# Prepare


In [None]:
from prelude import *

In [None]:
""" Test """

PLACE = f"{RAW_DATA_PREFIX}/place.csv"

df = pl.read_csv(PLACE)
test_table = df.lazy().filter(pl.col("name").is_in(["India", "China"])).collect()
out = test_table.select(
    [
        pl.col(":ID(Placeid)"),
        pl.col("name"),
        pl.col(":TYPE"),
        pl.col(":LABEL"),
    ]
)
out.head(5)

:ID(Placeid),name,:TYPE,:LABEL
i64,str,str,str
0,"""India""","""country""","""place"""
1,"""China""","""country""","""place"""


In [None]:
""" Load `vertices/edges` """

from polars import DataFrame

vertices = dict[str, DataFrame]()
raw_edges = dict[tuple[str, str, str], DataFrame]()
switch_namespace = dict[str, dict[int, int]]()

for file in glob.glob(f"{RAW_DATA_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    if "_" in df_name:
        src, relationship, dst = df_name.split("_")
        raw_edges[(src, relationship, dst)] = pl.read_csv(file)
    else:
        vertices[df_name] = pl.read_csv(file)


vertex_num = sum(len(df) for df in vertices.values())
edge_num = sum(len(df) for df in raw_edges.values())

In [None]:
""" Initialize `switch_namespace` """

for df in vertices.values():
    base_label = get_namespace(df.columns[0])
    switch_namespace[base_label] = dict()

switch_namespace

{'Commentid': {},
 'Forumid': {},
 'Organisationid': {},
 'Personid': {},
 'Placeid': {},
 'Postid': {},
 'Tagid': {},
 'TagClassid': {}}

In [None]:
""" Re-arrange all `:ID($AnyNamespace)` """

curr_global_id = 0

with tqdm(desc="Mapping `origin_id` to `uni_id`", total=vertex_num) as bar:
    for df in vertices.values():
        base_label = get_namespace(df.columns[0])
        map = switch_namespace[base_label]
        for row in df.rows():
            map[int(row[0])] = curr_global_id
            curr_global_id += 1
            bar.update(1)

assert curr_global_id == vertex_num
set(switch_namespace.keys())

Mapping `origin_id` to `uni_id`: 100%|██████████| 3181724/3181724 [00:03<00:00, 908464.65it/s] 


{'Commentid',
 'Forumid',
 'Organisationid',
 'Personid',
 'Placeid',
 'Postid',
 'TagClassid',
 'Tagid'}

# BI 11


## Original QG Build


In [None]:
""" Build `India` and `China` query graph """

china_query_graph_labels = [BaseLabel.Country + "China"] + ["city"] * 3 + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (4, 1),
    (5, 2),
    (6, 3),
    (4, 5),
    (5, 6),
    (6, 4),
]

india_query_graph_labels = [BaseLabel.Country + "India"] + ["city"] * 3 + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(BI_11_CHINA_QG):
    with open(BI_11_CHINA_QG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(BI_11_INDIA_QG):
    with open(BI_11_INDIA_QG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Optimized QG Build


In [None]:
""" Build `India` and `China` query graph """

china_query_graph_labels = [BaseLabel.Country + "China"] + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (1, 2),
    (2, 3),
    (3, 1),
]

india_query_graph_labels = [BaseLabel.Country + "India"] + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(BI_11_CHINA_QG_OPTIMIZED):
    with open(BI_11_CHINA_QG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(BI_11_INDIA_QG_OPTIMIZED):
    with open(BI_11_INDIA_QG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Execute `Query`


In [None]:
""" Exec `match` on `original` """

bi_11_original_time_table = []

run_veq_m_100k(
    BI_11_ORIGINAL_CHINA_RES,
    "original_china_match",
    original_china_match_args,
    bi_11_original_time_table,
)
run_veq_m_100k(
    BI_11_ORIGINAL_INDIA_RES,
    "original_india_match",
    original_india_match_args,
    bi_11_original_time_table,
)

""" Exec `match` on `optimized` """

bi_11_optimized_time_table = []

run_veq_m_100k(
    BI_11_OPTIMIZED_CHINA_RES,
    "optimized_china_match",
    optimized_china_match_args,
    bi_11_optimized_time_table,
)
run_veq_m_100k(
    BI_11_OPTIMIZED_INDIA_RES,
    "optimized_india_match",
    optimized_india_match_args,
    bi_11_optimized_time_table,
)

>>> Running: original_china_match...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_11/china_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3748
    Total Recursive Call Count: 131167
    Number of Matches: 100000
    Filtering Time (ms): 20.6644
    Verification Time (ms): 6480.42
    Processing Time (ms): 6501.08
<<< Done!
>>> Running: original_india_match...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_11/india_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3871
    Total Recursive Call Count: 131701
    Number of Matches: 100000
    Filtering Time (ms): 23.6162
    Verification Time (ms): 6028.32
    Processing Time (ms): 6051.94
<<< Done!
>>> Running: optimized_china_match...
    Data file: ./out/optimized/BI_11/data_graph.txt
    Query file: ./out/optimized/BI_11/china_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3157
    Total Recursive Call Count: 17593
    Number of Matches: 10000

In [None]:
""" Show BI-11 `comparison data-frame` """

print(
    "Comparison between: `original_china/india_match` & `optimized_china/india_match`"
)

df = pl.DataFrame(
    {
        "task": ["china_match", "india_match"],
        "original (ms)": bi_11_original_time_table,
        "optimized (ms)": bi_11_optimized_time_table,
    }
)
df

Comparison between: `original_china/india_match` & `optimized_china/india_match`


task,original (ms),optimized (ms)
str,f64,f64
"""china_match""",6501.08,3033.07
"""india_match""",6051.94,3079.34
