# Prepare


In [None]:
from prelude import *

In [None]:
""" Test """

PLACE = f"{RAW_DATA_PREFIX}/place.csv"

df = pl.read_csv(PLACE)
test_table = df.lazy().filter(pl.col("name").is_in(["India", "China"])).collect()
out = test_table.select(
    [
        pl.col(":ID(Placeid)"),
        pl.col("name"),
        pl.col(":TYPE"),
        pl.col(":LABEL"),
    ]
)
out.head(5)

:ID(Placeid),name,:TYPE,:LABEL
i64,str,str,str
0,"""India""","""country""","""place"""
1,"""China""","""country""","""place"""


In [None]:
""" Load `vertices/edges` """

from polars import DataFrame

vertices = dict[str, DataFrame]()
raw_edges = dict[tuple[str, str, str], DataFrame]()
switch_namespace = dict[str, dict[int, int]]()

for file in glob.glob(f"{RAW_DATA_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    if "_" in df_name:
        src, relationship, dst = df_name.split("_")
        raw_edges[(src, relationship, dst)] = pl.read_csv(file)
    else:
        vertices[df_name] = pl.read_csv(file)


vertex_num = sum(len(df) for df in vertices.values())
edge_num = sum(len(df) for df in raw_edges.values())

In [None]:
""" Initialize `switch_namespace` """

for df in vertices.values():
    base_label = get_namespace(df.columns[0])
    switch_namespace[base_label] = dict()

switch_namespace

{'Commentid': {},
 'Forumid': {},
 'Organisationid': {},
 'Personid': {},
 'Placeid': {},
 'Postid': {},
 'Tagid': {},
 'TagClassid': {}}

In [None]:
""" Re-arrange all `:ID($AnyNamespace)` """

curr_global_id = 0

with tqdm(desc="Mapping `origin_id` to `uni_id`", total=vertex_num) as bar:
    for df in vertices.values():
        base_label = get_namespace(df.columns[0])
        map = switch_namespace[base_label]
        for row in df.rows():
            map[int(row[0])] = curr_global_id
            curr_global_id += 1
            bar.update(1)

assert curr_global_id == vertex_num
set(switch_namespace.keys())

Mapping `origin_id` to `uni_id`: 100%|██████████| 3181724/3181724 [00:03<00:00, 908464.65it/s] 


{'Commentid',
 'Forumid',
 'Organisationid',
 'Personid',
 'Placeid',
 'Postid',
 'TagClassid',
 'Tagid'}

# BI 11


## Original Query


In [None]:
labels = dict[int, str]()
label_set = set[str]()

explicit_place_labels = (
    {"China", "India"}
    .union({"Tunisia", "Cuba", "France"})
    .union({"Belgium", "Greece", "Chile"})
)
explicit_tagclass_labels = {
    "NascarDriver",
    "Thing",
    "Politician",
}.union({"Saint", "President", "Song"})
explicit_date_labels = {"2010-01-07", "2010-01-10", "2010-01-26"}
explicit_end_date_labels = {"2012-11-11", "2012-11-09", "2012-11-03"}


def place_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col("name"),
            pl.col(":TYPE"),
        ]
    )
    for oid, name, ty in slice.rows():
        oid, name, label = int(oid), str(name), str(ty)
        uid = map[oid]
        if name in explicit_place_labels:
            label = BaseLabel.Country + name
        labels[uid] = label
        label_set.add(label)
        bar.update(1)


def normal_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col(":TYPE" if ":TYPE" in df.columns else ":LABEL"),
        ]
    )
    for oid, label in slice.rows():
        uid = map[int(oid)]
        labels[uid] = str(label)
        label_set.add(str(label))
        bar.update(1)


def vertex_op(df_name: str, df: DataFrame):
    match df_name:
        case "place":
            place_op(df)
        case _:
            normal_op(df)

In [None]:
""" Build map of `vertex.uid -> label` """

with tqdm(desc="Build map of `vertex.uni_id -> label`", total=vertex_num) as bar:
    for df_name, df in vertices.items():
        vertex_op(df_name, df)

label_set

Build map of `vertex.uni_id -> label`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1496053.20it/s]


{'China',
 'India',
 'city',
 'comment',
 'company',
 'continent',
 'country',
 'forum',
 'person',
 'post',
 'tag',
 'tagclass',
 'university'}

In [None]:
""" Build edges in format: `(src_id, dst_id)` """

edges = set[tuple[int, int]]()

with tqdm(desc="Build edges in format: `(src_id, dst_id)`", total=edge_num) as bar:
    for df_name, df in raw_edges.items():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Build edges in format: `(src_id, dst_id)`: 100%|██████████| 17256038/17256038 [00:18<00:00, 927268.42it/s] 


In [None]:
""" Write into `data_graph.txt` """

if not os.path.exists(COMMON_DG):
    with open(COMMON_DG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc=f"Writing `labels` into `{COMMON_DG}`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(desc=f"Writing `edges` into `{COMMON_DG}`", total=len(edges)) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{COMMON_DG}` already exists")

Writing `labels` into `data_graph.txt`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1514438.58it/s]
Writing `edges` into `data_graph.txt: 100%|██████████| 17256033/17256033 [00:20<00:00, 857313.93it/s]


In [None]:
""" Build `India` and `China` query graph """

china_query_graph_labels = [BaseLabel.Country + "China"] + ["city"] * 3 + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (4, 1),
    (5, 2),
    (6, 3),
    (4, 5),
    (5, 6),
    (6, 4),
]

india_query_graph_labels = [BaseLabel.Country + "India"] + ["city"] * 3 + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(BI_11_CHINA_QG):
    with open(BI_11_CHINA_QG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(BI_11_INDIA_QG):
    with open(BI_11_INDIA_QG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Optimized Query


In [None]:
""" Load all `index edge` """

index_edges = dict[str, DataFrame]()

for file in glob.glob(f"{INDEX_EDGES_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    index_edges[df_name] = pl.read_csv(file)

index_edge_num = sum(len(df) for df in index_edges.values())

In [None]:
""" Add `index edge` into `edges` """

with tqdm(desc="Adding `index edge` into `edges`", total=index_edge_num) as bar:
    for df in index_edges.values():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Adding `index edge` into `edges`: 100%|██████████| 20605002/20605002 [00:24<00:00, 851473.18it/s] 


In [None]:
""" Write into `data_graph.txt` """

if not os.path.exists(COMMON_DG_OPTIMIZED):
    with open(COMMON_DG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc=f"Writing `labels` into `{COMMON_DG_OPTIMIZED}`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(
            desc=f"Writing `edges` into `{COMMON_DG_OPTIMIZED}`", total=len(edges)
        ) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{COMMON_DG_OPTIMIZED}` already exists")

Writing `labels` into `data_graph.txt`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1477753.36it/s]
Writing `edges` into `data_graph.txt: 100%|██████████| 34775921/34775921 [00:42<00:00, 825434.42it/s]


In [None]:
""" Build `India` and `China` query graph """

china_query_graph_labels = [BaseLabel.Country + "China"] + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (1, 2),
    (2, 3),
    (3, 1),
]

india_query_graph_labels = [BaseLabel.Country + "India"] + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(BI_11_CHINA_QG_OPTIMIZED):
    with open(BI_11_CHINA_QG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(BI_11_INDIA_QG_OPTIMIZED):
    with open(BI_11_INDIA_QG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Execute `Query`


In [None]:
""" Args """

from args import *

In [None]:
""" Exec `match` on `original` """

bi_11_original_time_table = []

run_veq_m_100k(
    BI_11_ORIGINAL_CHINA_RES,
    "original_china_match",
    original_china_match_args,
    bi_11_original_time_table,
)
run_veq_m_100k(
    BI_11_ORIGINAL_INDIA_RES,
    "original_india_match",
    original_india_match_args,
    bi_11_original_time_table,
)

""" Exec `match` on `optimized` """

bi_11_optimized_time_table = []

run_veq_m_100k(
    BI_11_OPTIMIZED_CHINA_RES,
    "optimized_china_match",
    optimized_china_match_args,
    bi_11_optimized_time_table,
)
run_veq_m_100k(
    BI_11_OPTIMIZED_INDIA_RES,
    "optimized_india_match",
    optimized_india_match_args,
    bi_11_optimized_time_table,
)

>>> Running: original_china_match...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_11/china_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3748
    Total Recursive Call Count: 131167
    Number of Matches: 100000
    Filtering Time (ms): 20.6644
    Verification Time (ms): 6480.42
    Processing Time (ms): 6501.08
<<< Done!
>>> Running: original_india_match...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_11/india_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3871
    Total Recursive Call Count: 131701
    Number of Matches: 100000
    Filtering Time (ms): 23.6162
    Verification Time (ms): 6028.32
    Processing Time (ms): 6051.94
<<< Done!
>>> Running: optimized_china_match...
    Data file: ./out/optimized/BI_11/data_graph.txt
    Query file: ./out/optimized/BI_11/china_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3157
    Total Recursive Call Count: 17593
    Number of Matches: 10000

In [None]:
""" Show BI-11 `comparison data-frame` """

print(
    "Comparison between: `original_china/india_match` & `optimized_china/india_match`"
)

df = pl.DataFrame(
    {
        "task": ["china_match", "india_match"],
        "original (ms)": bi_11_original_time_table,
        "optimized (ms)": bi_11_optimized_time_table,
    }
)
df

Comparison between: `original_china/india_match` & `optimized_china/india_match`


task,original (ms),optimized (ms)
str,f64,f64
"""china_match""",6501.08,3033.07
"""india_match""",6051.94,3079.34
