# Prepare


In [2]:
import polars as pl
import re
from tqdm.notebook import tqdm

DATA_PREFIX = "./data"
RAW_DATA_PREFIX = f"{DATA_PREFIX}/raw"
INDEX_EDGES_PREFIX = f"{DATA_PREFIX}/index"

import os

if not os.path.exists(INDEX_EDGES_PREFIX):
    os.makedirs(INDEX_EDGES_PREFIX)
if not os.path.exists(RAW_DATA_PREFIX):
    os.makedirs(RAW_DATA_PREFIX)


def get_inner_namespace(col_name: str) -> str:
    match = re.search("\((.*?)\)", col_name)
    return "" if match is None else match.group(1)


def get_namespace(col_name: str) -> str:
    inner_namespace = get_inner_namespace(col_name)
    if inner_namespace in ["Country", "Continent", "City"]:
        return "Placeid"
    if inner_namespace in ["University", "Company"]:
        return "Organisationid"
    return inner_namespace


# test
get_namespace(":ID(Forumid)")

'Forumid'

In [3]:
""" Test """

PLACE = f"{RAW_DATA_PREFIX}/place.csv"

df = pl.read_csv(PLACE)
test_table = df.lazy().filter(pl.col("name").is_in(["India", "China"])).collect()
out = test_table.select(
    [
        pl.col(":ID(Placeid)"),
        pl.col("name"),
        pl.col(":TYPE"),
        pl.col(":LABEL"),
    ]
)
out.head(5)

:ID(Placeid),name,:TYPE,:LABEL
i64,str,str,str
0,"""India""","""country""","""place"""
1,"""China""","""country""","""place"""


In [4]:
""" Load `vertices/edges` """

import os, glob
from polars import DataFrame

vertices = dict[str, DataFrame]()
raw_edges = dict[tuple[str, str, str], DataFrame]()
switch_namespace = dict[str, dict[int, int]]()

for file in glob.glob(f"{RAW_DATA_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    if "_" in df_name:
        src, relationship, dst = df_name.split("_")
        raw_edges[(src, relationship, dst)] = pl.read_csv(file)
    else:
        vertices[df_name] = pl.read_csv(file)


vertex_num = sum(len(df) for df in vertices.values())
edge_num = sum(len(df) for df in raw_edges.values())

In [5]:
""" Initialize `switch_namespace` """

for df in vertices.values():
    namespace = get_namespace(df.columns[0])
    switch_namespace[namespace] = dict()

switch_namespace

{'Commentid': {},
 'Forumid': {},
 'Organisationid': {},
 'Personid': {},
 'Placeid': {},
 'Postid': {},
 'Tagid': {},
 'TagClassid': {}}

In [6]:
""" Re-arrange all `:ID($AnyNamespace)` """

curr_global_id = 0

with tqdm(desc="Mapping `origin_id` to `uni_id`", total=vertex_num) as bar:
    for df in vertices.values():
        namespace = get_namespace(df.columns[0])
        map = switch_namespace[namespace]
        for row in df.rows():
            map[int(row[0])] = curr_global_id
            curr_global_id += 1
            bar.update(1)

assert curr_global_id == vertex_num
set(switch_namespace.keys())

Mapping `origin_id` to `uni_id`:   0%|          | 0/3181724 [00:00<?, ?it/s]

{'Commentid',
 'Forumid',
 'Organisationid',
 'Personid',
 'Placeid',
 'Postid',
 'TagClassid',
 'Tagid'}

# BI 11


## Original Query


In [7]:
OUT_PREFIX = "./out"
ORIGINAL_QUERY_PREFIX = f"{OUT_PREFIX}/original"
DATA_GRAPH = f"{ORIGINAL_QUERY_PREFIX}/data_graph.txt"
CHINA_QUERY_GRAPH = f"{ORIGINAL_QUERY_PREFIX}/china_query_graph.txt"
INDIA_QUERY_GRAPH = f"{ORIGINAL_QUERY_PREFIX}/india_query_graph.txt"

import os

if not os.path.exists(ORIGINAL_QUERY_PREFIX):
    os.makedirs(ORIGINAL_QUERY_PREFIX)

In [8]:
""" Build map of `vertex.uni_id -> label` """

labels = dict[int, str]()
label_set = set[str]()


def place_op(df: DataFrame, gen_new_country_tag_for_bi_10: bool = False):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col("name"),
            pl.col(":TYPE"),
        ]
    )
    for origin_id, name, ty in slice.rows():
        origin_id, name, label = int(origin_id), str(name), str(ty)
        uni_id = map[origin_id]
        if name in ["China", "India"]:
            label = name
        elif gen_new_country_tag_for_bi_10 and origin_id < 10:
            label = f"country_bi_10"
        labels[uni_id] = label
        label_set.add(label)
        bar.update(1)


def normal_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col(":TYPE" if ":TYPE" in df.columns else ":LABEL"),
        ]
    )
    for origin_id, label in slice.rows():
        uni_id = map[int(origin_id)]
        labels[uni_id] = str(label)
        label_set.add(str(label))
        bar.update(1)


def vertex_op(df_name: str, df: DataFrame, gen_new_country_tag_for_bi_10: bool = False):
    place_op(df, gen_new_country_tag_for_bi_10) if df_name == "place" else normal_op(df)


with tqdm(desc="Build map of `vertex.uni_id -> label`", total=vertex_num) as bar:
    for df_name, df in vertices.items():
        vertex_op(df_name, df)

label_set

Build map of `vertex.uni_id -> label`:   0%|          | 0/3181724 [00:00<?, ?it/s]

{'China',
 'India',
 'city',
 'comment',
 'company',
 'continent',
 'country',
 'forum',
 'person',
 'post',
 'tag',
 'tagclass',
 'university'}

In [9]:
""" Build edges in format: `(src_id, dst_id)` """

edges = set[tuple[int, int]]()

with tqdm(desc="Build edges in format: `(src_id, dst_id)`", total=edge_num) as bar:
    for df_name, df in raw_edges.items():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Build edges in format: `(src_id, dst_id)`:   0%|          | 0/17256038 [00:00<?, ?it/s]

In [10]:
""" Write into `data_graph.txt` """

if not os.path.exists(DATA_GRAPH):
    with open(DATA_GRAPH, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc="Writing `labels` into `data_graph.txt`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(desc="Writing `edges` into `data_graph.txt", total=len(edges)) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{DATA_GRAPH}` already exists")

File `./out/original/data_graph.txt` already exists


In [11]:
""" Build `India` and `China` query graph """

china_query_graph_labels = ["China"] + ["city"] * 3 + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (4, 1),
    (5, 2),
    (6, 3),
    (4, 5),
    (5, 6),
    (6, 4),
]

india_query_graph_labels = ["India"] + ["city"] * 3 + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(CHINA_QUERY_GRAPH):
    with open(CHINA_QUERY_GRAPH, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(INDIA_QUERY_GRAPH):
    with open(INDIA_QUERY_GRAPH, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Optimized Query


In [12]:
OPTIMIZED_QUERY_PREFIX = f"{OUT_PREFIX}/optimized"
DATA_GRAPH = f"{OPTIMIZED_QUERY_PREFIX}/data_graph.txt"
CHINA_QUERY_GRAPH = f"{OPTIMIZED_QUERY_PREFIX}/china_query_graph.txt"
INDIA_QUERY_GRAPH = f"{OPTIMIZED_QUERY_PREFIX}/india_query_graph.txt"

if not os.path.exists(OPTIMIZED_QUERY_PREFIX):
    os.makedirs(OPTIMIZED_QUERY_PREFIX)

In [13]:
""" Load all `index edge` """

index_edges = dict[str, DataFrame]()

for file in glob.glob(f"{INDEX_EDGES_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    index_edges[df_name] = pl.read_csv(file)

index_edge_num = sum(len(df) for df in index_edges.values())

In [14]:
""" Add `index edge` into `edges` """

with tqdm(desc="Adding `index edge` into `edges`", total=index_edge_num) as bar:
    for df in index_edges.values():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Adding `index edge` into `edges`:   0%|          | 0/19593582 [00:00<?, ?it/s]

In [15]:
""" Write into `data_graph.txt` """

if not os.path.exists(DATA_GRAPH):
    with open(DATA_GRAPH, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc="Writing `labels` into `data_graph.txt`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(desc="Writing `edges` into `data_graph.txt", total=len(edges)) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{DATA_GRAPH}` already exists")

File `./out/optimized/data_graph.txt` already exists


In [16]:
""" Build `India` and `China` query graph """

china_query_graph_labels = ["China"] + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (1, 2),
    (2, 3),
    (3, 1),
]

india_query_graph_labels = ["India"] + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(CHINA_QUERY_GRAPH):
    with open(CHINA_QUERY_GRAPH, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(INDIA_QUERY_GRAPH):
    with open(INDIA_QUERY_GRAPH, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Execute `Query`


In [17]:
import subprocess
import platform
from typing import Optional


ORIGINAL_DATA_GRAPH = f"{ORIGINAL_QUERY_PREFIX}/data_graph.txt"
ORIGINAL_CHINA_QUERY_GRAPH = f"{ORIGINAL_QUERY_PREFIX}/china_query_graph.txt"
ORIGINAL_INDIA_QUERY_GRAPH = f"{ORIGINAL_QUERY_PREFIX}/india_query_graph.txt"

OPTIMIZED_DATA_GRAPH = f"{OPTIMIZED_QUERY_PREFIX}/data_graph.txt"
OPTIMIZED_CHINA_QUERY_GRAPH = f"{OPTIMIZED_QUERY_PREFIX}/china_query_graph.txt"
OPTIMIZED_INDIA_QUERY_GRAPH = f"{OPTIMIZED_QUERY_PREFIX}/india_query_graph.txt"


LOG_PREFIX = f"./log"
ORIGINAL_LOG_PREFIX = f"{LOG_PREFIX}/original"
OPTIMIZED_LOG_PREFIX = f"{LOG_PREFIX}/optimized"

if not os.path.exists(ORIGINAL_LOG_PREFIX):
    os.makedirs(ORIGINAL_LOG_PREFIX)
if not os.path.exists(OPTIMIZED_LOG_PREFIX):
    os.makedirs(OPTIMIZED_LOG_PREFIX)

ORIGINAL_CHINA_MATCH_RESULT = f"{ORIGINAL_LOG_PREFIX}/china_match_result.txt"
ORIGINAL_INDIA_MATCH_RESULT = f"{ORIGINAL_LOG_PREFIX}/india_match_result.txt"

OPTIMIZED_CHINA_MATCH_RESULT = f"{OPTIMIZED_LOG_PREFIX}/china_match_result.txt"
OPTIMIZED_INDIA_MATCH_RESULT = f"{OPTIMIZED_LOG_PREFIX}/india_match_result.txt"

wsl_if_on_windows = ["wsl"] if platform.system() == "Windows" else []

# ./VEQ_M_100k -dg <data_graph_path> -qg <query_graph_path>

original_china_match_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    ORIGINAL_CHINA_QUERY_GRAPH,
]
original_india_match_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    ORIGINAL_INDIA_QUERY_GRAPH,
]

# ./VEQ_M_100k -dg <data_graph_path> -qg <query_graph_path>

optimized_china_match_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    OPTIMIZED_CHINA_QUERY_GRAPH,
]
optimized_india_match_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    OPTIMIZED_INDIA_QUERY_GRAPH,
]


def run(
    result_path: str,
    task_name: str,
    args: list[str],
    time_table: Optional[list[float]] = None,
):
    if os.path.exists(result_path):
        print(f"File `{result_path}` already exists")
        return
    content = ""
    with open(result_path, "w") as f:
        print(f">>> Running: {task_name}...")
        with subprocess.Popen(
            args,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        ) as p:
            if p.stdout:
                for line in iter(p.stdout.readline, b""):
                    content = line.decode("utf-8")
                    print("    " + content, end="")
                    f.write(content)
            else:
                print("    <No output>")
                f.write("<No output>")
        print("<<< Done!")
    if not time_table is None:
        processing_time = float(content.split(" ")[-1])
        time_table.append(processing_time)

In [18]:
""" Exec `match` on `original` """

original_time_table = []

run(
    ORIGINAL_CHINA_MATCH_RESULT,
    "original_china_match",
    original_china_match_args,
    original_time_table,
)
run(
    ORIGINAL_INDIA_MATCH_RESULT,
    "original_india_match",
    original_india_match_args,
    original_time_table,
)

""" Exec `match` on `optimized` """

optimized_time_table = []

run(
    OPTIMIZED_CHINA_MATCH_RESULT,
    "optimized_china_match",
    optimized_china_match_args,
    optimized_time_table,
)
run(
    OPTIMIZED_INDIA_MATCH_RESULT,
    "optimized_india_match",
    optimized_india_match_args,
    optimized_time_table,
)

>>> Running: original_china_match...
    Data file: ./out/original/data_graph.txt
    Query file: ./out/original/china_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3748
    Total Recursive Call Count: 131167
    Number of Matches: 100000
    Filtering Time (ms): 17.288
    Verification Time (ms): 6505.01
    Processing Time (ms): 6522.3
<<< Done!
>>> Running: original_india_match...
    Data file: ./out/original/data_graph.txt
    Query file: ./out/original/india_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3871
    Total Recursive Call Count: 131701
    Number of Matches: 100000
    Filtering Time (ms): 22.8355
    Verification Time (ms): 6548.97
    Processing Time (ms): 6571.8
<<< Done!
>>> Running: optimized_china_match...
    Data file: ./out/optimized/data_graph.txt
    Query file: ./out/optimized/china_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3157
    Total Recursive Call Count: 17617
    Number of Matches: 100000
    Filtering Time (ms): 20.3857
    

In [19]:
""" Show BI-11 `comparison data-frame` """

print(
    "Comparison between: `original_china/india_match` & `optimized_china/india_match`"
)

df = pl.DataFrame(
    {
        "task": ["china_match", "india_match"],
        "original (ms)": original_time_table,
        "optimized (ms)": optimized_time_table,
    }
)
df

Comparison between: `original_china/india_match` & `optimized_china/india_match`


task,original (ms),optimized (ms)
str,f64,f64
"""china_match""",6522.3,2894.08
"""india_match""",6571.8,2921.18


# BI 10


In [20]:
DG = "data_graph.txt"
SHORT_CHINA_POST_QG = "short_china_post_query_graph.txt"
SHORT_CHINA_COMMENT_QG = "short_china_comment_query_graph.txt"
LONG_CHINA_POST_QG = "long_china_post_query_graph.txt"
LONG_CHINA_COMMENT_QG = "long_china_comment_query_graph.txt"
SHORT_INDIA_POST_QG = "short_india_post_query_graph.txt"
SHORT_INDIA_COMMENT_QG = "short_india_comment_query_graph.txt"
LONG_INDIA_POST_QG = "long_india_post_query_graph.txt"
LONG_INDIA_COMMENT_QG = "long_india_comment_query_graph.txt"

SHORT_CHINA_POST_RES = "short_china_post_result.txt"
SHORT_CHINA_COMMENT_RES = "short_china_comment_result.txt"
LONG_CHINA_POST_RES = "long_china_post_result.txt"
LONG_CHINA_COMMENT_RES = "long_china_comment_result.txt"
SHORT_INDIA_POST_RES = "short_india_post_result.txt"
SHORT_INDIA_COMMENT_RES = "short_india_comment_result.txt"
LONG_INDIA_POST_RES = "long_india_post_result.txt"
LONG_INDIA_COMMENT_RES = "long_india_comment_result.txt"


ORIGINAL_Q_PRE = ORIGINAL_QUERY_PREFIX
OPTIMIZED_Q_PRE = OPTIMIZED_QUERY_PREFIX
ORIGINAL_L_PRE = ORIGINAL_LOG_PREFIX
OPTIMIZED_L_PRE = OPTIMIZED_LOG_PREFIX

## Original


In [21]:
""" labels & edges """

original_short_edges = [(0, 1), (1, 6), (6, 2), (2, 3), (2, 4), (4, 5)] + [
    (6, 7),
    (7, 8),
    (8, 9),
]
original_long_edges = original_short_edges + [(9, 10)]

original_short_china_post_labels = [
    "China",
    "city",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_china_post_labels = original_short_china_post_labels + ["person"]
original_short_china_comment_labels = [
    "China",
    "city",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_china_comment_labels = original_short_china_comment_labels + ["person"]
original_short_india_post_labels = [
    "India",
    "city",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_india_post_labels = original_short_india_post_labels + ["person"]
original_short_india_comment_labels = [
    "India",
    "city",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_india_comment_labels = original_short_india_comment_labels + ["person"]

In [22]:
""" Init Original Query Graph """

if not os.path.exists(f"{ORIGINAL_Q_PRE}/{SHORT_CHINA_POST_QG}"):
    with open(f"{ORIGINAL_Q_PRE}/{SHORT_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_china_post_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{ORIGINAL_Q_PRE}/{LONG_CHINA_POST_QG}"):
    with open(f"{ORIGINAL_Q_PRE}/{LONG_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_china_post_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{ORIGINAL_Q_PRE}/{SHORT_CHINA_COMMENT_QG}"):
    with open(f"{ORIGINAL_Q_PRE}/{SHORT_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_china_comment_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{ORIGINAL_Q_PRE}/{LONG_CHINA_COMMENT_QG}"):
    with open(f"{ORIGINAL_Q_PRE}/{LONG_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_china_comment_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{ORIGINAL_Q_PRE}/{SHORT_INDIA_POST_QG}"):
    with open(f"{ORIGINAL_Q_PRE}/{SHORT_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_india_post_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{ORIGINAL_Q_PRE}/{LONG_INDIA_POST_QG}"):
    with open(f"{ORIGINAL_Q_PRE}/{LONG_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_india_post_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{ORIGINAL_Q_PRE}/{SHORT_INDIA_COMMENT_QG}"):
    with open(f"{ORIGINAL_Q_PRE}/{SHORT_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_india_comment_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{ORIGINAL_Q_PRE}/{LONG_INDIA_COMMENT_QG}"):
    with open(f"{ORIGINAL_Q_PRE}/{LONG_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_india_comment_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

In [23]:
""" args """

original_short_china_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    f"{ORIGINAL_Q_PRE}/{SHORT_CHINA_POST_QG}",
]
original_long_china_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    f"{ORIGINAL_Q_PRE}/{LONG_CHINA_POST_QG}",
]
original_short_china_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    f"{ORIGINAL_Q_PRE}/{SHORT_CHINA_COMMENT_QG}",
]
original_long_china_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    f"{ORIGINAL_Q_PRE}/{LONG_CHINA_COMMENT_QG}",
]
original_short_india_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    f"{ORIGINAL_Q_PRE}/{SHORT_INDIA_POST_QG}",
]
original_long_india_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    f"{ORIGINAL_Q_PRE}/{LONG_INDIA_POST_QG}",
]
original_short_india_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    f"{ORIGINAL_Q_PRE}/{SHORT_INDIA_COMMENT_QG}",
]
original_long_india_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    ORIGINAL_DATA_GRAPH,
    "-qg",
    f"{ORIGINAL_Q_PRE}/{LONG_INDIA_COMMENT_QG}",
]

In [24]:
""" exec """

new_original_time_table = []

run(
    f"{ORIGINAL_L_PRE}/{SHORT_CHINA_POST_RES}",
    "original_short_china_post",
    original_short_china_post_args,
    new_original_time_table,
)
run(
    f"{ORIGINAL_L_PRE}/{LONG_CHINA_POST_RES}",
    "original_long_china_post",
    original_long_china_post_args,
    new_original_time_table,
)
run(
    f"{ORIGINAL_L_PRE}/{SHORT_CHINA_COMMENT_RES}",
    "original_short_china_comment",
    original_short_china_comment_args,
    new_original_time_table,
)
run(
    f"{ORIGINAL_L_PRE}/{LONG_CHINA_COMMENT_RES}",
    "original_long_china_comment",
    original_long_china_comment_args,
    new_original_time_table,
)
run(
    f"{ORIGINAL_L_PRE}/{SHORT_INDIA_POST_RES}",
    "original_short_india_post",
    original_short_india_post_args,
    new_original_time_table,
)
run(
    f"{ORIGINAL_L_PRE}/{LONG_INDIA_POST_RES}",
    "original_long_india_post",
    original_long_india_post_args,
    new_original_time_table,
)
run(
    f"{ORIGINAL_L_PRE}/{SHORT_INDIA_COMMENT_RES}",
    "original_short_india_comment",
    original_short_india_comment_args,
    new_original_time_table,
)
run(
    f"{ORIGINAL_L_PRE}/{LONG_INDIA_COMMENT_RES}",
    "original_long_india_comment",
    original_long_india_comment_args,
    new_original_time_table,
)

>>> Running: original_short_china_post...
    Data file: ./out/original/data_graph.txt
    Query file: ./out/original/short_china_post_query_graph.txt
    Output file: 
    Sum of |C(u)|: 68294
    Total Recursive Call Count: 7
    Number of Matches: 100176
    Filtering Time (ms): 258.38
    Verification Time (ms): 534.871
    Processing Time (ms): 793.251
<<< Done!
>>> Running: original_long_china_post...
    Data file: ./out/original/data_graph.txt
    Query file: ./out/original/long_china_post_query_graph.txt
    Output file: 
    Sum of |C(u)|: 77102
    Total Recursive Call Count: 34
    Number of Matches: 100062
    Filtering Time (ms): 316.849
    Verification Time (ms): 77.8439
    Processing Time (ms): 394.693
<<< Done!
>>> Running: original_short_china_comment...
    Data file: ./out/original/data_graph.txt
    Query file: ./out/original/short_china_comment_query_graph.txt
    Output file: 
    Sum of |C(u)|: 144792
    Total Recursive Call Count: 7
    Number of Matches: 10

# Optimized


In [25]:
""" labels & edges """

optimized_short_edges = [(0, 5), (5, 1), (1, 2), (1, 3), (3, 4)] + [
    (5, 6),
    (6, 7),
    (7, 8),
]
optimized_long_edges = optimized_short_edges + [(8, 9)]

optimized_short_china_post_labels = [
    "China",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_china_post_labels = optimized_short_china_post_labels + ["person"]
optimized_short_china_comment_labels = [
    "China",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_china_comment_labels = optimized_short_china_comment_labels + ["person"]
optimized_short_india_post_labels = [
    "India",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_india_post_labels = optimized_short_india_post_labels + ["person"]
optimized_short_india_comment_labels = [
    "India",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_india_comment_labels = optimized_short_india_comment_labels + ["person"]

In [26]:
""" Init Optimized Query Graph """

if not os.path.exists(f"{OPTIMIZED_Q_PRE}/{SHORT_CHINA_POST_QG}"):
    with open(f"{OPTIMIZED_Q_PRE}/{SHORT_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_china_post_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{OPTIMIZED_Q_PRE}/{LONG_CHINA_POST_QG}"):
    with open(f"{OPTIMIZED_Q_PRE}/{LONG_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_china_post_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{OPTIMIZED_Q_PRE}/{SHORT_CHINA_COMMENT_QG}"):
    with open(f"{OPTIMIZED_Q_PRE}/{SHORT_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_china_comment_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{OPTIMIZED_Q_PRE}/{LONG_CHINA_COMMENT_QG}"):
    with open(f"{OPTIMIZED_Q_PRE}/{LONG_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_china_comment_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{OPTIMIZED_Q_PRE}/{SHORT_INDIA_POST_QG}"):
    with open(f"{OPTIMIZED_Q_PRE}/{SHORT_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_india_post_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{OPTIMIZED_Q_PRE}/{LONG_INDIA_POST_QG}"):
    with open(f"{OPTIMIZED_Q_PRE}/{LONG_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_india_post_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{OPTIMIZED_Q_PRE}/{SHORT_INDIA_COMMENT_QG}"):
    with open(f"{OPTIMIZED_Q_PRE}/{SHORT_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_india_comment_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{OPTIMIZED_Q_PRE}/{LONG_INDIA_COMMENT_QG}"):
    with open(f"{OPTIMIZED_Q_PRE}/{LONG_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_india_comment_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

In [27]:
""" args """

optimized_short_china_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    f"{OPTIMIZED_Q_PRE}/{SHORT_CHINA_POST_QG}",
]
optimized_long_china_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    f"{OPTIMIZED_Q_PRE}/{LONG_CHINA_POST_QG}",
]
optimized_short_china_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    f"{OPTIMIZED_Q_PRE}/{SHORT_CHINA_COMMENT_QG}",
]
optimized_long_china_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    f"{OPTIMIZED_Q_PRE}/{LONG_CHINA_COMMENT_QG}",
]
optimized_short_india_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    f"{OPTIMIZED_Q_PRE}/{SHORT_INDIA_POST_QG}",
]
optimized_long_india_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    f"{OPTIMIZED_Q_PRE}/{LONG_INDIA_POST_QG}",
]
optimized_short_india_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    f"{OPTIMIZED_Q_PRE}/{SHORT_INDIA_COMMENT_QG}",
]
optimized_long_india_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    OPTIMIZED_DATA_GRAPH,
    "-qg",
    f"{OPTIMIZED_Q_PRE}/{LONG_INDIA_COMMENT_QG}",
]

In [28]:
""" exec """

new_optimized_time_table = []

run(
    f"{OPTIMIZED_L_PRE}/{SHORT_CHINA_POST_RES}",
    "optimized_short_china_post",
    optimized_short_china_post_args,
    new_optimized_time_table,
)
run(
    f"{OPTIMIZED_L_PRE}/{LONG_CHINA_POST_RES}",
    "optimized_long_china_post",
    optimized_long_china_post_args,
    new_optimized_time_table,
)
run(
    f"{OPTIMIZED_L_PRE}/{SHORT_CHINA_COMMENT_RES}",
    "optimized_short_china_comment",
    optimized_short_china_comment_args,
    new_optimized_time_table,
)
run(
    f"{OPTIMIZED_L_PRE}/{LONG_CHINA_COMMENT_RES}",
    "optimized_long_china_comment",
    optimized_long_china_comment_args,
    new_optimized_time_table,
)
run(
    f"{OPTIMIZED_L_PRE}/{SHORT_INDIA_POST_RES}",
    "optimized_short_india_post",
    optimized_short_india_post_args,
    new_optimized_time_table,
)
run(
    f"{OPTIMIZED_L_PRE}/{LONG_INDIA_POST_RES}",
    "optimized_long_india_post",
    optimized_long_india_post_args,
    new_optimized_time_table,
)
run(
    f"{OPTIMIZED_L_PRE}/{SHORT_INDIA_COMMENT_RES}",
    "optimized_short_india_comment",
    optimized_short_india_comment_args,
    new_optimized_time_table,
)
run(
    f"{OPTIMIZED_L_PRE}/{LONG_INDIA_COMMENT_RES}",
    "optimized_long_india_comment",
    optimized_long_india_comment_args,
    new_optimized_time_table,
)

>>> Running: optimized_short_china_post...
    Data file: ./out/optimized/data_graph.txt
    Query file: ./out/optimized/short_china_post_query_graph.txt
    Output file: 
    Sum of |C(u)|: 68096
    Total Recursive Call Count: 6
    Number of Matches: 100332
    Filtering Time (ms): 281.724
    Verification Time (ms): 556.815
    Processing Time (ms): 838.539
<<< Done!
>>> Running: optimized_long_china_post...
    Data file: ./out/optimized/data_graph.txt
    Query file: ./out/optimized/long_china_post_query_graph.txt
    Output file: 
    Sum of |C(u)|: 76904
    Total Recursive Call Count: 30
    Number of Matches: 101576
    Filtering Time (ms): 356.262
    Verification Time (ms): 57.8331
    Processing Time (ms): 414.095
<<< Done!
>>> Running: optimized_short_china_comment...
    Data file: ./out/optimized/data_graph.txt
    Query file: ./out/optimized/short_china_comment_query_graph.txt
    Output file: 
    Sum of |C(u)|: 144594
    Total Recursive Call Count: 11
    Number of 

In [29]:
""" Show BI-10 `comparison data-frame` """

print(
    "Comparison between: `original_china/india_match` & `optimized_china/india_match`"
)

df = pl.DataFrame(
    {
        "task": [
            "short_china_post",
            "long_china_post",
            "short_china_comment",
            "long_china_comment",
            "short_india_post",
            "long_india_post",
            "short_india_comment",
            "long_india_comment",
        ],
        "original (ms)": new_original_time_table,
        "optimized (ms)": new_optimized_time_table,
    }
)
df

Comparison between: `original_china/india_match` & `optimized_china/india_match`


task,original (ms),optimized (ms)
str,f64,f64
"""short_china_po…",793.251,838.539
"""long_china_pos…",394.693,414.095
"""short_china_co…",1245.01,1906.34
"""long_china_com…",943.441,1237.05
"""short_india_po…",401.101,713.489
"""long_india_pos…",441.94,390.757
"""short_india_co…",1308.22,1654.62
"""long_india_com…",1169.87,1133.07
