# Prepare


In [14]:
import polars as pl
import re, os
from tqdm import tqdm

DATA_PREFIX = "./data"
RAW_DATA_PREFIX = f"{DATA_PREFIX}/raw"
INDEX_EDGES_PREFIX = f"{DATA_PREFIX}/index"

if not os.path.exists(INDEX_EDGES_PREFIX):
    os.makedirs(INDEX_EDGES_PREFIX)
if not os.path.exists(RAW_DATA_PREFIX):
    os.makedirs(RAW_DATA_PREFIX)

In [15]:
""" Function Tools """

import subprocess
from typing import Optional


def run_veq_m_100k(
    result_path: str,
    task_name: str,
    args: list[str],
    time_table: Optional[list[float]] = None,
):
    if os.path.exists(result_path):
        print(f"File `{result_path}` already exists")
        with open(result_path, "r") as f:
            non_empty_lines = [line for line in f if line.strip() != ""]
            last_line = non_empty_lines[-1]
            print(f"    last_line ~> {last_line}")
            time = float(last_line.split(" ")[-1])
            if not time_table is None:
                time_table.append(time)
        return

    content = ""
    with open(result_path, "w") as f:
        print(f">>> Running: {task_name}...")
        with subprocess.Popen(
            args,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        ) as p:
            if p.stdout:
                for line in iter(p.stdout.readline, b""):
                    content = line.decode("utf-8")
                    print("    " + content, end="")
                    f.write(content)
            else:
                print("    <No output>")
                f.write("<No output>")
        print("<<< Done!")
    if not time_table is None:
        processing_time = float(content.split(" ")[-1])
        time_table.append(processing_time)


def get_inner_namespace(col_name: str) -> str:
    match = re.search("\((.*?)\)", col_name)
    return "" if match is None else match.group(1)


def get_namespace(col_name: str) -> str:
    inner_namespace = get_inner_namespace(col_name)
    if inner_namespace in ["Country", "Continent", "City"]:
        return "Placeid"
    if inner_namespace in ["University", "Company"]:
        return "Organisationid"
    return inner_namespace


# demo
get_namespace(":ID(Forumid)")

'Forumid'

In [16]:
""" Test """

PLACE = f"{RAW_DATA_PREFIX}/place.csv"

df = pl.read_csv(PLACE)
test_table = df.lazy().filter(pl.col("name").is_in(["India", "China"])).collect()
out = test_table.select(
    [
        pl.col(":ID(Placeid)"),
        pl.col("name"),
        pl.col(":TYPE"),
        pl.col(":LABEL"),
    ]
)
out.head(5)

:ID(Placeid),name,:TYPE,:LABEL
i64,str,str,str
0,"""India""","""country""","""place"""
1,"""China""","""country""","""place"""


In [17]:
""" Load `vertices/edges` """

import os, glob
from polars import DataFrame

vertices = dict[str, DataFrame]()
raw_edges = dict[tuple[str, str, str], DataFrame]()
switch_namespace = dict[str, dict[int, int]]()

for file in glob.glob(f"{RAW_DATA_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    if "_" in df_name:
        src, relationship, dst = df_name.split("_")
        raw_edges[(src, relationship, dst)] = pl.read_csv(file)
    else:
        vertices[df_name] = pl.read_csv(file)


vertex_num = sum(len(df) for df in vertices.values())
edge_num = sum(len(df) for df in raw_edges.values())

In [18]:
""" Initialize `switch_namespace` """

for df in vertices.values():
    namespace = get_namespace(df.columns[0])
    switch_namespace[namespace] = dict()

switch_namespace

{'Commentid': {},
 'Forumid': {},
 'Organisationid': {},
 'Personid': {},
 'Placeid': {},
 'Postid': {},
 'Tagid': {},
 'TagClassid': {}}

In [19]:
""" Re-arrange all `:ID($AnyNamespace)` """

curr_global_id = 0

with tqdm(desc="Mapping `origin_id` to `uni_id`", total=vertex_num) as bar:
    for df in vertices.values():
        namespace = get_namespace(df.columns[0])
        map = switch_namespace[namespace]
        for row in df.rows():
            map[int(row[0])] = curr_global_id
            curr_global_id += 1
            bar.update(1)

assert curr_global_id == vertex_num
set(switch_namespace.keys())

Mapping `origin_id` to `uni_id`: 100%|██████████| 3181724/3181724 [00:03<00:00, 1037593.01it/s]


{'Commentid',
 'Forumid',
 'Organisationid',
 'Personid',
 'Placeid',
 'Postid',
 'TagClassid',
 'Tagid'}

# BI 11


## Original Query


In [20]:
OUT_PREFIX = "./out"
ORIGINAL_QUERY_PREFIX = f"{OUT_PREFIX}/original"
BI_11_DG = f"{ORIGINAL_QUERY_PREFIX}/data_graph.txt"
BI_11_CHINA_QG = f"{ORIGINAL_QUERY_PREFIX}/china_query_graph.txt"
BI_11_INDIA_QG = f"{ORIGINAL_QUERY_PREFIX}/india_query_graph.txt"

import os

if not os.path.exists(ORIGINAL_QUERY_PREFIX):
    os.makedirs(ORIGINAL_QUERY_PREFIX)

In [21]:
""" Build map of `vertex.uni_id -> label` """

labels = dict[int, str]()
label_set = set[str]()


def place_op(df: DataFrame, gen_new_country_tag_for_bi_10: bool = False):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col("name"),
            pl.col(":TYPE"),
        ]
    )
    for origin_id, name, ty in slice.rows():
        origin_id, name, label = int(origin_id), str(name), str(ty)
        uni_id = map[origin_id]
        if name in ["China", "India"]:
            label = name
        elif gen_new_country_tag_for_bi_10 and origin_id < 10:
            label = f"country_bi_10"
        labels[uni_id] = label
        label_set.add(label)
        bar.update(1)


def normal_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col(":TYPE" if ":TYPE" in df.columns else ":LABEL"),
        ]
    )
    for origin_id, label in slice.rows():
        uni_id = map[int(origin_id)]
        labels[uni_id] = str(label)
        label_set.add(str(label))
        bar.update(1)


def vertex_op(df_name: str, df: DataFrame, gen_new_country_tag_for_bi_10: bool = False):
    place_op(df, gen_new_country_tag_for_bi_10) if df_name == "place" else normal_op(df)


with tqdm(desc="Build map of `vertex.uni_id -> label`", total=vertex_num) as bar:
    for df_name, df in vertices.items():
        vertex_op(df_name, df)

label_set

Build map of `vertex.uni_id -> label`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1553599.61it/s]


{'China',
 'India',
 'city',
 'comment',
 'company',
 'continent',
 'country',
 'forum',
 'person',
 'post',
 'tag',
 'tagclass',
 'university'}

In [22]:
""" Build edges in format: `(src_id, dst_id)` """

edges = set[tuple[int, int]]()

with tqdm(desc="Build edges in format: `(src_id, dst_id)`", total=edge_num) as bar:
    for df_name, df in raw_edges.items():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Build edges in format: `(src_id, dst_id)`: 100%|██████████| 17256038/17256038 [00:17<00:00, 989369.78it/s] 


In [23]:
""" Write into `data_graph.txt` """

if not os.path.exists(BI_11_DG):
    with open(BI_11_DG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc="Writing `labels` into `data_graph.txt`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(desc="Writing `edges` into `data_graph.txt", total=len(edges)) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{BI_11_DG}` already exists")

File `./out/original/data_graph.txt` already exists


In [24]:
""" Build `India` and `China` query graph """

china_query_graph_labels = ["China"] + ["city"] * 3 + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (4, 1),
    (5, 2),
    (6, 3),
    (4, 5),
    (5, 6),
    (6, 4),
]

india_query_graph_labels = ["India"] + ["city"] * 3 + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(BI_11_CHINA_QG):
    with open(BI_11_CHINA_QG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(BI_11_INDIA_QG):
    with open(BI_11_INDIA_QG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Optimized Query


In [25]:
OPTIMIZED_QUERY_PREFIX = f"{OUT_PREFIX}/optimized"
BI_11_DG_OPTIMIZED = f"{OPTIMIZED_QUERY_PREFIX}/data_graph.txt"
BI_11_CHINA_QG_OPTIMIZED = f"{OPTIMIZED_QUERY_PREFIX}/china_query_graph.txt"
BI_11_INDIA_QG_OPTIMIZED = f"{OPTIMIZED_QUERY_PREFIX}/india_query_graph.txt"

if not os.path.exists(OPTIMIZED_QUERY_PREFIX):
    os.makedirs(OPTIMIZED_QUERY_PREFIX)

In [26]:
""" Load all `index edge` """

index_edges = dict[str, DataFrame]()

for file in glob.glob(f"{INDEX_EDGES_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    index_edges[df_name] = pl.read_csv(file)

index_edge_num = sum(len(df) for df in index_edges.values())

In [27]:
""" Add `index edge` into `edges` """

with tqdm(desc="Adding `index edge` into `edges`", total=index_edge_num) as bar:
    for df in index_edges.values():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Adding `index edge` into `edges`: 100%|██████████| 19593582/19593582 [00:21<00:00, 926901.33it/s] 


In [28]:
""" Write into `data_graph.txt` """

if not os.path.exists(BI_11_DG_OPTIMIZED):
    with open(BI_11_DG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc="Writing `labels` into `data_graph.txt`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(desc="Writing `edges` into `data_graph.txt", total=len(edges)) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{BI_11_DG_OPTIMIZED}` already exists")

File `./out/optimized/data_graph.txt` already exists


In [29]:
""" Build `India` and `China` query graph """

china_query_graph_labels = ["China"] + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (1, 2),
    (2, 3),
    (3, 1),
]

india_query_graph_labels = ["India"] + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(BI_11_CHINA_QG_OPTIMIZED):
    with open(BI_11_CHINA_QG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(BI_11_INDIA_QG_OPTIMIZED):
    with open(BI_11_INDIA_QG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Execute `Query`


In [30]:
BI_11_DG = f"{ORIGINAL_QUERY_PREFIX}/data_graph.txt"
BI_11_CHINA_QG = f"{ORIGINAL_QUERY_PREFIX}/china_query_graph.txt"
BI_11_INDIA_QG = f"{ORIGINAL_QUERY_PREFIX}/india_query_graph.txt"
BI_11_DG_OPTIMIZED = f"{OPTIMIZED_QUERY_PREFIX}/data_graph.txt"
BI_11_CHINA_QG_OPTIMIZED = f"{OPTIMIZED_QUERY_PREFIX}/china_query_graph.txt"
BI_11_INDIA_QG_OPTIMIZED = f"{OPTIMIZED_QUERY_PREFIX}/india_query_graph.txt"


LOG_PREFIX = "./log"

ORIGINAL_LOG_PREFIX = f"{LOG_PREFIX}/original"
OPTIMIZED_LOG_PREFIX = f"{LOG_PREFIX}/optimized"
BI_11_ORIGINAL_LOG_PRE = f"{ORIGINAL_LOG_PREFIX}/BI_11"
BI_11_OPTIMIZED_LOG_PRE = f"{OPTIMIZED_LOG_PREFIX}/BI_11"



BI_11_ORIGINAL_CHINA_RESULT = f"{BI_11_ORIGINAL_LOG_PRE}/china_match_result.txt"
BI_11_ORIGINAL_INDIA_RESULT = f"{BI_11_ORIGINAL_LOG_PRE}/india_match_result.txt"
BI_11_OPTIMIZED_CHINA_RESULT = f"{BI_11_OPTIMIZED_LOG_PRE}/china_match_result.txt"
BI_11_OPTIMIZED_INDIA_RESULT = f"{BI_11_OPTIMIZED_LOG_PRE}/india_match_result.txt"

In [31]:
if not os.path.exists(BI_11_ORIGINAL_LOG_PRE):
    os.makedirs(BI_11_ORIGINAL_LOG_PRE)
if not os.path.exists(BI_11_OPTIMIZED_LOG_PRE):
    os.makedirs(BI_11_OPTIMIZED_LOG_PRE)

In [32]:
""" Args """

""" ./VEQ_M_100k -dg <data_graph_path> -qg <query_graph_path> """

import platform

wsl_if_on_windows = ["wsl"] if platform.system() == "Windows" else []

original_china_match_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_11_DG,
    "-qg",
    BI_11_CHINA_QG,
]
original_india_match_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_11_DG,
    "-qg",
    BI_11_INDIA_QG,
]

optimized_china_match_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_11_DG_OPTIMIZED,
    "-qg",
    BI_11_CHINA_QG_OPTIMIZED,
]
optimized_india_match_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_11_DG_OPTIMIZED,
    "-qg",
    BI_11_INDIA_QG_OPTIMIZED,
]

In [33]:
""" Exec `match` on `original` """

original_time_table = []

run_veq_m_100k(
    BI_11_ORIGINAL_CHINA_RESULT,
    "original_china_match",
    original_china_match_args,
    original_time_table,
)
run_veq_m_100k(
    BI_11_ORIGINAL_INDIA_RESULT,
    "original_india_match",
    original_india_match_args,
    original_time_table,
)

""" Exec `match` on `optimized` """

optimized_time_table = []

run_veq_m_100k(
    BI_11_OPTIMIZED_CHINA_RESULT,
    "optimized_china_match",
    optimized_china_match_args,
    optimized_time_table,
)
run_veq_m_100k(
    BI_11_OPTIMIZED_INDIA_RESULT,
    "optimized_india_match",
    optimized_india_match_args,
    optimized_time_table,
)

File `./log/original/BI_11/china_match_result.txt` already exists
    last_line ~> Processing Time (ms): 6572.88

File `./log/original/BI_11/india_match_result.txt` already exists
    last_line ~> Processing Time (ms): 6399.13

File `./log/optimized/BI_11/china_match_result.txt` already exists
    last_line ~> Processing Time (ms): 2892.31

File `./log/optimized/BI_11/india_match_result.txt` already exists
    last_line ~> Processing Time (ms): 3315.39



In [34]:
""" Show BI-11 `comparison data-frame` """

print(
    "Comparison between: `original_china/india_match` & `optimized_china/india_match`"
)

df = pl.DataFrame(
    {
        "task": ["china_match", "india_match"],
        "original (ms)": original_time_table,
        "optimized (ms)": optimized_time_table,
    }
)
df

Comparison between: `original_china/india_match` & `optimized_china/india_match`


task,original (ms),optimized (ms)
str,f64,f64
"""china_match""",6572.88,2892.31
"""india_match""",6399.13,3315.39


# BI 10


In [35]:
# DG
BI_10_DG = BI_11_DG
BI_10_DG_OPTIMIZED = BI_11_DG_OPTIMIZED

# China QG
SHORT_CHINA_POST_QG = "short_china_post_query_graph.txt"
SHORT_CHINA_COMMENT_QG = "short_china_comment_query_graph.txt"
LONG_CHINA_POST_QG = "long_china_post_query_graph.txt"
LONG_CHINA_COMMENT_QG = "long_china_comment_query_graph.txt"

# India QG
SHORT_INDIA_POST_QG = "short_india_post_query_graph.txt"
SHORT_INDIA_COMMENT_QG = "short_india_comment_query_graph.txt"
LONG_INDIA_POST_QG = "long_india_post_query_graph.txt"
LONG_INDIA_COMMENT_QG = "long_india_comment_query_graph.txt"

In [36]:
# China Log(Result)
SHORT_CHINA_POST_RES = "short_china_post_result.txt"
SHORT_CHINA_COMMENT_RES = "short_china_comment_result.txt"
LONG_CHINA_POST_RES = "long_china_post_result.txt"
LONG_CHINA_COMMENT_RES = "long_china_comment_result.txt"

In [37]:
# India Log(Result)
SHORT_INDIA_POST_RES = "short_india_post_result.txt"
SHORT_INDIA_COMMENT_RES = "short_india_comment_result.txt"
LONG_INDIA_POST_RES = "long_india_post_result.txt"
LONG_INDIA_COMMENT_RES = "long_india_comment_result.txt"

In [38]:
# Dirname
BI_10_DIRNAME = "BI_10"
BI_10_ORIGINAL_Q_PRE = ORIGINAL_QUERY_PREFIX
BI_10_OPTIMIZED_Q_PRE = OPTIMIZED_QUERY_PREFIX
BI_10_ORIGINAL_L_PRE = f"{ORIGINAL_LOG_PREFIX}/{BI_10_DIRNAME}"
BI_10_OPTIMIZED_L_PRE = f"{OPTIMIZED_LOG_PREFIX}/{BI_10_DIRNAME}"

In [39]:
if not os.path.exists(BI_10_ORIGINAL_L_PRE):
    os.makedirs(BI_10_ORIGINAL_L_PRE)
if not os.path.exists(BI_10_OPTIMIZED_L_PRE):
    os.makedirs(BI_10_OPTIMIZED_L_PRE)

## Original


In [40]:
""" labels & edges """

original_short_edges = [(0, 1), (1, 6), (6, 2), (2, 3), (2, 4), (4, 5)] + [
    (6, 7),
    (7, 8),
    (8, 9),
]
original_long_edges = original_short_edges + [(9, 10)]

original_short_china_post_labels = [
    "China",
    "city",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_china_post_labels = original_short_china_post_labels + ["person"]
original_short_china_comment_labels = [
    "China",
    "city",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_china_comment_labels = original_short_china_comment_labels + ["person"]
original_short_india_post_labels = [
    "India",
    "city",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_india_post_labels = original_short_india_post_labels + ["person"]
original_short_india_comment_labels = [
    "India",
    "city",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_india_comment_labels = original_short_india_comment_labels + ["person"]

In [41]:
""" Init Original Query Graph """

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_POST_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_china_post_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_POST_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_china_post_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_COMMENT_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_china_comment_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_COMMENT_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_china_comment_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_POST_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_india_post_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_POST_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_india_post_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_COMMENT_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_india_comment_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_COMMENT_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_india_comment_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

In [42]:
""" args """

original_short_china_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG,
    "-qg",
    f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_POST_QG}",
]
original_long_china_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG,
    "-qg",
    f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_POST_QG}",
]
original_short_china_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG,
    "-qg",
    f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_COMMENT_QG}",
]
original_long_china_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG,
    "-qg",
    f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_COMMENT_QG}",
]
original_short_india_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG,
    "-qg",
    f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_POST_QG}",
]
original_long_india_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG,
    "-qg",
    f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_POST_QG}",
]
original_short_india_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG,
    "-qg",
    f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_COMMENT_QG}",
]
original_long_india_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG,
    "-qg",
    f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_COMMENT_QG}",
]

In [43]:
""" exec """

new_original_time_table = []

run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{SHORT_CHINA_POST_RES}",
    "original_short_china_post",
    original_short_china_post_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{LONG_CHINA_POST_RES}",
    "original_long_china_post",
    original_long_china_post_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{SHORT_CHINA_COMMENT_RES}",
    "original_short_china_comment",
    original_short_china_comment_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{LONG_CHINA_COMMENT_RES}",
    "original_long_china_comment",
    original_long_china_comment_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{SHORT_INDIA_POST_RES}",
    "original_short_india_post",
    original_short_india_post_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{LONG_INDIA_POST_RES}",
    "original_long_india_post",
    original_long_india_post_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{SHORT_INDIA_COMMENT_RES}",
    "original_short_india_comment",
    original_short_india_comment_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{LONG_INDIA_COMMENT_RES}",
    "original_long_india_comment",
    original_long_india_comment_args,
    new_original_time_table,
)

File `./log/original/BI_10/short_china_post_result.txt` already exists
    last_line ~> Processing Time (ms): 768.04

File `./log/original/BI_10/long_china_post_result.txt` already exists
    last_line ~> Processing Time (ms): 330.352

File `./log/original/BI_10/short_china_comment_result.txt` already exists
    last_line ~> Processing Time (ms): 1486.43

File `./log/original/BI_10/long_china_comment_result.txt` already exists
    last_line ~> Processing Time (ms): 887.005

File `./log/original/BI_10/short_india_post_result.txt` already exists
    last_line ~> Processing Time (ms): 401.596

File `./log/original/BI_10/long_india_post_result.txt` already exists
    last_line ~> Processing Time (ms): 395.917

File `./log/original/BI_10/short_india_comment_result.txt` already exists
    last_line ~> Processing Time (ms): 1254.6

File `./log/original/BI_10/long_india_comment_result.txt` already exists
    last_line ~> Processing Time (ms): 977.802



# Optimized


In [44]:
""" labels & edges """

optimized_short_edges = [(0, 5), (5, 1), (1, 2), (1, 3), (3, 4)] + [
    (5, 6),
    (6, 7),
    (7, 8),
]
optimized_long_edges = optimized_short_edges + [(8, 9)]

optimized_short_china_post_labels = [
    "China",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_china_post_labels = optimized_short_china_post_labels + ["person"]
optimized_short_china_comment_labels = [
    "China",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_china_comment_labels = optimized_short_china_comment_labels + ["person"]
optimized_short_india_post_labels = [
    "India",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_india_post_labels = optimized_short_india_post_labels + ["person"]
optimized_short_india_comment_labels = [
    "India",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_india_comment_labels = optimized_short_india_comment_labels + ["person"]

In [45]:
""" Init Optimized Query Graph """

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_POST_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_china_post_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_POST_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_china_post_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_COMMENT_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_china_comment_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_COMMENT_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_china_comment_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_POST_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_india_post_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_POST_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_india_post_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_COMMENT_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_india_comment_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_COMMENT_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_india_comment_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

In [46]:
""" args """

optimized_short_china_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG_OPTIMIZED,
    "-qg",
    f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_POST_QG}",
]
optimized_long_china_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG_OPTIMIZED,
    "-qg",
    f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_POST_QG}",
]
optimized_short_china_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG_OPTIMIZED,
    "-qg",
    f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_COMMENT_QG}",
]
optimized_long_china_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG_OPTIMIZED,
    "-qg",
    f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_COMMENT_QG}",
]
optimized_short_india_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG_OPTIMIZED,
    "-qg",
    f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_POST_QG}",
]
optimized_long_india_post_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG_OPTIMIZED,
    "-qg",
    f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_POST_QG}",
]
optimized_short_india_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG_OPTIMIZED,
    "-qg",
    f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_COMMENT_QG}",
]
optimized_long_india_comment_args = wsl_if_on_windows + [
    "./VEQ_M_100k",
    "-dg",
    BI_10_DG_OPTIMIZED,
    "-qg",
    f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_COMMENT_QG}",
]

In [47]:
""" exec """

new_optimized_time_table = []

run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{SHORT_CHINA_POST_RES}",
    "optimized_short_china_post",
    optimized_short_china_post_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{LONG_CHINA_POST_RES}",
    "optimized_long_china_post",
    optimized_long_china_post_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{SHORT_CHINA_COMMENT_RES}",
    "optimized_short_china_comment",
    optimized_short_china_comment_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{LONG_CHINA_COMMENT_RES}",
    "optimized_long_china_comment",
    optimized_long_china_comment_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{SHORT_INDIA_POST_RES}",
    "optimized_short_india_post",
    optimized_short_india_post_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{LONG_INDIA_POST_RES}",
    "optimized_long_india_post",
    optimized_long_india_post_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{SHORT_INDIA_COMMENT_RES}",
    "optimized_short_india_comment",
    optimized_short_india_comment_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{LONG_INDIA_COMMENT_RES}",
    "optimized_long_india_comment",
    optimized_long_india_comment_args,
    new_optimized_time_table,
)

File `./log/optimized/BI_10/short_china_post_result.txt` already exists
    last_line ~> Processing Time (ms): 707.368

File `./log/optimized/BI_10/long_china_post_result.txt` already exists
    last_line ~> Processing Time (ms): 395.018

File `./log/optimized/BI_10/short_china_comment_result.txt` already exists
    last_line ~> Processing Time (ms): 1996.5

File `./log/optimized/BI_10/long_china_comment_result.txt` already exists
    last_line ~> Processing Time (ms): 1235.61

File `./log/optimized/BI_10/short_india_post_result.txt` already exists
    last_line ~> Processing Time (ms): 683.904

File `./log/optimized/BI_10/long_india_post_result.txt` already exists
    last_line ~> Processing Time (ms): 346.558

File `./log/optimized/BI_10/short_india_comment_result.txt` already exists
    last_line ~> Processing Time (ms): 1469.59

File `./log/optimized/BI_10/long_india_comment_result.txt` already exists
    last_line ~> Processing Time (ms): 1287.6



In [48]:
""" Show BI-10 `comparison data-frame` """

print(
    "Comparison between: `original_china/india_match` & `optimized_china/india_match`"
)

df = pl.DataFrame(
    {
        "task": [
            "short_china_post",
            "long_china_post",
            "short_china_comment",
            "long_china_comment",
            "short_india_post",
            "long_india_post",
            "short_india_comment",
            "long_india_comment",
        ],
        "original (ms)": new_original_time_table,
        "optimized (ms)": new_optimized_time_table,
    }
)
df

Comparison between: `original_china/india_match` & `optimized_china/india_match`


task,original (ms),optimized (ms)
str,f64,f64
"""short_china_po…",768.04,707.368
"""long_china_pos…",330.352,395.018
"""short_china_co…",1486.43,1996.5
"""long_china_com…",887.005,1235.61
"""short_india_po…",401.596,683.904
"""long_india_pos…",395.917,346.558
"""short_india_co…",1254.6,1469.59
"""long_india_com…",977.802,1287.6


# BI 3


## Original


In [49]:
from dataclasses import dataclass


@dataclass
class OriginalBI3:
    dirname = "BI_3"
    labels = ["China", "city", "person", "forum", "post", "comment", "tag", "tagclass"]
    edges = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]
    task_name = "original_china_bi3"
    query_graph_name = "original_china_bi3_query_graph.txt"
    log_name = "original_china_bi3_result.txt"
    time_table = list[float]()
    args = wsl_if_on_windows + [
        "./VEQ_M_100k",
        "-dg",
        BI_11_DG,
        "-qg",
        f"{ORIGINAL_QUERY_PREFIX}/{query_graph_name}",
    ]

    def build_query_graph(self):
        query_prefix = ORIGINAL_QUERY_PREFIX
        if not os.path.exists(f"{query_prefix}/{self.query_graph_name}"):
            with open(f"{query_prefix}/{self.query_graph_name}", "w") as f:
                f.write("#0\n")
                f.write(f"{len(self.labels)}\n")
                [f.write(f"{label}\n") for label in self.labels]
                f.write(f"{len(self.edges)}\n")
                [f.write(f"{src} {dst}\n") for src, dst in self.edges]

    def run_query(self):
        log_prefix = f"{ORIGINAL_LOG_PREFIX}/{self.dirname}"
        if not os.path.exists(log_prefix):
            os.makedirs(log_prefix)
        run_veq_m_100k(
            f"{log_prefix}/{self.log_name}",
            self.task_name,
            self.args,
            self.time_table,
        )


original_query_proc = OriginalBI3()

In [50]:
""" Build `query graph` """

original_query_proc.build_query_graph()

In [51]:
""" Run query """

original_query_proc.run_query()

File `./log/original/BI_3/original_china_bi3_result.txt` already exists
    last_line ~> Processing Time (ms): 24926.9



## Optimized


In [52]:
@dataclass
class OptimizedBI3:
    dirname = "BI_3"
    labels = ["China", "forum", "post", "comment", "tag", "tagclass"]
    edges = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)]
    task_name = "optimized_china_bi3"
    query_graph_name = "optimized_china_bi3_query_graph.txt"
    log_name = "optimized_china_bi3_result.txt"
    time_table = list[float]()
    args = wsl_if_on_windows + [
        "./VEQ_M_100k",
        "-dg",
        BI_11_DG_OPTIMIZED,
        "-qg",
        f"{OPTIMIZED_QUERY_PREFIX}/{query_graph_name}",
    ]

    def build_query_graph(self):
        query_prefix = OPTIMIZED_QUERY_PREFIX
        if not os.path.exists(f"{query_prefix}/{self.query_graph_name}"):
            with open(f"{query_prefix}/{self.query_graph_name}", "w") as f:
                f.write("#0\n")
                f.write(f"{len(self.labels)}\n")
                [f.write(f"{label}\n") for label in self.labels]
                f.write(f"{len(self.edges)}\n")
                [f.write(f"{src} {dst}\n") for src, dst in self.edges]

    def run_query(self):
        log_prefix = f"{OPTIMIZED_LOG_PREFIX}/{self.dirname}"
        if not os.path.exists(log_prefix):
            os.makedirs(log_prefix)
        run_veq_m_100k(
            f"{log_prefix}/{self.log_name}",
            self.task_name,
            self.args,
            self.time_table,
        )


optimized_query_proc = OptimizedBI3()

In [53]:
""" Build `query graph` """

optimized_query_proc.build_query_graph()

In [54]:
""" Run query """

optimized_query_proc.run_query()

File `./log/optimized/BI_3/optimized_china_bi3_result.txt` already exists
    last_line ~> Processing Time (ms): 24998.3



In [55]:
""" Show BI-3 `comparison data-frame` """

df = pl.DataFrame(
    {
        "task": ["china_bi3"],
        "original (ms)": original_query_proc.time_table,
        "optimized (ms)": optimized_query_proc.time_table,
    }
)
df

task,original (ms),optimized (ms)
str,f64,f64
"""china_bi3""",24926.9,24998.3
