# Prepare


In [None]:
from envs import *

In [1]:
import polars as pl
import re, os
from tqdm import tqdm

In [2]:
""" Function Tools """

import subprocess
from typing import Optional
from enum import StrEnum


class BaseLabel(StrEnum):
    def __repr__(self) -> str:
        return self.value

    def __add__(self, other: str) -> str:
        return f"{self.value}({other})"

    Country = "country"
    Date = "date"
    EndDate = "endDate"
    TagClass = "tagClass"


def run_veq_m_100k(
    result_path: str,
    task_name: str,
    args: list[str],
    time_table: Optional[list[float]] = None,
):
    if os.path.exists(result_path):
        print(f"File `{result_path}` already exists")
        with open(result_path, "r") as f:
            non_empty_lines = [line for line in f if line.strip() != ""]
            last_line = non_empty_lines[-1]
            print(f"    last_line ~> {last_line}")
            time = float(last_line.split(" ")[-1])
            if not time_table is None:
                time_table.append(time)
        return

    content = ""
    with open(result_path, "w") as f:
        print(f">>> Running: {task_name}...")
        with subprocess.Popen(
            args,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
        ) as p:
            if p.stdout:
                for line in iter(p.stdout.readline, b""):
                    content = line.decode("utf-8")
                    print("    " + content, end="")
                    f.write(content)
            else:
                print("    <No output>")
                f.write("<No output>")
        print("<<< Done!")
    if not time_table is None:
        processing_time = float(content.split(" ")[-1])
        time_table.append(processing_time)


def get_inner_namespace(col_name: str) -> str:
    match = re.search("\((.*?)\)", col_name)
    return "" if match is None else match.group(1)


def get_namespace(col_name: str) -> str:
    inner_namespace = get_inner_namespace(col_name)
    if inner_namespace in ["Country", "Continent", "City"]:
        return "Placeid"
    if inner_namespace in ["University", "Company"]:
        return "Organisationid"
    return inner_namespace


assert get_namespace(":ID(Forumid)") == "Forumid"

'Forumid'

In [3]:
""" Test """

PLACE = f"{RAW_DATA_PREFIX}/place.csv"

df = pl.read_csv(PLACE)
test_table = df.lazy().filter(pl.col("name").is_in(["India", "China"])).collect()
out = test_table.select(
    [
        pl.col(":ID(Placeid)"),
        pl.col("name"),
        pl.col(":TYPE"),
        pl.col(":LABEL"),
    ]
)
out.head(5)

:ID(Placeid),name,:TYPE,:LABEL
i64,str,str,str
0,"""India""","""country""","""place"""
1,"""China""","""country""","""place"""


In [4]:
""" Load `vertices/edges` """

import os, glob
from polars import DataFrame

vertices = dict[str, DataFrame]()
raw_edges = dict[tuple[str, str, str], DataFrame]()
switch_namespace = dict[str, dict[int, int]]()

for file in glob.glob(f"{RAW_DATA_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    if "_" in df_name:
        src, relationship, dst = df_name.split("_")
        raw_edges[(src, relationship, dst)] = pl.read_csv(file)
    else:
        vertices[df_name] = pl.read_csv(file)


vertex_num = sum(len(df) for df in vertices.values())
edge_num = sum(len(df) for df in raw_edges.values())

In [5]:
""" Initialize `switch_namespace` """

for df in vertices.values():
    base_label = get_namespace(df.columns[0])
    switch_namespace[base_label] = dict()

switch_namespace

{'Commentid': {},
 'Forumid': {},
 'Organisationid': {},
 'Personid': {},
 'Placeid': {},
 'Postid': {},
 'Tagid': {},
 'TagClassid': {}}

In [6]:
""" Re-arrange all `:ID($AnyNamespace)` """

curr_global_id = 0

with tqdm(desc="Mapping `origin_id` to `uni_id`", total=vertex_num) as bar:
    for df in vertices.values():
        base_label = get_namespace(df.columns[0])
        map = switch_namespace[base_label]
        for row in df.rows():
            map[int(row[0])] = curr_global_id
            curr_global_id += 1
            bar.update(1)

assert curr_global_id == vertex_num
set(switch_namespace.keys())

Mapping `origin_id` to `uni_id`: 100%|██████████| 3181724/3181724 [00:03<00:00, 908464.65it/s] 


{'Commentid',
 'Forumid',
 'Organisationid',
 'Personid',
 'Placeid',
 'Postid',
 'TagClassid',
 'Tagid'}

# BI 11


## Original Query


In [None]:
labels = dict[int, str]()
label_set = set[str]()

explicit_place_labels = (
    {"China", "India"}
    .union({"Tunisia", "Cuba", "France"})
    .union({"Belgium", "Greece", "Chile"})
)
explicit_tagclass_labels = {
    "NascarDriver",
    "Thing",
    "Politician",
}.union({"Saint", "President", "Song"})
explicit_date_labels = {"2010-01-07", "2010-01-10", "2010-01-26"}
explicit_end_date_labels = {"2012-11-11", "2012-11-09", "2012-11-03"}


def place_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col("name"),
            pl.col(":TYPE"),
        ]
    )
    for origin_id, name, ty in slice.rows():
        origin_id, name, label = int(origin_id), str(name), str(ty)
        uni_id = map[origin_id]
        if name in explicit_place_labels:
            label = BaseLabel.Country + name
        labels[uni_id] = label
        label_set.add(label)
        bar.update(1)


def normal_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col(":TYPE" if ":TYPE" in df.columns else ":LABEL"),
        ]
    )
    for origin_id, label in slice.rows():
        uni_id = map[int(origin_id)]
        labels[uni_id] = str(label)
        label_set.add(str(label))
        bar.update(1)


def vertex_op(df_name: str, df: DataFrame):
    match df_name:
        case "place":
            place_op(df)
        case _:
            normal_op(df)

In [9]:
""" Build map of `vertex.uni_id -> label` """

with tqdm(desc="Build map of `vertex.uni_id -> label`", total=vertex_num) as bar:
    for df_name, df in vertices.items():
        vertex_op(df_name, df)

label_set

Build map of `vertex.uni_id -> label`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1496053.20it/s]


{'China',
 'India',
 'city',
 'comment',
 'company',
 'continent',
 'country',
 'forum',
 'person',
 'post',
 'tag',
 'tagclass',
 'university'}

In [10]:
""" Build edges in format: `(src_id, dst_id)` """

edges = set[tuple[int, int]]()

with tqdm(desc="Build edges in format: `(src_id, dst_id)`", total=edge_num) as bar:
    for df_name, df in raw_edges.items():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Build edges in format: `(src_id, dst_id)`: 100%|██████████| 17256038/17256038 [00:18<00:00, 927268.42it/s] 


In [11]:
""" Write into `data_graph.txt` """

if not os.path.exists(BI_11_DG):
    with open(BI_11_DG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc="Writing `labels` into `data_graph.txt`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(desc="Writing `edges` into `data_graph.txt", total=len(edges)) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{BI_11_DG}` already exists")

Writing `labels` into `data_graph.txt`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1514438.58it/s]
Writing `edges` into `data_graph.txt: 100%|██████████| 17256033/17256033 [00:20<00:00, 857313.93it/s]


In [12]:
""" Build `India` and `China` query graph """

china_query_graph_labels = [BaseLabel.Country + "China"] + ["city"] * 3 + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (4, 1),
    (5, 2),
    (6, 3),
    (4, 5),
    (5, 6),
    (6, 4),
]

india_query_graph_labels = [BaseLabel.Country + "India"] + ["city"] * 3 + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(BI_11_CHINA_QG):
    with open(BI_11_CHINA_QG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(BI_11_INDIA_QG):
    with open(BI_11_INDIA_QG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Optimized Query


In [15]:
""" Load all `index edge` """

index_edges = dict[str, DataFrame]()

for file in glob.glob(f"{INDEX_EDGES_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    index_edges[df_name] = pl.read_csv(file)

index_edge_num = sum(len(df) for df in index_edges.values())

In [16]:
""" Add `index edge` into `edges` """

with tqdm(desc="Adding `index edge` into `edges`", total=index_edge_num) as bar:
    for df in index_edges.values():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Adding `index edge` into `edges`: 100%|██████████| 20605002/20605002 [00:24<00:00, 851473.18it/s] 


In [17]:
""" Write into `data_graph.txt` """

if not os.path.exists(BI_11_DG_OPTIMIZED):
    with open(BI_11_DG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc="Writing `labels` into `data_graph.txt`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(desc="Writing `edges` into `data_graph.txt", total=len(edges)) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{BI_11_DG_OPTIMIZED}` already exists")

Writing `labels` into `data_graph.txt`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1477753.36it/s]
Writing `edges` into `data_graph.txt: 100%|██████████| 34775921/34775921 [00:42<00:00, 825434.42it/s]


In [18]:
""" Build `India` and `China` query graph """

china_query_graph_labels = [BaseLabel.Country + "China"] + ["person"] * 3
china_query_graph_edges = [
    (1, 0),
    (2, 0),
    (3, 0),
    (1, 2),
    (2, 3),
    (3, 1),
]

india_query_graph_labels = [BaseLabel.Country + "India"] + ["person"] * 3
india_query_graph_edges = china_query_graph_edges

if not os.path.exists(BI_11_CHINA_QG_OPTIMIZED):
    with open(BI_11_CHINA_QG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(china_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in china_query_graph_labels]
        f.write(f"{len(china_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in china_query_graph_edges]

if not os.path.exists(BI_11_INDIA_QG_OPTIMIZED):
    with open(BI_11_INDIA_QG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(india_query_graph_labels)}\n")
        [f.write(f"{label}\n") for label in india_query_graph_labels]
        f.write(f"{len(india_query_graph_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in india_query_graph_edges]

## Execute `Query`


In [21]:
""" Args """

from args.bi_11 import *

In [22]:
""" Exec `match` on `original` """

original_time_table = []

run_veq_m_100k(
    BI_11_ORIGINAL_CHINA_RES,
    "original_china_match",
    original_china_match_args,
    original_time_table,
)
run_veq_m_100k(
    BI_11_ORIGINAL_INDIA_RES,
    "original_india_match",
    original_india_match_args,
    original_time_table,
)

""" Exec `match` on `optimized` """

optimized_time_table = []

run_veq_m_100k(
    BI_11_OPTIMIZED_CHINA_RES,
    "optimized_china_match",
    optimized_china_match_args,
    optimized_time_table,
)
run_veq_m_100k(
    BI_11_OPTIMIZED_INDIA_RES,
    "optimized_india_match",
    optimized_india_match_args,
    optimized_time_table,
)

>>> Running: original_china_match...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_11/china_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3748
    Total Recursive Call Count: 131167
    Number of Matches: 100000
    Filtering Time (ms): 20.6644
    Verification Time (ms): 6480.42
    Processing Time (ms): 6501.08
<<< Done!
>>> Running: original_india_match...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_11/india_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3871
    Total Recursive Call Count: 131701
    Number of Matches: 100000
    Filtering Time (ms): 23.6162
    Verification Time (ms): 6028.32
    Processing Time (ms): 6051.94
<<< Done!
>>> Running: optimized_china_match...
    Data file: ./out/optimized/BI_11/data_graph.txt
    Query file: ./out/optimized/BI_11/china_query_graph.txt
    Output file: 
    Sum of |C(u)|: 3157
    Total Recursive Call Count: 17593
    Number of Matches: 10000

In [23]:
""" Show BI-11 `comparison data-frame` """

print(
    "Comparison between: `original_china/india_match` & `optimized_china/india_match`"
)

df = pl.DataFrame(
    {
        "task": ["china_match", "india_match"],
        "original (ms)": original_time_table,
        "optimized (ms)": optimized_time_table,
    }
)
df

Comparison between: `original_china/india_match` & `optimized_china/india_match`


task,original (ms),optimized (ms)
str,f64,f64
"""china_match""",6501.08,3033.07
"""india_match""",6051.94,3079.34


# BI 10


## Original


In [29]:
""" labels & edges """

original_short_edges = [(0, 1), (1, 6), (6, 2), (2, 3), (2, 4), (4, 5)] + [
    (6, 7),
    (7, 8),
    (8, 9),
]
original_long_edges = original_short_edges + [(9, 10)]

original_short_china_post_labels = [
    BaseLabel.Country + "China",
    "city",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_china_post_labels = original_short_china_post_labels + ["person"]
original_short_china_comment_labels = [
    BaseLabel.Country + "China",
    "city",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_china_comment_labels = original_short_china_comment_labels + ["person"]
original_short_india_post_labels = [
    BaseLabel.Country + "India",
    "city",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_india_post_labels = original_short_india_post_labels + ["person"]
original_short_india_comment_labels = [
    BaseLabel.Country + "India",
    "city",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
original_long_india_comment_labels = original_short_india_comment_labels + ["person"]

In [30]:
""" Init Original Query Graph """

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_POST_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_china_post_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_POST_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_china_post_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_COMMENT_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_china_comment_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_COMMENT_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_china_comment_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_POST_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_india_post_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_POST_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_india_post_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_COMMENT_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{SHORT_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_short_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_short_india_comment_labels]
        f.write(f"{len(original_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_short_edges]

if not os.path.exists(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_COMMENT_QG}"):
    with open(f"{BI_10_ORIGINAL_Q_PRE}/{LONG_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(original_long_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in original_long_india_comment_labels]
        f.write(f"{len(original_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in original_long_edges]

In [31]:
""" args """

from args.bi_10 import *

In [32]:
""" exec """

new_original_time_table = []

run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{SHORT_CHINA_POST_RES}",
    "original_short_china_post",
    original_short_china_post_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{LONG_CHINA_POST_RES}",
    "original_long_china_post",
    original_long_china_post_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{SHORT_CHINA_COMMENT_RES}",
    "original_short_china_comment",
    original_short_china_comment_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{LONG_CHINA_COMMENT_RES}",
    "original_long_china_comment",
    original_long_china_comment_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{SHORT_INDIA_POST_RES}",
    "original_short_india_post",
    original_short_india_post_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{LONG_INDIA_POST_RES}",
    "original_long_india_post",
    original_long_india_post_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{SHORT_INDIA_COMMENT_RES}",
    "original_short_india_comment",
    original_short_india_comment_args,
    new_original_time_table,
)
run_veq_m_100k(
    f"{BI_10_ORIGINAL_L_PRE}/{LONG_INDIA_COMMENT_RES}",
    "original_long_india_comment",
    original_long_india_comment_args,
    new_original_time_table,
)

>>> Running: original_short_china_post...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_10/short_china_post_query_graph.txt
    Output file: 


    Sum of |C(u)|: 68294
    Total Recursive Call Count: 7
    Number of Matches: 100176
    Filtering Time (ms): 274.408
    Verification Time (ms): 585.114
    Processing Time (ms): 859.521
<<< Done!
>>> Running: original_long_china_post...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_10/long_china_post_query_graph.txt
    Output file: 
    Sum of |C(u)|: 77102
    Total Recursive Call Count: 34
    Number of Matches: 100062
    Filtering Time (ms): 292.175
    Verification Time (ms): 65.839
    Processing Time (ms): 358.014
<<< Done!
>>> Running: original_short_china_comment...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_10/short_china_comment_query_graph.txt
    Output file: 
    Sum of |C(u)|: 144792
    Total Recursive Call Count: 7
    Number of Matches: 100336
    Filtering Time (ms): 834.945
    Verification Time (ms): 628.903
    Processing Time (ms): 1463.85
<<< Done!
>>> Running: original_long_

# Optimized


In [33]:
""" labels & edges """

optimized_short_edges = [(0, 5), (5, 1), (1, 2), (1, 3), (3, 4)] + [
    (5, 6),
    (6, 7),
    (7, 8),
]
optimized_long_edges = optimized_short_edges + [(8, 9)]

optimized_short_china_post_labels = [
    BaseLabel.Country + "China",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_china_post_labels = optimized_short_china_post_labels + ["person"]
optimized_short_china_comment_labels = [
    BaseLabel.Country + "China",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_china_comment_labels = optimized_short_china_comment_labels + ["person"]
optimized_short_india_post_labels = [
    BaseLabel.Country + "India",
    "post",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_india_post_labels = optimized_short_india_post_labels + ["person"]
optimized_short_india_comment_labels = [
    BaseLabel.Country + "India",
    "comment",
    "tag",
    "tag",
    "tagclass",
] + ["person"] * 4
optimized_long_india_comment_labels = optimized_short_india_comment_labels + ["person"]

In [34]:
""" Init Optimized Query Graph """

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_POST_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_china_post_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_POST_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_china_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_china_post_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_COMMENT_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_china_comment_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_COMMENT_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_CHINA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_china_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_china_comment_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_POST_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_india_post_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_POST_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_POST_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_india_post_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_india_post_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_COMMENT_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{SHORT_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_short_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_short_india_comment_labels]
        f.write(f"{len(optimized_short_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_short_edges]

if not os.path.exists(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_COMMENT_QG}"):
    with open(f"{BI_10_OPTIMIZED_Q_PRE}/{LONG_INDIA_COMMENT_QG}", "w") as f:
        f.write("#0\n")
        f.write(f"{len(optimized_long_india_comment_labels)}\n")
        [f.write(f"{label}\n") for label in optimized_long_india_comment_labels]
        f.write(f"{len(optimized_long_edges)}\n")
        [f.write(f"{src} {dst}\n") for src, dst in optimized_long_edges]

In [36]:
""" exec """

new_optimized_time_table = []

run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{SHORT_CHINA_POST_RES}",
    "optimized_short_china_post",
    optimized_short_china_post_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{LONG_CHINA_POST_RES}",
    "optimized_long_china_post",
    optimized_long_china_post_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{SHORT_CHINA_COMMENT_RES}",
    "optimized_short_china_comment",
    optimized_short_china_comment_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{LONG_CHINA_COMMENT_RES}",
    "optimized_long_china_comment",
    optimized_long_china_comment_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{SHORT_INDIA_POST_RES}",
    "optimized_short_india_post",
    optimized_short_india_post_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{LONG_INDIA_POST_RES}",
    "optimized_long_india_post",
    optimized_long_india_post_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{SHORT_INDIA_COMMENT_RES}",
    "optimized_short_india_comment",
    optimized_short_india_comment_args,
    new_optimized_time_table,
)
run_veq_m_100k(
    f"{BI_10_OPTIMIZED_L_PRE}/{LONG_INDIA_COMMENT_RES}",
    "optimized_long_india_comment",
    optimized_long_india_comment_args,
    new_optimized_time_table,
)

>>> Running: optimized_short_china_post...
    Data file: ./out/optimized/BI_11/data_graph.txt
    Query file: ./out/optimized/BI_10/short_china_post_query_graph.txt
    Output file: 
    Sum of |C(u)|: 68096
    Total Recursive Call Count: 6
    Number of Matches: 100064
    Filtering Time (ms): 339.592
    Verification Time (ms): 538.469
    Processing Time (ms): 878.061
<<< Done!
>>> Running: optimized_long_china_post...
    Data file: ./out/optimized/BI_11/data_graph.txt
    Query file: ./out/optimized/BI_10/long_china_post_query_graph.txt
    Output file: 
    Sum of |C(u)|: 76904
    Total Recursive Call Count: 29
    Number of Matches: 101164
    Filtering Time (ms): 324.827
    Verification Time (ms): 48.1659
    Processing Time (ms): 372.993
<<< Done!
>>> Running: optimized_short_china_comment...
    Data file: ./out/optimized/BI_11/data_graph.txt
    Query file: ./out/optimized/BI_10/short_china_comment_query_graph.txt
    Output file: 
    Sum of |C(u)|: 160537
    Total Rec

In [37]:
""" Show BI-10 `comparison data-frame` """

print(
    "Comparison between: `original_china/india_match` & `optimized_china/india_match`"
)

df = pl.DataFrame(
    {
        "task": [
            "short_china_post",
            "long_china_post",
            "short_china_comment",
            "long_china_comment",
            "short_india_post",
            "long_india_post",
            "short_india_comment",
            "long_india_comment",
        ],
        "original (ms)": new_original_time_table,
        "optimized (ms)": new_optimized_time_table,
    }
)
df

Comparison between: `original_china/india_match` & `optimized_china/india_match`


task,original (ms),optimized (ms)
str,f64,f64
"""short_china_po…",859.521,878.061
"""long_china_pos…",358.014,372.993
"""short_china_co…",1463.85,2323.1
"""long_china_com…",912.711,1272.72
"""short_india_po…",375.992,792.272
"""long_india_pos…",483.71,425.723
"""short_india_co…",1248.95,1701.53
"""long_india_com…",1127.95,1314.2


# BI 3


## Original


In [38]:
from dataclasses import dataclass

BI_3_ORIGINAL_Q_PRE = f"{ORIGINAL_QUERY_PREFIX}/BI_3"
if not os.path.exists(BI_3_ORIGINAL_Q_PRE):
    os.makedirs(BI_3_ORIGINAL_Q_PRE)

BI_3_OPTIMIZED_Q_PRE = f"{OPTIMIZED_QUERY_PREFIX}/BI_3"
if not os.path.exists(BI_3_OPTIMIZED_Q_PRE):
    os.makedirs(BI_3_OPTIMIZED_Q_PRE)


@dataclass
class OriginalBI3:
    dirname = "BI_3"
    labels = [
        BaseLabel.Country + "China",
        "city",
        "person",
        "forum",
        "post",
        "comment",
        "tag",
        "tagclass",
    ]
    edges = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 6), (6, 7)]
    task_name = "original_china_bi3"
    query_graph_name = "original_china_bi3_query_graph.txt"
    log_name = "original_china_bi3_result.txt"
    time_table = list[float]()
    args = wsl_if_on_windows + [
        "./VEQ_M_100k",
        "-dg",
        BI_11_DG,
        "-qg",
        f"{BI_3_ORIGINAL_Q_PRE}/{query_graph_name}",
    ]

    def build_query_graph(self):
        query_prefix = BI_3_ORIGINAL_Q_PRE
        if not os.path.exists(f"{query_prefix}/{self.query_graph_name}"):
            with open(f"{query_prefix}/{self.query_graph_name}", "w") as f:
                f.write("#0\n")
                f.write(f"{len(self.labels)}\n")
                [f.write(f"{label}\n") for label in self.labels]
                f.write(f"{len(self.edges)}\n")
                [f.write(f"{src} {dst}\n") for src, dst in self.edges]

    def run_query(self):
        log_prefix = f"{ORIGINAL_LOG_PREFIX}/{self.dirname}"
        if not os.path.exists(log_prefix):
            os.makedirs(log_prefix)
        run_veq_m_100k(
            f"{log_prefix}/{self.log_name}",
            self.task_name,
            self.args,
            self.time_table,
        )


original_query_proc = OriginalBI3()

In [39]:
""" Build `query graph` """

original_query_proc.build_query_graph()

In [40]:
""" Run query """

original_query_proc.run_query()

>>> Running: original_china_bi3...
    Data file: ./out/original/BI_11/data_graph.txt
    Query file: ./out/original/BI_3/original_china_bi3_query_graph.txt
    Output file: 
    Sum of |C(u)|: 488316
    Total Recursive Call Count: 40584
    Number of Matches: 100000
    Filtering Time (ms): 854.313
    Verification Time (ms): 25661.9
    Processing Time (ms): 26516.2
<<< Done!


## Optimized


In [41]:
@dataclass
class OptimizedBI3:
    dirname = "BI_3"
    labels = [
        BaseLabel.Country + "China",
        "forum",
        "post",
        "comment",
        "tag",
        "tagclass",
    ]
    edges = [(0, 1), (1, 2), (2, 3), (3, 4), (4, 5)]
    task_name = "optimized_china_bi3"
    query_graph_name = "optimized_china_bi3_query_graph.txt"
    log_name = "optimized_china_bi3_result.txt"
    time_table = list[float]()
    args = wsl_if_on_windows + [
        "./VEQ_M_100k",
        "-dg",
        BI_11_DG_OPTIMIZED,
        "-qg",
        f"{BI_3_OPTIMIZED_Q_PRE}/{query_graph_name}",
    ]

    def build_query_graph(self):
        query_prefix = BI_3_OPTIMIZED_Q_PRE
        if not os.path.exists(f"{query_prefix}/{self.query_graph_name}"):
            with open(f"{query_prefix}/{self.query_graph_name}", "w") as f:
                f.write("#0\n")
                f.write(f"{len(self.labels)}\n")
                [f.write(f"{label}\n") for label in self.labels]
                f.write(f"{len(self.edges)}\n")
                [f.write(f"{src} {dst}\n") for src, dst in self.edges]

    def run_query(self):
        log_prefix = f"{OPTIMIZED_LOG_PREFIX}/{self.dirname}"
        if not os.path.exists(log_prefix):
            os.makedirs(log_prefix)
        run_veq_m_100k(
            f"{log_prefix}/{self.log_name}",
            self.task_name,
            self.args,
            self.time_table,
        )


optimized_query_proc = OptimizedBI3()

In [42]:
""" Build `query graph` """

optimized_query_proc.build_query_graph()

In [43]:
""" Run query """

optimized_query_proc.run_query()

>>> Running: optimized_china_bi3...
    Data file: ./out/optimized/BI_11/data_graph.txt
    Query file: ./out/optimized/BI_3/optimized_china_bi3_query_graph.txt
    Output file: 
    Sum of |C(u)|: 125935
    Total Recursive Call Count: 47581
    Number of Matches: 100000
    Filtering Time (ms): 309.348
    Verification Time (ms): 24971.3
    Processing Time (ms): 25280.6
<<< Done!


In [44]:
""" Show BI-3 `comparison data-frame` """

df = pl.DataFrame(
    {
        "task": ["china_bi3"],
        "original (ms)": original_query_proc.time_table,
        "optimized (ms)": optimized_query_proc.time_table,
    }
)
df

task,original (ms),optimized (ms)
str,f64,f64
"""china_bi3""",26516.2,25280.6
