# Prepare


In [1]:
from prelude import *

In [2]:
""" Load `vertices/edges` """

from polars import DataFrame

vertices = dict[str, DataFrame]()
raw_edges = dict[tuple[str, str, str], DataFrame]()
switch_namespace = dict[str, dict[int, int]]()

for file in glob.glob(f"{RAW_DATA_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    if "_" in df_name:
        src, relationship, dst = df_name.split("_")
        raw_edges[(src, relationship, dst)] = pl.read_csv(file)
    else:
        vertices[df_name] = pl.read_csv(file)


vertex_num = sum(len(df) for df in vertices.values())
edge_num = sum(len(df) for df in raw_edges.values())

In [3]:
""" Initialize `switch_namespace` """

for df in vertices.values():
    base_label = get_namespace(df.columns[0])
    switch_namespace[base_label] = dict()

switch_namespace

{'Commentid': {},
 'Forumid': {},
 'Organisationid': {},
 'Personid': {},
 'Placeid': {},
 'Postid': {},
 'Tagid': {},
 'TagClassid': {}}

In [4]:
""" Re-arrange all `:ID($AnyNamespace)` """

curr_global_id = 0

with tqdm(desc="Mapping `origin_id` to `uni_id`", total=vertex_num) as bar:
    for df in vertices.values():
        base_label = get_namespace(df.columns[0])
        map = switch_namespace[base_label]
        for row in df.rows():
            map[int(row[0])] = curr_global_id
            curr_global_id += 1
            bar.update(1)

assert curr_global_id == vertex_num
set(switch_namespace.keys())

Mapping `origin_id` to `uni_id`:   0%|          | 0/3181724 [00:00<?, ?it/s]

Mapping `origin_id` to `uni_id`: 100%|██████████| 3181724/3181724 [00:03<00:00, 920189.47it/s] 


{'Commentid',
 'Forumid',
 'Organisationid',
 'Personid',
 'Placeid',
 'Postid',
 'TagClassid',
 'Tagid'}

## Build `original` dg


In [5]:
from ldbc_tools.data_graph import *

labels = dict[int, str]()
label_set = set[str]()


def place_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col("name"),
            pl.col(":TYPE"),
        ]
    )
    for oid, name, ty in slice.rows():
        oid, name, label = int(oid), str(name), str(ty)
        uid = map[oid]
        if name in explicit_place_labels:
            label = BaseLabel.Country + name
        labels[uid] = label
        label_set.add(label)
        bar.update(1)


def person_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col(":LABEL"),
        ]
    )
    for x, y in slice.rows():
        personId = int(x)
        generalLabel = str(y)
        uid = map[personId]
        label = (
            generalLabel
            if not personId in explicit_personId_labels
            else BaseLabel.PersonId + str(personId)
        )
        labels[uid] = label
        label_set.add(label)
        bar.update(1)


def tagclass_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col(df.columns[1]),
            pl.col(":LABEL"),
        ]
    )
    for oid, tagclass_name, label in slice.rows():
        uid = map[int(oid)]
        label = (
            str(label)
            if not str(tagclass_name) in explicit_tagclass_labels
            else BaseLabel.TagClass + str(tagclass_name)
        )
        labels[uid] = label
        label_set.add(label)
        bar.update(1)


def forum_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col(df.columns[2]),
            pl.col(":LABEL"),
        ]
    )
    for oid, creationDate, label in slice.rows():
        uid = map[int(oid)]
        labels[uid] = (
            str(label)
            if not int(creationDate) in explicit_date_labels
            else BaseLabel.Date + str(creationDate)
        )
        label_set.add(str(label))
        bar.update(1)


def normal_op(df: DataFrame):
    namespace = get_namespace(df.columns[0])
    map = switch_namespace[namespace]
    slice = df.select(
        [
            pl.col(df.columns[0]),
            pl.col(":TYPE" if ":TYPE" in df.columns else ":LABEL"),
        ]
    )
    for oid, label in slice.rows():
        uid = map[int(oid)]
        labels[uid] = str(label)
        label_set.add(str(label))
        bar.update(1)


def vertex_op(df_name: str, df: DataFrame):
    match df_name:
        case "place":
            place_op(df)
        case "person":
            person_op(df)
        case "tagclass":
            tagclass_op(df)
        case "forum":
            forum_op(df)
        case _:
            normal_op(df)

In [6]:
""" Build map of `vertex.uid -> label` """

with tqdm(desc="Build map of `vertex.uni_id -> label`", total=vertex_num) as bar:
    for df_name, df in vertices.items():
        vertex_op(df_name, df)

label_set

Build map of `vertex.uni_id -> label`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1576756.24it/s]


{'city',
 'comment',
 'company',
 'continent',
 'country',
 'country(Belgium)',
 'country(Chile)',
 'country(China)',
 'country(Cuba)',
 'country(France)',
 'country(Greece)',
 'country(India)',
 'country(Tunisia)',
 'forum',
 'person',
 'post',
 'tag',
 'tagClass(NascarDriver)',
 'tagClass(Politician)',
 'tagClass(President)',
 'tagClass(Saint)',
 'tagClass(Song)',
 'tagClass(Thing)',
 'tagclass',
 'university'}

In [7]:
""" Build edges in format: `(src_id, dst_id)` """

edges = set[tuple[int, int]]()

with tqdm(desc="Build edges in format: `(src_id, dst_id)`", total=edge_num) as bar:
    for df_name, df in raw_edges.items():
        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Build edges in format: `(src_id, dst_id)`: 100%|██████████| 17256038/17256038 [00:17<00:00, 962410.00it/s] 


In [8]:
""" Write into `data_graph.txt` """

if not os.path.exists(COMMON_DG):
    with open(COMMON_DG, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc=f"Writing `labels` into `{COMMON_DG}`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(desc=f"Writing `edges` into `{COMMON_DG}`", total=len(edges)) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{COMMON_DG}` already exists")

Writing `labels` into `./out/original/data_graph.txt`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1584289.41it/s]
Writing `edges` into `./out/original/data_graph.txt`: 100%|██████████| 17256033/17256033 [00:19<00:00, 887977.23it/s]


## Build `optimized` dg


In [9]:
""" Load all `index edge` """

index_edges = dict[str, DataFrame]()

for file in glob.glob(f"{INDEX_EDGES_PREFIX}/*.csv"):
    df_name = os.path.basename(file).split(".")[0]
    index_edges[df_name] = pl.read_csv(file)

index_edge_num = sum(len(df) for df in index_edges.values())

In [10]:
""" Add `index edge` into `edges` """

with tqdm(desc="Adding `index edge` into `edges`", total=index_edge_num) as bar:
    for df in index_edges.values():
        # need to unique!
        # see `./data/index/forum_country.csv` for example
        unique_current_edges = set[tuple[int, int]]()

        src_namespace = get_namespace(df.columns[0])
        dst_namespace = get_namespace(df.columns[1])
        src_map = switch_namespace[src_namespace]
        dst_map = switch_namespace[dst_namespace]
        slice = df.select(
            [
                pl.col(df.columns[0]),
                pl.col(df.columns[1]),
            ]
        )
        for src_id, dst_id in slice.rows():
            src_uni_id = src_map[int(src_id)]
            dst_uni_id = dst_map[int(dst_id)]
            if not (src_uni_id, dst_uni_id) in unique_current_edges:
                edges.add((src_uni_id, dst_uni_id))
                unique_current_edges.add((src_uni_id, dst_uni_id))
            bar.update(1)

Adding `index edge` into `edges`: 100%|██████████| 20605002/20605002 [00:32<00:00, 639311.85it/s] 


In [11]:
""" Write into `data_graph.txt` """

if not os.path.exists(COMMON_DG_OPTIMIZED):
    with open(COMMON_DG_OPTIMIZED, "w") as f:
        f.write("#0\n")
        f.write(f"{len(labels)}\n")
        with tqdm(
            desc=f"Writing `labels` into `{COMMON_DG_OPTIMIZED}`", total=len(labels)
        ) as bar:
            for i in range(len(labels)):
                f.write(f"{labels[i]}\n")
                bar.update(1)
        f.write(f"{len(edges)}\n")
        with tqdm(
            desc=f"Writing `edges` into `{COMMON_DG_OPTIMIZED}`", total=len(edges)
        ) as bar:
            for src, dst in edges:
                f.write(f"{src} {dst}\n")
                bar.update(1)
else:
    print(f"File `{COMMON_DG_OPTIMIZED}` already exists")

Writing `labels` into `./out/optimized/data_graph.txt`: 100%|██████████| 3181724/3181724 [00:02<00:00, 1557418.00it/s]
Writing `edges` into `./out/optimized/data_graph.txt`: 100%|██████████| 34775921/34775921 [00:38<00:00, 909350.60it/s]
