# Pre Processing

## load Data

In [1]:
from pathlib import Path
import time
import os
import random

import pandas as pd
import requests
from tqdm import tqdm
from PIL import Image
import torch


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [3]:
edge_index = pd.read_csv('./data/raw/edges.csv')
full_node_features = pd.read_parquet('./data/raw/node_features.parquet')
images = pd.read_csv('./data/raw/images.csv')

In [4]:
node_features = pd.merge(full_node_features, images, on='entity', how='left')
node_features["thumbnail_url"] = node_features["thumbnail_url_x"].fillna(node_features["thumbnail_url_y"])
node_features = node_features.drop(columns=["thumbnail_url_x", "thumbnail_url_y"])
node_features = node_features.reset_index(drop=True)
node_features['node_id'] = node_features.index

In [5]:
name_to_id = {name: node_id for node_id, name in zip(node_features['node_id'], node_features['entity'])}
short_name_to_id = {name: node_id for node_id, name in zip(node_features['node_id'], node_features['short_name'])}
relation_to_id = {rel: i for i, rel in enumerate(edge_index['relation'].unique())}
id_to_relation = {i: rel for rel, i in relation_to_id.items()}
edge_index['src_id'] = edge_index['source'].map(name_to_id)
edge_index['dst_id'] = edge_index['target'].map(name_to_id)
edge_index['rel_id'] = edge_index['relation'].map(relation_to_id)

In [6]:
edge_index

Unnamed: 0,source,relation,target,src_id,dst_id,rel_id
0,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Anarchism,0,0,0
1,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Franciaország,0,9973,0
2,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Pedagogy,0,356,0
3,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/country,http://dbpedia.org/resource/United_States,1,9457,1
4,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/language,http://dbpedia.org/resource/English_Americans,1,10956,2
...,...,...,...,...,...,...
89192,http://dbpedia.org/resource/Feroze_Khan,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11635,12841,278
89193,http://dbpedia.org/resource/Luc_Besson,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,1575,12841,278
89194,http://dbpedia.org/resource/Gwynedd,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11637,12841,278
89195,http://dbpedia.org/resource/Square_Enix,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,4928,12841,278


In [7]:
node_features_full = pd.read_csv('./data/raw/node_features_full.csv')
node_features_full[node_features_full['short_name'] == 'Anarchism']

Unnamed: 0,entity,short_name,summary,thumbnail_url,image_path
94,http://dbpedia.org/resource/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...,,images\Anarchism.jpg


## Download all images

In [8]:
for idx, row in node_features_full.iterrows():
    if type(row['thumbnail_url']) == float and not pd.isna(row['thumbnail_url']):
        print(idx, row['entity'], row['short_name'], row['thumbnail_url'])

In [9]:
from tqdm import tqdm
import requests, os, time, random
from pathlib import Path
import pandas as pd

def download_image(url, save_path):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/118.0.0.0 Safari/537.36"
        )
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        save_path = Path(save_path)
        save_path.parent.mkdir(parents=True, exist_ok=True)
        with open(save_path, 'wb') as f:
            f.write(response.content)
        return True
    except requests.exceptions.RequestException:
        return False


def download_all_images(sleep_time=1, check_existing=True, limit=None):
    Path('images/').mkdir(parents=True, exist_ok=True)
    skipped, downloaded, failed = 0, 0, 0

    total = len(node_features_full) if limit is None else min(limit, len(node_features_full))

    for idx, row in tqdm(node_features_full.iterrows(), total=total, desc="Downloading images", ncols=100):
        if limit and idx >= limit:
            break

        if pd.notna(row['image_path']) and isinstance(row['image_path'], str):
            try:
                image_path = Path(row['image_path'])
            except Exception:
                failed += 1
                continue

            if os.path.isfile(image_path) and check_existing:
                skipped += 1
                if limit:
                    limit += 1
                continue
            print(f"Downloading image for {row['entity']} from {row['thumbnail_url']} to {image_path}")
            ret = download_image(row['thumbnail_url'], image_path)
            if ret:
                downloaded += 1
                time.sleep(sleep_time + random.uniform(0, 1))
            else:
                failed += 1

    return skipped, downloaded, failed


skipped, downloaded, failed = download_all_images(sleep_time=0.5, check_existing=True)
print(f"Skipped: {skipped}, Downloaded: {downloaded}, Failed: {failed}")


Downloading images:  10%|███▋                                | 1300/12842 [00:00<00:01, 6509.00it/s]

Downloading image for http://dbpedia.org/resource/Anarchism from nan to images\Anarchism.jpg


Downloading images: 100%|███████████████████████████████████| 12842/12842 [00:01<00:00, 8077.33it/s]

Skipped: 11560, Downloaded: 0, Failed: 1





In [10]:
file_path = './images/3_Idiots.jpg'

# Check if the file exists
if os.path.isfile(file_path):
    print(f"The file '{file_path}' exists.")
else:
    print(f"The file '{file_path}' does not exist.")


The file './images/3_Idiots.jpg' exists.


## Sanity check

In [11]:
no_image_counter = 0
try:
    for idx, row in node_features_full.iterrows():
        if pd.notna(row['image_path']) and isinstance(row['image_path'], str):
            image_path = Path(row['image_path'])
            if not image_path.exists() and pd.notna(row['thumbnail_url']):
                # print(f"Missing image for {row['entity']} at {image_path} with URL {row['thumbnail_url']}")
                no_image_counter += 1
except Exception as e:
    print(type(row['image_path']), row['image_path'])

if no_image_counter == 0:
    print("All images are present.")
else:
    print(f"Number of missing images: {no_image_counter}")

All images are present.


In [12]:
node_features_full.tail(3)

Unnamed: 0,entity,short_name,summary,thumbnail_url,image_path
12839,http://dbpedia.org/resource/Athlete,Athlete,An athlete is most commonly a person who compe...,https://upload.wikimedia.org/wikipedia/commons...,images\Athlete.jpg
12840,http://dbpedia.org/resource/Architect,Architect,"An architect is a person who plans, designs, a...",https://upload.wikimedia.org/wikipedia/commons...,images\Architect.jpg
12841,http://dbpedia.org/resource/Species,Species,A species is often defined as the largest grou...,https://upload.wikimedia.org/wikipedia/commons...,images\Species.jpg


## Build node and edge DataFrames

In [13]:
def build_node_mapping(nodes_df: pd.DataFrame):
    # Map entity URL -> node_idx (0..N-1)
    nodes_df = nodes_df.copy()
    nodes_df = nodes_df.reset_index(drop=True)
    nodes_df['node_id'] = nodes_df.index.astype(int)
    entity2id = dict(zip(nodes_df['entity'].tolist(), nodes_df['node_id'].tolist()))
    return nodes_df, entity2id

nodes_df, entity2id = build_node_mapping(node_features_full)


def process_edges(edges_df: pd.DataFrame, entity2id: dict):
    edges = edges_df.copy()
    if 'src_id' in edges.columns and edges['src_id'].notnull().all():
        # ensure ints
        edges['src_id'] = edges['src_id'].astype(int)
        edges['dst_id'] = edges['dst_id'].astype(int)
    else:
        edges['src_id'] = edges['source'].map(entity2id).astype(int)
        edges['dst_id'] = edges['target'].map(entity2id).astype(int)    
    return edges

edges_df = process_edges(edge_index, entity2id)
nodes_df.to_csv('./data/processed/nodes.csv', index=False)
edges_df.to_csv('./data/processed/edges.csv', index=False)
edges_df

Unnamed: 0,source,relation,target,src_id,dst_id,rel_id
0,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Anarchism,0,0,0
1,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Franciaország,0,9973,0
2,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Pedagogy,0,356,0
3,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/country,http://dbpedia.org/resource/United_States,1,9457,1
4,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/language,http://dbpedia.org/resource/English_Americans,1,10956,2
...,...,...,...,...,...,...
89192,http://dbpedia.org/resource/Feroze_Khan,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11635,12841,278
89193,http://dbpedia.org/resource/Luc_Besson,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,1575,12841,278
89194,http://dbpedia.org/resource/Gwynedd,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11637,12841,278
89195,http://dbpedia.org/resource/Square_Enix,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,4928,12841,278


## Generate Meta Relations