# Pre Processing

## load Data

In [8]:
import pandas as pd
import torch
import requests
from pathlib import Path
import time
import random
from tqdm import tqdm
import os
from PIL import Image


In [9]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [10]:
edge_index = pd.read_csv('./data/raw/edges.csv')
full_node_features = pd.read_parquet('./data/raw/node_features.parquet')
images = pd.read_csv('./data/raw/images.csv')

In [11]:
node_features = pd.merge(full_node_features, images, on='entity', how='left')
node_features["thumbnail_url"] = node_features["thumbnail_url_x"].fillna(node_features["thumbnail_url_y"])
node_features = node_features.drop(columns=["thumbnail_url_x", "thumbnail_url_y"])
node_features = node_features.reset_index(drop=True)
node_features['node_id'] = node_features.index

In [23]:
name_to_id = {name: node_id for node_id, name in zip(node_features['node_id'], node_features['entity'])}
short_name_to_id = {name: node_id for node_id, name in zip(node_features['node_id'], node_features['short_name'])}
relation_to_id = {rel: i for i, rel in enumerate(edge_index['relation'].unique())}
id_to_relation = {i: rel for rel, i in relation_to_id.items()}
edge_index['src_id'] = edge_index['source'].map(name_to_id)
edge_index['dst_id'] = edge_index['target'].map(name_to_id)
edge_index['rel_id'] = edge_index['relation'].map(relation_to_id)

In [13]:
edge_index

Unnamed: 0,source,relation,target,src_id,dst_id,rel_id
0,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Anarchism,0,0,0
1,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Franciaország,0,9973,0
2,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Pedagogy,0,356,0
3,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/country,http://dbpedia.org/resource/United_States,1,9457,1
4,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/language,http://dbpedia.org/resource/English_Americans,1,10956,2
...,...,...,...,...,...,...
89192,http://dbpedia.org/resource/Feroze_Khan,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11635,12841,278
89193,http://dbpedia.org/resource/Luc_Besson,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,1575,12841,278
89194,http://dbpedia.org/resource/Gwynedd,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11637,12841,278
89195,http://dbpedia.org/resource/Square_Enix,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,4928,12841,278


In [14]:
node_features_full = pd.read_csv('./data/raw/node_features_full.csv')
node_features_full[node_features_full['short_name'] == 'Anarchism']

Unnamed: 0,entity,short_name,summary,thumbnail_url,image_path
94,http://dbpedia.org/resource/Anarchism,Anarchism,Anarchism is a political philosophy and moveme...,,images\Anarchism.jpg


## Download all images

In [10]:
for idx, row in node_features_full.iterrows():
    if type(row['thumbnail_url']) == float and not pd.isna(row['thumbnail_url']):
        print(idx, row['entity'], row['short_name'], row['thumbnail_url'])

In [21]:
from tqdm import tqdm
import requests, os, time, random
from pathlib import Path
import pandas as pd

def download_image(url, save_path):
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/118.0.0.0 Safari/537.36"
        )
    }

    try:
        response = requests.get(url, headers=headers, timeout=10)
        response.raise_for_status()
        save_path = Path(save_path)
        save_path.parent.mkdir(parents=True, exist_ok=True)
        with open(save_path, 'wb') as f:
            f.write(response.content)
        return True
    except requests.exceptions.RequestException:
        return False


def download_all_images(sleep_time=1, check_existing=True, limit=None):
    Path('images/').mkdir(parents=True, exist_ok=True)
    skipped, downloaded, failed = 0, 0, 0

    total = len(node_features_full) if limit is None else min(limit, len(node_features_full))

    for idx, row in tqdm(node_features_full.iterrows(), total=total, desc="Downloading images", ncols=100):
        if limit and idx >= limit:
            break

        if pd.notna(row['image_path']) and isinstance(row['image_path'], str):
            try:
                image_path = Path(row['image_path'])
            except Exception:
                failed += 1
                continue

            if os.path.isfile(image_path) and check_existing:
                skipped += 1
                if limit:
                    limit += 1
                continue
            print(f"Downloading image for {row['entity']} from {row['thumbnail_url']} to {image_path}")
            ret = download_image(row['thumbnail_url'], image_path)
            if ret:
                downloaded += 1
                time.sleep(sleep_time + random.uniform(0, 1))
            else:
                failed += 1

    return skipped, downloaded, failed


skipped, downloaded, failed = download_all_images(sleep_time=0.5, check_existing=True)
print(f"Skipped: {skipped}, Downloaded: {downloaded}, Failed: {failed}")




[A[A

[A[A

[A[A

Downloading image for http://dbpedia.org/resource/Anarchism from nan to images\Anarchism.jpg
Downloading image for http://dbpedia.org/resource/Star_Trek:_Deep_Space_Nine from nan to images\Star_Trek:_Deep_Space_Nine.jpg
Downloading image for http://dbpedia.org/resource/Star_Trek:_The_Original_Series from nan to images\Star_Trek:_The_Original_Series.jpg
Downloading image for http://dbpedia.org/resource/Star_Trek:_Enterprise from nan to images\Star_Trek:_Enterprise.jpg
Downloading image for http://dbpedia.org/resource/Star_Trek:_Voyager from nan to images\Star_Trek:_Voyager.jpg




[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

[A[A

Downloading image for http://dbpedia.org/resource/G.I._Joe:_The_Rise_of_Cobra from https://upload.wikimedia.org/wikipedia/en/7/78/G.I._Joe_-_The_Rise_of_Cobra_%282009_film%29.jpg to images\G.I._Joe:_The_Rise_of_Cobra.jpg
Downloading image for http://dbpedia.org/resource/Cirque_du_Freak:_The_Vampire's_Assistant from https://upload.wikimedia.org/wikipedia/en/b/ba/Vampires_assistant.jpg to images\Cirque_du_Freak:_The_Vampire's_Assistant.jpg
Downloading image for http://dbpedia.org/resource/Naruto_Shippuden_the_Movie:_Bonds from https://upload.wikimedia.org/wikipedia/en/0/04/Naruto_Shippuden_the_Movie_Bonds.jpg to images\Naruto_Shippuden_the_Movie:_Bonds.jpg
Downloading image for http://dbpedia.org/resource/Star_Wars:_The_Clone_Wars_(film) from https://upload.wikimedia.org/wikipedia/en/7/72/Star_wars_the_clone_wars.jpg to images\Star_Wars:_The_Clone_Wars_(film).jpg
Downloading image for http://dbpedia.org/resource/Resident_Evil:_Afterlife from https://upload.wikimedia.org/wikipedia/en/e/ea



[A[A

Downloading image for http://dbpedia.org/resource/2001:_A_Space_Odyssey_(film) from https://upload.wikimedia.org/wikipedia/en/1/11/2001_A_Space_Odyssey_%281968%29.png to images\2001:_A_Space_Odyssey_(film).jpg




[A[A

Downloading image for http://dbpedia.org/resource/Pirates_of_the_Caribbean:_On_Stranger_Tides from https://upload.wikimedia.org/wikipedia/en/5/5e/Pirates_of_the_Caribbean_-_On_Stranger_Tides.png to images\Pirates_of_the_Caribbean:_On_Stranger_Tides.jpg
Downloading image for http://dbpedia.org/resource/Sherlock_Holmes:_A_Game_of_Shadows from https://upload.wikimedia.org/wikipedia/en/thumb/5/53/Sherlock_Holmes2Poster.jpg/330px-Sherlock_Holmes2Poster.jpg to images\Sherlock_Holmes:_A_Game_of_Shadows.jpg
Downloading image for http://dbpedia.org/resource/Bucky_Larson:_Born_to_Be_a_Star from https://upload.wikimedia.org/wikipedia/en/1/1f/Born_to_Be_a_Star_Poster.jpg to images\Bucky_Larson:_Born_to_Be_a_Star.jpg
Downloading image for http://dbpedia.org/resource/Captain_America:_The_First_Avenger from https://upload.wikimedia.org/wikipedia/en/3/37/Captain_America_The_First_Avenger_poster.jpg to images\Captain_America:_The_First_Avenger.jpg
Downloading image for http://dbpedia.org/resource/Georg



[A[A

Downloading image for http://dbpedia.org/resource/Alvin_and_the_Chipmunks:_Chipwrecked from https://upload.wikimedia.org/wikipedia/en/c/cc/Alvin_and_the_Chipmunks_3_teaser.jpg to images\Alvin_and_the_Chipmunks:_Chipwrecked.jpg




[A[A

Downloading image for http://dbpedia.org/resource/The_Hobbit:_An_Unexpected_Journey from https://upload.wikimedia.org/wikipedia/en/b/b3/The_Hobbit-_An_Unexpected_Journey.jpeg to images\The_Hobbit:_An_Unexpected_Journey.jpg




[A[A

Downloading images: 100%|████████████████████████████████████| 12842/12842 [00:31<00:00, 405.83it/s]

Skipped: 11534, Downloaded: 22, Failed: 5





In [11]:
import os

# Define the path to the file
file_path = './images/3_Idiots.jpg'

# Check if the file exists
if os.path.isfile(file_path):
    print(f"The file '{file_path}' exists.")
else:
    print(f"The file '{file_path}' does not exist.")


The file './images/3_Idiots.jpg' exists.


## Sanity check

In [23]:
no_image_counter = 0
try:
    for idx, row in node_features_full.iterrows():
        if pd.notna(row['image_path']) and isinstance(row['image_path'], str):
            image_path = Path(row['image_path'])
            if not image_path.exists() and pd.notna(row['thumbnail_url']):
                # print(f"Missing image for {row['entity']} at {image_path} with URL {row['thumbnail_url']}")
                no_image_counter += 1
except Exception as e:
    print(type(row['image_path']), row['image_path'])

if no_image_counter == 0:
    print("All images are present.")
else:
    print(f"Number of missing images: {no_image_counter}")

All images are present.


In [24]:
node_features_full.tail()

Unnamed: 0,entity,short_name,summary,thumbnail_url,image_path
12837,http://dbpedia.org/resource/Organization,Organization,An organization or organisation is an entity—s...,https://upload.wikimedia.org/wikipedia/commons...,images\Organization.jpg
12838,http://dbpedia.org/resource/Astronaut,Astronaut,"An astronaut is a person trained, equipped, an...",https://upload.wikimedia.org/wikipedia/commons...,images\Astronaut.jpg
12839,http://dbpedia.org/resource/Athlete,Athlete,An athlete is most commonly a person who compe...,https://upload.wikimedia.org/wikipedia/commons...,images\Athlete.jpg
12840,http://dbpedia.org/resource/Architect,Architect,"An architect is a person who plans, designs, a...",https://upload.wikimedia.org/wikipedia/commons...,images\Architect.jpg
12841,http://dbpedia.org/resource/Species,Species,A species is often defined as the largest grou...,https://upload.wikimedia.org/wikipedia/commons...,images\Species.jpg


## Build node and edge DataFrames

In [16]:
edges_df

Unnamed: 0,source,relation,target,src_id,dst_id,rel_id
0,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Anarchism,0,0,0
1,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Franciaország,0,9973,0
2,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Pedagogy,0,356,0
3,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/country,http://dbpedia.org/resource/United_States,1,9457,1
4,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/language,http://dbpedia.org/resource/English_Americans,1,10956,2
...,...,...,...,...,...,...
89192,http://dbpedia.org/resource/Feroze_Khan,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11635,12841,278
89193,http://dbpedia.org/resource/Luc_Besson,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,1575,12841,278
89194,http://dbpedia.org/resource/Gwynedd,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11637,12841,278
89195,http://dbpedia.org/resource/Square_Enix,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,4928,12841,278


In [37]:
def build_node_mapping(nodes_df: pd.DataFrame):
    # Map entity URL -> node_idx (0..N-1)
    nodes_df = nodes_df.copy()
    nodes_df = nodes_df.reset_index(drop=True)
    nodes_df['node_id'] = nodes_df.index.astype(int)
    entity2id = dict(zip(nodes_df['entity'].tolist(), nodes_df['node_id'].tolist()))
    return nodes_df, entity2id

nodes_df, entity2id = build_node_mapping(node_features_full)


def process_edges(edges_df: pd.DataFrame, entity2id: dict):
    edges = edges_df.copy()
    if 'src_id' in edges.columns and edges['src_id'].notnull().all():
        # ensure ints
        edges['src_id'] = edges['src_id'].astype(int)
        edges['dst_id'] = edges['dst_id'].astype(int)
    else:
        edges['src_id'] = edges['source'].map(entity2id).astype(int)
        edges['dst_id'] = edges['target'].map(entity2id).astype(int)    
    return edges

edges_df = process_edges(edge_index, entity2id)
nodes_df.to_csv('./data/processed/nodes.csv', index=False)
edges_df.to_csv('./data/processed/edges.csv', index=False)
edges_df

Unnamed: 0,source,relation,target,src_id,dst_id,rel_id
0,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Anarchism,0,0,0
1,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Franciaország,0,9973,0
2,http://dbpedia.org/resource/Anarchism,http://www.w3.org/2000/01/rdf-schema#seeAlso,http://dbpedia.org/resource/Pedagogy,0,356,0
3,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/country,http://dbpedia.org/resource/United_States,1,9457,1
4,http://dbpedia.org/resource/Alabama,http://dbpedia.org/ontology/language,http://dbpedia.org/resource/English_Americans,1,10956,2
...,...,...,...,...,...,...
89192,http://dbpedia.org/resource/Feroze_Khan,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11635,12841,278
89193,http://dbpedia.org/resource/Luc_Besson,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,1575,12841,278
89194,http://dbpedia.org/resource/Gwynedd,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,11637,12841,278
89195,http://dbpedia.org/resource/Square_Enix,http://www.w3.org/1999/02/22-rdf-syntax-ns#type,http://dbpedia.org/resource/Athlete,4928,12841,278


## Generate Meta Relations

In [39]:
# print all relations names
for rel_id in sorted(id_to_relation.keys()):
    print(f"{rel_id}: {id_to_relation[rel_id].split('/')[-1]}")

0: rdf-schema#seeAlso
1: country
2: language
3: capital
4: largestCity
5: deathPlace
6: restingPlace
7: party
8: successor
9: region
10: predecessor
11: presenter
12: birthPlace
13: mainInterest
14: influencedBy
15: citizenship
16: subject
17: almaMater
18: award
19: governmentType
20: officialLanguage
21: timeZone
22: currency
23: ethnicGroup
24: leader
25: education
26: era
27: influenced
28: residence
29: field
30: place
31: territory
32: commander
33: type
34: outflow
35: occupation
36: religion
37: leaderParty
38: governingBody
39: isPartOf
40: training
41: movement
42: locationCity
43: industry
44: product
45: anthem
46: philosophicalSchool
47: genre
48: parent
49: location
50: parentCompany
51: headquarter
52: federalState
53: otherParty
54: profession
55: militaryRank
56: battle
57: vicePresident
58: militaryBranch
59: president
60: notableIdea
61: spouse
62: child
63: relative
64: owl#differentFrom
65: city
66: state
67: athletics
68: affiliation
69: spokenIn
70: languageFamil