In [6]:
import pandas as pd
import pathlib 
import torch
import pandas as pd
from torch_geometric.data import HeteroData
import unicodedata
import pickle

data_fp = '../data/PROCESSED/'
model_fp = '../models'

# Create the directory if it doesn't exist
pathlib.Path(model_fp).mkdir(parents=True, exist_ok=True)
# Create the directory if it doesn't exist
pathlib.Path(data_fp).mkdir(parents=True, exist_ok=True)

In [7]:
# Reading the data
df=pd.read_csv("../data/RAW/kg.csv")

  df=pd.read_csv("../data/RAW/kg.csv")


In [8]:
# Node type counts
node_types = pd.concat([
    df[['x_index', 'x_type']].rename(columns={'x_index': 'index', 'x_type': 'type'}),
    df[['y_index', 'y_type']].rename(columns={'y_index': 'index', 'y_type': 'type'})
]).drop_duplicates()

# Getting the umber of nodes types
print("\nNumber of different node types:")
print(node_types['type'].value_counts())

# Getting the umber of edge types
print("\nNumber of different edge types:")
print(df['relation'].value_counts())

# Unique node count using globally unique indices
unique_nodes = set(df['x_index']).union(set(df['y_index']))
print("Total unique nodes:", len(unique_nodes))

# Unique edge count based on (source, target, relation) triplet
unique_edges = df[['x_index', 'y_index', 'relation']].drop_duplicates()
print("Total unique edges:", len(unique_edges))

# Count unique relations
unique_relations = df["relation"].value_counts()
print("\nUnique Relations:\n", unique_relations)


Number of different node types:
type
biological_process    28642
gene/protein          27671
disease               17080
effect/phenotype      15311
anatomy               14035
molecular_function    11169
drug                   7957
cellular_component     4176
pathway                2516
exposure                818
Name: count, dtype: int64

Number of different edge types:
relation
anatomy_protein_present       3036406
drug_drug                     2672628
protein_protein                642150
disease_phenotype_positive     300634
bioprocess_protein             289610
cellcomp_protein               166804
disease_protein                160822
molfunc_protein                139060
drug_effect                    129568
bioprocess_bioprocess          105772
pathway_protein                 85292
disease_disease                 64388
contraindication                61350
drug_protein                    51306
anatomy_protein_absent          39774
phenotype_phenotype             37472
anatom

In [9]:
# Relationships to extract
selected_relations = [
    "protein_protein",
    "disease_phenotype_positive",
    "disease_phenotype_negative",
    "bioprocess_protein",
    "disease_protein",
    "drug_effect",
    "pathway_protein",
    "disease_disease",
    "contraindication",
    "drug_protein",
    "indication",
    "exposure_disease ",
    "anatomy_protein_absent"
]
# Filter the dataframe
filtered_df = df[df["relation"].isin(selected_relations)]

# Save filtered dataframe
filtered_df.to_csv(f"{data_fp}/filtered_primekg.csv", index=False)


### CREATING THE GRAPH DATA

In [10]:
# Load data
df = pd.read_csv("../data/PROCESSED/filtered_primekg.csv", low_memory=False)

# Clean names
def clean_text(text):
    return unicodedata.normalize("NFKD", str(text)).strip().lower()

df["x_name"] = df["x_name"].apply(clean_text)
df["y_name"] = df["y_name"].apply(clean_text)

# Normalize types
node_type_mapping = {
    "gene/protein": "protein",
    "chemical/drug": "drug",
    "drug": "drug",
    "disease": "disease",
    "biological_process": "bioprocess",
    "bioprocess": "bioprocess",
    "pathway": "pathway",
    "effect/phenotype": "phenotype"
}
df["x_type"] = df["x_type"].map(node_type_mapping)
df["y_type"] = df["y_type"].map(node_type_mapping)

#  Extract nodes and relations
node_sets = {t: set() for t in node_type_mapping.values()}
for t in node_sets.keys():
    x_nodes = set(df[df["x_type"] == t]["x_name"].dropna().unique())
    y_nodes = set(df[df["y_type"] == t]["y_name"].dropna().unique())
    node_sets[t] = x_nodes | y_nodes

relation_map = {}
actual_relations = set(df["relation"].unique())
for rel in actual_relations:
    x_type = df[df["relation"] == rel]["x_type"].iloc[0]
    y_type = df[df["relation"] == rel]["y_type"].iloc[0]
    if x_type in node_sets and y_type in node_sets:
        relation_map[rel] = (x_type, y_type)

# Create node maps
node_maps = {k: {name: i for i, name in enumerate(sorted(v))} for k, v in node_sets.items()}

# Create HeteroData
hetero_data = HeteroData()
for node_type, name_to_id in node_maps.items():
    hetero_data[node_type].num_nodes = len(name_to_id)
    hetero_data[node_type].x = torch.randn(len(name_to_id), 128)

for rel, (src_type, dst_type) in relation_map.items():
    rel_df = df[df['relation'] == rel]
    src_ids = rel_df['x_name'].map(node_maps[src_type]).fillna(-1).astype(int)
    dst_ids = rel_df['y_name'].map(node_maps[dst_type]).fillna(-1).astype(int)
    valid_mask = (src_ids != -1) & (dst_ids != -1)
    edge_index = torch.tensor([src_ids[valid_mask].values, dst_ids[valid_mask].values], dtype=torch.long)
    hetero_data[(src_type, rel, dst_type)].edge_index = edge_index

# Save HeteroData using torch.save (safe)
torch.save(hetero_data.to_dict(), f"{model_fp}/hetero_data_dict_version_final.pt")

# Save node_maps safely with pickle
with open(f"{data_fp}/node_maps_version_final.pkl", "wb") as f:
    pickle.dump(node_maps, f)

print("Saved hetero_data and node_maps successfully.")

Saved hetero_data and node_maps successfully.
