In [1]:
!pip install torch_geometric

Collecting torch_geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/63.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Downloading torch_geometric-2.6.1-py3-none-any.whl (1.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.1/1.1 MB[0m [31m44.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: torch_geometric
Successfully installed torch_geometric-2.6.1


In [2]:
import os
import json
import torch
import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from torch_geometric.data import HeteroData

data = HeteroData()

In [3]:
data_path = "/content/drive/MyDrive/AIT/ML/Project/personalized_medical_recommendation/preprocessing/processed/"

patients = pd.read_csv(os.path.join(data_path, "processed_patients.csv"))
diagnoses = pd.read_csv(os.path.join(data_path, "processed_diagnoses.csv"))
prescriptions = pd.read_csv(os.path.join(data_path, "processed_prescriptions.csv"))
procedures = pd.read_csv(os.path.join(data_path, "processed_procedures.csv"))
labevents = pd.read_csv(os.path.join(data_path, "processed_labevents.csv"))
admissions = pd.read_csv(os.path.join(data_path, "processed_admissions.csv"))

## Filter to match admission periods

In [4]:
# Merge with admission windows
presc_merged = prescriptions.merge(
    admissions[["subject_id", "hadm_id", "admittime", "dischtime"]],
    on=["subject_id", "hadm_id"],
    how="inner"
)

# Keep prescriptions within stay period
prescriptions_filtered = presc_merged[
    (presc_merged["startdate"] >= presc_merged["admittime"]) &
    (presc_merged["startdate"] <= presc_merged["dischtime"])
].copy()

In [5]:
# Merge and filter labs
labs_merged = labevents.merge(
    admissions[["subject_id", "hadm_id", "admittime", "dischtime"]],
    on=["subject_id", "hadm_id"],
    how="inner"
)

labevents_filtered = labs_merged[
    (labs_merged["charttime"] >= labs_merged["admittime"]) &
    (labs_merged["charttime"] <= labs_merged["dischtime"])
].copy()

In [6]:
# Keep only diagnoses with valid admissions
valid_hadm_ids = set(admissions["hadm_id"])
diagnoses_filtered = diagnoses[diagnoses["hadm_id"].isin(valid_hadm_ids)].copy()
procedures_filtered = procedures[procedures["hadm_id"].isin(valid_hadm_ids)].copy()

In [7]:
valid_subject_ids = set(admissions["subject_id"])
patients_filtered = patients[patients["subject_id"].isin(valid_subject_ids)].copy()

In [8]:
len(patients_filtered), len(diagnoses_filtered), len(prescriptions_filtered), len(procedures_filtered), len(labevents_filtered)

(46520, 651000, 3307362, 240095, 18764822)

In [9]:
del patients, diagnoses, prescriptions, procedures, labevents, admissions

## Create Unique Integer ID Mappings

In [10]:
# Patient ID map
patient_ids = {sid: idx for idx, sid in enumerate(patients_filtered["subject_id"].unique())}

# Disease ID map (using disease_category)
disease_ids = {cat: idx for idx, cat in enumerate(diagnoses_filtered["disease_category"].unique())}

# Medication ID map (from the embedding file you saved earlier)
medication_ids = {drug: idx for idx, drug in enumerate(prescriptions_filtered["drug"].unique())}

# Procedure ID map (using procedure_category)
procedure_ids = {cat: idx for idx, cat in enumerate(procedures_filtered["procedure_category"].unique())}

# Lab test ID map (using itemid)
lab_ids = {itemid: idx for idx, itemid in enumerate(labevents_filtered["itemid"].unique())}

## Save Mapping

In [11]:
# Reverse mappings
id_to_disease = {v: k for k, v in disease_ids.items()}
id_to_medication = {v: k for k, v in medication_ids.items()}
id_to_procedure = {v: k for k, v in procedure_ids.items()}

with open(os.path.join(data_path, "mappings", "id_to_disease.json"), "w") as f:
    json.dump(id_to_disease, f)

with open(os.path.join(data_path, "mappings", "id_to_medication.json"), "w") as f:
    json.dump(id_to_medication, f)

with open(os.path.join(data_path, "mappings", "id_to_procedure.json"), "w") as f:
    json.dump(id_to_procedure, f)

In [12]:
# Gender + Ethnicity one-hot encoding
encoder = OneHotEncoder(sparse_output=False, handle_unknown="ignore")
demo_features = encoder.fit_transform(
    patients_filtered[["gender", "ethnicity"]]
)

scaler = MinMaxScaler()
age_scaled = scaler.fit_transform(patients_filtered[["age"]])

# Final patient features
patient_features = np.hstack([age_scaled, demo_features])
patient_features = torch.tensor(patient_features, dtype=torch.float)

Save encoder and scaler

In [13]:
import joblib

# Save OneHotEncoder
joblib.dump(encoder, os.path.join(data_path, "patient_gender_ethnicity_encoder.pkl"))

# Save MinMaxScaler
joblib.dump(scaler, os.path.join(data_path, "patient_age_scaler.pkl"))

['/content/drive/MyDrive/AIT/ML/Project/personalized_medical_recommendation/preprocessing/processed/patient_age_scaler.pkl']

In [14]:
# Load medication_embeddings.csv (from previous step)
med_embed_df = pd.read_csv(os.path.join(data_path, "medication_embeddings.csv"))

# First: sort drug names by their assigned node_id
sorted_drugs = sorted(medication_ids.items(), key=lambda x: x[1])  # Sort by node_id
ordered_drug_names = [drug for drug, _ in sorted_drugs]

# Now: align med_embed_df to match node ID order
med_embed_df = med_embed_df.set_index("drug").loc[ordered_drug_names]

# Final: convert to tensor
medication_features = torch.tensor(med_embed_df.values, dtype=torch.float)

In [15]:
disease_features = torch.eye(len(disease_ids))
procedure_features = torch.eye(len(procedure_ids))
lab_features = torch.eye(len(lab_ids))

## Build Edge Index Tensor

In [16]:
patient_disease_edges = [
    (patient_ids[row["subject_id"]], disease_ids[row["disease_category"]])
    for _, row in diagnoses_filtered.iterrows()
]

edge_index_patient_disease = torch.tensor(patient_disease_edges, dtype=torch.long).T  # shape: [2, num_edges]

In [17]:
patient_medication_edges = [
    (patient_ids[row["subject_id"]], medication_ids[row["drug"]])
    for _, row in prescriptions_filtered.iterrows()
    if row["drug"] in medication_ids
]

edge_index_patient_med = torch.tensor(patient_medication_edges, dtype=torch.long).T

In [18]:
patient_procedure_edges = [
    (patient_ids[row["subject_id"]], procedure_ids[row["procedure_category"]])
    for _, row in procedures_filtered.iterrows()
]

edge_index_patient_proc = torch.tensor(patient_procedure_edges, dtype=torch.long).T

In [19]:
patient_lab_edges = [
    (patient_ids[row["subject_id"]], lab_ids[row["itemid"]])
    for _, row in labevents_filtered.iterrows()
]

edge_index_patient_lab = torch.tensor(patient_lab_edges, dtype=torch.long).T

In [20]:
del patients_filtered, diagnoses_filtered, prescriptions_filtered, procedures_filtered, labevents_filtered

## Construct the Heterogeneous Graph

In [21]:
from torch_geometric.data import HeteroData

data = HeteroData()

# Node features
data["patient"].x = patient_features
data["medication"].x = medication_features
data["disease"].x = disease_features
data["procedure"].x = procedure_features
data["lab"].x = lab_features

# Edges
data["patient", "has_disease", "disease"].edge_index = edge_index_patient_disease
data["patient", "prescribed", "medication"].edge_index = edge_index_patient_med
data["patient", "underwent", "procedure"].edge_index = edge_index_patient_proc
data["patient", "has_lab", "lab"].edge_index = edge_index_patient_lab

In [22]:
from torch_geometric.transforms import ToUndirected
data = ToUndirected()(data)

## Save the Graph Data

In [23]:
torch.save(data, "/content/drive/MyDrive/AIT/ML/Project/personalized_medical_recommendation/preprocessing/processed_graph.pt")