In [2]:
import torch 
import json
import os
import pandas as pd

In [5]:
input_data = """
{
  "user": {
    "id": "user_001",
    "name": "John Doe",
    "age": 45,
    "gender": "male",
    "blood_type": "O+",
    "genetic_conditions": ["G6PD deficiency"],
    "allergies": ["penicillin", "sulfa drugs"],
    "medical_history": ["hypertension", "asthma"]
  },
  "symptoms": {
    "reported": ["shortness of breath", "chest pain"],
    "duration_days": 3
  },
  "diagnosis": {
    "predicted_conditions": ["coronary artery disease"],
    "confidence_scores": {
      "coronary artery disease": 0.88
    }
  },
  "timestamp": "2025-04-18T10:20:00Z"
}"""

In [3]:
sider_file_path = 'data/meddra_all_se.tsv'
sider_names_file_path = 'data/drug_names.tsv'
sider_df = pd.read_csv(sider_file_path,sep='\t',header=None, compression=None)
drug_names_df = pd.read_csv(sider_names_file_path,sep='\t',header=None,compression=None)
drug_names_df.columns = ['STITCH_flat', 'Drug_Name']
sider_df.columns = [
    'STITCH_compound_flat',  # Example: CID100000085
    'STITCH_compound_stereo',  # Example: CID000010917
    'UMLS_concept_id',         # Example: C0000729
    'MedDRA_type',             # e.g., LLT
    'MedDRA_concept_id',       # Example: C0000729
    'LLT_preferred_term'       # e.g., "Abdominal cramps"
]
sider_df.dropna(subset=['MedDRA_type', 'MedDRA_concept_id'], inplace=True)
ctd_dir='data'
ctd_chem_disease_file = os.path.join(ctd_dir, 'CTD_chemicals_diseases.csv.gz')
ctd_chem_gene_file = os.path.join(ctd_dir, 'CTD_chem_gene_ixns.csv.gz')
ctd_chemicals_file = os.path.join(ctd_dir, 'CTD_chemicals.csv.gz')
ctd_genes_file = os.path.join(ctd_dir, 'CTD_genes.csv.gz')
ctd_chem_disease_df = pd.read_csv(
        ctd_chem_disease_file,
        comment='#', 
        compression='gzip'
    )
ctd_chemicals_df = pd.read_csv(
    ctd_chemicals_file,
    sep='\t',
    comment='#',
    compression='gzip'
)
ctd_genes_df = pd.read_csv(
    ctd_genes_file,
    sep='\t',
    comment='#',
    compression='gzip'
)
ctd_chem_disease_df.columns = [
    "ChemicalName", "ChemicalID", "CasRN", "DiseaseName", "DiseaseID", 
    "DirectEvidence", "InferenceGeneSymbol", "InferenceScore", 
    "OmimIDs", "PubMedIDs"
]

In [5]:
ctd_genes_df.head(1)

Unnamed: 0,"=,16S ribosomal RNA,73953399,,NQW33_mgr01,,,"
0,"03B03F,""DNA segment, 03B03F (Research Genetics..."


In [4]:
import pandas as pd
import torch
from sklearn.preprocessing import LabelEncoder
from torch_geometric.data import HeteroData

# --- Label Encoding ---
drug_encoder = LabelEncoder()
side_effect_encoder = LabelEncoder()
gene_encoder = LabelEncoder()
disease_encoder = LabelEncoder()

# Step 1: Encode drug names from all sources to unify
all_drugs = pd.concat([
    sider_df['STITCH_compound_flat'],
    ctd_genes_df['ChemicalName'],
    ctd_chem_disease_df['ChemicalName']
]).drop_duplicates().reset_index(drop=True)

drug_encoder.fit(all_drugs)

# Step 2: Encode entities
sider_df['drug_id'] = drug_encoder.transform(sider_df['STITCH_compound_flat'])
sider_df['side_effect_id'] = side_effect_encoder.fit_transform(sider_df['LLT_preferred_term'])

ctd_genes_df = ctd_genes_df[ctd_genes_df['GeneSymbol'].notna()]
ctd_genes_df['drug_id'] = drug_encoder.transform(ctd_genes_df['ChemicalName'])
ctd_genes_df['gene_id'] = gene_encoder.fit_transform(ctd_genes_df['GeneSymbol'])

ctd_chem_disease_df = ctd_chem_disease_df[ctd_chem_disease_df['DiseaseName'].notna()]
ctd_chem_disease_df['drug_id'] = drug_encoder.transform(ctd_chem_disease_df['ChemicalName'])
ctd_chem_disease_df['disease_id'] = disease_encoder.fit_transform(ctd_chem_disease_df['DiseaseName'])

# --- Initialize HeteroData ---
data = HeteroData()

# --- Node Features ---
num_drugs = len(drug_encoder.classes_)
num_side_effects = len(side_effect_encoder.classes_)
num_genes = len(gene_encoder.classes_)
num_diseases = len(disease_encoder.classes_)

data['drug'].x = torch.eye(num_drugs)
data['side_effect'].x = torch.eye(num_side_effects)
data['gene'].x = torch.eye(num_genes)
data['disease'].x = torch.eye(num_diseases)

# --- drug -> side_effect (SIDER) ---
drug_se_src = torch.tensor(sider_df['drug_id'].values, dtype=torch.long)
drug_se_dst = torch.tensor(sider_df['side_effect_id'].values, dtype=torch.long)
data['drug', 'causes', 'side_effect'].edge_index = torch.stack([drug_se_src, drug_se_dst], dim=0)
data['side_effect', 'is_caused_by', 'drug'].edge_index = torch.stack([drug_se_dst, drug_se_src], dim=0)

# --- drug -> gene (CTD) ---
drug_gene_src = torch.tensor(ctd_genes_df['drug_id'].values, dtype=torch.long)
drug_gene_dst = torch.tensor(ctd_genes_df['gene_id'].values, dtype=torch.long)
data['drug', 'interacts', 'gene'].edge_index = torch.stack([drug_gene_src, drug_gene_dst], dim=0)
data['gene', 'is_interacted_by', 'drug'].edge_index = torch.stack([drug_gene_dst, drug_gene_src], dim=0)

# --- drug -> disease (CTD) ---
drug_dis_src = torch.tensor(ctd_chem_disease_df['drug_id'].values, dtype=torch.long)
drug_dis_dst = torch.tensor(ctd_chem_disease_df['disease_id'].values, dtype=torch.long)
data['drug', 'associates', 'disease'].edge_index = torch.stack([drug_dis_src, drug_dis_dst], dim=0)
data['disease', 'is_associated_with_drug', 'drug'].edge_index = torch.stack([drug_dis_dst, drug_dis_src], dim=0)

print(data)


KeyError: 'ChemicalName'