# Prerequisites

In [1]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
import pickle
import os

# Preprocessing

In [2]:
file_path = r"C:\Users\Asus\Downloads\TWOSIDES\TWOSIDES.csv"

In [3]:
df = pd.read_csv(file_path, dtype=str)

In [4]:
df.head()

Unnamed: 0,drug_1_rxnorn_id,drug_1_concept_name,drug_2_rxnorm_id,drug_2_concept_name,condition_meddra_id,condition_concept_name,A,B,C,D,PRR,PRR_error,mean_reporting_frequency
0,10355,Temazepam,136411,sildenafil,10003239,Arthralgia,7,149,24,1536,2.91667,0.421275,0.0448718
1,1808,Bumetanide,7824,Oxytocin,10003239,Arthralgia,1,13,2,138,5.0,1.19224,0.0714286
2,221147,POLYETHYLENE GLYCOL 3350,5521,Hydroxychloroquine,10003239,Arthralgia,6,103,20,1070,3.0,0.454505,0.0550459
3,10324,Tamoxifen,8640,Prednisone,10012735,Diarrhoea,18,123,35,1375,5.14286,0.276271,0.12766
4,10355,Temazepam,136411,sildenafil,10012735,Diarrhoea,2,154,37,1523,0.540541,0.721093,0.0128205


In [5]:
df.shape

(42920391, 13)

In [6]:
df.isnull().sum()

drug_1_rxnorn_id            0
drug_1_concept_name         0
drug_2_rxnorm_id            0
drug_2_concept_name         0
condition_meddra_id         0
condition_concept_name      0
A                           0
B                           0
C                           0
D                           0
PRR                         0
PRR_error                   0
mean_reporting_frequency    0
dtype: int64

In [7]:
df.duplicated().sum()

1000

In [8]:
df = df.drop_duplicates(subset=['drug_1_rxnorn_id', 'drug_2_rxnorm_id', 'condition_meddra_id'])

In [9]:
df.duplicated().sum()

0

In [10]:
df.shape

(42919391, 13)

In [11]:
# Normalize drug pair order to prevent duplication
df['min_drug'] = df[['drug_1_rxnorn_id', 'drug_2_rxnorm_id']].min(axis=1)
df['max_drug'] = df[['drug_1_rxnorn_id', 'drug_2_rxnorm_id']].max(axis=1)

df['drug_1_rxnorn_id'] = df['min_drug']
df['drug_2_rxnorm_id'] = df['max_drug']

df.drop(['min_drug', 'max_drug'], axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 42919391 entries, 0 to 42920390
Data columns (total 13 columns):
 #   Column                    Dtype 
---  ------                    ----- 
 0   drug_1_rxnorn_id          object
 1   drug_1_concept_name       object
 2   drug_2_rxnorm_id          object
 3   drug_2_concept_name       object
 4   condition_meddra_id       object
 5   condition_concept_name    object
 6   A                         object
 7   B                         object
 8   C                         object
 9   D                         object
 10  PRR                       object
 11  PRR_error                 object
 12  mean_reporting_frequency  object
dtypes: object(13)
memory usage: 4.5+ GB


In [13]:
# Convert columns to numeric
numeric_cols = ['drug_1_rxnorn_id', 'drug_2_rxnorm_id', 'condition_meddra_id',
                'A', 'B', 'C', 'D', 'PRR', 'PRR_error', 'mean_reporting_frequency']
df[numeric_cols] = df[numeric_cols].apply(pd.to_numeric, errors='coerce')

In [14]:
# Filter low-signal interactions
df = df[(df['PRR'] > 1.5) & (df['mean_reporting_frequency'] > 0.01)].copy()

In [15]:
# Keep top 50 most frequent side effects
top_n = 50
top_side_effects = df['condition_meddra_id'].value_counts().head(top_n).index.tolist()
df = df[df['condition_meddra_id'].isin(top_side_effects)].copy()

# Creating the Graph

In [16]:
# Group side effects per drug pair (multi-label setup)
edge_df = df.groupby(['drug_1_rxnorn_id', 'drug_2_rxnorm_id'])['condition_meddra_id'].apply(set)
edge_dict = edge_df.to_dict()

In [17]:
# Map drugs and side effects to consecutive indices
all_drugs = set([d for pair in edge_dict.keys() for d in pair])
drug2idx = {drug: idx for idx, drug in enumerate(sorted(all_drugs))}

In [18]:
all_side_effects = set([se for se_list in edge_dict.values() for se in se_list])
se2idx = {se: idx for idx, se in enumerate(sorted(all_side_effects))}

In [19]:
# Build edge_index and labels
edge_index = []
labels = []

for (d1, d2), se_set in edge_dict.items():
    edge_index.append([drug2idx[d1], drug2idx[d2]])
    
    label = np.zeros(len(se2idx), dtype=np.float32)
    for se in se_set:
        label[se2idx[se]] = 1.0
    labels.append(label)

In [20]:
edge_index = torch.tensor(edge_index).t().contiguous()
labels = torch.tensor(np.array(labels)) 

In [22]:
data = Data(edge_index=edge_index, y=labels, num_nodes=len(drug2idx))

In [27]:
# Save the graph

base_dir = os.path.abspath(os.path.join(os.getcwd(), '..'))

processed_dir = os.path.join(base_dir, 'processed')
os.makedirs(processed_dir, exist_ok=True)

torch.save(data, os.path.join(processed_dir, 'ddi_graph.pt'))


In [29]:
with open('processed/drug2idx.pkl', 'wb') as f:
    pickle.dump(drug2idx, f)

with open('processed/se2idx.pkl', 'wb') as f:
    pickle.dump(se2idx, f)