# Dataset Split from TWOSIDES DDI

## Prepare Seen and Unseen datasets

In [15]:
import pandas as pd
import numpy as np
import json
import pickle

random_seed = 71
sample_size = 100_000

In [16]:
with open('./Data/TWOSIDES/TWOSIDES-categories.json', 'r') as fp:
    categories = json.load(fp)

In [17]:
df = pd.read_csv('./Data/TWOSIDES/TWOSIDES-xz-triplets-known-grouped.csv')

#### Select negative samples
Given the __closed world hypothesis__ we use all the possible pairs of the drugs not present in the set as __negative__ samples

In [4]:
# get all the unique drugs that are in the drug1 column
drugs1 = df[['drug1']].drop_duplicates()
# get all the uniqe drugs that are in the drug2 column
tmp = df[['drug2']].drop_duplicates()
# join them
tmp.rename(columns={'drug2':'drug1'}, inplace=True)
drugs1 = pd.concat([drugs1, tmp], ignore_index=True).drop_duplicates()
# now drugs1 column has all the possible drugs, copy them to drugs2 column
drugs2 = drugs1.rename(columns={'drug1':'drug2'})
# Create all the possible combinations as negatives (we will remove the pairs that are positive latter)
negatives = drugs1.join(drugs2, how='cross')
# remember for all pairs in the dataset drug1 < drug2
# negatives = negatives[negatives['drug1'] <= negatives['drug2']]
negatives['class_00'] = 1.0
print('all possible pairs:', len(negatives))
# Remove the positives from the negatives
# get all the positives
positives = df[['drug1', 'drug2']].drop_duplicates()
# disquise them as negatives
positives['class_00'] = 1.0
print('positives pairs:', len(positives))
# add the disguised positives to the negatives
# they will be duplicates as negatives contains all the posssible combinations
# so dropping all duplicates leaves us with the real negatives
negatives = pd.concat([negatives, positives], ignore_index=True).drop_duplicates(keep=False, ignore_index=True)
print('real negatives pairs:', len(negatives))

all possible pairs: 1077444
positives pairs: 106917
real negatives pairs: 970527


### Distinct Classes and Drugs

In [5]:
print('Drugs:', pd.concat([df['drug1'], df['drug2']]).drop_duplicates().nunique())

Drugs: 1038


### Pairs per class

In [6]:
df.loc[:,'class_00':'class_55'].sum(axis=0).sort_values(ascending=True)[:10]

class_00        0.0
class_18    50852.0
class_45    51778.0
class_27    56812.0
class_51    58623.0
class_42    61771.0
class_52    63719.0
class_03    64647.0
class_22    65137.0
class_44    68794.0
dtype: float64

### Select the 2 classes as unseen and the rest as seen
__Unseen:__ [18, 45] __Seen:__ [...]

In [7]:
selected_classes = [
 'class_00', 'class_01', 'class_02', 'class_03', 'class_04', 'class_05', 'class_06', 'class_07', 'class_08', 'class_09',
 'class_10', 'class_11', 'class_12', 'class_13', 'class_14', 'class_15', 'class_16', 'class_17',             'class_19',
 'class_20', 'class_21', 'class_22', 'class_23', 'class_24', 'class_25', 'class_26', 'class_27', 'class_28', 'class_29',
 'class_30', 'class_31', 'class_32', 'class_33', 'class_34', 'class_35', 'class_36', 'class_37', 'class_38', 'class_39',
 'class_40', 'class_41', 'class_42', 'class_43', 'class_44',             'class_46', 'class_47', 'class_48', 'class_49',
 'class_50', 'class_51', 'class_52', 'class_53', 'class_54', 'class_55'
]

unseen = df[
    (df['class_18'] == 1.0) |
    (df['class_45'] == 1.0) 
].copy()
seen = df[
    (df['class_18'] == 0.0) &
    (df['class_45'] == 0.0)
].copy()
seen = seen.rename(columns={
    'class_18':'unseen_class_18', 
    'class_45':'unseen_class_45', 
})
unseen = unseen.drop(columns=selected_classes[1:])
print('seen.shape:', seen.shape)
print('unseen.shape:', unseen.shape)
print('seen.columns:', seen.columns)
print('unseen.columns:', unseen.columns)


seen.shape: (55137, 58)
unseen.shape: (51780, 5)
seen.columns: Index(['drug1', 'drug2', 'class_00', 'class_01', 'class_02', 'class_03',
       'class_04', 'class_05', 'class_06', 'class_07', 'class_08', 'class_09',
       'class_10', 'class_11', 'class_12', 'class_13', 'class_14', 'class_15',
       'class_16', 'class_17', 'unseen_class_18', 'class_19', 'class_20',
       'class_21', 'class_22', 'class_23', 'class_24', 'class_25', 'class_26',
       'class_27', 'class_28', 'class_29', 'class_30', 'class_31', 'class_32',
       'class_33', 'class_34', 'class_35', 'class_36', 'class_37', 'class_38',
       'class_39', 'class_40', 'class_41', 'class_42', 'class_43', 'class_44',
       'unseen_class_45', 'class_46', 'class_47', 'class_48', 'class_49',
       'class_50', 'class_51', 'class_52', 'class_53', 'class_54', 'class_55'],
      dtype='object')
unseen.columns: Index(['drug1', 'drug2', 'class_00', 'class_18', 'class_45'], dtype='object')


#### Select negative samples
We select negative samples with drugs already existing in the __train__ and __test__ datasets respectively

In [8]:
seen_drugs = pd.concat([pd.Series(seen['drug1'].unique()), pd.Series(seen['drug2'].unique())]).unique()
print('seen_drugs:', len(seen_drugs))
seen_negatives = negatives[negatives['drug1'].isin(seen_drugs)]
seen_negatives = seen_negatives[seen_negatives['drug2'].isin(seen_drugs)]
seen_negatives = seen_negatives.sample(sample_size, random_state=random_seed, replace=True).reset_index(drop=True)
# Negatives belong in class_00
seen_negatives['class_00'] = 1.0
seen = pd.concat([seen, seen_negatives], ignore_index=True).sample(frac=1.0, ignore_index=True)
# replace NaN for the negative class columns with 0.0
seen.fillna(0.0, inplace=True)
seen.head()

seen_drugs: 1035


Unnamed: 0,drug1,drug2,class_00,class_01,class_02,class_03,class_04,class_05,class_06,class_07,...,class_46,class_47,class_48,class_49,class_50,class_51,class_52,class_53,class_54,class_55
0,55681,1243041,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,134547,6313,0.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0
2,4094,47858,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,8627,3920,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,797195,41493,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,...,1.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,1.0,0.0


#### Duplicate the pairs for each DrugA-DrugB also add DrugB-DrugA

In [9]:
# print('Initial seen set:', seen.shape)
# duplicates= seen.copy()
# duplicates[['drug2', 'drugbank2', 'drug1', 'drugbank1']] = seen[['drug1', 'drugbank1', 'drug2', 'drugbank2']]
# seen = pd.concat([seen, duplicates], ignore_index=True).drop_duplicates()
# print('Duplicates seen set:', seen.shape)

#### Prepare the unseen set

In [10]:
unseen_sample_size = len(unseen)
print('Unseen positive:', unseen_sample_size)

try:
    unseen_drugs = pd.concat([pd.Series(unseen['drug1'].unique()), pd.Series(unseen['drug2'].unique())]).unique()
    print('unseen_drugs:', len(unseen_drugs))
    unseen_negatives = negatives[negatives['drug1'].isin(unseen_drugs)]
    unseen_negatives = unseen_negatives[unseen_negatives['drug2'].isin(seen_drugs)]
    unseen_negatives = unseen_negatives.sample(unseen_sample_size*2, random_state=random_seed, replace=False).reset_index(drop=True)
    unseen_negatives = unseen_negatives.drop_duplicates()
except:
    pass
# Remove common rows from unseen_negatives and seen_negatives
seen_negatives_tmp = seen_negatives.copy()
# Negatives belong in class_00
unseen_negatives['class_00'] = 1.0
unseen_negatives = unseen_negatives[~unseen_negatives.apply(tuple, axis=1).isin(pd.merge(unseen_negatives, seen_negatives_tmp).apply(tuple, axis=1))]
print('all unseen_negatives:', len(unseen_negatives))
unseen_negatives = unseen_negatives.sample(unseen_sample_size, random_state=random_seed, replace=False).reset_index(drop=True)
unseen = pd.concat([unseen, unseen_negatives], ignore_index=True)
# Fill NA with 0.0 and mix Negative and Positive samples
unseen = unseen.fillna(0.0).sample(frac=1.0, ignore_index=True)
unseen

Unseen positive: 51780
unseen_drugs: 901
all unseen_negatives: 94598


Unnamed: 0,drug1,drug2,class_00,class_18,class_45
0,3616,30121,1.0,0.0,0.0
1,11246,15996,0.0,1.0,1.0
2,121243,30131,1.0,0.0,0.0
3,3648,474128,1.0,0.0,0.0
4,38685,14845,1.0,0.0,0.0
...,...,...,...,...,...
103555,17128,21949,0.0,1.0,1.0
103556,4917,9899,0.0,1.0,1.0
103557,11124,1373458,1.0,0.0,0.0
103558,6902,596724,1.0,0.0,0.0


#### Save the dataset

In [11]:
seen.to_csv('./Data/C56_'+str(random_seed)+'_TWOSIDES_one_hot_train.csv', index=False)
unseen.to_csv('./Data/C56_'+str(random_seed)+'_TWOSIDES_one_hot_test.csv', index=False)
print('./Data/C56_'+str(random_seed)+'_TWOSIDES_one_hot_train.csv')
print('./Data/C56_'+str(random_seed)+'_TWOSIDES_one_hot_test.csv')

./Data/C56_71_TWOSIDES_one_hot_train.csv
./Data/C56_71_TWOSIDES_one_hot_test.csv


### Create dictionaries mapping RxNORM to DrugBank ids and Drug Names

In [12]:
kg_embeddings = pd.read_csv("./Data/Entity2Vec_sg_200_5_5_15_2_500_d5_uniform.txt")
kg_db = kg_embeddings['DB'].drop_duplicates()
ssp_embeddings = pd.read_csv("./Data/DeepDDI-drug_similarity.csv")
ssp_db = ssp_embeddings['DB'].drop_duplicates()
db_rx = pd.read_csv('./Data/TWOSIDES-DB-RxNorm.csv')
db_rx = db_rx[db_rx['drugbank_id'].isin(kg_db) &
              db_rx['drugbank_id'].isin(ssp_db)]
# DB_IDs without SMILES 
db_no_SMILES = ['DB00104', 'DB00375', 'DB06219', 'DB08869', 'DB09265']
db_rx = db_rx[~db_rx['drugbank_id'].isin(db_no_SMILES)]
rx_norms = db_rx['rx_norm_id'].drop_duplicates()

In [13]:
rx_norm_to_db = {}
for index, row in db_rx.iterrows():
    rx_norm_to_db[row['rx_norm_id']] = row['drugbank_id']
with open('./Data/TWOSIDES/rx_norm_to_db.pkl', 'wb') as file:
    pickle.dump(rx_norm_to_db, file)

In [14]:
rx_norm_names = pd.read_csv('./Data/TWOSIDES/TWOSIDES-drugs-names.csv')
rx_norm_to_name = {}
for index, row in rx_norm_names.iterrows():
    rx_norm_to_name[row['rxnorm_id']] = row['concept_name']
with open('./Data/TWOSIDES/rx_norm_to_name.pkl', 'wb') as file:
    pickle.dump(rx_norm_to_name, file)