# DDI TWOSIDES Processing

In [None]:
!pip install progressbar2

In [1]:
import pandas as pd
import numpy as np
import json
from progressbar import ProgressBar

  from pandas.core import (


#### Process TWOSIDES triplets

In [2]:
bar = ProgressBar(max_value=42920392//1000000+1)
add_header = True
file_mode = 'w'
for df in pd.read_csv('./Data/TWOSIDES/TWOSIDES-xz.csv', usecols=[0,2,4], dtype='Int64', chunksize=1000000, 
                     header=0, names=['drug_1_rxnorm_id','drug_1_concept_name','drug_2_rxnorm_id','drug_2_concept_name',
                                      'condition_meddra_id','condition_concept_name','A','B','C','D','PRR',
                                      'PRR_error','mean_reporting_frequency']):
    bar.next()
    df.to_csv(
        './Data/TWOSIDES/TWOSIDES-xz-triplets.csv', 
        index=False, 
        columns=['drug_1_rxnorm_id','drug_2_rxnorm_id','condition_meddra_id'],
        header=add_header,
        mode=file_mode
    )
    add_header = False
    file_mode = 'a'
bar.finish()

[38;2;0;255;0m100%[39m [38;2;0;255;0m(43 of 43)[39m |########################| Elapsed Time: 0:02:34 Time:  0:02:340314


#### Extract all Drug RxNORMs

In [3]:
df = pd.read_csv('./Data/TWOSIDES/TWOSIDES-xz-triplets.csv', dtype='Int64')
print(df.shape)
df.head(2)

(42920390, 3)


Unnamed: 0,drug_1_rxnorm_id,drug_2_rxnorm_id,condition_meddra_id
0,10355,136411,10003239
1,1808,7824,10003239


In [4]:
drugs1 = df['drug_1_rxnorm_id'].drop_duplicates()
drugs2 = df['drug_2_rxnorm_id'].drop_duplicates()

In [5]:
drugs = pd.concat([drugs1, drugs2]).drop_duplicates()
drugs.to_csv('./Data/TWOSIDES/TWOSIDES-drugs.csv', header=['rxnorm_id'])

#### Extract Drugs Names

In [6]:
bar = ProgressBar(max_value=42920392//1000000+1)
names = pd.DataFrame(columns=['rxnorm_id', 'concept_name'])
for df in pd.read_csv('./Data/TWOSIDES/TWOSIDES-xz.csv', usecols=[0,1,2,3], chunksize=1000000, 
                     header=0, names=['drug_1_rxnorm_id','drug_1_concept_name','drug_2_rxnorm_id','drug_2_concept_name',
                                      'condition_meddra_id','condition_concept_name','A','B','C','D','PRR',
                                      'PRR_error','mean_reporting_frequency']):
    bar.next()
    df = df.drop_duplicates()
    names = pd.concat([names, 
                       df.rename(
                           columns={'drug_1_rxnorm_id':'rxnorm_id','drug_1_concept_name':'concept_name'}, 
                           )[['rxnorm_id', 'concept_name']],
                       df.rename(
                           columns={'drug_2_rxnorm_id':'rxnorm_id','drug_2_concept_name':'concept_name'}, 
                           )[['rxnorm_id', 'concept_name']],
                      ])
bar.finish()
names = names.drop_duplicates()
names.to_csv('./Data/TWOSIDES/TWOSIDES-drugs-names.csv', header=['rxnorm_id', 'concept_name'], index=False)
print(names.shape)
names.head(2)

[38;2;0;255;0m100%[39m [38;2;0;255;0m(43 of 43)[39m |########################| Elapsed Time: 0:00:33 Time:  0:00:330003


(1916, 2)


Unnamed: 0,rxnorm_id,concept_name
0,10355,Temazepam
1,1808,Bumetanide


#### Extract Condition-Interaction Names

In [7]:
conditions = pd.DataFrame(columns=['condition_meddra_id','condition_concept_name'])
bar = ProgressBar(max_value=(42920392//1000000+1))
for df in pd.read_csv('./Data/TWOSIDES/TWOSIDES-xz.csv', usecols=[4,5], chunksize=1000000, 
                     header=0, names=['drug_1_rxnorm_id','drug_1_concept_name','drug_2_rxnorm_id','drug_2_concept_name',
                                      'condition_meddra_id','condition_concept_name','A','B','C','D','PRR',
                                      'PRR_error','mean_reporting_frequency']):
    bar.next()
    df = df.drop_duplicates()
    conditions = pd.concat([conditions, df])
bar.finish()
conditions = conditions.drop_duplicates()
print(conditions.shape)
names = names.drop_duplicates()
conditions.to_csv('./Data/TWOSIDES-conditions-names.csv', header=['condition_meddra_id','condition_concept_name'], index=False)
conditions.head(2)

[38;2;0;255;0m100%[39m [38;2;0;255;0m(43 of 43)[39m |########################| Elapsed Time: 0:00:32 Time:  0:00:320003


(12725, 2)


Unnamed: 0,condition_meddra_id,condition_concept_name
0,10003239,Arthralgia
3,10012735,Diarrhoea


#### List of Drugs that shouldn't be used

In [8]:
kg_embeddings = pd.read_csv("./Data/Embeddings/Entity2Vec_sg_200_5_5_15_2_500_d5_uniform.txt")
kg_db = kg_embeddings['DB'].drop_duplicates()
ssp_embeddings = pd.read_csv("./Data/Embeddings/DeepDDI-drug_similarity.csv")
ssp_db = ssp_embeddings['DB'].drop_duplicates()
# DB_IDs without SMILES 
db_no_SMILES = ['DB00104', 'DB00375', 'DB06219', 'DB08869', 'DB09265']

In [9]:
db_rx = pd.read_csv('./Data/TWOSIDES/TWOSIDES-DB-RxNorm.csv')
print('All drugs:', len(db_rx))
db_rx = db_rx[db_rx['drugbank_id'].isin(kg_db)]
print('Drugs with kb_db embedding:', len(db_rx[db_rx['drugbank_id'].isin(kg_db)]))
print('Drugs with ssp_db embedding:', len(db_rx[db_rx['drugbank_id'].isin(ssp_db)]))
print('Also remove 5 drugs due to missing SMILES')
db_rx = db_rx[db_rx['drugbank_id'].isin(kg_db) &
              db_rx['drugbank_id'].isin(ssp_db)]
db_rx = db_rx[~db_rx['drugbank_id'].isin(db_no_SMILES)]
rx_norms = db_rx['rx_norm_id'].drop_duplicates()
print('Final number of Drugs:', len(rx_norms))

All drugs: 5260
Drugs with kb_db embedding: 1656
Drugs with ssp_db embedding: 1407
Also remove 5 drugs due to missing SMILES
Final number of Drugs: 1402


#### Store the triplets with available Embeddings

In [10]:
df = pd.read_csv('./Data/TWOSIDES/TWOSIDES-xz-triplets.csv')
df = df[df['drug_1_rxnorm_id'].isin(rx_norms) & df['drug_2_rxnorm_id'].isin(rx_norms)]
df.to_csv('./Data/TWOSIDES/TWOSIDES-xz-triplets-known.csv', index=False)
print(df.shape)
df.head(2)

(26038428, 3)


Unnamed: 0,drug_1_rxnorm_id,drug_2_rxnorm_id,condition_meddra_id
0,10355,136411,10003239
3,10324,8640,10012735


#### Keep only the Conditions-Interactions for the Triplets we will use

In [11]:
df = pd.read_csv('./Data/TWOSIDES/TWOSIDES-xz-triplets-known.csv')
print(df.shape)
df.head(2)

(26038428, 3)


Unnamed: 0,drug_1_rxnorm_id,drug_2_rxnorm_id,condition_meddra_id
0,10355,136411,10003239
1,10324,8640,10012735


In [12]:
len(df['condition_meddra_id'].unique())

12227

In [13]:
conditions = conditions[
    conditions['condition_meddra_id'].isin(df['condition_meddra_id'].unique())
].drop_duplicates()
conditions.to_csv('./Data/TWOSIDES/TWOSIDES-conditions-names-known.csv', index=False)
conditions['condition_concept_name'].to_csv('./Data/TWOSIDES/TWOSIDES-condition_names-known.csv', index=False)

## Create Categories

In [14]:
df = pd.read_csv('./Data/TWOSIDES-xz-triplets-known.csv')
df.head(2)

Unnamed: 0,drug_1_rxnorm_id,drug_2_rxnorm_id,condition_meddra_id
0,10355,136411,10003239
1,10324,8640,10012735


In [15]:
conditions = pd.read_csv('./Data/TWOSIDES-conditions-names-known.csv')
conditions.head(2)

Unnamed: 0,condition_meddra_id,condition_concept_name
0,10003239,Arthralgia
1,10012735,Diarrhoea


In [16]:
conditions_dict = {}
for _, row in conditions.iterrows():
    conditions_dict[str(row.iloc[1])] = int(row.iloc[0])

#### Read all available Categories and the Categories assigned to each Side-Effect

In [17]:
with open('./Data/TWOSIDES/TWOSIDES-categories.json', 'r') as fp:
    categories = json.load(fp)
with open('./Data/TWOSIDES/TWOSIDES-sideeffect-categories.json', 'r') as fp:
    side_effect_category = json.load(fp)

In [18]:
df_pairs = df[['drug_1_rxnorm_id', 'drug_2_rxnorm_id', 'condition_meddra_id']].drop_duplicates()
df_pairs = df_pairs.rename(columns={'drug_1_rxnorm_id':'drug1', 'drug_2_rxnorm_id':'drug2', 'condition_meddra_id': 'effect'})
df_pairs[list(categories.values())] = 0.0
df_pairs.head(2)

Unnamed: 0,drug1,drug2,effect,class_00,class_01,class_02,class_03,class_04,class_05,class_06,...,class_46,class_47,class_48,class_49,class_50,class_51,class_52,class_53,class_54,class_55
0,10355,136411,10003239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10324,8640,10012735,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
side_effect_dict = {}
for key, data in list(side_effect_category.items()):
    try:
        meddra_id = conditions_dict[key]
        classes = set()
        for c in data:
            try:
                classes.add(categories[c])
            except:
                pass
        side_effect_dict[meddra_id] = classes
    except:
        pass
len(side_effect_dict)

12199

#### Create the Triplets-Classes file

In [20]:
bar = ProgressBar(max_value=len(side_effect_dict))
for meddra_id, classes in side_effect_dict.items():
    bar.next()
    df_pairs.loc[df_pairs['effect'] == meddra_id, list(classes)] = 1.0
bar.finish()
df_pairs.head()

[38;2;0;255;0m100%[39m [38;2;0;255;0m(12199 of 12199)[39m |##################| Elapsed Time: 0:11:23 Time:  0:11:230555


Unnamed: 0,drug1,drug2,effect,class_00,class_01,class_02,class_03,class_04,class_05,class_06,...,class_46,class_47,class_48,class_49,class_50,class_51,class_52,class_53,class_54,class_55
0,10355,136411,10003239,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,10324,8640,10012735,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,10355,136411,10012735,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,161,1546438,10012735,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7781,9863,10012735,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
df_pairs.shape

(26037737, 59)

#### Keep a single row for each Drug-Drug pair

In [22]:
columns = ['drug1', 'drug2']
columns = columns + list(categories.values())
df_pairs = df_pairs[df_pairs['effect'].isin(list(side_effect_dict.keys()))]
df_pairs.loc[:, 'effect'] = 1
df_grouped = df_pairs.groupby(['drug1', 'drug2', 'effect'], sort=False).max().reset_index()
df_grouped = df_grouped.drop(columns=['effect'])
df_grouped.to_csv('./Data/TWOSIDES/TWOSIDES-xz-triplets-known-grouped.csv', index=False)
print(df_grouped.shape)
df_grouped.head(2)

(106917, 58)


Unnamed: 0,drug1,drug2,class_00,class_01,class_02,class_03,class_04,class_05,class_06,class_07,...,class_46,class_47,class_48,class_49,class_50,class_51,class_52,class_53,class_54,class_55
0,10355,136411,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,10324,8640,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
