In [8]:
import pandas as pd
import numpy as np
import pickle
from typing import Optional, Union, Tuple
from cmapPy.pandasGEXpress.parse import parse
import numpy as np
import json
from collections import Counter
import sys
from numpy.core.multiarray import ndarray
import sys

path = "../data/drug_class_identification/phase1/"

### Importing Metadata

In [26]:
meta = pd.read_csv(path+"metadata.csv")
meta.head()

Unnamed: 0,smiles,name,id,inchi_key
0,-666,AKT2,56582,-666
1,-666,HSF1,5981,-666
2,-666,NFE2L2,7150,-666
3,-666,ABL1,ABL1_G2A,-666
4,-666,ABL1,ABL1_T315I,-666


In [66]:
Counter(meta['inchi_key'])
meta.shape

(51383, 4)

In [30]:
print("Total number of rows: ",len(list(meta.name)))
print("Unique number of perturbagen names: ",len(list(np.unique(meta.name))))

Total number of rows:  51383
Unique number of perturbagen name:  28957


### Importing Drugbank

In [19]:
drugbank = pd.read_csv(path+"drugbank.csv",low_memory=False)
drugbank.head()

Unnamed: 0,name,atc,inchi_key,smiles
0,lepirudin,B01AE02,,
1,cetuximab,L01XC06,,
2,dornase alfa,R05CB13,,
3,denileukin diftitox,L01XX29,,
4,etanercept,L04AB01,,


In [31]:
print("Total number of rows: ",len(list(drugbank.name)))
print("Unique number of perturbagen names: ",len(list(np.unique(drugbank.name))))

Total number of rows:  13339
Unique number of perturbagen name:  13339


### Importing ATC

In [20]:
atc = pd.read_csv(path+"atc.csv",low_memory=False)
atc.head()

Unnamed: 0,name,atc
0,silicones,A03AX13
1,cefatrizine,J01DB07
2,"Technetium 99m compounds, central nervous syst...",V09AA
3,urofollitropin,G03GA04
4,promethazine,D04AA10


In [32]:
print("Total number of rows: ",len(list(atc.name)))
print("Unique number of perturbagen names: ",len(list(np.unique(atc.name))))

Total number of rows:  6277
Unique number of perturbagen name:  5592


### Drugbank + Metadata

In [38]:
drug_meta = pd.read_csv(path+"drug_meta.csv")
drug_meta.head()

Unnamed: 0,smiles,name,id,inchi_key,atc
0,-666,AKT2,56582,-666,
1,-666,HSF1,5981,-666,
2,-666,NFE2L2,7150,-666,
3,-666,ABL1,ABL1_G2A,-666,
4,-666,ABL1,ABL1_T315I,-666,


In [37]:
print("Total number of rows: ",len(list(drug_meta.name)))
print("Unique number of perturbagen names: ",len(list(np.unique(drug_meta.name))))

Total number of rows:  51383
Unique number of perturbagen name:  28957


### ATC + Metadata

In [54]:
atc_meta = pd.read_csv(path+"atc_meta.csv")
atc_meta.head(2)

Unnamed: 0,smiles,name,id,inchi_key,atc
0,-666,AKT2,56582,-666,
1,-666,HSF1,5981,-666,


In [55]:
print("Total number of rows: ",len(list(atc_meta.name)))
print("Unique number of perturbagen names: ",len(np.unique(list(atc_meta.name))))

Total number of rows:  52012
Unique number of perturbagen names:  28957


### atc_meta + drug_meta

In [58]:
# Merging all the three datasets and writing to csv

all3 = pd.merge(drug_meta, atc_meta,  how="outer")
all3.to_csv(path+"all3.csv",index=False)

In [59]:
print("Total: ",len(list(all3.name)))
print("Unique: ",len(np.unique(list(all3.name))))
all3.head(2)

Total:  53456
Unique:  28957


Unnamed: 0,smiles,name,id,inchi_key,atc
0,-666,AKT2,56582,-666,
1,-666,HSF1,5981,-666,


### Removing rows with invalid ATC codes

In [60]:
print("Total number of rows (includes duplicate perts): ",len(all3))
print(all3.isnull().sum())

Total number of rows (includes duplicate perts):  53456
smiles           0
name             0
id               0
inchi_key        0
atc          51342
dtype: int64


In [61]:
newall = all3.dropna(subset=['atc'])
print("Total: ", len(newall))
print("Unique: ",len(np.unique(list(newall.name))))

Total:  2114
Unique:  1105


In [62]:
newall.to_csv(path+"all3_without_nan.csv")

In [63]:
print("Number of entries: ",len(newall[newall.name=="dexamethasone"].drop_duplicates()))
newall[newall.name=="dexamethasone"].head()

Number of entries:  77


Unnamed: 0,smiles,name,id,inchi_key,atc
51436,C[C@@H]1CC2C3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C...,dexamethasone,BRD-A10188456,UREBDLICKHMUKA-QCYOSJOCSA-N,D07AB19
51437,C[C@@H]1CC2C3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C...,dexamethasone,BRD-A10188456,UREBDLICKHMUKA-QCYOSJOCSA-N,C05AA09
51438,C[C@@H]1CC2C3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C...,dexamethasone,BRD-A10188456,UREBDLICKHMUKA-QCYOSJOCSA-N,H02AB02
51439,C[C@@H]1CC2C3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C...,dexamethasone,BRD-A10188456,UREBDLICKHMUKA-QCYOSJOCSA-N,S01BA01
51440,C[C@@H]1CC2C3CCC4=CC(=O)C=C[C@]4(C)[C@@]3(F)[C...,dexamethasone,BRD-A10188456,UREBDLICKHMUKA-QCYOSJOCSA-N,D10AA03


In [64]:
np.unique(list(newall.id)).shape

(1481,)

In [68]:
newall.shape

(2114, 5)

In [70]:
all3.shape

(53456, 5)

In [71]:
53456-2114

51342

In [72]:
all3.isnull().sum()

smiles           0
name             0
id               0
inchi_key        0
atc          51342
dtype: int64