In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
train= pd.read_csv("../data/processed/train_w_desc.csv")

In [3]:
train.head()

Unnamed: 0,Drug_ID,Drug,Y,mol,tpsa,mol_w,qed,HBA,HBD,logP,MR
0,Drug 0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,0,<rdkit.Chem.rdchem.Mol object at 0x00000201534...,45.2,319.096508,0.434358,0,0,2.6669,70.426
1,Drug 2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,0,<rdkit.Chem.rdchem.Mol object at 0x00000201534...,20.31,291.071785,0.581359,2,0,4.989,87.333
2,Drug 5,CCOP(=O)(Nc1cccc(Cl)c1)OCC,0,<rdkit.Chem.rdchem.Mol object at 0x00000201531...,47.56,263.047808,0.790087,3,1,3.933,65.9332
3,Drug 6,O=C(O)c1ccccc1O,0,<rdkit.Chem.rdchem.Mol object at 0x00000201531...,57.53,138.031694,0.610259,3,2,1.0904,35.0661
4,Drug 8,O=[N+]([O-])c1ccc(SSc2ccc([N+](=O)[O-])cc2[N+]...,0,<rdkit.Chem.rdchem.Mol object at 0x00000201531...,172.56,397.962705,0.374691,10,0,4.1188,91.2156


In [4]:
import rdkit

In [5]:
from rdkit import Chem

In [6]:
from rdkit.Chem import AllChem

In [8]:
train['mol'] = train['Drug'].apply(lambda x: Chem.MolFromSmiles(x))



In [19]:
Chem.rdmolops.RDKFingerprint??

[1;31mDocstring:[0m
RDKFingerprint( (Mol)mol [, (int)minPath=1 [, (int)maxPath=7 [, (int)fpSize=2048 [, (int)nBitsPerHash=2 [, (bool)useHs=True [, (float)tgtDensity=0.0 [, (int)minSize=128 [, (bool)branchedPaths=True [, (bool)useBondOrder=True [, (AtomPairsParameters)atomInvariants=0 [, (AtomPairsParameters)fromAtoms=0 [, (AtomPairsParameters)atomBits=None [, (AtomPairsParameters)bitInfo=None]]]]]]]]]]]]]) -> ExplicitBitVect :
    Returns an RDKit topological fingerprint for a molecule
    
      Explanation of the algorithm below.
    
      ARGUMENTS:
    
        - mol: the molecule to use
    
        - minPath: (optional) minimum number of bonds to include in the subgraphs
          Defaults to 1.
    
        - maxPath: (optional) maximum number of bonds to include in the subgraphs
          Defaults to 7.
    
        - fpSize: (optional) number of bits in the fingerprint
          Defaults to 2048.
    
        - nBitsPerHash: (optional) number of bits to set per path
       

In [24]:
vector= Chem.rdmolops.RDKFingerprint(train["mol"][0])

In [29]:
len(vector)

2048

In [33]:
vectors = [AllChem.rdmolops.RDKFingerprint(x) for x in train["mol"]]

In [38]:
len(vectors[0])

2048

In [39]:
columns = [f'Bit_{i}' for i in range(2048)]
bits = [list(l) for l in vectors]
df_rdkfp = pd.DataFrame(bits, columns=columns)
df_rdkfp.head()

Unnamed: 0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_2038,Bit_2039,Bit_2040,Bit_2041,Bit_2042,Bit_2043,Bit_2044,Bit_2045,Bit_2046,Bit_2047
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,0,0,0,0,0,1,...,0,0,1,1,0,0,0,1,1,1
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [7]:
def smiles_to_rdkfp(data, save_path):
    """ 
    Converts SMILES into a dataframe with RDK fingerprints (2048 bits)

    data= column of a dataframe with the SMILES structure

    returns RDK fingeprints as dataframe
    """
    mols = data.apply(lambda x: Chem.MolFromSmiles(x))
    mols = mols.apply(lambda x: Chem.AddHs(x))

    vectors = [AllChem.rdmolops.RDKFingerprint(x) for x in mols]

    columns = [f'Bit_{i}' for i in range(2048)]
    bits = [list(l) for l in vectors]
    df = pd.DataFrame(bits, columns=columns)

    df.to_csv(save_path, index=False)
    
    return df

In [42]:
maccs = Chem.MACCSkeys.GenMACCSKeys(train["mol"][0])

In [43]:
maccs

<rdkit.DataStructs.cDataStructs.ExplicitBitVect at 0x2810aa24f30>

In [56]:
def smiles_to_maccs(data, save_path):
    """ 
    Converts SMILES into a dataframe with MACCS keys

    data= column of a dataframe with the SMILES structure

    returns MACCS keys as dataframe
    """
    mols = data.apply(lambda x: Chem.MolFromSmiles(x))
    mols = mols.apply(lambda x: Chem.AddHs(x))

    vectors = [Chem.MACCSkeys.GenMACCSKeys(x) for x in mols]

    columns = [f'Bit_{i}' for i in range(len(vectors[0]))]
    bits = [list(l) for l in vectors]
    df = pd.DataFrame(bits, columns=columns)

    df.to_csv(save_path, index=False)
    
    return df

In [54]:
smiles_to_maccs(train["Drug"])



Unnamed: 0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_157,Bit_158,Bit_159,Bit_160,Bit_161,Bit_162,Bit_163,Bit_164,Bit_165,Bit_166
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,0,0,1,1,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,1,0,1,1,1,1,1,1,0
2,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
3,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,1,1,1,1,0
4,0,0,0,0,0,0,0,0,0,0,...,0,1,1,0,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
28784,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
28785,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0
28786,0,0,0,0,0,0,0,0,0,0,...,1,1,1,1,1,1,1,1,1,0
28787,0,0,0,0,0,0,0,0,0,0,...,0,1,1,1,1,1,1,1,1,0


In [7]:
test= pd.read_csv("../data/processed/test_w_desc.csv")
valid= pd.read_csv("../data/processed/valid_w_desc.csv")


In [58]:
train_rdkfp = smiles_to_rdkfp(train["Drug"], "../data/processed/train_rdkfp.csv")
test_rdkfp = smiles_to_rdkfp(test["Drug"], "../data/processed/test_rdkfp.csv")
valid_rdkfp = smiles_to_rdkfp(valid["Drug"], "../data/processed/valid_rdkfp.csv")
train_maccs = smiles_to_maccs(train["Drug"], "../data/processed/train_maccs.csv")
test_maccs = smiles_to_maccs(test["Drug"], "../data/processed/test_maccs.csv")
valid_maccs = smiles_to_maccs(valid["Drug"], "../data/processed/valid_maccs.csv")



In [59]:
import pickle

In [60]:
with open("../utils/smiles_to_rdkfp.pkl", "wb") as save_file:
    pickle.dump(smiles_to_rdkfp, save_file)

In [61]:
with open("../utils/smiles_to_maccs.pkl", "wb") as save_file:
    pickle.dump(smiles_to_maccs, save_file)

In [14]:
from rdkit.Chem.AtomPairs import Pairs

In [41]:
pairsfp= Pairs.GetAtomPairFingerprintAsBitVect(train["mol"][0])

In [42]:
pairsfp

<rdkit.DataStructs.cDataStructs.SparseBitVect at 0x20fbb9c8db0>

In [43]:
len(pairsfp)

8388608

In [47]:
columns = [f'Bit_{i}' for i in range(len(pairsfp))]


In [50]:
bits= []
for l in range(len(pairsfp)):
    bits.append(pairsfp[l])

In [53]:
bits= [bits]

In [55]:
df_mock = pd.DataFrame(bits, columns=columns)

In [57]:
df_mock

Unnamed: 0,Bit_0,Bit_1,Bit_2,Bit_3,Bit_4,Bit_5,Bit_6,Bit_7,Bit_8,Bit_9,...,Bit_8388598,Bit_8388599,Bit_8388600,Bit_8388601,Bit_8388602,Bit_8388603,Bit_8388604,Bit_8388605,Bit_8388606,Bit_8388607
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
#atompairs bits might be too large to compute

In [65]:
from rdkit.Chem.AtomPairs import Torsions

In [66]:
torsions= Torsions.GetTopologicalTorsionFingerprint(train["mol"][0])

In [75]:
count= 0
for i in torsions:
    count+=1
count

KeyboardInterrupt: 

In [76]:
count #this is the count value, interrupted after 5 min of loop

388775973

In [9]:
#Real data to test the model

In [14]:
drugs= pd.read_csv("../data/raw/repurposing_samples_20180907.txt", sep="\t")

In [15]:
drugs_df = drugs[["smiles"]]

In [16]:
drugs_df= drugs_df.dropna()

In [17]:
drugs_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10141 entries, 0 to 10146
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   smiles  10141 non-null  object
dtypes: object(1)
memory usage: 158.5+ KB


In [20]:
drugs_rdkfp= smiles_to_rdkfp(drugs_df["smiles"], "../data/processed/drugs_rdkfp.csv")