In [135]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.Chem import rdFingerprintGenerator

In [115]:
smiles_drug=pd.read_csv("drug_to_smiles_after_master_pipeline.csv",index_col=0)

In [116]:
smiles_drug.head()

Unnamed: 0,drug,smiles
0,5-FU,C1=C(C(=O)NC(=O)N1)F
1,ABT-888,C[C@@]1(CCCN1)C2=NC3=C(C=CC=C3N2)C(=O)N
2,AZD1775,CC(C)(C1=NC(=CC=C1)N2C3=NC(=NC=C3C(=O)N2CC=C)N...
3,BEZ-235,CC(C)(C#N)C1=CC=C(C=C1)N2C3=C4C=C(C=CC4=NC=C3N...
4,BORTEZOMIB,B([C@H](CC(C)C)NC(=O)[C@H](CC1=CC=CC=C1)NC(=O)...


In [117]:
smiles_drug.shape

(4150, 2)

In [118]:
smiles_drug.dropna(inplace=True)

In [119]:
smiles_drug.shape

(4052, 2)

In [120]:
smiles_drug.isnull().sum().sum()

np.int64(0)

In [121]:

radius = 2
nBits = 2048

morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)

fps = []

for a in smiles_drug['smiles']:
    mol = Chem.MolFromSmiles(a)
    if mol is not None:
        fp = morgan_gen.GetFingerprint(mol)
        arr = np.zeros((nBits,), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)
    else:
        fps.append(np.zeros((nBits,), dtype=int))  

fp_df_with_smile = pd.DataFrame(fps)
fp_df_with_smile = fp_df_with_smile.reset_index(drop=True)
smiles_drug = smiles_drug.reset_index(drop=True)
fp_df_with_smile['drug'] = smiles_drug['drug']


[20:30:50] SMILES Parse Error: syntax error while parsing: <!DOCTYPE
[20:30:50] SMILES Parse Error: check for mistakes around position 1:
[20:30:50] <!DOCTYPE
[20:30:50] ^
[20:30:50] SMILES Parse Error: Failed parsing SMILES '<!DOCTYPE' for input: '<!DOCTYPE'


In [122]:
smiles_drug['drug'].isnull().sum().sum()

np.int64(0)

In [123]:
fp_df_with_smile.isnull().sum().sum()

np.int64(0)

In [124]:
invalid_smiles = smiles_drug[smiles_drug['smiles'].str.contains('<!DOCTYPE', na=False)]
invalid_smiles

Unnamed: 0,drug,smiles
1564,PLURISIN #1 (NSC 14613),<!DOCTYPE html>\n\n<html>\n\n<head>\n\t<meta h...


In [125]:
smiles_drug = smiles_drug[~smiles_drug['smiles'].str.startswith('<!DOCTYPE', na=False)]

In [126]:
smiles_drug.shape

(4051, 2)

In [127]:
smiles_drug.isnull().sum().sum()

np.int64(0)

In [128]:

radius = 2
nBits = 2048

morgan_gen = rdFingerprintGenerator.GetMorganGenerator(radius=radius, fpSize=nBits)

fps = []

for a in smiles_drug['smiles']:
    mol = Chem.MolFromSmiles(a)
    if mol is not None:
        fp = morgan_gen.GetFingerprint(mol)
        arr = np.zeros((nBits,), dtype=int)
        DataStructs.ConvertToNumpyArray(fp, arr)
        fps.append(arr)
    else:
        fps.append(np.zeros((nBits,), dtype=int))  # fallback for failed SMILES

fp_df_with_smile = pd.DataFrame(fps)
fp_df_with_smile = fp_df_with_smile.reset_index(drop=True)
smiles_drug = smiles_drug.reset_index(drop=True)
fp_df_with_smile['drug'] = smiles_drug['drug']


In [129]:
fp_df_with_smile.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,2039,2040,2041,2042,2043,2044,2045,2046,2047,drug
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,5-FU
1,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ABT-888
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,AZD1775
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BEZ-235
4,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,BORTEZOMIB


In [130]:
cols = ['drug'] + [col for col in fp_df_with_smile.columns if col != 'drug']
fp_df_with_smile = fp_df_with_smile[cols]

In [131]:
fp_df_with_smile.head()

Unnamed: 0,drug,0,1,2,3,4,5,6,7,8,...,2038,2039,2040,2041,2042,2043,2044,2045,2046,2047
0,5-FU,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,ABT-888,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,AZD1775,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,BEZ-235,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,BORTEZOMIB,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [132]:
fp_df_with_smile.isnull().sum().sum()

np.int64(0)

In [133]:
fp_df_with_smile.drug.nunique()

4051

In [134]:
fp_df_with_smile.to_csv("drug_feature.csv")