In [1]:
import os
import pandas as pd
import numpy as np

import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors

from models.mol2vec.features import mol2alt_sentence, mol2sentence, MolSentence, DfVec, sentences2vec
from models.mol2vec.helpers import depict_identifier, mol_to_svg
from gensim.models import word2vec

import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from math import sqrt
from statistics import mean, stdev
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold

In [24]:
class CustomModel_2(nn.Module):
    def __init__(self):
        super(CustomModel_2, self).__init__()
        self.fc1 = nn.Linear(input_dim, 1500)
        self.fc2 = nn.Linear(1500, 500)
        self.fc3 = nn.Linear(500, 100)
        self.fc4 = nn.Linear(100, 59)
        
        
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.relu(self.fc3(x))
        x = self.fc4(x)
        
        return x
    
def toxic_label(value):
    if value >= 5000: 
        return 'Safe_chemicals'
    elif value >= 500:
        return 'Slightly_toxic'
    elif value >= 50:
        return 'Moderately_toxic'
    else:
        return 'Highest_toxic'

def toxic_label2(value):
    if value >= 2000: 
        return 'Safe_chemicals'
    else:
        return 'Toxic_chemicals' 

In [3]:
# 경로 설정
path = os.getcwd()
model_path = path + '/models/'
if not os.path.exists(model_path):
    os.makedirs(model_path)
data_path = path + '/data/'
if not os.path.exists(data_path):
    os.makedirs(data_path)
    
# device = torch.device('cuda:3') if torch.cuda.is_available() else torch.device('cpu')
device = torch.device('cpu')

In [4]:
print('Generate Molwt column')
# reverse standardization을 위한 MolWt 추출
data = pd.read_csv("/home/psy/LD50/Toxicity_prediction/data/FDA_smiles_jy.csv")
data = data.iloc[:,0:2]
smiles_list = data['canonical smiles'].values # SMILES 부분민 추출
mol = [ Chem.MolFromSmiles(smiles) for smiles in smiles_list]
molecular_weight = [Descriptors.MolWt(mols) for mols in mol]
data['MolWt'] = molecular_weight
print('Done')

Generate Molwt column
Done




In [5]:
print('Generate Avalon_fp')
# SMILES 문자열을 Molecule 객체로 변환하고 Avalon fingerprint 생성
avalon_fps = []
for smiles in smiles_list: # 각각의 smile에 해당하는 avalon_fp 를 얻기 위한 for문
    mol = Chem.MolFromSmiles(smiles) # smiles를 molecule로 변환
    if mol is not None:
        avalon_fp = rdkit.Avalon.pyAvalonTools.GetAvalonFP(mol, nBits=1024) # molecule 을 avalon_fp로 변환
        binary_avalon_fp = avalon_fp.ToList() # 0,1 로 구성된 1024bit로 표현
        avalon_fps.append(binary_avalon_fp)
    else:
        print(f"Failed to generate Avalon fingerprint for SMILES: {smiles}") # SMILES 정보가 database에 존재하지 않을 시 error
avalon_fps = np.array(avalon_fps)

# avalon_fp의 각 col name을 Avalon_i 로 표현
col_list = [] 
for i in range(1024):
    col_names = f'Avalon_{i}'
    col_list.append(col_names)
    
avalon_bits = pd.DataFrame(avalon_fps)
avalon_bits.columns = col_list
print('Done')

Generate Avalon_fp




Done


In [6]:
print('Generate mol2vec embedding vector')
# Unseen data의 mo2lvec embedding vector 생성
mol2vec_df = data.iloc[:,0:2] # ID와 smiles만 추출

mol = [Chem.MolFromSmiles(x) for x in smiles_list]
#Draw.MolsToGridImage(mol, molsPerRow=5, useSVG=False) # 시각화
mol2vec_df['ROMol'] = mol

# molecule 별로 sentence 생성
mol2vec_df['sentence'] = mol2vec_df.apply(lambda x: MolSentence(mol2alt_sentence(x['ROMol'], 1)), axis=1)

# pretrained mol2vec model 불러오기
model = word2vec.Word2Vec.load(model_path + 'mol2vec/mol2vec_300dim.pkl')

# mol2vec embedding vector 생성
mol2vec_df['mol2vec'] = [DfVec(x) for x in sentences2vec(mol2vec_df['sentence'], model, unseen='UNK')]

# mol2vec embedding vector 저장
mol2vec_emb = np.array([x.vec for x in mol2vec_df['mol2vec']])

col_list = [] 
for i in range(300):
    col_names = f'Mol2vec_{i}'
    col_list.append(col_names)
    
Mol2vec = pd.DataFrame(mol2vec_emb)
Mol2vec.columns = col_list
print('Done')

Generate mol2vec embedding vector




Done


In [8]:
print('Prediction start')
# prediction 결과를 reverse standardization 하여 class_assign을 하고 toxicity count와 percantage, score를 구하는 코드
# remove_col = data.columns[2:1085]
# data = data.drop(columns=remove_col)

test_set = pd.concat([data,avalon_bits,Mol2vec],axis=1)
X_test = test_set.iloc[:,3:].values
# X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
input_dim = X_test.shape[1]

# model = CustomModel_2().to(device)
model = CustomModel_2()
model.load_state_dict(torch.load(model_path + f'Best_model_fold_1', map_location='cpu'))

model.eval()
with torch.no_grad():
#     y_pred = model(X_test_tensor).cpu().numpy()
    y_pred = model(X_test_tensor).numpy()
print('Done')

Prediction start
Done


# Classification assign1

In [23]:
print('Make result file format')
task_list = np.load(data_path + "task_list.npy")
pred = pd.DataFrame(y_pred)
pred.columns = task_list
pred['molwt'] = data['MolWt'].values

for i in range(len(pred)):
    reverse_standardization = (1/10**(pred.iloc[i,0:59]))*pred.iloc[i,59]*10**(3)
    pred.iloc[i,0:59] = reverse_standardization
    
pred_label = pred.iloc[:,0:-1]
pred_label = pred_label.applymap(toxic_label)
pred_label = pd.concat([data,pred_label],axis=1)
count_label = pred_label.iloc[:,3:].T

pred_label['Safe_chemicals'] = 0
pred_label['Slightly_toxicity'] = 0
pred_label['Moderately_toxicity'] = 0
pred_label['Highest_toxicity'] = 0

count_list = []
for i in range(len(count_label.columns)):
    dict = count_label.iloc[:,i].value_counts().to_dict()
    count_list.append(dict)

for i in range(len(count_label.columns)):
    try:
        pred_label.loc[i,'Safe_chemicals'] = count_list[i]['Safe_chemicals']
    except KeyError:
        pred_label.loc[i,'Safe_chemicals'] = 0
    try:
        pred_label.loc[i,'Slightly_toxicity'] = count_list[i]['Slightly_toxic']
    except KeyError:
        pred_label.loc[i,'Slightly_toxicity'] = 0
    try:
        pred_label.loc[i,'Moderately_toxicity'] = count_list[i]['Moderately_toxic']
    except KeyError:
        pred_label.loc[i,'Moderately_toxicity'] = 0
    try:
        pred_label.loc[i,'Highest_toxicity'] = count_list[i]['Highest_toxic']
    except KeyError:
        pred_label.loc[i,'Highest_toxicity'] = 0
        
toxic_count = pred_label.iloc[:,-4:].values
toxic_percentage = np.round((toxic_count/59)*100,2)

col_names = ['Safe_chemicals_per','Slightly_toxicity_per','Moderately_toxicity_per','Highest_toxicity_per']
toxic_per = pd.DataFrame(toxic_percentage,columns = col_names)

results = pd.concat([pred_label.iloc[:,0:2],pred_label.iloc[:,-4:],toxic_per],axis=1)
results['toxic_score'] = np.round((results['Safe_chemicals']*0 + results['Slightly_toxicity']*3.33 + results['Moderately_toxicity']*6.66 + results['Highest_toxicity']*10)/59,2)
results['std'] = np.round(results.iloc[:,2:6].std(axis=1),2)

results.to_csv(path+'/Results/LD50_prediction_of_FDA.csv',index=False)
print('Done')
# results

Make result file format
Done


# Classification assign2

In [25]:
print('Make result file format')
task_list = np.load(data_path + "task_list.npy")
pred = pd.DataFrame(y_pred)
pred.columns = task_list
pred['molwt'] = data['MolWt'].values

for i in range(len(pred)):
    reverse_standardization = (1/10**(pred.iloc[i,0:59]))*pred.iloc[i,59]*10**(3)
    pred.iloc[i,0:59] = reverse_standardization
    
pred_label = pred.iloc[:,0:-1]
pred_label = pred_label.applymap(toxic_label2)
pred_label = pd.concat([data,pred_label],axis=1)
count_label = pred_label.iloc[:,3:].T

pred_label['Safe_chemicals'] = 0
pred_label['Toxic_chemicals'] = 0

count_list = []
for i in range(len(count_label.columns)):
    dict = count_label.iloc[:,i].value_counts().to_dict()
    count_list.append(dict)

for i in range(len(count_label.columns)):
    try:
        pred_label.loc[i,'Safe_chemicals'] = count_list[i]['Safe_chemicals']
    except KeyError:
        pred_label.loc[i,'Safe_chemicals'] = 0
    try:
        pred_label.loc[i,'Toxic_chemicals'] = count_list[i]['Toxic_chemicals']
    except KeyError:
        pred_label.loc[i,'Toxic_chemicals'] = 0

        
toxic_count = pred_label.iloc[:,-2:].values
toxic_percentage = np.round((toxic_count/59)*100,2)

col_names = ['Safe_chemicals_per','Toxic_chemicals_per']
toxic_per = pd.DataFrame(toxic_percentage,columns = col_names)

results = pd.concat([pred_label.iloc[:,0:2],pred_label.iloc[:,-2:],toxic_per],axis=1)
# results['toxic_score'] = np.round((results['Safe_chemicals']*0 + results['Slightly_toxicity']*3.33 + results['Moderately_toxicity']*6.66 + results['Highest_toxicity']*10)/59,2)
# results['std'] = np.round(results.iloc[:,2:6].std(axis=1),2)

results.to_csv(path+'/Results/LD50_prediction_of_FDA2.csv',index=False)
print('Done')
# results

Make result file format
Done


In [26]:
results

Unnamed: 0,drug_name,canonical smiles,Safe_chemicals,Toxic_chemicals,Safe_chemicals_per,Toxic_chemicals_per
0,ROSUVASTATIN,CC(C)C1=NC(=NC(=C1C=CC(CC(CC(=O)O)O)O)C2=CC=C(...,58,1,98.31,1.69
1,ATORVASTATIN CALCIUM TRIHYDRATE,CC(C)C1=C(C(=C(N1CCC(CC(CC(=O)[O-])O)O)C2=CC=C...,44,15,74.58,25.42
2,TAPENTADOL HYDROCHLORIDE,CCC(C1=CC(=CC=C1)O)C(C)CN(C)C.Cl,58,1,98.31,1.69
3,BUPRENORPHINE,CC(C)(C)C(C)(C1CC23CCC1(C4C25CCN(C3CC6=C5C(=C(...,57,2,96.61,3.39
4,TOLCAPONE,CC1=CC=C(C=C1)C(=O)C2=CC(=C(C(=C2)O)O)[N+](=O)...,59,0,100.00,0.00
...,...,...,...,...,...,...
1372,TELAPREVIR,CCCC(C(=O)C(=O)NC1CC1)NC(=O)C2C3CCCC3CN2C(=O)C...,55,4,93.22,6.78
1373,COBIMETINIB,C1CCNC(C1)C2(CN(C2)C(=O)C3=C(C(=C(C=C3)F)F)NC4...,56,3,94.92,5.08
1374,DROSPIRENONE,CC12CCC(=O)C=C1C3CC3C4C2CCC5(C4C6CC6C57CCC(=O)...,58,1,98.31,1.69
1375,ABACAVIR,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)C4CC(C=C4)CO,56,3,94.92,5.08


In [23]:
pred_label

Unnamed: 0,drug_name,canonical smiles,MolWt,mouse_intraperitoneal_LD50,mammal (species unspecified)_intraperitoneal_LD50,guinea pig_intraperitoneal_LD50,rat_intraperitoneal_LD50,rabbit_intraperitoneal_LD50,mouse_intraperitoneal_LDLo,rat_intraperitoneal_LDLo,...,guinea pig_subcutaneous_LDLo,rat_subcutaneous_LDLo,rabbit_subcutaneous_LDLo,frog_subcutaneous_LDLo,mouse_intramuscular_LD50,rat_intramuscular_LD50,mouse_parenteral_LD50,Safe_chemicals,Toxicity_chemicals,Toxic_chemicals
0,ROSUVASTATIN,CC(C)C1=NC(=NC(=C1C=CC(CC(CC(=O)O)O)O)C2=CC=C(...,481.546,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,0,0,0.0
1,ATORVASTATIN CALCIUM TRIHYDRATE,CC(C)C1=C(C(=C(N1CCC(CC(CC(=O)[O-])O)O)C2=CC=C...,1209.407,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,11,0,0.0
2,TAPENTADOL HYDROCHLORIDE,CCC(C1=CC(=CC=C1)O)C(C)CN(C)C.Cl,257.805,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,1,0,0.0
3,BUPRENORPHINE,CC(C)(C)C(C)(C1CC23CCC1(C4C25CCN(C3CC6=C5C(=C(...,467.650,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,2,0,0.0
4,TOLCAPONE,CC1=CC=C(C=C1)C(=O)C2=CC(=C(C(=C2)O)O)[N+](=O)...,273.244,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,TELAPREVIR,CCCC(C(=O)C(=O)NC1CC1)NC(=O)C2C3CCCC3CN2C(=O)C...,679.863,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,3,0,0.0
1373,COBIMETINIB,C1CCNC(C1)C2(CN(C2)C(=O)C3=C(C(=C(C=C3)F)F)NC4...,531.316,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,3,0,0.0
1374,DROSPIRENONE,CC12CCC(=O)C=C1C3CC3C4C2CCC5(C4C6CC6C57CCC(=O)...,366.501,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,0,0,0.0
1375,ABACAVIR,C1CC1NC2=C3C(=NC(=N2)N)N(C=N3)C4CC(C=C4)CO,286.339,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,2,0,0.0


In [22]:
pred_label['Safe_chemicals'] = 0
pred_label['Toxicity_chemicals'] = 0

count_list = []
for i in range(len(count_label.columns)):
    dict = count_label.iloc[:,i].value_counts().to_dict()
    count_list.append(dict)

for i in range(len(count_label.columns)):
    try:
        pred_label.loc[i,'Safe_chemicals'] = count_list[i]['Safe_chemicals']
    except KeyError:
        pred_label.loc[i,'Safe_chemicals'] = 0
    try:
        pred_label.loc[i,'Toxic_chemicals'] = count_list[i]['Toxic_chemicals']
    except KeyError:
        pred_label.loc[i,'Toxic_chemicals'] = 0

In [17]:
pred_label = pd.concat([data,pred_label],axis=1)
count_label = pred_label.iloc[:,3:].T

In [19]:
count_label = pred_label.iloc[:,3:].T

In [20]:
count_label

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1367,1368,1369,1370,1371,1372,1373,1374,1375,1376
mouse_intraperitoneal_LD50,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
mammal (species unspecified)_intraperitoneal_LD50,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
guinea pig_intraperitoneal_LD50,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
rat_intraperitoneal_LD50,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
rabbit_intraperitoneal_LD50,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
mouse_intraperitoneal_LDLo,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
rat_intraperitoneal_LDLo,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
mouse_intravenous_LD50,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
guinea pig_intravenous_LD50,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
rat_intravenous_LD50,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,Toxic,Safe_chemicals,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic


In [12]:
print('Make result file format')
task_list = np.load(data_path + "task_list.npy")
pred = pd.DataFrame(y_pred)
pred.columns = task_list
pred['molwt'] = data['MolWt'].values

for i in range(len(pred)):
    reverse_standardization = (1/10**(pred.iloc[i,0:59]))*pred.iloc[i,59]*10**(3)
    pred.iloc[i,0:59] = reverse_standardization

Make result file format


In [14]:
pred_label = pred.iloc[:,0:-1]
pred_label = pred_label.applymap(toxic_label2)

In [15]:
pred_label

Unnamed: 0,mouse_intraperitoneal_LD50,mammal (species unspecified)_intraperitoneal_LD50,guinea pig_intraperitoneal_LD50,rat_intraperitoneal_LD50,rabbit_intraperitoneal_LD50,mouse_intraperitoneal_LDLo,rat_intraperitoneal_LDLo,mouse_intravenous_LD50,guinea pig_intravenous_LD50,rat_intravenous_LD50,...,rat_subcutaneous_LD50,rabbit_subcutaneous_LD50,mouse_subcutaneous_LDLo,guinea pig_subcutaneous_LDLo,rat_subcutaneous_LDLo,rabbit_subcutaneous_LDLo,frog_subcutaneous_LDLo,mouse_intramuscular_LD50,rat_intramuscular_LD50,mouse_parenteral_LD50
0,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
1,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Safe_chemicals,Safe_chemicals,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
2,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
3,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
4,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
1373,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
1374,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
1375,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,...,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic,Toxic
