In [1]:
import gc
import numpy as np
import pandas as pd
import cupy as cp
from rdkit import Chem
from SCFP_functions.feature import *
import SCFP_functions.SCFPfunctions as Mf
import SCFP_functions.SCFPmodel as Mm
from sklearn import metrics
import chainer.functions as F
from chainer import Variable
from chainer import serializers

--------------------------------------------------------------------------------
CuPy (cupy-cuda116) version 10.4.0 may not be compatible with this version of Chainer.
Please consider installing the supported version by running:
  $ pip install 'cupy-cuda116>=7.7.0,<8.0.0'

See the following page for more details:
  https://docs.cupy.dev/en/latest/install.html
--------------------------------------------------------------------------------



In [2]:
 # featurevector size
atomInfo = 21
structInfo = 21
lensize= atomInfo + structInfo

In [4]:
args = {
    "batchsize": 32,
    "epoch": 500,
    "frequency": 1,
    "gpu": 0,
    "output": "./test_out",
    "atomsize": 400,
    "boost": -1,
    "k1": 11,
    "s1": 1,
    "f1": 128,
    "k2": 5,
    "s2": 1,
    "k3": 11,
    "s3": 1,
    "f3": 64,
    "k4": 5,
    "s4": 1,
    "n_hid": 96,
    "n_out": 1,
}

In [5]:
#-------------------------------
# GPU check
xp = np
if args["gpu"] >= 0:
    print('GPU mode')
    xp = cp

GPU mode


In [6]:
print('Making Test Dataset...')
file='data/tox21_SR-p53/tox21_data_scoring.smiles'
print('Loading smiles: ', file)
smi = Chem.SmilesMolSupplier(file,delimiter='\t',titleLine=False)
mols = [mol for mol in smi if mol is not None]

F_list, T_list = [],[]
for mol in mols:
    if len(Chem.MolToSmiles(mol, kekuleSmiles=True, isomericSmiles=True)) > args["atomsize"]: print("SMILES is too long. This mol will be ignored.")
    else:
        F_list.append(mol_to_feature(mol,-1,args["atomsize"]))
        T_list.append(mol.GetProp('_Name') )            
Mf.random_list(F_list)
Mf.random_list(T_list)
data_t = np.asarray(T_list, dtype=np.int32).reshape(-1,1)
data_f = np.asarray(F_list, dtype=np.float32).reshape(-1,1,args["atomsize"],lensize)
print(data_t.shape, data_f.shape)

Making Test Dataset...
Loading smiles:  tox21_data_scoring.smiles
(614, 1) (614, 1, 400, 42)


In [7]:
borders = [len(data_t) * i // 30 for i in range(30+1)]

with cp.cuda.Device(args["gpu"]):
    data_f_gpu = cp.array(data_f)
    data_t_gpu = cp.array(data_t)

del mol, mols, F_list, T_list
gc.collect()

0

In [12]:
model = Mm.CNN(args["atomsize"], lensize, args["k1"], args["s1"], args["f1"], args["k2"], args["s2"], args["k3"], args["s3"], args["f3"],args["k4"], args["s4"],args["n_hid"],args["n_out"])

model.compute_accuracy = False
model.to_gpu(args["gpu"])

<SCFPmodel.CNN at 0x2742416aef0>

In [None]:
f = open('./model_output/evaluation_epoch.csv', 'w') 
f.write("epoch,TP,FN,FP,TN,Loss,Accuracy,B_accuracy,Sepecificity,Precision,Recall,F-measure,AUC\n")

try:
    for epoch in range(args["frequency"], args["epoch"]+1 ,args["frequency"]):
        pred_score,loss =[],[]
        
        with cp.cuda.Device(args["gpu"]):
            serializers.load_npz('./model_output/model_snapshot_' + str(epoch), model)
            
        for i in range(30):
            with cp.cuda.Device(args["gpu"]):
                x_gpu = data_f_gpu[borders[i]:borders[i+1]]
                y_gpu = data_t_gpu[borders[i]:borders[i+1]]
                pred_tmp_gpu, sr = model.predict(Variable(x_gpu))
                pred_tmp_gpu = F.sigmoid(pred_tmp_gpu)
                pred_tmp = pred_tmp_gpu.data.get()
                loss_tmp = model(Variable(x_gpu),Variable(y_gpu)).data.get()
            pred_score.extend(pred_tmp.reshape(-1).tolist())
            loss.append(loss_tmp.tolist())
        
        loss = np.mean(loss)
        pred_score = np.array(pred_score).reshape(-1,1)
        pred = 1*(pred_score >=0.5)
        
        count_TP= np.sum(np.logical_and(data_t == pred, pred == 1)*1)
        count_FP = np.sum(np.logical_and(data_t != pred, pred == 1)*1)
        count_FN = np.sum(np.logical_and(data_t != pred, pred == 0)*1)
        count_TN = np.sum(np.logical_and(data_t == pred, pred == 0)*1)
            
        Accuracy = (count_TP + count_TN)/(count_TP+count_FP+count_FN+count_TN)
        Sepecificity = count_TN/(count_TN + count_FP)
        Precision = count_TP/(count_TP+count_FP)
        Recall = count_TP/(count_TP+count_FN)
        Fmeasure = 2*Recall*Precision/(Recall+Precision)
        B_accuracy = (Sepecificity+Recall)/2
        AUC = metrics.roc_auc_score(data_t, pred_score, average = 'weighted')
        
        print(epoch,count_TP,count_FN,count_FP,count_TN,loss,Accuracy,B_accuracy,Sepecificity,Precision,Recall,Fmeasure,AUC, sep="\t")
        text = '{0},{1},{2},{3},{4},{5},{6},{7},{8},{9},{10},{11},{12}\n'.format(
                epoch,count_TP,count_FN,count_FP,count_TN,loss,Accuracy,B_accuracy,Sepecificity,Precision,Recall,Fmeasure,AUC)
        f.write(text)
finally:
    f.close()


In [8]:
df_output = pd.read_csv("./model_output/evaluation_epoch.csv")
index_of_best =df_output['AUC'].idxmax()
best = df_output.iloc[index_of_best]
best

epoch             7.000000
TP                9.000000
FN               31.000000
FP               17.000000
TN              557.000000
Loss              0.213412
Accuracy          0.921824
B_accuracy        0.597692
Sepecificity      0.970383
Precision         0.346154
Recall            0.225000
F-measure         0.272727
AUC               0.829443
Name: 6, dtype: float64

In [14]:
data_f_gpu.shape

(614, 1, 400, 42)