In [1]:
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import os
import sys
import nbimporter

project_root = os.path.join(os.getcwd(), '..')
sys.path.append(project_root)

from datapreparation.Process_graph_2d_data import *
from models.Model_building import *
from augmentation.data_augmentation import *
from cv_strategies.train_cv_strategy_123D import *
from Extract_features.extract_features_from_dataset import *
from testing.offline_test import *

# Training

In [2]:
canonical_file = '../dataset/final_canonical_trainset.csv'
ori_smiles_df = pd.read_csv(canonical_file)

## data augmentation

In [3]:
smiles_df = smiles_augmentation(ori_smiles_df, original_multiplier=4, num_variants=4)

In [4]:
print(smiles_df.shape)

(9928, 2)


In [5]:
labels = smiles_df['Label']
smiles_list = smiles_df["SMILES"]
atom_numbers = get_atom_number(smiles_list)

In [6]:
graph_data_2d = preprocess_2d_graph_data(smiles_list, labels, atom_numbers)
num_node_features = graph_data_2d[0]['nodes_features'].shape[1]
num_edge_features = graph_data_2d[0]['edge_attr'].shape[1]

get graph_data: 100%|█████████████████████████████████████████████████████████████| 9928/9928 [00:10<00:00, 918.81it/s]


## set parameters

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
output_dim = 1
criterion = nn.BCEWithLogitsLoss()
#output_dim = 2
#criterion = nn.CrossEntropyLoss()
#model = GCNNnodes(num_node_features,output_dim).to(device)
batch_size=64
epoch_num=150
scale_path_2d ='standardize_scale_2d.pkl'             # None    'standardize_scale_2d.pkl'  'normalize_scale_2d.pkl'
scale_path_cv_2d = 'standardize_scale_cv_2d.pkl'    # None    'standardize_scale_cv_2d.pkl'
Preprocess2d = 'standardize'                       # None   'standardize'   'normalize'
Preprocesscv_2d = 'standardize'                    # None   'standardize'   'normalize'

## general training

In [8]:
model_general = EnhancedGCNNnodes(num_node_features, output_dim, dropout_rate=0)
optimizer_general = optim.Adam(model_general.parameters(),lr=0.01)
training_general(graph_data_2d, model_general, optimizer_general, criterion, batch_size, epoch_num, device, '2d', \
                 mode='train', Preprocess=Preprocess2d, scale_path=scale_path_2d)

Epoch: 1/150, Train Loss: 0.6485, Train Acc: 0.6202, Val Loss: 0.6430, Val Acc: 0.6324, SEN: 0.5766, SPE: 0.6890, MCC: 0.2672
Epoch: 2/150, Train Loss: 0.6326, Train Acc: 0.6476, Val Loss: 0.6176, Val Acc: 0.6591, SEN: 0.7427, SPE: 0.5745, MCC: 0.3219
Epoch: 3/150, Train Loss: 0.6192, Train Acc: 0.6574, Val Loss: 0.6109, Val Acc: 0.6732, SEN: 0.7167, SPE: 0.6292, MCC: 0.3473
Epoch: 4/150, Train Loss: 0.6129, Train Acc: 0.6657, Val Loss: 0.6199, Val Acc: 0.6611, SEN: 0.8609, SPE: 0.4590, MCC: 0.3496
Epoch: 5/150, Train Loss: 0.6012, Train Acc: 0.6667, Val Loss: 0.6229, Val Acc: 0.6365, SEN: 0.8889, SPE: 0.3810, MCC: 0.3136
Epoch: 6/150, Train Loss: 0.5879, Train Acc: 0.6804, Val Loss: 0.5958, Val Acc: 0.6687, SEN: 0.6066, SPE: 0.7315, MCC: 0.3407
Epoch: 7/150, Train Loss: 0.5871, Train Acc: 0.6823, Val Loss: 0.5838, Val Acc: 0.6697, SEN: 0.7638, SPE: 0.5745, MCC: 0.3446
Epoch: 8/150, Train Loss: 0.5865, Train Acc: 0.6813, Val Loss: 0.5795, Val Acc: 0.6772, SEN: 0.6587, SPE: 0.6960, MCC:

Epoch: 66/150, Train Loss: 0.3056, Train Acc: 0.8562, Val Loss: 0.3550, Val Acc: 0.8409, SEN: 0.7778, SPE: 0.9048, MCC: 0.6877
Epoch: 67/150, Train Loss: 0.3008, Train Acc: 0.8631, Val Loss: 0.3184, Val Acc: 0.8343, SEN: 0.8468, SPE: 0.8217, MCC: 0.6688
Epoch: 68/150, Train Loss: 0.3085, Train Acc: 0.8573, Val Loss: 0.3481, Val Acc: 0.8374, SEN: 0.7518, SPE: 0.9240, MCC: 0.6855
Epoch: 69/150, Train Loss: 0.3167, Train Acc: 0.8581, Val Loss: 0.3615, Val Acc: 0.8444, SEN: 0.7818, SPE: 0.9078, MCC: 0.6947
Epoch: 70/150, Train Loss: 0.3011, Train Acc: 0.8607, Val Loss: 0.3523, Val Acc: 0.8323, SEN: 0.8919, SPE: 0.7720, MCC: 0.6691
Epoch: 71/150, Train Loss: 0.2891, Train Acc: 0.8664, Val Loss: 0.3243, Val Acc: 0.8364, SEN: 0.8579, SPE: 0.8146, MCC: 0.6732
Epoch: 72/150, Train Loss: 0.3037, Train Acc: 0.8633, Val Loss: 0.3158, Val Acc: 0.8535, SEN: 0.8348, SPE: 0.8723, MCC: 0.7076
Epoch: 73/150, Train Loss: 0.2805, Train Acc: 0.8691, Val Loss: 0.3120, Val Acc: 0.8676, SEN: 0.8358, SPE: 0.89

Epoch: 131/150, Train Loss: 0.2028, Train Acc: 0.9140, Val Loss: 0.2584, Val Acc: 0.8842, SEN: 0.8989, SPE: 0.8693, MCC: 0.7686
Epoch: 132/150, Train Loss: 0.2130, Train Acc: 0.9102, Val Loss: 0.2505, Val Acc: 0.8978, SEN: 0.8789, SPE: 0.9169, MCC: 0.7962
Epoch: 133/150, Train Loss: 0.2235, Train Acc: 0.9087, Val Loss: 0.3088, Val Acc: 0.8590, SEN: 0.8649, SPE: 0.8531, MCC: 0.7180
Epoch: 134/150, Train Loss: 0.2340, Train Acc: 0.9005, Val Loss: 0.2426, Val Acc: 0.8933, SEN: 0.8789, SPE: 0.9078, MCC: 0.7869
Epoch: 135/150, Train Loss: 0.2057, Train Acc: 0.9161, Val Loss: 0.2280, Val Acc: 0.8983, SEN: 0.8869, SPE: 0.9098, MCC: 0.7968
Epoch: 136/150, Train Loss: 0.2058, Train Acc: 0.9137, Val Loss: 0.2477, Val Acc: 0.8912, SEN: 0.8899, SPE: 0.8926, MCC: 0.7825
Epoch: 137/150, Train Loss: 0.2014, Train Acc: 0.9189, Val Loss: 0.3022, Val Acc: 0.8696, SEN: 0.9159, SPE: 0.8227, MCC: 0.7421
Epoch: 138/150, Train Loss: 0.2502, Train Acc: 0.8940, Val Loss: 0.3008, Val Acc: 0.8671, SEN: 0.9029, S

## fold cv training

In [9]:
model_cv = EnhancedGCNNnodes(num_node_features, output_dim, dropout_rate=0)
optimizer_cv = optim.Adam(model_cv.parameters(),lr=0.01)
training_with_10fold_cv(graph_data_2d, model_cv, optimizer_cv, criterion, device, batch_size, 15, '2d', \
                        reset=False, n_splits=10, mode='train',Preprocess=Preprocesscv_2d,scale_path=scale_path_cv_2d)

Fold 1/10
Epoch: 1/15, Train Loss: 0.6569, Train Acc: 0.6145, Val Loss: 0.6309, Val Acc: 0.6626, SEN: 0.6713, SPE: 0.6537, MCC: 0.3250
Epoch: 2/15, Train Loss: 0.6345, Train Acc: 0.6402, Val Loss: 0.6213, Val Acc: 0.6647, SEN: 0.7564, SPE: 0.5697, MCC: 0.3323
Epoch: 3/15, Train Loss: 0.6285, Train Acc: 0.6463, Val Loss: 0.6527, Val Acc: 0.5740, SEN: 0.9525, SPE: 0.1824, MCC: 0.2123
Epoch: 4/15, Train Loss: 0.6225, Train Acc: 0.6522, Val Loss: 0.6206, Val Acc: 0.6677, SEN: 0.8198, SPE: 0.5102, MCC: 0.3477
Epoch: 5/15, Train Loss: 0.6163, Train Acc: 0.6630, Val Loss: 0.6069, Val Acc: 0.6687, SEN: 0.6931, SPE: 0.6434, MCC: 0.3370
Epoch: 6/15, Train Loss: 0.6126, Train Acc: 0.6597, Val Loss: 0.6157, Val Acc: 0.6626, SEN: 0.6851, SPE: 0.6393, MCC: 0.3249
Epoch: 7/15, Train Loss: 0.6047, Train Acc: 0.6703, Val Loss: 0.5964, Val Acc: 0.6767, SEN: 0.7683, SPE: 0.5820, MCC: 0.3569
Epoch: 8/15, Train Loss: 0.5925, Train Acc: 0.6741, Val Loss: 0.6270, Val Acc: 0.6647, SEN: 0.6099, SPE: 0.7213, MC

Epoch: 6/15, Train Loss: 0.3339, Train Acc: 0.8499, Val Loss: 0.3562, Val Acc: 0.8348, SEN: 0.9098, SPE: 0.7624, MCC: 0.6784
Epoch: 7/15, Train Loss: 0.3370, Train Acc: 0.8426, Val Loss: 0.3151, Val Acc: 0.8691, SEN: 0.8627, SPE: 0.8752, MCC: 0.7381
Epoch: 8/15, Train Loss: 0.3270, Train Acc: 0.8530, Val Loss: 0.3182, Val Acc: 0.8681, SEN: 0.8914, SPE: 0.8455, MCC: 0.7372
Epoch: 9/15, Train Loss: 0.3198, Train Acc: 0.8550, Val Loss: 0.3039, Val Acc: 0.8651, SEN: 0.8770, SPE: 0.8535, MCC: 0.7305
Epoch: 10/15, Train Loss: 0.3536, Train Acc: 0.8429, Val Loss: 0.3478, Val Acc: 0.8359, SEN: 0.8996, SPE: 0.7743, MCC: 0.6782
Epoch: 11/15, Train Loss: 0.3287, Train Acc: 0.8536, Val Loss: 0.3228, Val Acc: 0.8530, SEN: 0.9365, SPE: 0.7723, MCC: 0.7170
Epoch: 12/15, Train Loss: 0.3229, Train Acc: 0.8537, Val Loss: 0.3228, Val Acc: 0.8630, SEN: 0.8996, SPE: 0.8277, MCC: 0.7285
Epoch: 13/15, Train Loss: 0.3131, Train Acc: 0.8553, Val Loss: 0.3183, Val Acc: 0.8570, SEN: 0.8770, SPE: 0.8376, MCC: 0.7

Epoch: 12/15, Train Loss: 0.2415, Train Acc: 0.8948, Val Loss: 0.2183, Val Acc: 0.9032, SEN: 0.8852, SPE: 0.9226, MCC: 0.8073
Epoch: 13/15, Train Loss: 0.2419, Train Acc: 0.8972, Val Loss: 0.2189, Val Acc: 0.9022, SEN: 0.8619, SPE: 0.9456, MCC: 0.8081
Epoch: 14/15, Train Loss: 0.2338, Train Acc: 0.8986, Val Loss: 0.2211, Val Acc: 0.9062, SEN: 0.8677, SPE: 0.9477, MCC: 0.8159
Epoch: 15/15, Train Loss: 0.2296, Train Acc: 0.9022, Val Loss: 0.2188, Val Acc: 0.9103, SEN: 0.9241, SPE: 0.8954, MCC: 0.8204
Fold 10/10
Epoch: 1/15, Train Loss: 0.2403, Train Acc: 0.8977, Val Loss: 0.2239, Val Acc: 0.9062, SEN: 0.9027, SPE: 0.9100, MCC: 0.8124
Epoch: 2/15, Train Loss: 0.2334, Train Acc: 0.9029, Val Loss: 0.2396, Val Acc: 0.9093, SEN: 0.9222, SPE: 0.8954, MCC: 0.8183
Epoch: 3/15, Train Loss: 0.2308, Train Acc: 0.9023, Val Loss: 0.2469, Val Acc: 0.8982, SEN: 0.8716, SPE: 0.9268, MCC: 0.7981
Epoch: 4/15, Train Loss: 0.2308, Train Acc: 0.9031, Val Loss: 0.2517, Val Acc: 0.9022, SEN: 0.9494, SPE: 0.851

# Offline test and select model

In [10]:
offlinetest_file = '../dataset/canonical_offlinetestset.csv'

In [11]:
# best_model_2d   foldcv_models_2d
model_save_dir1 = 'best_model_2d/'
model_save_dir2 = 'foldcv_models_2d/'

In [12]:
integrated_test_model(offlinetest_file, '2d', model_general, criterion,device, 1, model_save_dir1, \
                      tokenizer=None, atom_numbers=atom_numbers, mode='test', Preprocess=Preprocess2d, scale_path=scale_path_2d)

get graph_data: 100%|███████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 786.30it/s]


Model: best_model_val_acc_0.9033.pth, ACC: 0.7268, SEN: 0.8085, SPE: 0.6577, MCC: 0.4675
Model: model_periodic_epoch_10.pth, ACC: 0.6293, SEN: 0.7872, SPE: 0.4955, MCC: 0.2925
Model: model_periodic_epoch_100.pth, ACC: 0.7463, SEN: 0.8191, SPE: 0.6847, MCC: 0.5043
Model: model_periodic_epoch_105.pth, ACC: 0.7366, SEN: 0.8511, SPE: 0.6396, MCC: 0.4963
Model: model_periodic_epoch_110.pth, ACC: 0.7366, SEN: 0.8723, SPE: 0.6216, MCC: 0.5035
Model: model_periodic_epoch_115.pth, ACC: 0.7171, SEN: 0.8404, SPE: 0.6126, MCC: 0.4599
Model: model_periodic_epoch_120.pth, ACC: 0.7366, SEN: 0.8617, SPE: 0.6306, MCC: 0.4998
Model: model_periodic_epoch_125.pth, ACC: 0.7317, SEN: 0.8511, SPE: 0.6306, MCC: 0.4881
Model: model_periodic_epoch_130.pth, ACC: 0.7171, SEN: 0.8511, SPE: 0.6036, MCC: 0.4634
Model: model_periodic_epoch_135.pth, ACC: 0.7512, SEN: 0.8723, SPE: 0.6486, MCC: 0.5279
Model: model_periodic_epoch_140.pth, ACC: 0.7415, SEN: 0.8191, SPE: 0.6757, MCC: 0.4957
Model: model_periodic_epoch_145.

In [13]:
integrated_test_model(offlinetest_file, '2d', model_cv, criterion,device, 1, model_save_dir2, \
                      tokenizer=None, atom_numbers=atom_numbers, mode='test', Preprocess=Preprocesscv_2d, scale_path=scale_path_cv_2d)

get graph_data: 100%|███████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 779.83it/s]


Model: best_model_fold_10_epoch_5.pth, ACC: 0.6683, SEN: 0.7660, SPE: 0.5856, MCC: 0.3544
Model: best_model_fold_1_epoch_14.pth, ACC: 0.7561, SEN: 0.7979, SPE: 0.7207, MCC: 0.5171
Model: best_model_fold_2_epoch_10.pth, ACC: 0.6537, SEN: 0.9362, SPE: 0.4144, MCC: 0.4015
Model: best_model_fold_3_epoch_15.pth, ACC: 0.6780, SEN: 0.8298, SPE: 0.5495, MCC: 0.3903
Model: best_model_fold_4_epoch_9.pth, ACC: 0.6927, SEN: 0.7872, SPE: 0.6126, MCC: 0.4025
Model: best_model_fold_5_epoch_2.pth, ACC: 0.7024, SEN: 0.7766, SPE: 0.6396, MCC: 0.4170
Model: best_model_fold_6_epoch_14.pth, ACC: 0.7415, SEN: 0.8298, SPE: 0.6667, MCC: 0.4985
Model: best_model_fold_7_epoch_13.pth, ACC: 0.7122, SEN: 0.8085, SPE: 0.6306, MCC: 0.4421
Model: best_model_fold_8_epoch_14.pth, ACC: 0.7073, SEN: 0.8511, SPE: 0.5856, MCC: 0.4471
Model: best_model_fold_9_epoch_7.pth, ACC: 0.7073, SEN: 0.8191, SPE: 0.6126, MCC: 0.4367
