In [1]:
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import os
import sys
import nbimporter

project_root = os.path.join(os.getcwd(), '..')
sys.path.append(project_root)

from datapreparation.Process_graph_2d_data import get_atom_number
from datapreparation.Process_graph_3d_data import *
from models.Model_building import *
from augmentation.data_augmentation import *
from cv_strategies.train_cv_strategy_123D import *
from Extract_features.extract_features_from_dataset import *
from testing.offline_test import *

# Training

In [2]:
canonical_file = '../dataset/final_canonical_trainset.csv'
ori_smiles_df = pd.read_csv(canonical_file)

## data augmentation

In [3]:
smiles_df = smiles_augmentation(ori_smiles_df, original_multiplier=4, num_variants=4)

In [4]:
labels = smiles_df['Label']
smiles_list = smiles_df["SMILES"]
atom_numbers = get_atom_number(smiles_list)

In [5]:
graph_data_3d = preprocess_3d_graph(smiles_list, labels, atom_numbers)
num_node_features = graph_data_3d[0]['nodes_features'].shape[1]
num_edge_features = graph_data_3d[0]['edge_attr'].shape[1]

get graph_data: 100%|██████████████████████████████████████████████████████████████| 9928/9928 [24:25<00:00,  6.78it/s]


In [6]:
print(len(graph_data_3d))

9928


## set parameters

In [7]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
output_dim = 1
criterion = nn.BCEWithLogitsLoss()
#output_dim = 2
#criterion = nn.CrossEntropyLoss()
batch_size=64
epoch_num=150

scale_path_3d = 'standardize_scale_3d.pkl'           # None    'standardize_scale_2d.pkl'
scale_path_cv_3d = 'standardize_scale_cv_3d.pkl'     # None    'standardize_scale_cv_2d.pkl'
Preprocess3d = 'standardize'                         # None   'standardize'   'normalize'
Preprocesscv_3d = 'standardize'                      # None   'standardize'   'normalize'

## general training

In [8]:
model_general = GraphNN3D(num_node_features, num_edge_features, output_dim).to(device)
optimizer_general = optim.Adam(model_general.parameters(),lr=0.01)
training_general(graph_data_3d, model_general, optimizer_general, criterion, batch_size, epoch_num, device, '3d', \
                 mode='train', Preprocess=Preprocess3d, scale_path=scale_path_3d)

Epoch: 1/150, Train Loss: 0.6547, Train Acc: 0.6109, Val Loss: 0.6657, Val Acc: 0.6113, SEN: 0.5255, SPE: 0.6981, MCC: 0.2269
Epoch: 2/150, Train Loss: 0.6444, Train Acc: 0.6220, Val Loss: 0.6493, Val Acc: 0.6319, SEN: 0.7407, SPE: 0.5218, MCC: 0.2691
Epoch: 3/150, Train Loss: 0.6295, Train Acc: 0.6377, Val Loss: 0.6277, Val Acc: 0.6576, SEN: 0.8408, SPE: 0.4721, MCC: 0.3370
Epoch: 4/150, Train Loss: 0.6219, Train Acc: 0.6519, Val Loss: 0.6280, Val Acc: 0.6571, SEN: 0.7057, SPE: 0.6079, MCC: 0.3152
Epoch: 5/150, Train Loss: 0.6152, Train Acc: 0.6551, Val Loss: 0.6317, Val Acc: 0.6626, SEN: 0.6116, SPE: 0.7143, MCC: 0.3276
Epoch: 6/150, Train Loss: 0.6085, Train Acc: 0.6579, Val Loss: 0.6031, Val Acc: 0.6697, SEN: 0.8158, SPE: 0.5218, MCC: 0.3534
Epoch: 7/150, Train Loss: 0.5917, Train Acc: 0.6735, Val Loss: 0.5994, Val Acc: 0.6762, SEN: 0.6957, SPE: 0.6565, MCC: 0.3525
Epoch: 8/150, Train Loss: 0.5930, Train Acc: 0.6666, Val Loss: 0.5978, Val Acc: 0.6747, SEN: 0.6927, SPE: 0.6565, MCC:

Epoch: 66/150, Train Loss: 0.2157, Train Acc: 0.9107, Val Loss: 0.2771, Val Acc: 0.8938, SEN: 0.8789, SPE: 0.9088, MCC: 0.7879
Epoch: 67/150, Train Loss: 0.2151, Train Acc: 0.9063, Val Loss: 0.2209, Val Acc: 0.9038, SEN: 0.9339, SPE: 0.8734, MCC: 0.8090
Epoch: 68/150, Train Loss: 0.1961, Train Acc: 0.9178, Val Loss: 0.2215, Val Acc: 0.8978, SEN: 0.8879, SPE: 0.9078, MCC: 0.7958
Epoch: 69/150, Train Loss: 0.2030, Train Acc: 0.9144, Val Loss: 0.2213, Val Acc: 0.9094, SEN: 0.9009, SPE: 0.9179, MCC: 0.8189
Epoch: 70/150, Train Loss: 0.1972, Train Acc: 0.9182, Val Loss: 0.2163, Val Acc: 0.9124, SEN: 0.9039, SPE: 0.9210, MCC: 0.8249
Epoch: 71/150, Train Loss: 0.2074, Train Acc: 0.9090, Val Loss: 0.2312, Val Acc: 0.8998, SEN: 0.8809, SPE: 0.9189, MCC: 0.8003
Epoch: 72/150, Train Loss: 0.2144, Train Acc: 0.9102, Val Loss: 0.2299, Val Acc: 0.9104, SEN: 0.9089, SPE: 0.9119, MCC: 0.8207
Epoch: 73/150, Train Loss: 0.2008, Train Acc: 0.9189, Val Loss: 0.2143, Val Acc: 0.9074, SEN: 0.9199, SPE: 0.89

Epoch: 131/150, Train Loss: 0.1259, Train Acc: 0.9513, Val Loss: 0.1993, Val Acc: 0.9189, SEN: 0.9580, SPE: 0.8794, MCC: 0.8403
Epoch: 132/150, Train Loss: 0.1377, Train Acc: 0.9436, Val Loss: 0.1792, Val Acc: 0.9310, SEN: 0.9359, SPE: 0.9260, MCC: 0.8621
Epoch: 133/150, Train Loss: 0.1338, Train Acc: 0.9461, Val Loss: 0.1885, Val Acc: 0.9245, SEN: 0.9469, SPE: 0.9017, MCC: 0.8497
Epoch: 134/150, Train Loss: 0.1598, Train Acc: 0.9386, Val Loss: 0.2977, Val Acc: 0.8998, SEN: 0.8969, SPE: 0.9027, MCC: 0.7996
Epoch: 135/150, Train Loss: 0.2221, Train Acc: 0.9258, Val Loss: 0.2586, Val Acc: 0.9109, SEN: 0.9349, SPE: 0.8865, MCC: 0.8226
Epoch: 136/150, Train Loss: 0.1784, Train Acc: 0.9305, Val Loss: 0.1717, Val Acc: 0.9350, SEN: 0.9219, SPE: 0.9483, MCC: 0.8704
Epoch: 137/150, Train Loss: 0.1314, Train Acc: 0.9479, Val Loss: 0.1898, Val Acc: 0.9260, SEN: 0.8849, SPE: 0.9676, MCC: 0.8550
Epoch: 138/150, Train Loss: 0.1226, Train Acc: 0.9532, Val Loss: 0.1767, Val Acc: 0.9335, SEN: 0.9419, S

## fold cv training

In [9]:
model_cv = GraphNN3D(num_node_features, num_edge_features, output_dim).to(device)
optimizer_cv = optim.Adam(model_cv.parameters(),lr=0.01)
training_with_10fold_cv(graph_data_3d, model_cv, optimizer_cv, criterion, device, batch_size, 10, '3d', \
                        reset=False, n_splits=10, mode='train',Preprocess=Preprocesscv_3d,scale_path=scale_path_cv_3d)

Fold 1/10
Epoch: 1/10, Train Loss: 0.6590, Train Acc: 0.6079, Val Loss: 0.6391, Val Acc: 0.6475, SEN: 0.7822, SPE: 0.5082, MCC: 0.3023
Epoch: 2/10, Train Loss: 0.6359, Train Acc: 0.6301, Val Loss: 0.6309, Val Acc: 0.6556, SEN: 0.7960, SPE: 0.5102, MCC: 0.3201
Epoch: 3/10, Train Loss: 0.6216, Train Acc: 0.6420, Val Loss: 0.6273, Val Acc: 0.6395, SEN: 0.5545, SPE: 0.7275, MCC: 0.2859
Epoch: 4/10, Train Loss: 0.6166, Train Acc: 0.6556, Val Loss: 0.6076, Val Acc: 0.6777, SEN: 0.7208, SPE: 0.6332, MCC: 0.3555
Epoch: 5/10, Train Loss: 0.5979, Train Acc: 0.6633, Val Loss: 0.5883, Val Acc: 0.6596, SEN: 0.6139, SPE: 0.7070, MCC: 0.3220
Epoch: 6/10, Train Loss: 0.5815, Train Acc: 0.6753, Val Loss: 0.5808, Val Acc: 0.6828, SEN: 0.6752, SPE: 0.6906, MCC: 0.3658
Epoch: 7/10, Train Loss: 0.5761, Train Acc: 0.6738, Val Loss: 0.5769, Val Acc: 0.6737, SEN: 0.6000, SPE: 0.7500, MCC: 0.3536
Epoch: 8/10, Train Loss: 0.5616, Train Acc: 0.6943, Val Loss: 0.5670, Val Acc: 0.6888, SEN: 0.7703, SPE: 0.6045, MC

Epoch: 6/10, Train Loss: 0.2564, Train Acc: 0.8673, Val Loss: 0.2722, Val Acc: 0.8580, SEN: 0.8092, SPE: 0.9174, MCC: 0.7236
Epoch: 7/10, Train Loss: 0.2455, Train Acc: 0.8735, Val Loss: 0.3158, Val Acc: 0.8288, SEN: 0.7890, SPE: 0.8772, MCC: 0.6632
Epoch: 8/10, Train Loss: 0.2498, Train Acc: 0.8729, Val Loss: 0.3371, Val Acc: 0.8298, SEN: 0.7798, SPE: 0.8906, MCC: 0.6679
Epoch: 9/10, Train Loss: 0.2500, Train Acc: 0.8736, Val Loss: 0.2536, Val Acc: 0.8640, SEN: 0.7982, SPE: 0.9442, MCC: 0.7408
Epoch: 10/10, Train Loss: 0.2339, Train Acc: 0.8809, Val Loss: 0.2866, Val Acc: 0.8439, SEN: 0.7651, SPE: 0.9397, MCC: 0.7055
Fold 8/10
Epoch: 1/10, Train Loss: 0.2455, Train Acc: 0.8752, Val Loss: 0.2563, Val Acc: 0.8630, SEN: 0.7788, SPE: 0.9591, MCC: 0.7428
Epoch: 2/10, Train Loss: 0.2516, Train Acc: 0.8742, Val Loss: 0.2380, Val Acc: 0.8751, SEN: 0.8242, SPE: 0.9332, MCC: 0.7571
Epoch: 3/10, Train Loss: 0.2662, Train Acc: 0.8684, Val Loss: 0.2669, Val Acc: 0.8570, SEN: 0.8053, SPE: 0.9159, M

# Offline test and select model

In [10]:
offlinetest_file = '../dataset/canonical_offlinetestset.csv'

In [11]:
#best_model_3d   foldcv_models_3d
model_save_dir1 = 'best_model_3d/'
model_save_dir2 = 'foldcv_models_3d/'

In [12]:
integrated_test_model(offlinetest_file, '3d', model_general, criterion,device, 1, model_save_dir1, \
                      tokenizer=None, atom_numbers=atom_numbers, mode='test', Preprocess=Preprocess3d, scale_path=scale_path_3d)

get graph_data: 100%|████████████████████████████████████████████████████████████████| 205/205 [00:17<00:00, 11.43it/s]


Model: best_model_val_acc_0.9471.pth, ACC: 0.7707, SEN: 0.8191, SPE: 0.7297, MCC: 0.5475
Model: model_periodic_epoch_10.pth, ACC: 0.7512, SEN: 0.8404, SPE: 0.6757, MCC: 0.5182
Model: model_periodic_epoch_100.pth, ACC: 0.7463, SEN: 0.8298, SPE: 0.6757, MCC: 0.5069
Model: model_periodic_epoch_105.pth, ACC: 0.7317, SEN: 0.7979, SPE: 0.6757, MCC: 0.4735
Model: model_periodic_epoch_110.pth, ACC: 0.7707, SEN: 0.8191, SPE: 0.7297, MCC: 0.5475
Model: model_periodic_epoch_115.pth, ACC: 0.7317, SEN: 0.8617, SPE: 0.6216, MCC: 0.4916
Model: model_periodic_epoch_120.pth, ACC: 0.7463, SEN: 0.8085, SPE: 0.6937, MCC: 0.5018
Model: model_periodic_epoch_125.pth, ACC: 0.7415, SEN: 0.8404, SPE: 0.6577, MCC: 0.5014
Model: model_periodic_epoch_130.pth, ACC: 0.7659, SEN: 0.8723, SPE: 0.6757, MCC: 0.5525
Model: model_periodic_epoch_135.pth, ACC: 0.7122, SEN: 0.8511, SPE: 0.5946, MCC: 0.4552
Model: model_periodic_epoch_140.pth, ACC: 0.7756, SEN: 0.8298, SPE: 0.7297, MCC: 0.5584
Model: model_periodic_epoch_145.

In [13]:
integrated_test_model(offlinetest_file, '3d', model_cv, criterion,device, 1, model_save_dir2, \
                      tokenizer=None, atom_numbers=atom_numbers, mode='test', Preprocess=Preprocesscv_3d, scale_path=scale_path_cv_3d)

get graph_data: 100%|████████████████████████████████████████████████████████████████| 205/205 [00:17<00:00, 11.57it/s]


Model: best_model_fold_10_epoch_3.pth, ACC: 0.7463, SEN: 0.9255, SPE: 0.5946, MCC: 0.5412
Model: best_model_fold_1_epoch_10.pth, ACC: 0.6927, SEN: 0.9043, SPE: 0.5135, MCC: 0.4455
Model: best_model_fold_2_epoch_8.pth, ACC: 0.7512, SEN: 0.8936, SPE: 0.6306, MCC: 0.5355
Model: best_model_fold_3_epoch_9.pth, ACC: 0.7268, SEN: 0.7021, SPE: 0.7477, MCC: 0.4499
Model: best_model_fold_4_epoch_9.pth, ACC: 0.7122, SEN: 0.6809, SPE: 0.7387, MCC: 0.4200
Model: best_model_fold_5_epoch_3.pth, ACC: 0.7268, SEN: 0.7128, SPE: 0.7387, MCC: 0.4509
Model: best_model_fold_6_epoch_2.pth, ACC: 0.6829, SEN: 0.6702, SPE: 0.6937, MCC: 0.3632
Model: best_model_fold_7_epoch_9.pth, ACC: 0.7073, SEN: 0.6809, SPE: 0.7297, MCC: 0.4106
Model: best_model_fold_8_epoch_8.pth, ACC: 0.7073, SEN: 0.7234, SPE: 0.6937, MCC: 0.4157
Model: best_model_fold_9_epoch_2.pth, ACC: 0.7463, SEN: 0.9255, SPE: 0.5946, MCC: 0.5412
