In [1]:
import pandas as pd
import torch
import torch.optim as optim
import torch.nn as nn
import os
import sys
import nbimporter

project_root = os.path.join(os.getcwd(), '..')
sys.path.append(project_root)

from datapreparation.Process_1D_data import *
from models.Model_building import *
from augmentation.data_augmentation import *
from cv_strategies.train_cv_strategy_123D import *
from Extract_features.extract_features_from_dataset import *
from testing.offline_test import *


# Training

In [2]:
canonical_file = '../dataset/final_canonical_trainset.csv'
ori_smiles_df = pd.read_csv(canonical_file)

## data augmentation

In [3]:
smiles_df = smiles_augmentation(ori_smiles_df, original_multiplier=2, num_variants=2)

In [4]:
print(smiles_df.shape)

(4964, 2)


In [5]:
smiles_list = smiles_df['SMILES'].tolist()
labels = smiles_df['Label'].apply(lambda x: 1 if x == 'Positive' else 0).tolist()
data = pack_smiles_label(smiles_list, labels)

## set parameters

In [6]:
tokenizer = create_vocab(smiles_list)
stoi = tokenizer['stoi']
input_dim = len(stoi)
embed_dim = 64
hidden_dim = 128
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
output_dim = 1
criterion = nn.BCEWithLogitsLoss()
#output_dim = 2
#criterion = nn.CrossEntropyLoss()
batch_size = 4
epoch_num = 30

## general training

In [7]:
model_general = RNNModel(input_dim, embed_dim, hidden_dim, output_dim).to(device)
optimizer_general = optim.Adam(model_general.parameters(),lr=0.001)
training_general(data, model_general, optimizer_general, criterion, batch_size, epoch_num, device, '1d', \
                 mode='train', Preprocess=None, scale_path=None)

Epoch: 1/30, Train Loss: 0.6898, Train Acc: 0.5439, Val Loss: 0.6787, Val Acc: 0.5770, SEN: 0.8200, SPE: 0.3306, MCC: 0.1728
Epoch: 2/30, Train Loss: 0.6818, Train Acc: 0.5648, Val Loss: 0.6647, Val Acc: 0.5801, SEN: 0.8560, SPE: 0.3002, MCC: 0.1881
Epoch: 3/30, Train Loss: 0.6704, Train Acc: 0.5825, Val Loss: 0.6543, Val Acc: 0.5911, SEN: 0.8720, SPE: 0.3063, MCC: 0.2164
Epoch: 4/30, Train Loss: 0.6523, Train Acc: 0.6180, Val Loss: 0.6432, Val Acc: 0.6264, SEN: 0.8640, SPE: 0.3854, MCC: 0.2843
Epoch: 5/30, Train Loss: 0.6099, Train Acc: 0.6555, Val Loss: 0.6029, Val Acc: 0.6798, SEN: 0.8320, SPE: 0.5254, MCC: 0.3758
Epoch: 6/30, Train Loss: 0.5763, Train Acc: 0.7013, Val Loss: 0.5967, Val Acc: 0.6747, SEN: 0.8820, SPE: 0.4645, MCC: 0.3818
Epoch: 7/30, Train Loss: 0.5393, Train Acc: 0.7240, Val Loss: 0.5575, Val Acc: 0.7049, SEN: 0.7040, SPE: 0.7059, MCC: 0.4099
Epoch: 8/30, Train Loss: 0.5010, Train Acc: 0.7610, Val Loss: 0.5298, Val Acc: 0.7221, SEN: 0.8340, SPE: 0.6085, MCC: 0.4545


## fold cv training

In [8]:
model_cv = RNNModel(input_dim, embed_dim, hidden_dim, output_dim).to(device)
optimizer_cv = optim.Adam(model_cv.parameters(),lr=0.001)
training_with_10fold_cv(data, model_cv, optimizer_cv, criterion, device, batch_size, 5, '1d', \
                        reset=False, n_splits=10, mode='train',Preprocess=None,scale_path=None)


Fold 1/10
Epoch: 1/5, Train Loss: 0.6876, Train Acc: 0.5554, Val Loss: 0.6834, Val Acc: 0.5412, SEN: 0.9000, SPE: 0.2062, MCC: 0.1467
Epoch: 2/5, Train Loss: 0.6710, Train Acc: 0.5865, Val Loss: 0.6404, Val Acc: 0.6398, SEN: 0.6333, SPE: 0.6459, MCC: 0.2792
Epoch: 3/5, Train Loss: 0.6420, Train Acc: 0.6291, Val Loss: 0.6154, Val Acc: 0.6499, SEN: 0.8833, SPE: 0.4319, MCC: 0.3510
Epoch: 4/5, Train Loss: 0.6190, Train Acc: 0.6586, Val Loss: 0.5822, Val Acc: 0.6922, SEN: 0.5958, SPE: 0.7821, MCC: 0.3854
Epoch: 5/5, Train Loss: 0.5815, Train Acc: 0.6922, Val Loss: 0.5465, Val Acc: 0.7264, SEN: 0.7250, SPE: 0.7276, MCC: 0.4524
Fold 2/10
Epoch: 1/5, Train Loss: 0.5317, Train Acc: 0.7358, Val Loss: 0.5236, Val Acc: 0.7465, SEN: 0.7385, SPE: 0.7553, MCC: 0.4932
Epoch: 2/5, Train Loss: 0.4747, Train Acc: 0.7759, Val Loss: 0.4781, Val Acc: 0.7767, SEN: 0.7385, SPE: 0.8186, MCC: 0.5573
Epoch: 3/5, Train Loss: 0.4296, Train Acc: 0.7983, Val Loss: 0.4634, Val Acc: 0.7787, SEN: 0.7346, SPE: 0.8270, 

# Offline test and select model

In [9]:
offlinetest_file = '../dataset/canonical_offlinetestset.csv'

In [10]:
# best_model_1d  foldcv_models_1d
model_save_dir1 = 'best_model_1d/'
model_save_dir2 = 'foldcv_models_1d/'

In [11]:
integrated_test_model(offlinetest_file, '1d', model_general, criterion,device, 1, model_save_dir1, \
                      tokenizer=tokenizer, atom_numbers=None, mode='test', Preprocess=None, scale_path=None)

Model: best_model_val_acc_0.8268.pth, ACC: 0.7268, SEN: 0.8085, SPE: 0.6577, MCC: 0.4675
Model: model_periodic_epoch_10.pth, ACC: 0.7512, SEN: 0.9043, SPE: 0.6216, MCC: 0.5397
Model: model_periodic_epoch_15.pth, ACC: 0.7366, SEN: 0.8723, SPE: 0.6216, MCC: 0.5035
Model: model_periodic_epoch_20.pth, ACC: 0.7171, SEN: 0.7872, SPE: 0.6577, MCC: 0.4453
Model: model_periodic_epoch_25.pth, ACC: 0.7268, SEN: 0.8298, SPE: 0.6396, MCC: 0.4733
Model: model_periodic_epoch_30.pth, ACC: 0.7268, SEN: 0.8085, SPE: 0.6577, MCC: 0.4675
Model: model_periodic_epoch_5.pth, ACC: 0.6732, SEN: 0.8936, SPE: 0.4865, MCC: 0.4087


In [12]:
integrated_test_model(offlinetest_file, '1d', model_cv, criterion,device, 1, model_save_dir2, \
                      tokenizer=tokenizer, atom_numbers=None, mode='test', Preprocess=None, scale_path=None)

Model: best_model_fold_10_epoch_4.pth, ACC: 0.7220, SEN: 0.8617, SPE: 0.6036, MCC: 0.4753
Model: best_model_fold_1_epoch_5.pth, ACC: 0.6683, SEN: 0.7447, SPE: 0.6036, MCC: 0.3493
Model: best_model_fold_2_epoch_4.pth, ACC: 0.7268, SEN: 0.7128, SPE: 0.7387, MCC: 0.4509
Model: best_model_fold_3_epoch_3.pth, ACC: 0.6878, SEN: 0.8085, SPE: 0.5856, MCC: 0.4000
Model: best_model_fold_4_epoch_1.pth, ACC: 0.7024, SEN: 0.7234, SPE: 0.6847, MCC: 0.4067
Model: best_model_fold_5_epoch_1.pth, ACC: 0.7561, SEN: 0.8511, SPE: 0.6757, MCC: 0.5295
Model: best_model_fold_6_epoch_1.pth, ACC: 0.7171, SEN: 0.8617, SPE: 0.5946, MCC: 0.4672
Model: best_model_fold_7_epoch_2.pth, ACC: 0.7268, SEN: 0.8511, SPE: 0.6216, MCC: 0.4798
Model: best_model_fold_8_epoch_1.pth, ACC: 0.7415, SEN: 0.8617, SPE: 0.6396, MCC: 0.5080
Model: best_model_fold_9_epoch_1.pth, ACC: 0.7610, SEN: 0.8723, SPE: 0.6667, MCC: 0.5442
