In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import os
import sys
import nbimporter

project_root = os.path.join(os.getcwd(), '..')
sys.path.append(project_root)

from datapreparation.Process_1D_data import *
from datapreparation.Process_graph_2d_data import *
from datapreparation.Process_graph_3d_data import *
from datapreparation.Process_mlp_data import *

from models.Model_building import *
from augmentation.data_augmentation import *
from cv_strategies.train_cv_strategy_123D import *
from Extract_features.extract_features_from_dataset import *
from testing.offline_test import *


# Extract Features

In [2]:
canonical_file = '../dataset/final_canonical_trainset.csv'
ori_smiles_df = pd.read_csv(canonical_file)

In [3]:
smiles_df_1d = smiles_augmentation(ori_smiles_df, original_multiplier=3, num_variants=3)
smiles_df_2d = smiles_augmentation(ori_smiles_df, original_multiplier=3, num_variants=3)
smiles_df_3d = smiles_augmentation(ori_smiles_df, original_multiplier=3, num_variants=3)

## set parameters

In [4]:
smiles_list_1d = smiles_df_1d['SMILES'].tolist()
tokenizer = create_vocab(smiles_list_1d)
stoi = tokenizer['stoi']
smiles_list_2d = smiles_df_2d["SMILES"]
atom_numbers = get_atom_number(smiles_list_2d)

In [5]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_dim = len(stoi)
embed_dim = 64
hidden_dim = 128
output_dim = 1
num_node_features = 17
num_edge_features = 1
batch_size = 4

## load models

In [6]:
model_1d = RNNModel(input_dim, embed_dim, hidden_dim, output_dim=1).to(device)
model_2d = EnhancedGCNNnodes(num_node_features, output_dim, dropout_rate=0).to(device)
model_3d = GraphNN3D(num_node_features, num_edge_features, output_dim).to(device)
model_path_1d = '../Feature_extract/foldcv_models_1d/best_model_fold_9_epoch_1.pth'
model_path_2d = '../Feature_extract/best_model_2d/model_periodic_epoch_135.pth'
model_path_3d = '../Feature_extract/best_model_3d/model_periodic_epoch_50.pth'

## feature  extraction

In [7]:
Preprocess_2d = 'standardize'          # None   'standardize'   'normalize'
scale_path_2d = '../Feature_extract/standardize_scale_cv_2d.pkl'          # 
Preprocess_3d = 'standardize'          # None   'standardize'   'normalize'
scale_path_3d = '../Feature_extract/standardize_scale_cv_3d.pkl '         #

In [8]:
all_features_1d, all_labels_1d = integrated_feature_extraction('1d', model_1d, model_path_1d, smiles_df_1d, device, 1, tokenizer=tokenizer,atom_numbers=None, \
                                                               mode='test',Preprocess=None,scale_path=None)
all_features_2d, all_labels_2d = integrated_feature_extraction('2d', model_2d, model_path_2d, smiles_df_2d, device, 1, tokenizer=None,atom_numbers=atom_numbers, \
                                                               mode='test',Preprocess=Preprocess_2d,scale_path=scale_path_2d)
all_features_3d, all_labels_3d = integrated_feature_extraction('3d', model_3d, model_path_3d, smiles_df_3d, device, 1, tokenizer=None,atom_numbers=atom_numbers, \
                                                               mode='test',Preprocess=Preprocess_3d,scale_path=scale_path_3d)

get graph_data: 100%|█████████████████████████████████████████████████████████████| 7446/7446 [00:08<00:00, 893.35it/s]
get graph_data: 100%|██████████████████████████████████████████████████████████████| 7446/7446 [18:52<00:00,  6.57it/s]


In [9]:
print(all_features_1d.shape, all_labels_1d.shape)
print(all_features_2d.shape, all_labels_2d.shape)
print(all_features_3d.shape, all_labels_3d.shape)

(7446, 128) (7446,)
(7446, 64) (7446,)
(7446, 129) (7446,)


## fuse features

In [10]:
fused_features = np.concatenate([all_features_1d, all_features_2d, all_features_3d], axis=1)
fused_labels = all_labels_2d

In [11]:
print(fused_features.shape, fused_labels.shape)

(7446, 321) (7446,)


# Training model

## Training_ML

In [12]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [13]:
X , y= fused_features, fused_labels

In [14]:
scaler_option = None            # None   'standardize'   'normalize'
scale_path_ml = None           # None 'standardize_scaler.pkl'   'normalize_scaler.pkl'
pca_components = None         # 100 50 ...
pca_path = None              # None  'pca.pkl'

In [15]:
svm_model = SVC(kernel='linear', C=1)
ML_training(X, y, svm_model, scaler_option=scaler_option, scale_path=scale_path_ml,pca_components=pca_components,pca_path=pca_path)

Fold 0: ACC: 0.6926, SEN: 0.6824, SPE: 0.7033, MCC: 0.3856
Fold 1: ACC: 0.6966, SEN: 0.7218, SPE: 0.6703, MCC: 0.3927
Fold 2: ACC: 0.6872, SEN: 0.6955, SPE: 0.6786, MCC: 0.3741
Fold 3: ACC: 0.7060, SEN: 0.7139, SPE: 0.6978, MCC: 0.4117
Fold 4: ACC: 0.7020, SEN: 0.7060, SPE: 0.6978, MCC: 0.4038
Fold 5: ACC: 0.7047, SEN: 0.7139, SPE: 0.6951, MCC: 0.4090
Fold 6: ACC: 0.7030, SEN: 0.7270, SPE: 0.6777, MCC: 0.4053
Fold 7: ACC: 0.7083, SEN: 0.7244, SPE: 0.6915, MCC: 0.4161
Fold 8: ACC: 0.6855, SEN: 0.6903, SPE: 0.6804, MCC: 0.3707
Fold 9: ACC: 0.6922, SEN: 0.7087, SPE: 0.6749, MCC: 0.3838
Average ACC: 0.6978, Average SEN: 0.7084, Average SPE: 0.6867, Average MCC: 0.3953


In [16]:
rf_model = RandomForestClassifier(n_estimators=100)
ML_training(X, y, rf_model, scaler_option=scaler_option, scale_path=scale_path_ml,pca_components=pca_components,pca_path=pca_path)

Fold 0: ACC: 0.9114, SEN: 0.9160, SPE: 0.9066, MCC: 0.8227
Fold 1: ACC: 0.9141, SEN: 0.9318, SPE: 0.8956, MCC: 0.8284
Fold 2: ACC: 0.9060, SEN: 0.9055, SPE: 0.9066, MCC: 0.8120
Fold 3: ACC: 0.9235, SEN: 0.9318, SPE: 0.9148, MCC: 0.8469
Fold 4: ACC: 0.9342, SEN: 0.9475, SPE: 0.9203, MCC: 0.8686
Fold 5: ACC: 0.9450, SEN: 0.9528, SPE: 0.9368, MCC: 0.8899
Fold 6: ACC: 0.9341, SEN: 0.9318, SPE: 0.9366, MCC: 0.8683
Fold 7: ACC: 0.9234, SEN: 0.9528, SPE: 0.8926, MCC: 0.8478
Fold 8: ACC: 0.9140, SEN: 0.9160, SPE: 0.9118, MCC: 0.8279
Fold 9: ACC: 0.9140, SEN: 0.9265, SPE: 0.9008, MCC: 0.8280
Average ACC: 0.9220, Average SEN: 0.9312, Average SPE: 0.9123, Average MCC: 0.8440


In [17]:
gbt_model = GradientBoostingClassifier(n_estimators=100)
ML_training(X, y, gbt_model, scaler_option=scaler_option, scale_path=scale_path_ml,pca_components=pca_components,pca_path=pca_path)

Fold 0: ACC: 0.8282, SEN: 0.8425, SPE: 0.8132, MCC: 0.6562
Fold 1: ACC: 0.8564, SEN: 0.8740, SPE: 0.8379, MCC: 0.7127
Fold 2: ACC: 0.8403, SEN: 0.8084, SPE: 0.8736, MCC: 0.6827
Fold 3: ACC: 0.8604, SEN: 0.8714, SPE: 0.8489, MCC: 0.7207
Fold 4: ACC: 0.8685, SEN: 0.8898, SPE: 0.8462, MCC: 0.7371
Fold 5: ACC: 0.8523, SEN: 0.8609, SPE: 0.8434, MCC: 0.7045
Fold 6: ACC: 0.8723, SEN: 0.8478, SPE: 0.8981, MCC: 0.7460
Fold 7: ACC: 0.8589, SEN: 0.8898, SPE: 0.8264, MCC: 0.7183
Fold 8: ACC: 0.8387, SEN: 0.8268, SPE: 0.8512, MCC: 0.6778
Fold 9: ACC: 0.8481, SEN: 0.8688, SPE: 0.8264, MCC: 0.6962
Average ACC: 0.8524, Average SEN: 0.8580, Average SPE: 0.8465, Average MCC: 0.7052


# offline test

In [18]:
offlinetest_file = '../dataset/canonical_offlinetestset.csv'

In [19]:
smiles_df_test = pd.read_csv(offlinetest_file)

In [20]:
all_features_1d_test, all_labels_1d_test = integrated_feature_extraction('1d', model_1d, model_path_1d, smiles_df_test, device, 1, tokenizer=tokenizer, atom_numbers=atom_numbers, \
                                                                         mode='test',Preprocess=None,scale_path=None)
all_features_2d_test, all_labels_2d_test = integrated_feature_extraction('2d', model_2d, model_path_2d, smiles_df_test, device, 1, tokenizer=tokenizer, atom_numbers=atom_numbers, \
                                                                         mode='test',Preprocess=Preprocess_2d,scale_path=scale_path_2d)
all_features_3d_test, all_labels_3d_test = integrated_feature_extraction('3d', model_3d, model_path_3d, smiles_df_test, device, 1, tokenizer=tokenizer, atom_numbers=atom_numbers, \
                                                                         mode='test',Preprocess=Preprocess_3d,scale_path=scale_path_3d)

get graph_data: 100%|███████████████████████████████████████████████████████████████| 205/205 [00:00<00:00, 791.30it/s]
get graph_data: 100%|████████████████████████████████████████████████████████████████| 205/205 [00:18<00:00, 11.28it/s]


In [21]:
fused_features_test = np.concatenate([all_features_1d_test, all_features_2d_test, all_features_3d_test], axis=1)
fused_labels_test = all_labels_2d_test

## ML test

In [22]:
model_save_dir_svm = 'SVC_foldcv/'
model_save_dir_RF = 'RandomForestClassifier_foldcv/'
model_save_dir_GB = 'GradientBoostingClassifier_foldcv/'

In [23]:
ML_testing(fused_features_test, fused_labels_test, model_save_dir_svm, scale_path=scale_path_ml, pca_path=pca_path)

Model model_fold_0.joblib: ACC: 0.6537, SEN: 0.8617, SPE: 0.4775, MCC: 0.3617
Model model_fold_1.joblib: ACC: 0.6732, SEN: 0.9255, SPE: 0.4595, MCC: 0.4259
Model model_fold_2.joblib: ACC: 0.6683, SEN: 0.9043, SPE: 0.4685, MCC: 0.4062
Model model_fold_3.joblib: ACC: 0.6439, SEN: 0.8936, SPE: 0.4324, MCC: 0.3607
Model model_fold_4.joblib: ACC: 0.6585, SEN: 0.8936, SPE: 0.4595, MCC: 0.3848
Model model_fold_5.joblib: ACC: 0.6537, SEN: 0.9043, SPE: 0.4414, MCC: 0.3824
Model model_fold_6.joblib: ACC: 0.6341, SEN: 0.8936, SPE: 0.4144, MCC: 0.3445
Model model_fold_7.joblib: ACC: 0.6585, SEN: 0.9043, SPE: 0.4505, MCC: 0.3904
Model model_fold_8.joblib: ACC: 0.6439, SEN: 0.9043, SPE: 0.4234, MCC: 0.3664
Model model_fold_9.joblib: ACC: 0.6537, SEN: 0.8723, SPE: 0.4685, MCC: 0.3665
Average ACC: 0.6541, Average SEN: 0.8957, Average SPE: 0.4495, Average MCC: 0.3790


In [24]:
ML_testing(fused_features_test, fused_labels_test, model_save_dir_RF, scale_path=scale_path_ml, pca_path=pca_path)

Model model_fold_0.joblib: ACC: 0.7707, SEN: 0.9255, SPE: 0.6396, MCC: 0.5800
Model model_fold_1.joblib: ACC: 0.7366, SEN: 0.8830, SPE: 0.6126, MCC: 0.5074
Model model_fold_2.joblib: ACC: 0.7463, SEN: 0.9043, SPE: 0.6126, MCC: 0.5318
Model model_fold_3.joblib: ACC: 0.7415, SEN: 0.9149, SPE: 0.5946, MCC: 0.5286
Model model_fold_4.joblib: ACC: 0.7415, SEN: 0.8936, SPE: 0.6126, MCC: 0.5195
Model model_fold_5.joblib: ACC: 0.7561, SEN: 0.9149, SPE: 0.6216, MCC: 0.5520
Model model_fold_6.joblib: ACC: 0.7268, SEN: 0.8723, SPE: 0.6036, MCC: 0.4873
Model model_fold_7.joblib: ACC: 0.7659, SEN: 0.9255, SPE: 0.6306, MCC: 0.5722
Model model_fold_8.joblib: ACC: 0.7512, SEN: 0.8936, SPE: 0.6306, MCC: 0.5355
Model model_fold_9.joblib: ACC: 0.7561, SEN: 0.9362, SPE: 0.6036, MCC: 0.5617
Average ACC: 0.7493, Average SEN: 0.9064, Average SPE: 0.6162, Average MCC: 0.5376


In [25]:
ML_testing(fused_features_test, fused_labels_test, model_save_dir_GB, scale_path=scale_path_ml, pca_path=pca_path)

Model model_fold_0.joblib: ACC: 0.7610, SEN: 0.9149, SPE: 0.6306, MCC: 0.5599
Model model_fold_1.joblib: ACC: 0.7512, SEN: 0.8936, SPE: 0.6306, MCC: 0.5355
Model model_fold_2.joblib: ACC: 0.7366, SEN: 0.9043, SPE: 0.5946, MCC: 0.5161
Model model_fold_3.joblib: ACC: 0.7366, SEN: 0.8936, SPE: 0.6036, MCC: 0.5116
Model model_fold_4.joblib: ACC: 0.7415, SEN: 0.8936, SPE: 0.6126, MCC: 0.5195
Model model_fold_5.joblib: ACC: 0.7415, SEN: 0.8723, SPE: 0.6306, MCC: 0.5116
Model model_fold_6.joblib: ACC: 0.7463, SEN: 0.9043, SPE: 0.6126, MCC: 0.5318
Model model_fold_7.joblib: ACC: 0.7561, SEN: 0.8936, SPE: 0.6396, MCC: 0.5435
Model model_fold_8.joblib: ACC: 0.7512, SEN: 0.8830, SPE: 0.6396, MCC: 0.5316
Model model_fold_9.joblib: ACC: 0.7366, SEN: 0.8936, SPE: 0.6036, MCC: 0.5116
Average ACC: 0.7459, Average SEN: 0.8947, Average SPE: 0.6198, Average MCC: 0.5273


# Training_MLP

In [None]:
fused_dim = fused_features_mlp.shape[1]
output_dim = 1
criterion = nn.BCEWithLogitsLoss()
#output_dim = 2
#criterion = nn.CrossEntropyLoss()
batch_size = 4
epoch_num = 20
scale_path_mlp ='standardize_scale_mlp.pkl'             # None    'standardize_scale_mlp.pkl'  'normalize_scale_mlp.pkl'
scale_path_cv_mlp = 'standardize_scale_cv_mlp.pkl'    # None    'standardize_scale_cv_mlp.pkl'
Preprocessmlp = 'standardize'                       # None   'standardize'   'normalize'
Preprocesscv_mlp = 'standardize'                    # None   'standardize'   'normalize'

In [None]:
mlp_data = pack_fusefeatures_labels(fused_features,fused_labels)

In [None]:
#mlp_model_general = MLP(input_size=fused_dim, hidden_size=100, output_dim=output_dim).to(device)
mlp_model_general = ComplexMLP(fused_dim, [128, 64, 32], 1, 0)
optimizer_general = optim.Adam(mlp_model_general.parameters(),lr=0.001)
training_general(mlp_data, mlp_model_general, optimizer_general, criterion, batch_size, epoch_num, device, 'mlp', \
                 mode='train', Preprocess=Preprocessmlp, scale_path=scale_path_mlp)

In [None]:
#mlp_model_cv = MLP(input_size=fused_dim, hidden_size=100, output_dim=output_dim).to(device)
mlp_model_cv = ComplexMLP(fused_dim, [256,128,64], 1, 0)
optimizer_cv = optim.Adam(mlp_model_cv.parameters())
training_with_10fold_cv(mlp_data, mlp_model_cv, optimizer_cv, criterion, device, batch_size, 10, 'mlp', \
                        reset=False, n_splits=10, mode='train',Preprocess=Preprocesscv_mlp,scale_path=scale_path_cv_mlp)

## MLP test

In [None]:
fused_features_mlp_test, labels_mlp_test = standard_data(fused_features_test, fused_labels_test, 'test', \
                                                         Preprocess=Preprocessmlp, scale_path=scale_path_mlp)

In [None]:
mlp_test_data_loader = load_data_for_offline_test(fused_features_mlp_test, labels_mlp_test, 1)

In [None]:
fused_features_mlp_cv_test, labels_mlp_cv_test = standard_data(fused_features_test, fused_labels_test, 'test', \
                                                         Preprocess=Preprocesscv_mlp, scale_path=scale_path_cv_mlp)

In [None]:
mlp_cv_test_data_loader = load_data_for_offline_test(fused_features_mlp_cv_test, labels_mlp_cv_test, 1)

In [None]:
# best_model_mlp  foldcv_models_mlp
model_save_dir1 = 'best_model_mlp/'
model_save_dir2 = 'foldcv_models_mlp/'

In [None]:
for model_file in os.listdir(model_save_dir1):
    model_path = os.path.join(model_save_dir1, model_file)
    ACC, SEN, SPE, MCC = offline_test_model(mlp_model_general, criterion, model_path, mlp_test_data_loader, device, 'mlp')
    print(f"Model: {model_file}, ACC: {ACC:.4f}, SEN: {SEN:.4f}, SPE: {SPE:.4f}, MCC: {MCC:.4f}")

In [None]:
for model_file in os.listdir(model_save_dir2):
    model_path = os.path.join(model_save_dir2, model_file)
    ACC, SEN, SPE, MCC = offline_test_model(mlp_model_cv, criterion, model_path, mlp_cv_test_data_loader, device, 'mlp')
    print(f"Model: {model_file}, ACC: {ACC:.4f}, SEN: {SEN:.4f}, SPE: {SPE:.4f}, MCC: {MCC:.4f}")