In [1]:
import os
import numpy as np
import pandas as pd
import torch
from sklearn.model_selection import KFold
from math import sqrt

import os
import sys
import nbimporter

project_root = os.path.join(os.getcwd(), '..')
sys.path.append(project_root)

from datapreparation.Process_1D_data import *
from datapreparation.Process_graph_2d_data import *
from datapreparation.Process_graph_3d_data import *
from datapreparation.Process_mlp_data import *

In [2]:
def get_data_label(data, data_type, device):
    data_label_mapping = {
        '1d': lambda data: (data[0].to(device), data[1].to(device)),
        '2d': lambda data: (data.to(device), data.y.to(device)),
        '3d': lambda data: (data.to(device), data.y.to(device)),
        '3d_voxels': lambda data: (data[0].to(device), data[1].to(device)),
        'mlp': lambda data: (data[0].to(device), data[1].to(device))
    }
    return data_label_mapping.get(data_type, lambda data: (None, None))(data)

# train model for each epoch

In [3]:
def train_epoch(model, train_loader, optimizer, criterion, device, data_type):
    model.train()
    total_loss = total_correct = total = 0
    
    for data in train_loader:
        inputs, labels = get_data_label(data, data_type, device)
        if inputs is None or labels is None:
            raise ValueError(f"Unsupported data type: {data_type}")

        optimizer.zero_grad()
        outputs = model(inputs)

        if isinstance(criterion, torch.nn.BCEWithLogitsLoss):
            labels = labels.float()
            labels = labels.unsqueeze(1)
            predicted = (torch.sigmoid(outputs) > 0.5).float()
        else:
            predicted = torch.max(outputs, 1)[1]
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        total_loss += loss.item()
        total_correct += (predicted == labels).sum().item()
        total += labels.size(0)
    
    average_loss = total_loss / len(train_loader)
    accuracy = total_correct / total
    return average_loss, accuracy

# validate model for each epoch

In [4]:
def validate_epoch(model, test_loader, criterion, device, data_type):
    model.eval()
    total_loss = total_correct = total = TP = FP = TN = FN = 0
    
    with torch.no_grad():

        for data in test_loader:
            inputs, labels = get_data_label(data, data_type, device)
            if inputs is None or labels is None:
                raise ValueError(f"Unsupported data type: {data_type}")

            outputs = model(inputs)

            if isinstance(criterion, torch.nn.BCEWithLogitsLoss):
                labels = labels.float()
                labels = labels.unsqueeze(1)
                predicted = (torch.sigmoid(outputs) > 0.5).float()
            else:
                predicted = torch.max(outputs, 1)[1]

            loss = criterion(outputs, labels)
            total_loss += loss.item()
            total_correct += (predicted == labels).sum().item()
            total += labels.size(0)

            TP += ((predicted == 1) & (labels == 1)).sum().item()
            TN += ((predicted == 0) & (labels == 0)).sum().item()
            FP += ((predicted == 1) & (labels == 0)).sum().item()
            FN += ((predicted == 0) & (labels == 1)).sum().item()

    SEN = TP / (TP + FN) if TP + FN else 0
    SPE = TN / (TN + FP) if TN + FP else 0
    MCC = (TP*TN - FP*FN) / sqrt((TP+FP)*(TP+FN)*(TN+FP)*(TN+FN)) if (TP+FP)*(TP+FN)*(TN+FP)*(TN+FN) else 0
    
    average_loss = total_loss / len(test_loader)
    accuracy = total_correct / total
    
    return average_loss, accuracy, SEN, SPE, MCC


# general training for 1d 2d 3d mlp

In [5]:
def training_general(data, model, optimizer, criterion, batch_size, epoch_num, device, data_type, mode='train', Preprocess=None, scale_path=None):
    best_val_accuracy = 0  
    best_model_state = None
    model_save_dir = f'best_model_{data_type}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
        
    if data_type == '1d':
        smiles_list, labels = unpack_smiles_label(data)
        train_loader, test_loader = load_data_1d(smiles_list, labels, batch_size)  
    elif data_type == '2d' or data_type == '3d':
        torch_graph_data_list = get_torch_graph_data_list(data, mode, Preprocess, scale_path)
        train_loader, test_loader = load_graph_data(torch_graph_data_list, batch_size)
    elif data_type == '3d_voxels':
        train_loader, test_loader = load_3d_voxels_data(data, batch_size)
    elif data_type == 'mlp':
        fused_features, processed_labels = unpack_fusefeatures_labels(data)
        fused_features, processed_labels = standard_data(fused_features, processed_labels, mode, Preprocess, scale_path)
        train_loader, test_loader = load_data_mlp(fused_features, processed_labels, batch_size)
    else:
        raise ValueError(f"Unsupported data type: {data_type}")

    for epoch in range(epoch_num):
        train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, criterion, device, data_type)
        val_loss, val_accuracy, SEN, SPE, MCC = validate_epoch(model, test_loader, criterion, device, data_type)
        
        print(f'Epoch: {epoch+1}/{epoch_num}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}, SEN: {SEN:.4f}, SPE: {SPE:.4f}, MCC: {MCC:.4f}')
        
        if val_accuracy > best_val_accuracy:
            best_val_accuracy = val_accuracy
            best_model_state = model.state_dict()

        if (epoch + 1) % 5 == 0:
            periodic_save_path = os.path.join(model_save_dir, f'model_periodic_epoch_{epoch+1}.pth')
            torch.save(model.state_dict(), periodic_save_path)
    if best_model_state:
        best_model_path = os.path.join(model_save_dir, f'best_model_val_acc_{best_val_accuracy:.4f}.pth')
        torch.save(best_model_state, best_model_path)
        print(f"Best model saved: {best_model_path}")

    print("Training complete.")

In [6]:
def reset_weights(m):
    if hasattr(m, 'reset_parameters'):
        m.reset_parameters()

# 10 fold cv training for 1d 2d 3d mlp

In [7]:
def training_with_10fold_cv(data, model, optimizer, criterion, device, batch_size, epoch_num, data_type, reset=False, n_splits=10, mode='train',Preprocess=None,scale_path=None):
    model_save_dir = f'foldcv_models_{data_type}'
    if not os.path.exists(model_save_dir):
        os.makedirs(model_save_dir)
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    val_accuracies = []
    val_losses = []

    if data_type == '1d':
        smiles_list, labels = unpack_smiles_label(data)
        split_data = kf.split(smiles_list)
    elif data_type == '2d'or data_type == '3d':
        torch_graph_data_list = get_torch_graph_data_list(data, mode, Preprocess, scale_path)
        split_data = kf.split(torch_graph_data_list)
    elif data_type == '3d_voxels':
        voxels,labels = get_voxels_labels(data)
        split_data = kf.split(voxels)
    elif data_type == 'mlp':
        fused_features, processed_labels = unpack_fusefeatures_labels(data, mode, Preprocess, scale_path)
        fused_features, processed_labels = standard_data(fused_features, processed_labels, mode, Preprocess, scale_path)
        split_data = kf.split(fused_features)

    for fold, (train_idx, test_idx) in enumerate(split_data):
        print(f'Fold {fold + 1}/{n_splits}')

        if reset:
            model.apply(reset_weights)
            optimizer = torch.optim.Adam(model.parameters())

        if data_type == '1d':
            train_loader, test_loader = load_data_1d_10fold_cv(smiles_list, labels, train_idx, test_idx, batch_size)
        elif data_type == '2d' or data_type == '3d':
            train_loader, test_loader = load_graph_data_10fold_cv(torch_graph_data_list, train_idx, test_idx, batch_size)
        elif data_type == '3d_voxels': 
            train_loader, test_loader = load_3d_voxels_data_10fold_cv(voxels, labels, train_idx, test_idx, batch_size)
        elif data_type == 'mlp':
            train_loader, test_loader = load_data_mlp_10fold_cv(fused_features, processed_labels, train_idx, test_idx, batch_size)

        best_val_accuracy = 0
        best_model_state = None
        best_epoch = -1

        for epoch in range(epoch_num):
            train_loss, train_accuracy = train_epoch(model, train_loader, optimizer, criterion, device, data_type)
            val_loss, val_accuracy, SEN, SPE, MCC = validate_epoch(model, test_loader, criterion, device, data_type)

            print(f'Epoch: {epoch+1}/{epoch_num}, Train Loss: {train_loss:.4f}, Train Acc: {train_accuracy:.4f}, '
                  f'Val Loss: {val_loss:.4f}, Val Acc: {val_accuracy:.4f}, SEN: {SEN:.4f}, SPE: {SPE:.4f}, MCC: {MCC:.4f}')

            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                best_model_state = model.state_dict()
                best_epoch = epoch

        if best_model_state is not None:
            model_save_path = os.path.join(model_save_dir, f'best_model_fold_{fold+1}_epoch_{best_epoch+1}.pth')
            torch.save(best_model_state, model_save_path)

        val_accuracies.append(best_val_accuracy)
        val_losses.append(val_loss)

    avg_val_accuracy = np.mean(val_accuracies)
    avg_val_loss = np.mean(val_losses)
    print(f'Average Validation Accuracy: {avg_val_accuracy:.4f}, Average Validation Loss: {avg_val_loss:.4f}')


# traini model for machine leaning methods

In [8]:
import numpy as np
import joblib
import os
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, recall_score, matthews_corrcoef
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA

def ML_training(X, y, model, scaler_option=None, scale_path=None,pca_components=None,pca_path=None,cv_folds=10):
    
    if scaler_option == 'standardize':
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        joblib.dump(scaler, scale_path)
        
    elif scaler_option == 'normalize':
        scaler = MinMaxScaler()
        X = scaler.fit_transform(X)
        joblib.dump(scaler, scale_path)
    
    if pca_components is not None:
        pca = PCA(n_components=pca_components)
        X = pca.fit_transform(X)
        joblib.dump(pca, pca_path)

    total_accuracy = total_sensitivity = total_specificity = total_mcc = 0

    model_name = type(model).__name__
    model_dir = f"{model_name}_foldcv"
    if not os.path.exists(model_dir):
        os.makedirs(model_dir)

    skf = StratifiedKFold(n_splits=cv_folds, shuffle=True, random_state=42)
    fold = 0
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model.fit(X_train, y_train)
        model_filename = os.path.join(model_dir, f'model_fold_{fold}.joblib')
        joblib.dump(model, model_filename)
        
        y_pred = model.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        sensitivity = recall_score(y_test, y_pred, pos_label=1)
        specificity = recall_score(y_test, y_pred, pos_label=0)
        mcc = matthews_corrcoef(y_test, y_pred)
        
        total_accuracy += accuracy
        total_sensitivity += sensitivity
        total_specificity += specificity
        total_mcc += mcc
        
        print(f"Fold {fold}: ACC: {accuracy:.4f}, SEN: {sensitivity:.4f}, SPE: {specificity:.4f}, MCC: {mcc:.4f}")
        fold += 1

    avg_accuracy = total_accuracy / cv_folds
    avg_sensitivity = total_sensitivity / cv_folds
    avg_specificity = total_specificity / cv_folds
    avg_mcc = total_mcc / cv_folds
    
    print(f"Average ACC: {avg_accuracy:.4f}, Average SEN: {avg_sensitivity:.4f}, "
          f"Average SPE: {avg_specificity:.4f}, Average MCC: {avg_mcc:.4f}")
