In [1]:
# Cell 1: Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report, roc_auc_score, precision_score, recall_score, roc_curve, precision_recall_curve, matthews_corrcoef
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, Dropout, LayerNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from imblearn.over_sampling import SMOTE
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

print("Libraries loaded successfully.")

2024-09-09 20:44:46.878510: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-09-09 20:44:46.896001: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:485] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-09 20:44:46.915524: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:8454] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-09 20:44:46.921284: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1452] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-09 20:44:46.936129: I tensorflow/core/platform/cpu_feature_guar

Libraries loaded successfully.


In [2]:
# Cell 2: Load and preprocess dataset
df = pd.read_csv('merged_data_clean.csv')
df = df[(df['Source'] == 'ToxiM') | (df['Source'] == 'MolToxPred')]
categorical_columns = ['SMILES', 'Source']
df = df.drop(columns=categorical_columns)
Y = df['Toxicity']
# X_pca_df = pd.read_csv('X_boruta_df_ToxiM_and_MoltoxPred.csv')
X_pca_df = pd.read_csv('X_pca_clean_with_boruta_ToxiM_and_MolToxPred.csv')
# Handle class imbalance using SMOTE
smote = SMOTE(random_state=42)
X_resampled, Y_resampled = smote.fit_resample(X_pca_df, Y)

# Split data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X_resampled, Y_resampled, test_size=0.2, random_state=42, shuffle=True, stratify=Y_resampled)

print("Data preprocessing completed.")

Data preprocessing completed.


In [3]:
# Cell 3: Define and train Deep Neural Network with Keras Tuner
from kerastuner.tuners import BayesianOptimization

def build_model(hp):
    model = Sequential()
    for i in range(hp.Int('num_layers', 2, 6)):
        model.add(Dense(units=hp.Int(f'units_{i}', min_value=32, max_value=512, step=32), 
                        kernel_regularizer=tf.keras.regularizers.l2(0.01)))
        model.add(LeakyReLU(alpha=0.01))
        model.add(LayerNormalization())
        model.add(Dropout(hp.Float(f'dropout_{i}', min_value=0.3, max_value=0.5, step=0.05)))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(optimizer=Adam(learning_rate=hp.Float('learning_rate', 1e-4, 1e-2, sampling='LOG')), 
                  loss='binary_crossentropy', 
                  metrics=['accuracy'])
    return model

tuner = BayesianOptimization(build_model, 
                             objective='val_accuracy', 
                             max_trials=20, 
                             executions_per_trial=2, 
                             directory='tuner_dir', 
                             project_name='hyperparameter_tuning_bayes')

reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=10, min_lr=0.0001)
early_stopping = EarlyStopping(monitor='val_loss', patience=20, restore_best_weights=True)

tuner.search(X_train, Y_train, epochs=50, validation_split=0.2, callbacks=[reduce_lr, early_stopping])

best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
dnn_model = tuner.hypermodel.build(best_hps)
history = dnn_model.fit(X_train, Y_train, epochs=300, batch_size=32, validation_split=0.2, verbose=1, callbacks=[reduce_lr, early_stopping])

print("Deep Neural Network trained.")

Reloading Tuner from tuner_dir/hyperparameter_tuning_bayes/tuner0.json


  from kerastuner.tuners import BayesianOptimization
2024-09-09 20:46:16.941676: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2343] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


Epoch 1/300
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 7ms/step - accuracy: 0.6566 - loss: 5.5983 - val_accuracy: 0.7852 - val_loss: 4.3343 - learning_rate: 1.6803e-04
Epoch 2/300
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7686 - loss: 4.0928 - val_accuracy: 0.8071 - val_loss: 3.2992 - learning_rate: 1.6803e-04
Epoch 3/300
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8164 - loss: 3.0908 - val_accuracy: 0.8278 - val_loss: 2.6032 - learning_rate: 1.6803e-04
Epoch 4/300
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8260 - loss: 2.4492 - val_accuracy: 0.8361 - val_loss: 2.1254 - learning_rate: 1.6803e-04
Epoch 5/300
[1m302/302[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8399 - loss: 1.9746 - val_accuracy: 0.8328 - val_loss: 1.7840 - learning_rate: 1.6803e-04
Epoch 6/300
[1m302/302[0m [32m━━━━━━━━━━━━

In [4]:
models = {
    'LR': LogisticRegression(random_state=42,max_iter=200, C =0.01),
    'DT': DecisionTreeClassifier(random_state = 42,max_depth=10),
    'RF': RandomForestClassifier(random_state = 42, n_estimators=100),
    'GB': GradientBoostingClassifier(random_state=42,learning_rate=0.2),
    'XGB': XGBClassifier(random_state = 42,use_label_encoder=False, eval_metric='logloss', learning_rate=0.1),
    'SVM': SVC(probability=True, C=1),
    'KNN': KNeighborsClassifier(n_neighbors=7),
    'NB': GaussianNB()
}

fitted_models = {}
for model_name, model in models.items():
    print("This model is running", model_name)
    model.fit(X_train, Y_train)  # Fit the model to the training data
    fitted_models[model_name] = model  # Store the trained model
fitted_models['DNN'] = dnn_model;
print("Traditional machine learning models trained.")

This model is running LR
This model is running DT
This model is running RF
This model is running GB
This model is running XGB
This model is running SVM
This model is running KNN
This model is running NB
Traditional machine learning models trained.


In [5]:
# Cell 5: Evaluate all models and store metrics
def evaluate_models(models, X_train, X_test, Y_train, Y_test):
    results = []
    for model_name, model in models.items():
        print(f"Evaluating {model_name}...")
        if model_name == 'DNN':
            train_pred = (model.predict(X_train) > 0.5).astype(int)
            test_pred = (model.predict(X_test) > 0.5).astype(int)
            train_pred_prob = model.predict(X_train).ravel()
            test_pred_prob = model.predict(X_test).ravel()
        else:
            train_pred = model.predict(X_train)
            test_pred = model.predict(X_test)
            train_pred_prob = model.predict_proba(X_train)[:, 1]
            test_pred_prob = model.predict_proba(X_test)[:, 1]
        
        metrics = {
            'Model': model_name,
            'Train Accuracy': accuracy_score(Y_train, train_pred),
            'Test Accuracy': accuracy_score(Y_test, test_pred),
            'Train F1': f1_score(Y_train, train_pred),
            'Test F1': f1_score(Y_test, test_pred),
            'Train MCC': matthews_corrcoef(Y_train, train_pred),
            'Test MCC': matthews_corrcoef(Y_test, test_pred),
            'Train ROC AUC': roc_auc_score(Y_train, train_pred_prob),
            'Test ROC AUC': roc_auc_score(Y_test, test_pred_prob),
            'Train Precision': precision_score(Y_train, train_pred),
            'Test Precision': precision_score(Y_test, test_pred),
            'Train Recall': recall_score(Y_train, train_pred),
            'Test Recall': recall_score(Y_test, test_pred)
        }
        results.append(metrics)
    
    results_df = pd.DataFrame(results)
    results_df.to_csv("model_metrics_latest.csv")
    print("Model metrics saved to 'model_metrics.csv'.")

evaluate_models({'DNN': dnn_model, **fitted_models}, X_train, X_test, Y_train, Y_test)

Evaluating DNN...
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m378/378[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
[1m95/95[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Evaluating LR...
Evaluating DT...
Evaluating RF...
Evaluating GB...
Evaluating XGB...
Evaluating SVM...
Evaluating KNN...
Evaluating NB...
Model metrics saved to 'model_metrics.csv'.


In [6]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, precision_recall_curve

def plot_curves(models, X_train, X_test, Y_train, Y_test):
    plt.rcParams.update({'font.size': 12})
    
    # ROC Curve for Training Data
    plt.figure(figsize=(7, 5))
    for model_name, model in models.items():
        if model_name == 'DNN':
            train_pred_prob = model.predict(X_train).ravel()
        else:
            train_pred_prob = model.predict_proba(X_train)[:, 1]
        
        fpr_train, tpr_train, _ = roc_curve(Y_train, train_pred_prob)
        plt.plot(fpr_train, tpr_train, label=f'{model_name}')
    
    plt.plot([0, 1], [0, 1], linestyle='--', color='black')  # Diagonal 45-degree line
    plt.title('ROC Curve - Training Data')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.savefig('roc_curve_train.png', dpi=300)
    plt.show()

    # ROC Curve for Testing Data
    plt.figure(figsize=(7, 5))
    for model_name, model in models.items():
        if model_name == 'DNN':
            test_pred_prob = model.predict(X_test).ravel()
        else:
            test_pred_prob = model.predict_proba(X_test)[:, 1]
        
        fpr_test, tpr_test, _ = roc_curve(Y_test, test_pred_prob)
        plt.plot(fpr_test, tpr_test, label=f'{model_name}')
    
    plt.plot([0, 1], [0, 1], linestyle='--', color='black')  # Diagonal 45-degree line
    plt.title('ROC Curve - Testing Data')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.legend()
    plt.savefig('roc_curve_test.png', dpi=300)
    plt.show()

    # Precision-Recall Curve for Training Data
    plt.figure(figsize=(7, 5))
    for model_name, model in models.items():
        if model_name == 'DNN':
            train_pred_prob = model.predict(X_train).ravel()
        else:
            train_pred_prob = model.predict_proba(X_train)[:, 1]
        
        precision_train, recall_train, _ = precision_recall_curve(Y_train, train_pred_prob)
        plt.plot(recall_train, precision_train, label=f'{model_name}')
    # plt.plot([1, 0], [0, 1], linestyle='--', color='black')
    plt.title('Precision-Recall Curve - Training Data')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.savefig('precision_recall_curve_train.png', dpi=300)
    plt.show()

    # Precision-Recall Curve for Testing Data
    plt.figure(figsize=(7, 5))
    for model_name, model in models.items():
        if model_name == 'DNN':
            test_pred_prob = model.predict(X_test).ravel()
        else:
            test_pred_prob = model.predict_proba(X_test)[:, 1]
        
        precision_test, recall_test, _ = precision_recall_curve(Y_test, test_pred_prob)
        plt.plot(recall_test, precision_test, label=f'{model_name}')
    # plt.plot([1, 0], [0, 1], linestyle='--', color='black')
    plt.title('Precision-Recall Curve - Testing Data')
    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.legend()
    plt.savefig('precision_recall_curve_test.png', dpi=300)
    plt.show()

    print("All plots have been saved as separate PNG files with 300 dpi.")

In [None]:
print("Plotting ROC and Precision-Recall curves...")
plot_curves(fitted_models, X_train, X_test, Y_train, Y_test)
print("Plots saved successfully.")

Plotting ROC and Precision-Recall curves...
