## Import Libraries

In [43]:
# Librerie per lettura file
import pandas as pd
from pathlib import Path
import json
import os
import psutil
import numpy as np
from tqdm import tqdm
import warnings
import joblib

#Kfold and GridSearch
from imblearn.over_sampling import SMOTE
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from scipy.stats import randint, uniform
from scipy.stats import pearsonr


# Grafici
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# Data Analysis
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score,
    precision_recall_curve, classification_report, confusion_matrix,
    roc_curve, roc_auc_score, average_precision_score
)
from imblearn.over_sampling import SMOTE
from sklearn.multioutput import MultiOutputRegressor

# Modelli con Alberi
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor


from scipy.stats import uniform, randint

# Librerie Torch per MLP
import torch
import pytorch_lightning as pl
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split, SubsetRandomSampler
from torchmetrics import F1Score, Accuracy, Precision, Recall
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger

# Explainable AI (SHAP)
import shap

# Librerie per gestione dati parallela
import modin.pandas as mpd
import modin.config as cfg

# Visualizzazione matrice di confusione
from sklearn.metrics import ConfusionMatrixDisplay
import optuna
from sklearn.metrics import average_precision_score


In [44]:
matplotlib.use('Agg') 
warnings.filterwarnings("ignore", category=UserWarning, module="matplotlib")
optuna.logging.set_verbosity(optuna.logging.WARNING)
import warnings
warnings.filterwarnings(
    "ignore",
    message="FigureCanvasAgg is non-interactive, and thus cannot be shown"
)


## Reading File

In [45]:
# Reading the files
score_df = pd.read_csv("Preprocessing/drug_scores_with_targets.csv")
df_clean = pd.read_csv("Preprocessing/transcrittoma_pulito.csv")

In [46]:
score_df.iloc[1:30,:] #show the best 30 drugs (the first one has score None due to the lack of data, so it is not taken in consideration)

Unnamed: 0,Drug,Score,Drug_name,Targets
1,290.0,0.179823,KIN001-260,['IKK-complex']
2,306.0,0.168887,TG101348,['JAK2']
3,222.0,0.162292,BX-912,['PDPK1']
4,329.0,0.161085,QL-XI-92,['DDR1']
5,265.0,0.151817,Tubastatin A,['HDAC6']
6,326.0,0.142042,GSK690693,['AKT']
7,253.0,0.140113,XMD14-99,"['CAMK1', 'EPHB3']"
8,257.0,0.137881,NPK76-II-72-1,['PLK3']
9,309.0,0.136929,Y-39983,"['ROCK2', 'ROCK1']"
10,219.0,0.132665,AT-7519,['CDK9']


## Model IC50

In this section, you can run models that predict IC50 and Drug sensitivity. The analysis specifically focuses on the top 30 drugs, although you can include more if desired (l>31). However, note that drugs with a negative score perform worse than a random model and are therefore not recommended for use—this applies especially to drugs ranked beyond the 194th position.

Currently, only the functions used in the article are implemented here. However, lots of models that uses scikit-learn can be easily added by modifying this structure, which is based on the XGBoost (XGB) model.

#### Here you can modify the ensemble models
models = [GradientBoostingRegressor for _ in range(5)]

#### Seed
rf_params = {
        "random_state": 42,
    }

#### Hyperparameter Tuning with optuna
params = {
    "n_estimators": randint(10, 301),            
    "max_depth": randint(3, 11),                   
    "learning_rate": [1, 0.1, 0.01, 0.3],          
    "subsample": [0.4, 0.5, 0.6, 0.8, 1],          
    "min_samples_split": randint(2, 11),          
    "min_samples_leaf": randint(1, 11),            
    "max_features": ['sqrt', 'log2', None]         
}

#### Directory
shap_dir = 'Results/Models_IC50/GB_IC50'

#### Run the function (smo = False to avoid oversampling and smo = True to apply SMOTE+KNN)
train_ic50(df_clean, rf_params, params, models= models, shap_dir = shap_dir, l=31, smo = False)

### Optuna objective

In [47]:
def objective_reg_r2_low_ic50_percentile(trial, model, X_train, y_train, X_val, y_val, base_params, param_grid, percentile=30):

    '''
    Objective function for hyperparameter optimization using Optuna.

    Parameters:
    - trial: Optuna Trial object, used to suggest new hyperparameter values.
    - model: scikit-learn-compatible model to be optimized.
    - X_train, y_train: training data.
    - X_val, y_val: validation data 
    - base_params: dictionary of fixed model parameters (you can see it when you call the main function)
    - param_grid: dictionary defining the search space for each hyperparameter ((you can see it when you call the main function))
    - percentile: float, percentile of the validation set to select for performance evaluation.

    returns:
    - R² score computed on the selected percentile of the validation set, which will be maximized by Optuna.
    '''

    params = base_params.copy()

    for key, space in param_grid.items():
        if isinstance(space, list):
            params[key] = trial.suggest_categorical(key, space)
        elif isinstance(space, range):
            params[key] = trial.suggest_int(key, min(space), max(space))
        elif hasattr(space, "rvs"):
            a, b = space.a, space.b
            params[key] = trial.suggest_int(key, a, b - 1)
        else:
            raise ValueError(f"Tipo non supportato per il parametro {key}: {type(space)}")

    model.set_params(**params)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)

    cutoff = np.percentile(y_val, percentile)
    mask = y_val < cutoff

    if mask.sum() < 2:
        return -np.inf

    return r2_score(y_val[mask], y_pred[mask])

### Training model

In [48]:
def model_ic50(train_set, train_cells, rf_params={}, 
               base_models=None, param_grid=None, n_splits=5, random_state=42, smo = False, n_trials = 20):
    '''
    Function to train and evaluate ensemble ML models using K-Fold cross-validation as explained in the article,
    SMOTE for class balancing, and Optuna for hyperparameter optimization.

    Parameters:
    - train_set: DataFrame containing the full training data, including features and labels.
    - train_cells: Array-like list of cell line identifiers used for splitting the data into folds.
    - rf_params: Dictionary of fixed parameters for the base model(s) (optional).
    - base_models: List of model constructors (e.g., [RandomForestClassifier, ...]); one per fold. If None, RandomForestRegressor is used for all folds.
    - param_grid: Dictionary defining the search space for hyperparameters to tune with Optuna.
    - n_splits: Number of folds for cross-validation (default is 5).
    - n_trials: Number of Optuna trials per fold for hyperparameter tuning (default is 10).
    - random_state: Seed for reproducibility (used in SMOTE, CV splitting, and Optuna sampler).
    - smo : True if you want to do the oversampling (second approach in the article), else False

    Returns:
    - models_per_fold: A list of dictionaries, each containing the trained model, evaluation scores (AUC, average precision, F1), and best hyperparameters for a fold.
    - y_val_all: Combined list of true validation labels across all folds.
    - y_val_pred_all: Combined list of predicted probabilities across all folds.
    '''

    #Base models if the models given to the function are None
    if base_models is None:
        base_models = [RandomForestRegressor for _ in range(n_splits)]
    #Parameter grid if the ones given to the function are None
    if param_grid is None:
        param_grid = {
            "n_estimators": [10],
            "max_depth": [7]
        }
    
    # Ensemble Models
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    y_val_ic50_all = []
    y_val_pred_ic50_all = []
    models_per_fold = []
    y_val_sens_all = []


    for i, (train_idx, val_idx) in enumerate(kf.split(train_cells)):
        #Trainin and Validation for each split
        train_cells_fold = train_cells[train_idx]
        val_cells_fold = train_cells[val_idx]


        train_fold = train_set[train_set["Cell_line_cosmic_identifiers"].isin(train_cells_fold)]
        val_fold = train_set[train_set["Cell_line_cosmic_identifiers"].isin(val_cells_fold)]

        X_train = train_fold.drop(columns=["IC50", "Sensitivity", "Cell_line_cosmic_identifiers", 
                                           "Screened_Compounds_", "Sample_Names", "Drug_id","cell_name"])
        y_train_ic50 = train_fold["IC50"]
        y_train_sens = train_fold["Sensitivity"]

        X_val = val_fold.drop(columns=["IC50", "Sensitivity", "Cell_line_cosmic_identifiers", 
                                       "Screened_Compounds_", "Sample_Names", "Drug_id","cell_name"])
        y_val_ic50 = val_fold["IC50"]
        y_val_sens = val_fold["Sensitivity"]

        if smo:
                #Smote+KNN for balancing classes  
                smote = SMOTE(random_state=42)
                X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_sens)

       
                knn = KNeighborsRegressor(n_neighbors=3)
                knn.fit(X_train, y_train_ic50)

                y_ic50_resampled = knn.predict(X_train_resampled)

  
                df_train_final = X_train_resampled.copy()
                df_train_final["IC50"] = y_ic50_resampled
                df_train_final["Sensitivity"] = y_train_resampled


                X_train_smo = df_train_final.drop(columns=["IC50"])
                y_train_ic50_smo = df_train_final["IC50"]

                X_train = pd.DataFrame(X_train_smo, columns=X_train.columns)
                y_train_ic50 = pd.Series(y_train_ic50_smo, index=X_train.index)


        # Optuna
        mod = base_models[i](**rf_params)
        sampler = optuna.samplers.TPESampler(seed=42)
        study = optuna.create_study(direction="maximize", sampler=sampler)
        study.optimize(
            lambda trial: objective_reg_r2_low_ic50_percentile(trial, mod, X_train, y_train_ic50, X_val, y_val_ic50, rf_params, param_grid),
            n_trials=n_trials
        )

        best_params = study.best_params
        best_params["random_state"] = 42
        best_model = mod.set_params(**best_params)

        #Predict IC50
        best_model.fit(X_train, y_train_ic50)
        y_pred_ic50 = best_model.predict(X_val)
        y_val_ic50_all.extend(y_val_ic50)
        y_val_pred_ic50_all.extend(y_pred_ic50)
        y_val_sens_all.extend(y_val_sens)


        models_per_fold.append(best_model)

    return models_per_fold, y_val_ic50_all, y_val_pred_ic50_all, y_val_sens_all

### Already trained

In [49]:
def testing(train_set, train_cells,shap_dir,drug_name,smo,n_splits=5,random_state = 42):

    '''
    Function to test and evaluate already trained ensemble ML models using K-Fold cross-validation as explained in the article,
    SMOTE for class balancing, and Optuna for hyperparameter optimization.

    Parameters:
    - train_set: DataFrame containing the full training data, including features and labels.
    - train_cells: Array-like list of cell line identifiers used for splitting the data into folds.
    - shap_dir: directory where are saved the models
    – drug_name: name of the drug
    - n_splits: Number of folds for cross-validation (default is 5).
    - n_trials: Number of Optuna trials per fold for hyperparameter tuning (default is 10).
    - random_state: Seed for reproducibility (used in SMOTE, CV splitting, and Optuna sampler).

    Returns:
    - models_per_fold: A list of dictionaries, each containing the trained model, evaluation scores (AUC, average precision, F1), and best hyperparameters for a fold.
    - y_val_all: Combined list of true validation labels across all folds.
    - y_val_pred_all: Combined list of predicted probabilities across all folds.
    '''

    # Ensemble Models
    
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    y_val_ic50_all = []
    y_val_pred_ic50_all = []
    models_per_fold = []
    y_val_sens_all = []
    models_per_fold = []


    for i, (train_idx, val_idx) in enumerate(kf.split(train_cells)):
        #Trainin and Validation for each split
        train_cells_fold = train_cells[train_idx]
        val_cells_fold = train_cells[val_idx]
        train_fold = train_set[train_set["Cell_line_cosmic_identifiers"].isin(train_cells_fold)]
        val_fold = train_set[train_set["Cell_line_cosmic_identifiers"].isin(val_cells_fold)]

        X_train = train_fold.drop(columns=["IC50", "Sensitivity", "Cell_line_cosmic_identifiers", 
                                           "Screened_Compounds_", "Sample_Names", "Drug_id","cell_name"])
        y_train_ic50 = train_fold["IC50"]
        y_train_sens = train_fold["Sensitivity"]

        X_val = val_fold.drop(columns=["IC50", "Sensitivity", "Cell_line_cosmic_identifiers", 
                                       "Screened_Compounds_", "Sample_Names", "Drug_id","cell_name"])
        y_val_ic50 = val_fold["IC50"]
        y_val_sens = val_fold["Sensitivity"]

        if smo:
                # SMOTE+KNN for balancing classes
                smote = SMOTE(random_state=42)
                X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train_sens)

       
                knn = KNeighborsRegressor(n_neighbors=3)
                knn.fit(X_train, y_train_ic50)

                y_ic50_resampled = knn.predict(X_train_resampled)

  
                df_train_final = X_train_resampled.copy()
                df_train_final["IC50"] = y_ic50_resampled
                df_train_final["Sensitivity"] = y_train_resampled


                X_train_smo = df_train_final.drop(columns=["IC50"])
                y_train_ic50_smo = df_train_final["IC50"]

                X_train = pd.DataFrame(X_train_smo, columns=X_train.columns)
                y_train_ic50 = pd.Series(y_train_ic50_smo, index=X_train.index)

        
        models_dir = os.path.join(shap_dir, f"{drug_name}_models")
        model_path = os.path.join(models_dir, f"model_fold_{i}.pkl")
        model = joblib.load(model_path)


        # Predict IC50
        y_pred_ic50 = model.predict(X_val)
        y_val_ic50_all.extend(y_val_ic50)
        y_val_pred_ic50_all.extend(y_pred_ic50)
        y_val_sens_all.extend(y_val_sens)
        
        models_per_fold.append(model)

    return models_per_fold, y_val_ic50_all, y_val_pred_ic50_all, y_val_sens_all

#### Plots

In [50]:
def gen_plots(
    y_test_ic50,                 
    y_pred_test_ic50,           
    sensitivities,               
    y_pred_sensitivity,         
    fpr_val, tpr_val, auc_score_val,  
    fpr_test, tpr_test, auc_score_test,  
    recall_val, precision_val, avg_auc_pr_val, best_idx_val, best_threshold_val,  
    recall_test, precision_test, avg_auc_pr_test,                                
    drug_name,                    
    shap_dir                    

):
        

        '''
        Function to generate plots
        '''
        
        df_plot = pd.DataFrame({
            "True_IC50": y_test_ic50,
            "Predicted_IC50": y_pred_test_ic50,
            "Sensitive": sensitivities.map({0: "Non-sensitive", 1: "Sensitive"})
        })

        plt.figure(figsize=(6, 6))
        sns.scatterplot(data=df_plot, x="True_IC50", y="Predicted_IC50", hue="Sensitive", alpha=0.7)
        plt.plot([df_plot["True_IC50"].min(), df_plot["True_IC50"].max()],
                [df_plot["True_IC50"].min(), df_plot["True_IC50"].max()], 'r--')
        plt.xlabel("True IC50")
        plt.ylabel("Predicted IC50")
        plt.title(f"IC50 Prediction- {drug_name}")
        plt.legend(title="Cell Sensitivity")
        plt.grid(True)
        plt.tight_layout()
        plt.savefig(f"{shap_dir}/{drug_name}_ic50_scatter_rf.png")
        plt.close()


        cutoff_range = np.arange(y_pred_test_ic50.min(), y_pred_test_ic50.max(), 0.1)
        precisions_cutoff = []
        recalls_cutoff = []
        true_sensitivity = np.array(sensitivities)
        y_pred_test_ic50 = np.array(y_pred_test_ic50)

        for cutoff in cutoff_range:
            predicted_sensitivity = (y_pred_test_ic50 < cutoff).astype(int)
            precision = precision_score(true_sensitivity, predicted_sensitivity, zero_division=0)
            recall = recall_score(true_sensitivity, predicted_sensitivity, zero_division=0)

            precisions_cutoff.append(precision)
            recalls_cutoff.append(recall)

        plt.figure(figsize=(7, 4))
        plt.plot(cutoff_range, precisions_cutoff, marker='o', label='Precision', color='blue')
        plt.plot(cutoff_range, recalls_cutoff, marker='x', label='Recall', color='orange')
        plt.title(f"Precision & Recall vs IC50 Cut-off - {drug_name}")
        plt.xlabel("IC50 Cut-off (on predicted values)")
        plt.ylabel("Score")
        plt.ylim(0, 1.05)
        plt.grid(True)
        plt.legend()
        plt.tight_layout()
        plt.savefig(f"{shap_dir}/{drug_name}_precision_recall_curve.png")
        plt.close()

        cm = confusion_matrix(true_sensitivity, y_pred_sensitivity)

        # Save confusion matrix plot
        plt.figure(figsize=(4, 4))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=["Non-sensitive", "Sensitive"], yticklabels=["Non-sensitive", "Sensitive"])
        plt.xlabel("Predicted")
        plt.ylabel("True")
        plt.title(f"Confusion Matrix with cutoff- {drug_name}")
        plt.tight_layout()
        plt.savefig(f"{shap_dir}/{drug_name}_confusion_matrix.png")
        plt.close()



        roc_fig_path = os.path.join(shap_dir, f"{drug_name}_roc_curve.png")
        plt.figure(figsize=(12, 5))

        plt.subplot(1, 2, 1)
        plt.plot(fpr_val, tpr_val, label=f'Validation AUC = {auc_score_test:.3f}')
        plt.plot(fpr_test, tpr_test, label=f'Test AUC = {auc_score_test:.3f}')
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel("False Positive Rate")
        plt.ylabel("True Positive Rate")
        plt.title(f"ROC Curve - {drug_name}")
        plt.legend()

        plt.subplot(1, 2, 2)
        plt.plot(recall_val, precision_val, label=f'Validation AUC-PR = {avg_auc_pr_val:.3f}')
        plt.plot(recall_test, precision_test, label=f'Test AUC-PR = {avg_auc_pr_test:.3f}')
        plt.axvline(x=recall_val[best_idx_val], color='r', linestyle="--", label=f"Best Threshold = {best_threshold_val:.3f}")
        plt.xlabel("Recall")
        plt.ylabel("Precision")
        plt.title(f"Precision-Recall Curve - {drug_name}")
        plt.legend()
        plt.tight_layout()
        plt.savefig(roc_fig_path)
        plt.close()


#### SHAP

In [66]:
def generate_shap_plots(
    models_per_fold,
    X_test,
    test_set,
    df_clean,
    shap_dir,
    drug_name,
    specific_drug
):
        '''
        Function to compute SHAP and to generate plots
        '''

        # Directory SHAP
        shap_dir_drug = os.path.join(shap_dir, f"{drug_name}")
        os.makedirs(shap_dir_drug, exist_ok=True)  

        shap_values_list = []
        print(f"\nCalcolo SHAP per {drug_name} su {len(models_per_fold)} modelli...")


        ## Shap values for each model-cell_line
        for i, model_info in enumerate(models_per_fold):
            model = model_info
            explainer = shap.TreeExplainer(model)
            X_test_aligned = X_test[model.feature_names_in_]  
            shap_values = -explainer.shap_values(X_test_aligned, check_additivity=False) # negative, so it is comparable with the model sensitivity
            if isinstance(shap_values, list):
                sv = shap_values[1] if len(shap_values) > 1 else shap_values[0]
            else:
                sv = shap_values

            if sv.shape[1] != X_test.shape[1]:
                print(f"⚠️ SHAP shape mismatch: got {sv.shape}, expected ({X_test.shape[0]}, {X_test.shape[1]})")
                continue

            shap_values_list.append(sv)

        ## MEAN SHAP for each model
        if shap_values_list:
            shap_df_list = []
            for sv in shap_values_list:
                if sv.ndim == 3:
                    sv = sv[:, :, 1]
                shap_df_list.append(pd.DataFrame(sv, columns=X_test.columns, index=X_test.index))

            mean_shap_df = sum(shap_df_list) / len(shap_df_list)

            for idx, row in X_test.iterrows():
       
                cell_line_id = test_set.loc[idx]["Cell_line_cosmic_identifiers"]
                matching_rows = df_clean[df_clean["Cell_line_cosmic_identifiers"] == cell_line_id]

                if not matching_rows.empty:
                    cell_name = matching_rows["cell_name"].iloc[0] 
                else:
                    print(f" Nessuna riga trovata per cell_line_id {cell_line_id}")
                    continue  

                shap_values_row = mean_shap_df.loc[idx]
                sorted_idx = np.argsort(np.abs(shap_values_row.values))[::-1]
                sorted_features = X_test.columns[sorted_idx]
                sorted_shap = shap_values_row.values[sorted_idx]
                sorted_vals = row.values[sorted_idx]

                if isinstance(explainer.expected_value, (list, np.ndarray)):
                        base_value = explainer.expected_value[1] if len(explainer.expected_value) > 1 else explainer.expected_value[0]
                else:
                        base_value = explainer.expected_value

                ## 1) Waterfall Plot
                shap.waterfall_plot(shap.Explanation(
                        values=sorted_shap,
                        base_values=base_value,  
                        data=sorted_vals,
                        feature_names=sorted_features
                    ))

                plt.suptitle(f"SHAP Waterfall - Cell Line: {cell_name}", fontsize=14)
                fig = plt.gcf()
                fig_path = os.path.join(shap_dir_drug, f"{cell_name}_shap_waterfall.png")
                fig.savefig(fig_path, format="png") 
                plt.close(fig)  


                ## 2) shap_bar_top 5 sensitivity and resistance
                shap_series = shap_values_row.sort_values()
                negativi = shap_series.head(5)
                positivi = shap_series.tail(5)
                combined = pd.concat([negativi, positivi])
                colori = ['lightcoral'] * 5 + ['skyblue'] * 5

                plt.figure(figsize=(5, 4))
                plt.barh(combined.index[::-1], combined.values[::-1], color=colori[::-1])
                plt.axvline(0, color='gray', linestyle='--')
                plt.title(f"Top 5 ± SHAP - {cell_name}", fontsize=10)
                plt.xlabel("SHAP Value (Effect)")
                plt.tight_layout()
                fig_bar_path = os.path.join(shap_dir_drug, f"{cell_name}_shap_bar_top5.png")
                plt.savefig(fig_bar_path, format="png")
                plt.close()

            ### Grafici SHAP sul farmaco ###

            ## shap_top5_mean_bar
            shap_mean = mean_shap_df.mean(axis=0).sort_values()
            top5_neg = shap_mean.head(5)
            top5_pos = shap_mean.tail(5)
            combined = pd.concat([top5_neg, top5_pos])
            colori = ['lightcoral'] * 5 + ['skyblue'] * 5
            plt.figure(figsize=(6, 4))
            plt.barh(combined.index[::-1], combined.values[::-1], color=colori[::-1])
            plt.axvline(0, color='gray', linestyle='--')
            plt.title(f"SHAP Top 5 ± - {drug_name} (media su tutte le cell)", fontsize=11)
            plt.xlabel("SHAP value (mean)")
            plt.tight_layout()
            fig_group_bar_path = os.path.join(shap_dir_drug, f"{drug_name}_shap_top5_mean_bar.png")
            plt.savefig(fig_group_bar_path, format="png")
            plt.close()

            abs_shap_df_list = [df.abs() for df in shap_df_list]
            mean_abs_shap_df = sum(abs_shap_df_list) / len(abs_shap_df_list)
            mean_abs_importance = mean_abs_shap_df.mean(axis=0)
            ## top 20 features
            top20_features = (
                mean_shap_df.abs().mean(axis=0)
                .sort_values(ascending=False)
                .head(20)
            )
            top20_features = mean_abs_importance.sort_values(ascending=False).head(20)
            top20_features_path = os.path.join(shap_dir_drug, "top20_features.csv")
            top20_features.to_csv(top20_features_path, header=["mean_abs_shap_value"])


            ## SHAP summary plot
            top10_indices = mean_abs_importance.sort_values(ascending=False).head(10).index
            top10_shap_values = mean_shap_df[top10_indices].values
            top10_X_test = X_test[top10_indices]

            plt.figure()
            shap.summary_plot(top10_shap_values, top10_X_test, feature_names=top10_indices, show=False)
            plt.title(f"SHAP Summary Plot (Top 10) - {drug_name}")
            plt.tight_layout()
            plt.savefig(os.path.join(shap_dir_drug, "shap_summary_plot.png"))
            plt.close()
            
            ## SHAP bar plot
            shap_bar_fig_path = os.path.join(shap_dir_drug, "shap_bar_plot.png")
            plt.figure()
            shap.summary_plot(mean_shap_df.values, X_test, feature_names=X_test.columns, plot_type="bar", show=False)
            plt.title(f"SHAP Bar Plot (Media) - {drug_name}")
            plt.tight_layout()
            plt.savefig(shap_bar_fig_path)
            plt.close()


### Funzione

In [62]:
def train_ic50(df_clean, rf_params, param_grid, models = None, shap_dir = 'RF_IC50', l=31,smo = False):
    '''
    Train models to predict drug IC50 for multiple drugs, 
    evaluate their performance, generate visualizations, and compute SHAP explanations.

    The function performs the following steps per drug:
    - Splits the dataset by cell lines into training and test sets.
    - Uses K-Fold cross-validation and SMOTE on training data.
    - Applies Optuna for hyperparameter optimization.
    - Evaluates on both validation and test sets (AUC, F1, Precision, Recall).
    - Saves trained models, performance metrics, confusion matrices, ROC/PR curves.
    - Computes SHAP values for test set samples and saves their plots.

    Parameters:
    - df_clean: Preprocessed and cleaned DataFrame .
    - rf_params: Dictionary of fixed parameters.
    - base_models: List of model classes to use per fold (e.g. [RandomForestClassifier, ...]).
    - param_grid: Dictionary of hyperparameters to tune via Optuna.
    - shap_dir: Directory where models, plots, and SHAP outputs will be saved (default 'RF_SENSITIVITY').
    - l: Number of drugs to process (default is 31; starts from index 1 (because the first drug has score None) of `score_df["Drug"]`).

    Returns:
    - Saves per-drug results and visualizations in the specified `shap_dir`.
    - Outputs a summary CSV with metrics per drug.
    - Prints final classification report and cumulative F1 score over all predictions.
    '''

    # Directory
    os.makedirs(shap_dir, exist_ok=True)  

    # Initialize variables
    models_results_ic50 = {}
    y_test_all = []
    y_pred_all = []
    sensitivity_all = []
    df_nan_sens = df_clean[df_clean["Sensitivity"].isna()]  
    df_clean = df_clean[df_clean["Sensitivity"].notna()]  
    y_test_sens_all = []
    y_pred_sens_all = []


    for row in tqdm(score_df.iloc[1:l].itertuples(index=False), desc="Processing Drugs"):
        ## Model for each drug
        specific_drug = row.Drug
        drug_name = row.Drug_name
        df_drug = df_clean[df_clean["Drug_id"] == specific_drug]
        cell_counts = df_drug["Cell_line_cosmic_identifiers"].value_counts()

        # Training and Test
        train_cells, test_cells = train_test_split(
            cell_counts.index,
            test_size=0.2,
            stratify = df_drug.groupby("Cell_line_cosmic_identifiers")["Sensitivity"].apply(lambda x: x.mean()),
            random_state=42
        )

        train_set = df_drug[df_drug["Cell_line_cosmic_identifiers"].isin(train_cells)]
        test_set = df_drug[df_drug["Cell_line_cosmic_identifiers"].isin(test_cells)]
        train_set = pd.concat([train_set, df_nan_sens[df_nan_sens["Drug_id"] == specific_drug]])
        X_test = test_set.drop(columns=["IC50", "Sensitivity", "Cell_line_cosmic_identifiers", 
                                           "Screened_Compounds_", "Sample_Names", "Drug_id","cell_name"])
        y_test_ic50 = test_set["IC50"]
        sensitivities = test_set["Sensitivity"]

        
        '''
        #  If you want to train again the models
        models_per_fold, y_true_val, y_pred_val,y_val_sens = model_ic50(
                                train_set=train_set,
                                train_cells=train_cells,
                                rf_params=rf_params,
                                base_models=models,
                                param_grid=param_grid,
                                smo = smo
                            )
        models_dir = os.path.join(shap_dir, f"{drug_name}_models")
        os.makedirs(models_dir, exist_ok=True)
    
        for fold_idx, model_info in enumerate(models_per_fold):
            model_path = os.path.join(models_dir, f"model_fold_{fold_idx}.pkl")
            joblib.dump(model_info, model_path)
        '''

        # Evaluate models
        models_per_fold, y_true_val, y_pred_val,y_val_sens = testing(train_set, train_cells,shap_dir,drug_name,smo,n_splits=5,random_state = 42)
    
        # Predict IC50
        y_pred_test_ic50 = np.mean([m.predict(X_test) for m in models_per_fold], axis=0)

        # Calculate metrics
        mae = mean_absolute_error(y_test_ic50, y_pred_test_ic50)
        rmse = np.sqrt(mean_squared_error(y_test_ic50, y_pred_test_ic50))
        r2 = r2_score(y_test_ic50, y_pred_test_ic50)
        pearson_corr, _ = pearsonr(y_test_ic50, y_pred_test_ic50)
        y_score_test = -y_pred_test_ic50  
        y_score_val =  -np.array(y_pred_val)
        fpr_test, tpr_test, roc_thresholds = roc_curve(sensitivities, y_score_test)
        fpr_val, tpr_val, roc_thresholds_val = roc_curve(y_val_sens, y_score_val)
        auc_score_test = roc_auc_score(sensitivities, y_score_test)
        auc_score_val = roc_auc_score(y_val_sens, y_score_val)

        # Precision-Recall
        precision_test, recall_test, pr_thresholds = precision_recall_curve(sensitivities, y_score_test)
        avg_auc_pr_test = average_precision_score(sensitivities, y_score_test)

        precision_val, recall_val, pr_thresholds_val = precision_recall_curve(y_val_sens, y_score_val)
        avg_auc_pr_val = average_precision_score(y_val_sens, y_score_val)


        # Calcola F1 per ogni soglia
        f1_scores_val = 2 * (precision_val[:-1] * recall_val[:-1]) / (precision_val[:-1] + recall_val[:-1] + 1e-8)
        best_idx_val = np.argmax(f1_scores_val)
        best_threshold_val = -pr_thresholds_val[best_idx_val]

        y_pred_sensitivity = (y_pred_test_ic50 <= best_threshold_val).astype(int)
        



        gen_plots(
            y_test_ic50,                 
            y_pred_test_ic50,           
            sensitivities,               
            y_pred_sensitivity,         
            fpr_val, tpr_val, auc_score_val,  
            fpr_test, tpr_test, auc_score_test,  
            recall_val, precision_val, avg_auc_pr_val, best_idx_val, best_threshold_val,  
            recall_test, precision_test, avg_auc_pr_test,                                
            drug_name,                    
            shap_dir                    
        )

        generate_shap_plots(
            models_per_fold,
            X_test,
            test_set,
            df_clean,
            shap_dir,
            drug_name,
            specific_drug
        )

        true_sensitivity = np.array(sensitivities)
        f1_test = f1_score(true_sensitivity, y_pred_sensitivity)



      
        y_test_all.extend(y_test_ic50)
        y_pred_all.extend(y_pred_test_ic50)
        y_test_sens_all.extend(true_sensitivity)
        y_pred_sens_all.extend(y_pred_sensitivity)
        sensitivity_all.extend(sensitivities)
        

        models_results_ic50[drug_name] = {
            "MAE_IC50": mae,
            "RMSE_IC50": rmse,
            "R2_IC50": r2,
            "Pearson_IC50": pearson_corr,
            "ROC_AUC": auc_score_test,
            "PR_AUC": avg_auc_pr_test,
            "F1-score": f1_test
        }
        results_df = pd.DataFrame.from_dict(models_results_ic50, orient="index")
        results_df.to_csv(os.path.join(shap_dir, "rf_drug_sensitivity_results.csv"))


        #print(f"\n** Results for {drug_name} **")
        #print(f"IC50 - MAE: {mae:.3f}, RMSE: {rmse:.3f}, R²: {r2:.3f}")
        


    mae_global = mean_absolute_error(y_test_all, y_pred_all)
    rmse_global = np.sqrt(mean_squared_error(y_test_all, y_pred_all))
    r2_global = r2_score(y_test_all, y_pred_all)

    print("\n\n========= GLOBAL METRICS ACROSS ALL DRUGS =========")
    print(f"Global IC50 - MAE: {mae_global:.3f}, RMSE: {rmse_global:.3f}, R²: {r2_global:.3f}")

    
    df_global = pd.DataFrame({
        "True_IC50": y_test_all,
        "Predicted_IC50": y_pred_all,
        "Sensitive": pd.Series(sensitivity_all).map({0: "Non-sensitive", 1: "Sensitive"})
    })

    plt.figure(figsize=(7, 7))
    sns.scatterplot(data=df_global, x="True_IC50", y="Predicted_IC50", hue="Sensitive", alpha=0.5)
    plt.plot([df_global["True_IC50"].min(), df_global["True_IC50"].max()],
            [df_global["True_IC50"].min(), df_global["True_IC50"].max()], 'r--')
    plt.xlabel("True IC50")
    plt.ylabel("Predicted IC50")
    plt.title(f"Global IC50 Prediction Across All Drugs R²: {r2_global:.3f}")
    plt.legend(title="Cell Sensitivity")
    plt.grid(True)
    plt.tight_layout()
    plt.tight_layout()
    plt.savefig(f"{shap_dir}/global_ic50_prediction_rf.png")
    plt.close()

    final_f1 = f1_score(y_test_sens_all, y_pred_sens_all)

    cm_global = confusion_matrix(y_test_sens_all, y_pred_sens_all)

    plt.figure(figsize=(5, 5))
    sns.heatmap(cm_global, annot=True, fmt='d', cmap='Purples', xticklabels=["Non-sensitive", "Sensitive"], yticklabels=["Non-sensitive", "Sensitive"])
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.title(f"Global Confusion Matrix (All Drugs) with f1: {final_f1}")
    plt.tight_layout()
    plt.savefig(f"{shap_dir}/global_confusion_matrix_rf.png")
    plt.close()

### NO Oversampling

Here are implemented the function which does not use oversampling

#### XGBoosting

In [64]:
models = [XGBRegressor for _ in range(5)]

rf_params = {
        "random_state": 42,
    }

params = {
    "n_estimators": randint(10, 501),              
    "max_depth": randint(3, 11),                     
    "learning_rate": [1,0.1,0.01,0.3],             
    "subsample": [0.4,0.5,0.6,0.8,1],                
    "colsample_bytree": [0.7,0.9,1],   
    "min_child_weight": randint(1, 11),          
    "gamma": [0,0.1,0.01,0.3,0.5]                         
}

shap_dir = 'Results/Models_IC50/XGB_IC50'
train_ic50(df_clean, rf_params, params, models= models, shap_dir = shap_dir, l=31, smo = False)

Processing Drugs: 0it [00:00, ?it/s]


Calcolo SHAP per KIN001-260 su 5 modelli...


Processing Drugs: 1it [00:34, 34.67s/it]


Calcolo SHAP per TG101348 su 5 modelli...


Processing Drugs: 2it [01:08, 34.40s/it]


Calcolo SHAP per BX-912 su 5 modelli...


Processing Drugs: 3it [01:43, 34.37s/it]


Calcolo SHAP per QL-XI-92 su 5 modelli...


Processing Drugs: 4it [02:14, 33.29s/it]


Calcolo SHAP per Tubastatin A su 5 modelli...


Processing Drugs: 5it [02:47, 33.02s/it]


Calcolo SHAP per GSK690693 su 5 modelli...


Processing Drugs: 6it [03:20, 33.10s/it]


Calcolo SHAP per XMD14-99 su 5 modelli...


Processing Drugs: 6it [04:07, 41.19s/it]


KeyboardInterrupt: 

#### Random Forest

In [67]:
models = [RandomForestRegressor for _ in range(5)]

rf_params = {
        "random_state": 42,
    }

param_grid = {
    "n_estimators": randint(10, 501),               
    "max_depth": [5, 10, 20, None],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf": randint(1, 6),
    "max_features": ["sqrt", "log2"]
}

shap_dir = 'Results/Models_IC50/RF_IC50'
train_ic50(df_clean, rf_params, param_grid = param_grid, models= models, shap_dir = shap_dir, l=31, smo = False)

Processing Drugs: 0it [00:00, ?it/s]


Calcolo SHAP per KIN001-260 su 5 modelli...


Processing Drugs: 1it [00:37, 37.33s/it]


Calcolo SHAP per TG101348 su 5 modelli...


Processing Drugs: 2it [01:09, 34.58s/it]


Calcolo SHAP per BX-912 su 5 modelli...


Processing Drugs: 3it [01:46, 35.47s/it]


Calcolo SHAP per QL-XI-92 su 5 modelli...


Processing Drugs: 4it [02:22, 35.84s/it]


Calcolo SHAP per Tubastatin A su 5 modelli...


Processing Drugs: 5it [03:02, 37.29s/it]


Calcolo SHAP per GSK690693 su 5 modelli...


Processing Drugs: 6it [03:39, 37.15s/it]


Calcolo SHAP per XMD14-99 su 5 modelli...


Processing Drugs: 7it [05:03, 52.38s/it]


Calcolo SHAP per NPK76-II-72-1 su 5 modelli...


Processing Drugs: 8it [05:36, 46.27s/it]


Calcolo SHAP per Y-39983 su 5 modelli...


Processing Drugs: 9it [07:00, 58.05s/it]


Calcolo SHAP per AT-7519 su 5 modelli...


Processing Drugs: 10it [07:34, 50.72s/it]


Calcolo SHAP per KIN001-236 su 5 modelli...


Processing Drugs: 11it [08:09, 45.87s/it]


Calcolo SHAP per TL-2-105 su 5 modelli...


Processing Drugs: 12it [08:43, 42.08s/it]


Calcolo SHAP per ABT-263 su 5 modelli...


Processing Drugs: 13it [10:54, 69.25s/it]


Calcolo SHAP per GSK1070916 su 5 modelli...


Processing Drugs: 14it [11:28, 58.36s/it]


Calcolo SHAP per Methotrexate su 5 modelli...


Processing Drugs: 15it [11:58, 49.88s/it]


Calcolo SHAP per TL-1-85 su 5 modelli...


Processing Drugs: 16it [12:32, 45.00s/it]


Calcolo SHAP per T0901317 su 5 modelli...


Processing Drugs: 17it [13:36, 51.00s/it]


Calcolo SHAP per PHA-793887 su 5 modelli...


Processing Drugs: 18it [14:08, 45.01s/it]


Calcolo SHAP per JW-7-24-1 su 5 modelli...


Processing Drugs: 19it [14:40, 41.25s/it]


Calcolo SHAP per TPCA-1 su 5 modelli...


Processing Drugs: 20it [15:13, 38.64s/it]


Calcolo SHAP per CX-5461 su 5 modelli...


Processing Drugs: 21it [15:45, 36.66s/it]


Calcolo SHAP per STF-62247 su 5 modelli...


Processing Drugs: 22it [16:16, 35.10s/it]


Calcolo SHAP per Dabrafenib su 5 modelli...


Processing Drugs: 23it [16:47, 34.00s/it]


Calcolo SHAP per GSK429286A su 5 modelli...


Processing Drugs: 24it [17:24, 34.74s/it]


Calcolo SHAP per Trametinib su 5 modelli...


Processing Drugs: 25it [17:56, 33.84s/it]


Calcolo SHAP per NG-25 su 5 modelli...


Processing Drugs: 26it [18:30, 33.88s/it]


Calcolo SHAP per BIX02189 su 5 modelli...


Processing Drugs: 26it [18:52, 43.57s/it]


KeyboardInterrupt: 

#### Gradient Boosting

In [None]:
models = [GradientBoostingRegressor for _ in range(5)]

rf_params = {
        "random_state": 42,
    }
params = {
    "n_estimators": randint(10, 301),            
    "max_depth": randint(3, 11),                   
    "learning_rate": [1, 0.1, 0.01, 0.3],          
    "subsample": [0.4, 0.5, 0.6, 0.8, 1],          
    "min_samples_split": randint(2, 11),          
    "min_samples_leaf": randint(1, 11),            
    "max_features": ['sqrt', 'log2', None]         
}

shap_dir = 'Results/Models_IC50/GB_IC50'
train_ic50(df_clean, rf_params, params, models= models, shap_dir = shap_dir, l=31, smo = False)

Processing Drugs: 0it [00:00, ?it/s]


** Results for KIN001-260 **
IC50 - MAE: 0.744, RMSE: 0.951, R²: 0.231

Calcolo SHAP per KIN001-260 su 5 modelli...


Processing Drugs: 1it [00:18, 18.40s/it]


** Results for TG101348 **
IC50 - MAE: 0.995, RMSE: 1.261, R²: 0.351

Calcolo SHAP per TG101348 su 5 modelli...


Processing Drugs: 2it [00:36, 18.38s/it]


** Results for BX-912 **
IC50 - MAE: 1.022, RMSE: 1.264, R²: 0.545

Calcolo SHAP per BX-912 su 5 modelli...


Processing Drugs: 3it [00:54, 18.25s/it]


** Results for QL-XI-92 **
IC50 - MAE: 0.859, RMSE: 1.031, R²: 0.386

Calcolo SHAP per QL-XI-92 su 5 modelli...


Processing Drugs: 4it [01:13, 18.24s/it]


** Results for Tubastatin A **
IC50 - MAE: 0.694, RMSE: 0.870, R²: 0.442

Calcolo SHAP per Tubastatin A su 5 modelli...


Processing Drugs: 5it [01:31, 18.25s/it]


** Results for GSK690693 **
IC50 - MAE: 1.042, RMSE: 1.333, R²: 0.268

Calcolo SHAP per GSK690693 su 5 modelli...


Processing Drugs: 6it [01:49, 18.20s/it]


** Results for XMD14-99 **
IC50 - MAE: 0.678, RMSE: 0.881, R²: 0.196

Calcolo SHAP per XMD14-99 su 5 modelli...


Processing Drugs: 7it [02:25, 24.17s/it]


** Results for NPK76-II-72-1 **
IC50 - MAE: 1.134, RMSE: 1.423, R²: 0.305

Calcolo SHAP per NPK76-II-72-1 su 5 modelli...


Processing Drugs: 8it [02:44, 22.27s/it]


** Results for Y-39983 **
IC50 - MAE: 0.947, RMSE: 1.254, R²: 0.288

Calcolo SHAP per Y-39983 su 5 modelli...


Processing Drugs: 9it [03:19, 26.29s/it]


** Results for AT-7519 **
IC50 - MAE: 1.354, RMSE: 1.674, R²: 0.123

Calcolo SHAP per AT-7519 su 5 modelli...


Processing Drugs: 10it [03:37, 23.81s/it]


** Results for KIN001-236 **
IC50 - MAE: 0.634, RMSE: 0.811, R²: 0.301

Calcolo SHAP per KIN001-236 su 5 modelli...


Processing Drugs: 11it [03:56, 22.19s/it]


** Results for TL-2-105 **
IC50 - MAE: 0.839, RMSE: 1.110, R²: 0.405

Calcolo SHAP per TL-2-105 su 5 modelli...


Processing Drugs: 12it [04:14, 21.00s/it]


** Results for ABT-263 **
IC50 - MAE: 1.294, RMSE: 1.611, R²: 0.353

Calcolo SHAP per ABT-263 su 5 modelli...


Processing Drugs: 13it [05:02, 29.21s/it]


** Results for GSK1070916 **
IC50 - MAE: 1.137, RMSE: 1.477, R²: 0.493

Calcolo SHAP per GSK1070916 su 5 modelli...


Processing Drugs: 14it [05:20, 25.86s/it]


** Results for Methotrexate **
IC50 - MAE: 1.097, RMSE: 1.325, R²: 0.318

Calcolo SHAP per Methotrexate su 5 modelli...


Processing Drugs: 15it [05:36, 22.98s/it]


** Results for TL-1-85 **
IC50 - MAE: 1.166, RMSE: 1.465, R²: 0.355

Calcolo SHAP per TL-1-85 su 5 modelli...


Processing Drugs: 16it [05:55, 21.64s/it]


** Results for T0901317 **
IC50 - MAE: 0.688, RMSE: 0.941, R²: 0.268

Calcolo SHAP per T0901317 su 5 modelli...


Processing Drugs: 17it [06:30, 25.74s/it]


** Results for PHA-793887 **
IC50 - MAE: 1.422, RMSE: 1.691, R²: 0.239

Calcolo SHAP per PHA-793887 su 5 modelli...


Processing Drugs: 18it [06:48, 23.52s/it]


** Results for JW-7-24-1 **
IC50 - MAE: 1.030, RMSE: 1.356, R²: 0.307

Calcolo SHAP per JW-7-24-1 su 5 modelli...


Processing Drugs: 19it [07:07, 21.87s/it]


** Results for TPCA-1 **
IC50 - MAE: 1.162, RMSE: 1.453, R²: 0.337

Calcolo SHAP per TPCA-1 su 5 modelli...


Processing Drugs: 20it [07:25, 20.72s/it]


** Results for CX-5461 **
IC50 - MAE: 1.417, RMSE: 1.711, R²: 0.264

Calcolo SHAP per CX-5461 su 5 modelli...


Processing Drugs: 21it [07:42, 19.82s/it]


** Results for STF-62247 **
IC50 - MAE: 0.621, RMSE: 0.801, R²: 0.282

Calcolo SHAP per STF-62247 su 5 modelli...


Processing Drugs: 22it [08:00, 19.28s/it]


** Results for Dabrafenib **
IC50 - MAE: 1.146, RMSE: 1.648, R²: 0.217

Calcolo SHAP per Dabrafenib su 5 modelli...


Processing Drugs: 23it [08:17, 18.66s/it]


** Results for GSK429286A **
IC50 - MAE: 0.767, RMSE: 1.037, R²: 0.402

Calcolo SHAP per GSK429286A su 5 modelli...


Processing Drugs: 24it [08:36, 18.61s/it]


** Results for Trametinib **
IC50 - MAE: 1.757, RMSE: 2.293, R²: 0.230

Calcolo SHAP per Trametinib su 5 modelli...


Processing Drugs: 25it [08:53, 18.28s/it]


** Results for NG-25 **
IC50 - MAE: 1.128, RMSE: 1.432, R²: 0.342

Calcolo SHAP per NG-25 su 5 modelli...


Processing Drugs: 26it [09:11, 18.19s/it]


** Results for BIX02189 **
IC50 - MAE: 0.743, RMSE: 0.998, R²: 0.397

Calcolo SHAP per BIX02189 su 5 modelli...


Processing Drugs: 27it [09:29, 18.12s/it]


** Results for PIK-93 **
IC50 - MAE: 1.110, RMSE: 1.415, R²: 0.284

Calcolo SHAP per PIK-93 su 5 modelli...


Processing Drugs: 28it [10:05, 23.40s/it]


** Results for XMD15-27 **
IC50 - MAE: 0.529, RMSE: 0.701, R²: 0.440

Calcolo SHAP per XMD15-27 su 5 modelli...


Processing Drugs: 29it [11:36, 43.60s/it]


** Results for AC220 **
IC50 - MAE: 0.664, RMSE: 1.061, R²: 0.275

Calcolo SHAP per AC220 su 5 modelli...


Processing Drugs: 30it [11:55, 23.84s/it]




Global IC50 - MAE: 0.944, RMSE: 1.267, R²: 0.629


#### ALL

In [None]:
models = [XGBRegressor, RandomForestRegressor, GradientBoostingRegressor, RandomForestRegressor,XGBRegressor ]
rf_params = {
        "random_state": 42,
    }

params = {
    "n_estimators": randint(10, 501),              
    "max_depth": [5, 7, 10, 20, None],
}

shap_dir = 'Results/Models_IC50/GB_XGB_RF'

train_ic50(df_clean, rf_params, params, models= models, shap_dir = shap_dir, l=31, smo = False)

Processing Drugs: 0it [00:00, ?it/s]


** Results for KIN001-260 **
IC50 - MAE: 0.747, RMSE: 0.941, R²: 0.247

Calcolo SHAP per KIN001-260 su 5 modelli...


Processing Drugs: 1it [00:23, 23.47s/it]


** Results for TG101348 **
IC50 - MAE: 1.038, RMSE: 1.309, R²: 0.300

Calcolo SHAP per TG101348 su 5 modelli...


Processing Drugs: 2it [00:42, 21.09s/it]


** Results for BX-912 **
IC50 - MAE: 1.050, RMSE: 1.289, R²: 0.527

Calcolo SHAP per BX-912 su 5 modelli...


Processing Drugs: 3it [01:02, 20.43s/it]


** Results for QL-XI-92 **
IC50 - MAE: 0.865, RMSE: 1.040, R²: 0.375

Calcolo SHAP per QL-XI-92 su 5 modelli...


Processing Drugs: 4it [01:24, 21.10s/it]


** Results for Tubastatin A **
IC50 - MAE: 0.703, RMSE: 0.871, R²: 0.441

Calcolo SHAP per Tubastatin A su 5 modelli...


Processing Drugs: 5it [01:43, 20.16s/it]


** Results for GSK690693 **
IC50 - MAE: 1.036, RMSE: 1.295, R²: 0.310

Calcolo SHAP per GSK690693 su 5 modelli...


Processing Drugs: 6it [02:01, 19.53s/it]


** Results for XMD14-99 **
IC50 - MAE: 0.676, RMSE: 0.885, R²: 0.189

Calcolo SHAP per XMD14-99 su 5 modelli...


Processing Drugs: 7it [02:41, 26.29s/it]


** Results for NPK76-II-72-1 **
IC50 - MAE: 1.125, RMSE: 1.389, R²: 0.338

Calcolo SHAP per NPK76-II-72-1 su 5 modelli...


Processing Drugs: 8it [03:01, 24.12s/it]


** Results for Y-39983 **
IC50 - MAE: 0.946, RMSE: 1.261, R²: 0.280

Calcolo SHAP per Y-39983 su 5 modelli...


Processing Drugs: 9it [03:41, 29.25s/it]


** Results for AT-7519 **
IC50 - MAE: 1.378, RMSE: 1.694, R²: 0.102

Calcolo SHAP per AT-7519 su 5 modelli...


Processing Drugs: 10it [04:02, 26.73s/it]


** Results for KIN001-236 **
IC50 - MAE: 0.646, RMSE: 0.824, R²: 0.279

Calcolo SHAP per KIN001-236 su 5 modelli...


Processing Drugs: 11it [04:22, 24.68s/it]


** Results for TL-2-105 **
IC50 - MAE: 0.842, RMSE: 1.085, R²: 0.432

Calcolo SHAP per TL-2-105 su 5 modelli...


Processing Drugs: 12it [04:43, 23.49s/it]


** Results for ABT-263 **
IC50 - MAE: 1.350, RMSE: 1.672, R²: 0.303

Calcolo SHAP per ABT-263 su 5 modelli...


Processing Drugs: 13it [05:36, 32.38s/it]


** Results for GSK1070916 **
IC50 - MAE: 1.099, RMSE: 1.449, R²: 0.512

Calcolo SHAP per GSK1070916 su 5 modelli...


Processing Drugs: 14it [05:59, 29.58s/it]


** Results for Methotrexate **
IC50 - MAE: 1.113, RMSE: 1.337, R²: 0.306

Calcolo SHAP per Methotrexate su 5 modelli...


Processing Drugs: 15it [06:16, 25.71s/it]


** Results for TL-1-85 **
IC50 - MAE: 1.156, RMSE: 1.421, R²: 0.394

Calcolo SHAP per TL-1-85 su 5 modelli...


Processing Drugs: 16it [06:34, 23.48s/it]


** Results for T0901317 **
IC50 - MAE: 0.689, RMSE: 0.939, R²: 0.270

Calcolo SHAP per T0901317 su 5 modelli...


Processing Drugs: 17it [07:11, 27.50s/it]


** Results for PHA-793887 **
IC50 - MAE: 1.451, RMSE: 1.708, R²: 0.224

Calcolo SHAP per PHA-793887 su 5 modelli...


Processing Drugs: 18it [07:29, 24.77s/it]


** Results for JW-7-24-1 **
IC50 - MAE: 1.069, RMSE: 1.363, R²: 0.300

Calcolo SHAP per JW-7-24-1 su 5 modelli...


Processing Drugs: 19it [07:48, 22.87s/it]


** Results for TPCA-1 **
IC50 - MAE: 1.173, RMSE: 1.474, R²: 0.318

Calcolo SHAP per TPCA-1 su 5 modelli...


Processing Drugs: 20it [08:08, 22.10s/it]


** Results for CX-5461 **
IC50 - MAE: 1.410, RMSE: 1.705, R²: 0.269

Calcolo SHAP per CX-5461 su 5 modelli...


Processing Drugs: 21it [08:33, 23.01s/it]


** Results for STF-62247 **
IC50 - MAE: 0.620, RMSE: 0.798, R²: 0.287

Calcolo SHAP per STF-62247 su 5 modelli...


Processing Drugs: 22it [08:52, 21.67s/it]


** Results for Dabrafenib **
IC50 - MAE: 1.144, RMSE: 1.657, R²: 0.208

Calcolo SHAP per Dabrafenib su 5 modelli...


Processing Drugs: 23it [09:09, 20.42s/it]


** Results for GSK429286A **
IC50 - MAE: 0.788, RMSE: 1.049, R²: 0.388

Calcolo SHAP per GSK429286A su 5 modelli...


Processing Drugs: 24it [09:29, 20.33s/it]


** Results for Trametinib **
IC50 - MAE: 1.778, RMSE: 2.337, R²: 0.200

Calcolo SHAP per Trametinib su 5 modelli...


Processing Drugs: 25it [09:52, 21.02s/it]


** Results for NG-25 **
IC50 - MAE: 1.155, RMSE: 1.438, R²: 0.337

Calcolo SHAP per NG-25 su 5 modelli...


Processing Drugs: 26it [10:11, 20.45s/it]


** Results for BIX02189 **
IC50 - MAE: 0.752, RMSE: 0.998, R²: 0.397

Calcolo SHAP per BIX02189 su 5 modelli...


Processing Drugs: 27it [10:33, 20.97s/it]


** Results for PIK-93 **
IC50 - MAE: 1.136, RMSE: 1.428, R²: 0.270

Calcolo SHAP per PIK-93 su 5 modelli...


Processing Drugs: 28it [11:10, 25.80s/it]


** Results for XMD15-27 **
IC50 - MAE: 0.530, RMSE: 0.695, R²: 0.450

Calcolo SHAP per XMD15-27 su 5 modelli...


Processing Drugs: 29it [12:53, 49.00s/it]


** Results for AC220 **
IC50 - MAE: 0.659, RMSE: 1.072, R²: 0.260

Calcolo SHAP per AC220 su 5 modelli...


Processing Drugs: 30it [13:17, 26.59s/it]




Global IC50 - MAE: 0.955, RMSE: 1.276, R²: 0.624


### Oversampling

#### XGBoosting

In [None]:
models = [XGBRegressor for _ in range(5)]

rf_params = {
        "random_state": 42,
    }
params = {
    "n_estimators": randint(10, 501),              
    "max_depth": randint(3, 11),                     
    "learning_rate": [1,0.1,0.01,0.3],             
    "subsample": [0.4,0.5,0.6,0.8,1],                
    "colsample_bytree": [0.7,0.9,1],   
    "min_child_weight": randint(1, 11),          
    "gamma": [0,0.1,0.01,0.3,0.5]                         
}

shap_dir = 'Results/Models_IC50_SMOTE/XGB_IC50'
train_ic50(df_clean, rf_params, params, models= models, shap_dir = shap_dir, l=31, smo = True)

Processing Drugs: 0it [00:00, ?it/s]


** Results for KIN001-260 **
IC50 - MAE: 0.761, RMSE: 0.926, R²: 0.271

Calcolo SHAP per KIN001-260 su 5 modelli...


Processing Drugs: 1it [00:19, 19.09s/it]


** Results for TG101348 **
IC50 - MAE: 1.149, RMSE: 1.515, R²: 0.064

Calcolo SHAP per TG101348 su 5 modelli...


Processing Drugs: 2it [00:37, 18.79s/it]


** Results for BX-912 **
IC50 - MAE: 1.339, RMSE: 1.664, R²: 0.211

Calcolo SHAP per BX-912 su 5 modelli...


Processing Drugs: 3it [00:56, 18.78s/it]


** Results for QL-XI-92 **
IC50 - MAE: 0.905, RMSE: 1.066, R²: 0.343

Calcolo SHAP per QL-XI-92 su 5 modelli...


Processing Drugs: 4it [01:15, 18.75s/it]


** Results for Tubastatin A **
IC50 - MAE: 0.751, RMSE: 0.903, R²: 0.399

Calcolo SHAP per Tubastatin A su 5 modelli...


Processing Drugs: 5it [01:33, 18.70s/it]


** Results for GSK690693 **
IC50 - MAE: 1.000, RMSE: 1.266, R²: 0.341

Calcolo SHAP per GSK690693 su 5 modelli...


Processing Drugs: 6it [01:52, 18.68s/it]


** Results for XMD14-99 **
IC50 - MAE: 0.723, RMSE: 0.910, R²: 0.143

Calcolo SHAP per XMD14-99 su 5 modelli...


Processing Drugs: 7it [02:28, 24.27s/it]


** Results for NPK76-II-72-1 **
IC50 - MAE: 1.388, RMSE: 1.698, R²: 0.011

Calcolo SHAP per NPK76-II-72-1 su 5 modelli...


Processing Drugs: 8it [02:46, 22.54s/it]


** Results for Y-39983 **
IC50 - MAE: 0.972, RMSE: 1.267, R²: 0.273

Calcolo SHAP per Y-39983 su 5 modelli...


Processing Drugs: 9it [03:22, 26.70s/it]


** Results for AT-7519 **
IC50 - MAE: 1.742, RMSE: 2.170, R²: -0.473

Calcolo SHAP per AT-7519 su 5 modelli...


Processing Drugs: 10it [03:41, 24.20s/it]


** Results for KIN001-236 **
IC50 - MAE: 0.735, RMSE: 0.908, R²: 0.125

Calcolo SHAP per KIN001-236 su 5 modelli...


Processing Drugs: 11it [04:00, 22.48s/it]


** Results for TL-2-105 **
IC50 - MAE: 0.987, RMSE: 1.222, R²: 0.280

Calcolo SHAP per TL-2-105 su 5 modelli...


Processing Drugs: 12it [04:18, 21.36s/it]


** Results for ABT-263 **
IC50 - MAE: 1.450, RMSE: 1.715, R²: 0.266

Calcolo SHAP per ABT-263 su 5 modelli...


Processing Drugs: 13it [05:06, 29.23s/it]


** Results for GSK1070916 **
IC50 - MAE: 1.286, RMSE: 1.625, R²: 0.387

Calcolo SHAP per GSK1070916 su 5 modelli...


Processing Drugs: 14it [05:24, 25.94s/it]


** Results for Methotrexate **
IC50 - MAE: 1.386, RMSE: 1.617, R²: -0.015

Calcolo SHAP per Methotrexate su 5 modelli...


Processing Drugs: 15it [05:41, 23.23s/it]


** Results for TL-1-85 **
IC50 - MAE: 1.358, RMSE: 1.658, R²: 0.174

Calcolo SHAP per TL-1-85 su 5 modelli...


Processing Drugs: 16it [06:00, 21.89s/it]


** Results for T0901317 **
IC50 - MAE: 0.901, RMSE: 1.128, R²: -0.052

Calcolo SHAP per T0901317 su 5 modelli...


Processing Drugs: 17it [06:35, 25.98s/it]


** Results for PHA-793887 **
IC50 - MAE: 1.782, RMSE: 2.177, R²: -0.262

Calcolo SHAP per PHA-793887 su 5 modelli...


Processing Drugs: 18it [06:54, 23.77s/it]


** Results for JW-7-24-1 **
IC50 - MAE: 1.180, RMSE: 1.580, R²: 0.059

Calcolo SHAP per JW-7-24-1 su 5 modelli...


Processing Drugs: 19it [07:13, 22.23s/it]


** Results for TPCA-1 **
IC50 - MAE: 1.476, RMSE: 1.836, R²: -0.059

Calcolo SHAP per TPCA-1 su 5 modelli...


Processing Drugs: 20it [07:31, 21.15s/it]


** Results for CX-5461 **
IC50 - MAE: 1.491, RMSE: 1.804, R²: 0.182

Calcolo SHAP per CX-5461 su 5 modelli...


Processing Drugs: 21it [07:50, 20.32s/it]


** Results for STF-62247 **
IC50 - MAE: 0.753, RMSE: 0.900, R²: 0.094

Calcolo SHAP per STF-62247 su 5 modelli...


Processing Drugs: 22it [08:08, 19.80s/it]


** Results for Dabrafenib **
IC50 - MAE: 1.198, RMSE: 1.676, R²: 0.190

Calcolo SHAP per Dabrafenib su 5 modelli...


Processing Drugs: 23it [08:26, 19.16s/it]


** Results for GSK429286A **
IC50 - MAE: 1.009, RMSE: 1.265, R²: 0.110

Calcolo SHAP per GSK429286A su 5 modelli...


Processing Drugs: 24it [08:44, 19.01s/it]


** Results for Trametinib **
IC50 - MAE: 1.716, RMSE: 2.306, R²: 0.221

Calcolo SHAP per Trametinib su 5 modelli...


Processing Drugs: 25it [09:03, 18.75s/it]


** Results for NG-25 **
IC50 - MAE: 1.179, RMSE: 1.537, R²: 0.242

Calcolo SHAP per NG-25 su 5 modelli...


Processing Drugs: 26it [09:21, 18.74s/it]


** Results for BIX02189 **
IC50 - MAE: 1.061, RMSE: 1.320, R²: -0.055

Calcolo SHAP per BIX02189 su 5 modelli...


Processing Drugs: 27it [09:40, 18.68s/it]


** Results for PIK-93 **
IC50 - MAE: 1.443, RMSE: 1.791, R²: -0.148

Calcolo SHAP per PIK-93 su 5 modelli...


Processing Drugs: 28it [10:15, 23.73s/it]


** Results for XMD15-27 **
IC50 - MAE: 0.685, RMSE: 0.854, R²: 0.170

Calcolo SHAP per XMD15-27 su 5 modelli...


Processing Drugs: 29it [11:43, 43.04s/it]


** Results for AC220 **
IC50 - MAE: 0.851, RMSE: 1.145, R²: 0.155

Calcolo SHAP per AC220 su 5 modelli...


Processing Drugs: 30it [12:03, 24.11s/it]




Global IC50 - MAE: 1.104, RMSE: 1.440, R²: 0.521


#### Random Forest

In [None]:
models = [RandomForestRegressor for _ in range(5)]

rf_params = {
        "random_state": 42,
    }

param_grid = {
    "max_depth": [5, 10, 20, None],
    "min_samples_split": randint(2, 11),
    "min_samples_leaf": randint(1, 6),
    "max_features": ["sqrt", "log2"]
}

shap_dir = 'Results/Models_IC50_SMOTE/RF_IC50'
train_ic50(df_clean, rf_params, param_grid = param_grid, models= models, shap_dir = shap_dir, l=31, smo = True)

Processing Drugs: 0it [00:00, ?it/s]


** Results for KIN001-260 **
IC50 - MAE: 0.729, RMSE: 0.903, R²: 0.307

Calcolo SHAP per KIN001-260 su 5 modelli...


Processing Drugs: 1it [00:19, 19.31s/it]


** Results for TG101348 **
IC50 - MAE: 1.016, RMSE: 1.301, R²: 0.310

Calcolo SHAP per TG101348 su 5 modelli...


Processing Drugs: 2it [00:38, 19.08s/it]


** Results for BX-912 **
IC50 - MAE: 1.068, RMSE: 1.331, R²: 0.495

Calcolo SHAP per BX-912 su 5 modelli...


Processing Drugs: 3it [00:56, 18.73s/it]


** Results for QL-XI-92 **
IC50 - MAE: 0.921, RMSE: 1.083, R²: 0.323

Calcolo SHAP per QL-XI-92 su 5 modelli...


Processing Drugs: 4it [01:15, 18.83s/it]


** Results for Tubastatin A **
IC50 - MAE: 0.752, RMSE: 0.904, R²: 0.397

Calcolo SHAP per Tubastatin A su 5 modelli...


Processing Drugs: 5it [01:34, 18.71s/it]


** Results for GSK690693 **
IC50 - MAE: 0.997, RMSE: 1.253, R²: 0.354

Calcolo SHAP per GSK690693 su 5 modelli...


Processing Drugs: 6it [01:52, 18.66s/it]


** Results for XMD14-99 **
IC50 - MAE: 0.683, RMSE: 0.888, R²: 0.183

Calcolo SHAP per XMD14-99 su 5 modelli...


Processing Drugs: 7it [02:29, 24.52s/it]


** Results for NPK76-II-72-1 **
IC50 - MAE: 1.106, RMSE: 1.350, R²: 0.375

Calcolo SHAP per NPK76-II-72-1 su 5 modelli...


Processing Drugs: 8it [02:47, 22.69s/it]


** Results for Y-39983 **
IC50 - MAE: 0.970, RMSE: 1.262, R²: 0.279

Calcolo SHAP per Y-39983 su 5 modelli...


Processing Drugs: 9it [03:30, 28.80s/it]


** Results for AT-7519 **
IC50 - MAE: 1.372, RMSE: 1.689, R²: 0.107

Calcolo SHAP per AT-7519 su 5 modelli...


Processing Drugs: 10it [03:49, 25.87s/it]


** Results for KIN001-236 **
IC50 - MAE: 0.631, RMSE: 0.792, R²: 0.335

Calcolo SHAP per KIN001-236 su 5 modelli...


Processing Drugs: 11it [04:07, 23.58s/it]


** Results for TL-2-105 **
IC50 - MAE: 0.874, RMSE: 1.108, R²: 0.408

Calcolo SHAP per TL-2-105 su 5 modelli...


Processing Drugs: 12it [04:26, 22.07s/it]


** Results for ABT-263 **
IC50 - MAE: 1.429, RMSE: 1.705, R²: 0.275

Calcolo SHAP per ABT-263 su 5 modelli...


Processing Drugs: 13it [05:13, 29.65s/it]


** Results for GSK1070916 **
IC50 - MAE: 1.226, RMSE: 1.562, R²: 0.433

Calcolo SHAP per GSK1070916 su 5 modelli...


Processing Drugs: 14it [05:32, 26.39s/it]


** Results for Methotrexate **
IC50 - MAE: 1.157, RMSE: 1.370, R²: 0.271

Calcolo SHAP per Methotrexate su 5 modelli...


Processing Drugs: 15it [05:48, 23.38s/it]


** Results for TL-1-85 **
IC50 - MAE: 1.140, RMSE: 1.406, R²: 0.407

Calcolo SHAP per TL-1-85 su 5 modelli...


Processing Drugs: 16it [06:07, 22.10s/it]


** Results for T0901317 **
IC50 - MAE: 0.707, RMSE: 0.939, R²: 0.271

Calcolo SHAP per T0901317 su 5 modelli...


Processing Drugs: 17it [06:44, 26.35s/it]


** Results for PHA-793887 **
IC50 - MAE: 1.502, RMSE: 1.759, R²: 0.176

Calcolo SHAP per PHA-793887 su 5 modelli...


Processing Drugs: 18it [07:02, 24.05s/it]


** Results for JW-7-24-1 **
IC50 - MAE: 1.015, RMSE: 1.385, R²: 0.277

Calcolo SHAP per JW-7-24-1 su 5 modelli...


Processing Drugs: 19it [07:21, 22.30s/it]


** Results for TPCA-1 **
IC50 - MAE: 1.183, RMSE: 1.468, R²: 0.323

Calcolo SHAP per TPCA-1 su 5 modelli...


Processing Drugs: 20it [07:39, 21.17s/it]


** Results for CX-5461 **
IC50 - MAE: 1.456, RMSE: 1.755, R²: 0.226

Calcolo SHAP per CX-5461 su 5 modelli...


Processing Drugs: 21it [07:58, 20.41s/it]


** Results for STF-62247 **
IC50 - MAE: 0.644, RMSE: 0.795, R²: 0.293

Calcolo SHAP per STF-62247 su 5 modelli...


Processing Drugs: 22it [08:16, 19.79s/it]


** Results for Dabrafenib **
IC50 - MAE: 1.256, RMSE: 1.723, R²: 0.144

Calcolo SHAP per Dabrafenib su 5 modelli...


Processing Drugs: 23it [08:35, 19.55s/it]


** Results for GSK429286A **
IC50 - MAE: 0.839, RMSE: 1.077, R²: 0.354

Calcolo SHAP per GSK429286A su 5 modelli...


Processing Drugs: 24it [09:02, 21.61s/it]


** Results for Trametinib **
IC50 - MAE: 1.820, RMSE: 2.316, R²: 0.214

Calcolo SHAP per Trametinib su 5 modelli...


Processing Drugs: 25it [09:31, 23.90s/it]


** Results for NG-25 **
IC50 - MAE: 1.107, RMSE: 1.374, R²: 0.394

Calcolo SHAP per NG-25 su 5 modelli...


Processing Drugs: 26it [10:02, 26.06s/it]


** Results for BIX02189 **
IC50 - MAE: 0.739, RMSE: 0.996, R²: 0.399

Calcolo SHAP per BIX02189 su 5 modelli...


Processing Drugs: 27it [10:31, 27.00s/it]


** Results for PIK-93 **
IC50 - MAE: 1.159, RMSE: 1.475, R²: 0.222

Calcolo SHAP per PIK-93 su 5 modelli...


Processing Drugs: 28it [11:28, 35.87s/it]


** Results for XMD15-27 **
IC50 - MAE: 0.557, RMSE: 0.726, R²: 0.399

Calcolo SHAP per XMD15-27 su 5 modelli...


Processing Drugs: 29it [13:47, 66.93s/it]


** Results for AC220 **
IC50 - MAE: 0.721, RMSE: 1.033, R²: 0.312

Calcolo SHAP per AC220 su 5 modelli...


Processing Drugs: 30it [14:18, 28.61s/it]




Global IC50 - MAE: 0.979, RMSE: 1.290, R²: 0.616


#### Gradiend Boosting

In [None]:
models = [GradientBoostingRegressor for _ in range(5)]

rf_params = {
        "random_state": 42,
    }

params = {
    "n_estimators": randint(10, 301),            
    "max_depth": randint(3, 11),                   
    "learning_rate": [1, 0.1, 0.01, 0.3],          
    "subsample": [0.4, 0.5, 0.6, 0.8, 1],          
    "min_samples_split": randint(2, 11),          
    "min_samples_leaf": randint(1, 11),            
    "max_features": ['sqrt', 'log2', None]         
}

shap_dir = 'Results/Models_IC50_SMOTE/GB_IC50'
train_ic50(df_clean, rf_params, params, models= models, shap_dir = shap_dir, l=31, smo = True)

Processing Drugs: 0it [00:00, ?it/s]


** Results for KIN001-260 **
IC50 - MAE: 0.782, RMSE: 0.944, R²: 0.243

Calcolo SHAP per KIN001-260 su 5 modelli...


Processing Drugs: 1it [00:28, 28.96s/it]


** Results for TG101348 **
IC50 - MAE: 1.098, RMSE: 1.459, R²: 0.132

Calcolo SHAP per TG101348 su 5 modelli...


Processing Drugs: 2it [00:58, 29.53s/it]


** Results for BX-912 **
IC50 - MAE: 1.183, RMSE: 1.472, R²: 0.383

Calcolo SHAP per BX-912 su 5 modelli...


Processing Drugs: 3it [01:27, 29.18s/it]


** Results for QL-XI-92 **
IC50 - MAE: 0.883, RMSE: 1.053, R²: 0.360

Calcolo SHAP per QL-XI-92 su 5 modelli...


Processing Drugs: 4it [01:57, 29.57s/it]


** Results for Tubastatin A **
IC50 - MAE: 0.788, RMSE: 0.936, R²: 0.354

Calcolo SHAP per Tubastatin A su 5 modelli...


Processing Drugs: 5it [02:26, 29.27s/it]


** Results for GSK690693 **
IC50 - MAE: 1.020, RMSE: 1.289, R²: 0.316

Calcolo SHAP per GSK690693 su 5 modelli...


Processing Drugs: 6it [02:55, 29.03s/it]


** Results for XMD14-99 **
IC50 - MAE: 0.697, RMSE: 0.884, R²: 0.191

Calcolo SHAP per XMD14-99 su 5 modelli...


Processing Drugs: 7it [03:51, 37.88s/it]


** Results for NPK76-II-72-1 **
IC50 - MAE: 1.296, RMSE: 1.576, R²: 0.148

Calcolo SHAP per NPK76-II-72-1 su 5 modelli...


Processing Drugs: 8it [04:19, 34.94s/it]


** Results for Y-39983 **
IC50 - MAE: 0.970, RMSE: 1.266, R²: 0.274

Calcolo SHAP per Y-39983 su 5 modelli...


Processing Drugs: 9it [05:20, 42.84s/it]


** Results for AT-7519 **
IC50 - MAE: 1.526, RMSE: 1.892, R²: -0.120

Calcolo SHAP per AT-7519 su 5 modelli...


Processing Drugs: 10it [05:48, 38.45s/it]


** Results for KIN001-236 **
IC50 - MAE: 0.723, RMSE: 0.893, R²: 0.153

Calcolo SHAP per KIN001-236 su 5 modelli...


Processing Drugs: 11it [06:18, 35.92s/it]


** Results for TL-2-105 **
IC50 - MAE: 0.976, RMSE: 1.203, R²: 0.302

Calcolo SHAP per TL-2-105 su 5 modelli...


Processing Drugs: 12it [06:49, 34.26s/it]


** Results for ABT-263 **
IC50 - MAE: 1.506, RMSE: 1.766, R²: 0.223

Calcolo SHAP per ABT-263 su 5 modelli...


Processing Drugs: 13it [08:05, 46.89s/it]


** Results for GSK1070916 **
IC50 - MAE: 1.403, RMSE: 1.731, R²: 0.304

Calcolo SHAP per GSK1070916 su 5 modelli...


Processing Drugs: 14it [08:34, 41.57s/it]


** Results for Methotrexate **
IC50 - MAE: 1.321, RMSE: 1.560, R²: 0.055

Calcolo SHAP per Methotrexate su 5 modelli...


Processing Drugs: 15it [09:00, 36.96s/it]


** Results for TL-1-85 **
IC50 - MAE: 1.231, RMSE: 1.539, R²: 0.289

Calcolo SHAP per TL-1-85 su 5 modelli...


Processing Drugs: 16it [09:30, 34.64s/it]


** Results for T0901317 **
IC50 - MAE: 0.918, RMSE: 1.130, R²: -0.057

Calcolo SHAP per T0901317 su 5 modelli...


Processing Drugs: 17it [10:26, 41.05s/it]


** Results for PHA-793887 **
IC50 - MAE: 1.753, RMSE: 2.138, R²: -0.217

Calcolo SHAP per PHA-793887 su 5 modelli...


Processing Drugs: 18it [10:54, 37.37s/it]


** Results for JW-7-24-1 **
IC50 - MAE: 1.149, RMSE: 1.551, R²: 0.093

Calcolo SHAP per JW-7-24-1 su 5 modelli...


Processing Drugs: 19it [11:23, 34.86s/it]


** Results for TPCA-1 **
IC50 - MAE: 1.422, RMSE: 1.775, R²: 0.010

Calcolo SHAP per TPCA-1 su 5 modelli...


Processing Drugs: 20it [11:52, 33.12s/it]


** Results for CX-5461 **
IC50 - MAE: 1.435, RMSE: 1.743, R²: 0.236

Calcolo SHAP per CX-5461 su 5 modelli...


Processing Drugs: 21it [12:21, 31.73s/it]


** Results for STF-62247 **
IC50 - MAE: 0.761, RMSE: 0.907, R²: 0.079

Calcolo SHAP per STF-62247 su 5 modelli...


Processing Drugs: 22it [12:50, 30.81s/it]


** Results for Dabrafenib **
IC50 - MAE: 1.198, RMSE: 1.688, R²: 0.178

Calcolo SHAP per Dabrafenib su 5 modelli...


Processing Drugs: 23it [13:18, 30.07s/it]


** Results for GSK429286A **
IC50 - MAE: 0.975, RMSE: 1.228, R²: 0.162

Calcolo SHAP per GSK429286A su 5 modelli...


Processing Drugs: 24it [13:47, 29.79s/it]


** Results for Trametinib **
IC50 - MAE: 1.760, RMSE: 2.302, R²: 0.224

Calcolo SHAP per Trametinib su 5 modelli...


Processing Drugs: 25it [14:15, 29.24s/it]


** Results for NG-25 **
IC50 - MAE: 1.162, RMSE: 1.484, R²: 0.293

Calcolo SHAP per NG-25 su 5 modelli...


Processing Drugs: 26it [14:45, 29.35s/it]


** Results for BIX02189 **
IC50 - MAE: 0.946, RMSE: 1.205, R²: 0.121

Calcolo SHAP per BIX02189 su 5 modelli...


Processing Drugs: 27it [15:14, 29.30s/it]


** Results for PIK-93 **
IC50 - MAE: 1.403, RMSE: 1.757, R²: -0.104

Calcolo SHAP per PIK-93 su 5 modelli...


Processing Drugs: 28it [16:09, 36.93s/it]


** Results for XMD15-27 **
IC50 - MAE: 0.672, RMSE: 0.841, R²: 0.193

Calcolo SHAP per XMD15-27 su 5 modelli...


Processing Drugs: 29it [18:22, 65.95s/it]


** Results for AC220 **
IC50 - MAE: 0.792, RMSE: 1.081, R²: 0.247

Calcolo SHAP per AC220 su 5 modelli...


Processing Drugs: 30it [18:52, 37.75s/it]




Global IC50 - MAE: 1.081, RMSE: 1.407, R²: 0.543


#### ALL

In [None]:
models = [XGBRegressor, RandomForestRegressor, GradientBoostingRegressor, RandomForestRegressor,XGBRegressor ]
rf_params = {
        "random_state": 42,
    }

params = {
    "n_estimators": randint(10, 501),              
    "max_depth": [5, 7, 10, 20, None],
}

shap_dir = 'Results/Models_IC50_SMOTE/GB_XGB_RF'

train_ic50(df_clean, rf_params, params, models= models, shap_dir = shap_dir, l=31, smo = True)

Processing Drugs: 0it [00:00, ?it/s]


** Results for KIN001-260 **
IC50 - MAE: 0.737, RMSE: 0.923, R²: 0.275

Calcolo SHAP per KIN001-260 su 5 modelli...


Processing Drugs: 1it [00:35, 35.32s/it]


** Results for TG101348 **
IC50 - MAE: 0.992, RMSE: 1.260, R²: 0.352

Calcolo SHAP per TG101348 su 5 modelli...


Processing Drugs: 2it [01:13, 36.81s/it]


** Results for BX-912 **
IC50 - MAE: 1.050, RMSE: 1.295, R²: 0.522

Calcolo SHAP per BX-912 su 5 modelli...


Processing Drugs: 3it [01:49, 36.52s/it]


** Results for QL-XI-92 **
IC50 - MAE: 0.913, RMSE: 1.074, R²: 0.333

Calcolo SHAP per QL-XI-92 su 5 modelli...


Processing Drugs: 4it [02:19, 34.04s/it]


** Results for Tubastatin A **
IC50 - MAE: 0.748, RMSE: 0.905, R²: 0.396

Calcolo SHAP per Tubastatin A su 5 modelli...


Processing Drugs: 5it [02:48, 32.35s/it]


** Results for GSK690693 **
IC50 - MAE: 0.979, RMSE: 1.242, R²: 0.365

Calcolo SHAP per GSK690693 su 5 modelli...


Processing Drugs: 6it [03:18, 31.24s/it]


** Results for XMD14-99 **
IC50 - MAE: 0.697, RMSE: 0.908, R²: 0.146

Calcolo SHAP per XMD14-99 su 5 modelli...


Processing Drugs: 7it [04:24, 42.71s/it]


** Results for NPK76-II-72-1 **
IC50 - MAE: 1.142, RMSE: 1.388, R²: 0.339

Calcolo SHAP per NPK76-II-72-1 su 5 modelli...


Processing Drugs: 8it [04:54, 38.64s/it]


** Results for Y-39983 **
IC50 - MAE: 0.980, RMSE: 1.269, R²: 0.271

Calcolo SHAP per Y-39983 su 5 modelli...


Processing Drugs: 9it [05:52, 44.90s/it]


** Results for AT-7519 **
IC50 - MAE: 1.361, RMSE: 1.668, R²: 0.129

Calcolo SHAP per AT-7519 su 5 modelli...


Processing Drugs: 10it [06:23, 40.36s/it]


** Results for KIN001-236 **
IC50 - MAE: 0.651, RMSE: 0.812, R²: 0.301

Calcolo SHAP per KIN001-236 su 5 modelli...


Processing Drugs: 11it [06:52, 36.95s/it]


** Results for TL-2-105 **
IC50 - MAE: 0.836, RMSE: 1.074, R²: 0.444

Calcolo SHAP per TL-2-105 su 5 modelli...


Processing Drugs: 12it [07:23, 35.25s/it]


** Results for ABT-263 **
IC50 - MAE: 1.375, RMSE: 1.658, R²: 0.315

Calcolo SHAP per ABT-263 su 5 modelli...


Processing Drugs: 13it [08:40, 47.74s/it]


** Results for GSK1070916 **
IC50 - MAE: 1.274, RMSE: 1.613, R²: 0.396

Calcolo SHAP per GSK1070916 su 5 modelli...


Processing Drugs: 14it [09:09, 42.20s/it]


** Results for Methotrexate **
IC50 - MAE: 1.101, RMSE: 1.312, R²: 0.331

Calcolo SHAP per Methotrexate su 5 modelli...


Processing Drugs: 15it [09:41, 39.24s/it]


** Results for TL-1-85 **
IC50 - MAE: 1.103, RMSE: 1.364, R²: 0.441

Calcolo SHAP per TL-1-85 su 5 modelli...


Processing Drugs: 16it [10:14, 37.27s/it]


** Results for T0901317 **
IC50 - MAE: 0.714, RMSE: 0.971, R²: 0.220

Calcolo SHAP per T0901317 su 5 modelli...


Processing Drugs: 17it [11:12, 43.36s/it]


** Results for PHA-793887 **
IC50 - MAE: 1.494, RMSE: 1.746, R²: 0.188

Calcolo SHAP per PHA-793887 su 5 modelli...


Processing Drugs: 18it [11:44, 40.15s/it]


** Results for JW-7-24-1 **
IC50 - MAE: 1.011, RMSE: 1.377, R²: 0.285

Calcolo SHAP per JW-7-24-1 su 5 modelli...


Processing Drugs: 19it [12:15, 37.32s/it]


** Results for TPCA-1 **
IC50 - MAE: 1.159, RMSE: 1.445, R²: 0.344

Calcolo SHAP per TPCA-1 su 5 modelli...


Processing Drugs: 20it [12:45, 35.05s/it]


** Results for CX-5461 **
IC50 - MAE: 1.453, RMSE: 1.758, R²: 0.223

Calcolo SHAP per CX-5461 su 5 modelli...


Processing Drugs: 21it [13:20, 35.14s/it]


** Results for STF-62247 **
IC50 - MAE: 0.633, RMSE: 0.790, R²: 0.302

Calcolo SHAP per STF-62247 su 5 modelli...


Processing Drugs: 22it [13:53, 34.48s/it]


** Results for Dabrafenib **
IC50 - MAE: 1.201, RMSE: 1.699, R²: 0.167

Calcolo SHAP per Dabrafenib su 5 modelli...


Processing Drugs: 23it [14:21, 32.64s/it]


** Results for GSK429286A **
IC50 - MAE: 0.805, RMSE: 1.042, R²: 0.396

Calcolo SHAP per GSK429286A su 5 modelli...


Processing Drugs: 24it [14:52, 32.15s/it]


** Results for Trametinib **
IC50 - MAE: 1.742, RMSE: 2.307, R²: 0.220

Calcolo SHAP per Trametinib su 5 modelli...


Processing Drugs: 25it [15:22, 31.33s/it]


** Results for NG-25 **
IC50 - MAE: 1.147, RMSE: 1.398, R²: 0.373

Calcolo SHAP per NG-25 su 5 modelli...


Processing Drugs: 26it [15:49, 30.11s/it]


** Results for BIX02189 **
IC50 - MAE: 0.735, RMSE: 1.005, R²: 0.388

Calcolo SHAP per BIX02189 su 5 modelli...


Processing Drugs: 27it [16:08, 26.61s/it]


** Results for PIK-93 **
IC50 - MAE: 1.150, RMSE: 1.464, R²: 0.233

Calcolo SHAP per PIK-93 su 5 modelli...


Processing Drugs: 28it [16:44, 29.47s/it]


** Results for XMD15-27 **
IC50 - MAE: 0.531, RMSE: 0.692, R²: 0.455

Calcolo SHAP per XMD15-27 su 5 modelli...


Processing Drugs: 29it [20:09, 82.24s/it]


** Results for AC220 **
IC50 - MAE: 0.700, RMSE: 1.024, R²: 0.325

Calcolo SHAP per AC220 su 5 modelli...


Processing Drugs: 30it [20:42, 41.41s/it]




Global IC50 - MAE: 0.965, RMSE: 1.279, R²: 0.622
