# Dataset CLeaning and Drug Selector

## Import Libraries

In [18]:
# Librerie per lettura file
import pandas as pd
from pathlib import Path
import json
import os
import psutil
import numpy as np
from tqdm import tqdm
import warnings

#Kfold and GridSearch

from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, average_precision_score, f1_score
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV


# Grafici
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib

# Data Analysis
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import (
    mean_absolute_error, mean_squared_error, r2_score,
    accuracy_score, precision_score, recall_score, f1_score,
    precision_recall_curve, classification_report, confusion_matrix,
    roc_curve, roc_auc_score, average_precision_score
)
from imblearn.over_sampling import SMOTE
from sklearn.multioutput import MultiOutputRegressor

# Modelli con Alberi
from xgboost import XGBRegressor, XGBClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import xgboost as xgb

# Librerie Torch per MLP
import torch
import pytorch_lightning as pl
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split, SubsetRandomSampler
from torchmetrics import F1Score, Accuracy, Precision, Recall
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import CSVLogger

# Explainable AI (SHAP)
import shap

# Librerie per gestione dati parallela
import modin.pandas as mpd
import modin.config as cfg

# Visualizzazione matrice di confusione
from sklearn.metrics import ConfusionMatrixDisplay


## Reading

In [19]:
folder_path = Path("Gene_expression_Sign/transcrittomica_finale")

file_paths = sorted(folder_path.glob("*.csv"))

print(file_paths)

df_list = []

def optimize_memory(df):

    for col in df.select_dtypes(include=['float64']).columns:
        df[col] = df[col].astype('float32')  

    for col in df.select_dtypes(include=['int64']).columns:
        df[col] = df[col].astype('int32')  

    for col in df.select_dtypes(include=['object']).columns:
        df[col] = df[col].astype('category') 

    return df

for file in file_paths:

    chunk = pd.read_csv(file)


    chunk = optimize_memory(chunk)


    df_list.append(chunk)


df_trans = pd.concat(df_list, ignore_index=True)


print(df_trans.shape)  
print(df_trans.head())  

(241010, 507)
   Unnamed: 0       Drug cell.name  Cell line cosmic identifiers Sample Names  \
0           1  Cisplatin      PSN1                        910546         PSN1   
1           2  Cisplatin    MHHES1                        908134     MHH-ES-1   
2           3  Cisplatin   NCIH146                        910899     NCI-H146   
3           4  Cisplatin   NCIH460                        905943     NCI-H460   
4           5  Cisplatin     HCC56                       1290907       HCC-56   

       IC50 Screened Compounds: Sensitivity Target_clean  Drug_id  ...  \
0  0.568371                PSN1           S   DNA_damage   1005.0  ...   
1  0.831471            MHH-ES-1           S   DNA_damage   1005.0  ...   
2  3.085050            NCI-H146           R   DNA_damage   1005.0  ...   
3  1.671034            NCI-H460           R   DNA_damage   1005.0  ...   
4  3.948344              HCC-56           R   DNA_damage   1005.0  ...   

       ZEB1      ZEB2     ZFP42     ZFPM1     ZFPM2   

## Dataset Cleaning

In [20]:
df_full = df_trans.drop(columns=['Unnamed: 0', 'Drug','Target_in_signor','sign','ID'])

df_full[['Sample Names', 'Screened Compounds:', 'Target_clean', 'Drug_Target']] = df_full[['Sample Names', 'Screened Compounds:', 'Target_clean', 'Drug_Target']].apply(lambda x: x.astype('category'))
df_full['Sample Names'] = df_full['Sample Names'].cat.codes
df_full['Screened Compounds:'] = df_full['Screened Compounds:'].cat.codes
df_full['Target_clean'] = df_full['Target_clean'].cat.codes
df_full['Drug_Target'] = df_full['Drug_Target'].cat.codes

df_full.head()

Unnamed: 0,cell.name,Cell line cosmic identifiers,Sample Names,IC50,Screened Compounds:,Sensitivity,Target_clean,Drug_id,Drug_Target,AP1,...,ZEB1,ZEB2,ZFP42,ZFPM1,ZFPM2,ZIC1,ZNF148,ZNF382,ZNF384,ZNF91
0,PSN1,910546,476,0.568371,476,S,42,1005.0,39,0.46714,...,0.334324,0.236429,-0.270358,-0.372084,-0.688642,0.389129,-0.471846,-1.038019,-0.295491,-0.759984
1,MHHES1,908134,321,0.831471,321,S,42,1005.0,39,-0.71976,...,1.785094,-0.0101,-0.42348,-1.254897,-0.986739,0.699671,-0.208668,-0.926012,-1.751162,-1.255542
2,NCIH146,910899,354,3.08505,354,R,42,1005.0,39,-2.54263,...,1.794722,-0.429532,-0.750003,0.030268,-0.199089,1.516712,-0.421734,-1.819404,-1.749611,-1.245667
3,NCIH460,905943,414,1.671034,414,R,42,1005.0,39,0.848041,...,1.578582,0.346746,-0.947639,-0.479116,-0.834051,1.27983,-0.068265,-1.790925,-1.572221,-1.565712
4,HCC56,1290907,156,3.948344,156,R,42,1005.0,39,0.744634,...,-0.617793,-0.985933,-0.662368,-0.407095,-0.306222,1.056803,-0.01135,-1.942993,-1.791034,-1.098688


In [21]:
df_full.columns = df_full.columns.str.replace(r'[^\w]', '_', regex=True)

df_clean = df_full.dropna(subset=['Drug_id'])

colonne_con_nan = df_clean.columns[df_clean.isna().any()].tolist()
print("Colonne con almeno un NaN:", colonne_con_nan[1:])

Colonne con almeno un NaN: ['Sensitivity']


In [22]:
total_nan = df_clean.isna().sum().sum()

print(f"Totale NaN nel dataframe: {total_nan}")

righe_con_nan = df_clean.isna().any(axis=1).sum()
print(f"Numero di righe con almeno un NaN: {righe_con_nan}")


Totale NaN nel dataframe: 91226
Numero di righe con almeno un NaN: 45685


In [23]:
df_clean = df_clean.copy()

df_clean["Sensitivity"] = df_clean["Sensitivity"].astype(str)
df_clean["Sensitivity"] = df_clean["Sensitivity"].apply(
    lambda x: 0 if x == "R" else (1 if x == "S" else x)
)

df_clean.head()

Unnamed: 0,cell_name,Cell_line_cosmic_identifiers,Sample_Names,IC50,Screened_Compounds_,Sensitivity,Target_clean,Drug_id,Drug_Target,AP1,...,ZEB1,ZEB2,ZFP42,ZFPM1,ZFPM2,ZIC1,ZNF148,ZNF382,ZNF384,ZNF91
0,PSN1,910546,476,0.568371,476,1,42,1005.0,39,0.46714,...,0.334324,0.236429,-0.270358,-0.372084,-0.688642,0.389129,-0.471846,-1.038019,-0.295491,-0.759984
1,MHHES1,908134,321,0.831471,321,1,42,1005.0,39,-0.71976,...,1.785094,-0.0101,-0.42348,-1.254897,-0.986739,0.699671,-0.208668,-0.926012,-1.751162,-1.255542
2,NCIH146,910899,354,3.08505,354,0,42,1005.0,39,-2.54263,...,1.794722,-0.429532,-0.750003,0.030268,-0.199089,1.516712,-0.421734,-1.819404,-1.749611,-1.245667
3,NCIH460,905943,414,1.671034,414,0,42,1005.0,39,0.848041,...,1.578582,0.346746,-0.947639,-0.479116,-0.834051,1.27983,-0.068265,-1.790925,-1.572221,-1.565712
4,HCC56,1290907,156,3.948344,156,0,42,1005.0,39,0.744634,...,-0.617793,-0.985933,-0.662368,-0.407095,-0.306222,1.056803,-0.01135,-1.942993,-1.791034,-1.098688


In [24]:
df_clean.to_csv("transcrittoma_pulito.csv", index = False)

#### Drug Selector

In [None]:
models_results_rf_combined = {}
df_clean = df_clean.dropna()

for specific_drug in tqdm(df_clean["Drug_id"].unique(), desc="Processing Drugs"):

    # Evaluation for each drug
    df_drug = df_clean[df_clean["Drug_id"] == specific_drug]
    cell_counts = df_drug["Cell_line_cosmic_identifiers"].value_counts()

    if len(cell_counts) < 10:
        continue

    stratify_vals = df_drug.groupby("Cell_line_cosmic_identifiers")["Sensitivity"].mean().reindex(cell_counts.index)

    train_cells, test_cells = train_test_split(
        cell_counts.index,
        test_size=0.2,
        stratify=stratify_vals,
        random_state=42
    )

    train_set = df_drug[df_drug["Cell_line_cosmic_identifiers"].isin(train_cells)]
    test_set = df_drug[df_drug["Cell_line_cosmic_identifiers"].isin(test_cells)]

    categorical_cols = train_set.select_dtypes(include=['object', 'category']).columns
    label_encoders = {}
    for col in categorical_cols:
        le = LabelEncoder()
        train_set[col] = le.fit_transform(train_set[col].astype(str))
        test_set[col] = test_set[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)
        label_encoders[col] = le

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    val_mae_list, val_rmse_list, val_r2_list = [], [], []
    val_auc_list, val_aucpr_list, val_f1_list = [], [], []

    train_cells = np.array(train_cells)

    for train_idx, val_idx in kf.split(train_cells):
        inner_train_cells = train_cells[train_idx]
        inner_val_cells = train_cells[val_idx]

        inner_train_set = train_set[train_set["Cell_line_cosmic_identifiers"].isin(inner_train_cells)]
        inner_val_set = train_set[train_set["Cell_line_cosmic_identifiers"].isin(inner_val_cells)]

        X_train_fold = inner_train_set.drop(columns=["IC50", "Sensitivity", "Cell_line_cosmic_identifiers", "Screened_Compounds_", "Sample_Names", "Drug_id", "Drug_Target"], errors='ignore')
        y_train_fold_reg = inner_train_set["IC50"]
        y_train_fold_sens = inner_train_set["Sensitivity"]

        X_val_fold = inner_val_set.drop(columns=["IC50", "Sensitivity", "Cell_line_cosmic_identifiers", "Screened_Compounds_", "Sample_Names", "Drug_id", "Drug_Target"], errors='ignore')
        y_val_fold_reg = inner_val_set["IC50"]
        y_val_fold_sens = inner_val_set["Sensitivity"]

        rf_reg_val = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=42)
        rf_reg_val.fit(X_train_fold, y_train_fold_reg)
        y_pred_ic50_val = rf_reg_val.predict(X_val_fold)

        X_train_clf_fold = X_train_fold.copy()
        X_val_clf_fold = X_val_fold.copy()
        
        # Check class count before SMOTE
        class_counts = y_train_fold_sens.value_counts()
        if class_counts.min() > 2:
            smote = SMOTE(random_state=42, k_neighbors=2)
            X_train_clf_sm_fold, y_train_clf_sm_fold = smote.fit_resample(X_train_clf_fold, y_train_fold_sens)

            rf_clf_val = RandomForestClassifier(n_estimators=100, max_depth=6, class_weight='balanced', random_state=42)
            rf_clf_val.fit(X_train_clf_sm_fold, y_train_clf_sm_fold)

            y_pred_prob_val = rf_clf_val.predict_proba(X_val_clf_fold)[:, 1]
            y_pred_val_sens = (y_pred_prob_val > 0.5).astype(int)

            if len(np.unique(y_val_fold_sens)) == 2:
                val_auc_list.append(roc_auc_score(y_val_fold_sens, y_pred_prob_val))
                val_aucpr_list.append(average_precision_score(y_val_fold_sens, y_pred_prob_val))
                val_f1_list.append(f1_score(y_val_fold_sens, y_pred_val_sens))
            else:
                val_auc_list.append(np.nan)
                val_aucpr_list.append(np.nan)
                val_f1_list.append(np.nan)

        val_mae_list.append(mean_absolute_error(y_val_fold_reg, y_pred_ic50_val))
        val_rmse_list.append(mean_squared_error(y_val_fold_reg, y_pred_ic50_val, squared=False))
        val_r2_list.append(r2_score(y_val_fold_reg, y_pred_ic50_val))

    

    models_results_rf_combined[specific_drug] = {
        "Val_MAE_IC50": np.mean(val_mae_list),
        "Val_RMSE_IC50": np.mean(val_rmse_list),
        "Val_R2_IC50": np.mean(val_r2_list),
        "Val_AUC_Sens": np.nanmean(val_auc_list),
        "Val_AUC_PR_Sens": np.nanmean(val_aucpr_list),
        "Val_F1_Sens": np.nanmean(val_f1_list),
        "n_cells": len(cell_counts),
    }

    print(f"\n Validation Metrics for {specific_drug}")
    print(f"Val R² IC50: {np.mean(val_r2_list):.2f}")
    print(f"Val F1 Sensitivity: {np.nanmean(val_f1_list):.2f}")
    print(f"Val AUC Sensitivity: {np.nanmean(val_auc_list):.2f}")
    print(f"Val AUC PR Sensitivity: {np.nanmean(val_aucpr_list):.2f}")

print(f"\n Numero di modelli validi: {len(models_results_rf_combined)}")


In [None]:
ordered_drug_score_list = sorted(
    [
        (drug, data["Val_R2_IC50"] * (data["Val_AUC_PR_Sens"]**2))
        for drug, data in models_results_rf_combined.items()
    ],
    key=lambda x: x[1],
    reverse=True
)

score_df = pd.DataFrame(ordered_drug_score_list, columns=["Drug", "Score"])

drug_id_to_name = df_trans.drop_duplicates(subset="Drug_id")[["Drug_id", "Drug"]].set_index("Drug_id")["Drug"].to_dict()
score_df["Drug_name"] = score_df["Drug"].map(drug_id_to_name)


score_df["Drug_name"] = score_df["Drug"].map(drug_id_to_name)


drug_id_to_targets = (
    df_trans.groupby("Drug_id")["Target_clean"]
    .agg(lambda x: list(set(x)))
    .to_dict()
)

score_df["Targets"] = score_df["Drug"].map(drug_id_to_targets)


score_df.head(30)

In [None]:
score_df.to_csv("drug_scores_with_targets.csv", index=False)