# ML model trainings are done in different code cells of this notebook

## XGBOOST REGRESSOR ON ALL COUMARINS

In [None]:
import pandas as pd
import numpy as np
from xgboost import XGBRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error,
    median_absolute_error, r2_score, make_scorer
)
from tqdm import tqdm
import warnings
import pickle
import os
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# ============================================================
# CONFIGURATION
# Paths for dataset, trained models, and evaluation reports.
# ============================================================
DATA_PATH = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/Total_Data.csv'
RESULT_DIR = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/GridSearchingResults/Models/XGBOOST_REGRESSOR'
REPORT_DIR = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/GridSearchingResults/Reports/XGBOOST_REGRESSOR'
os.makedirs(RESULT_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)

# ============================================================
# LOAD & PREPARE DATA
# Includes: loading dataset, column selection, cleaning, label encoding
# ============================================================
data = pd.read_csv(DATA_PATH, encoding='utf-8')
required_columns = ['Coumarin Type', 'Cancer Type', 'Coumarin Dose', 'Time', 'Viability']
data = data[required_columns].dropna()

# Convert dose to numeric and restrict doses ≤ 400
data['Coumarin Dose'] = pd.to_numeric(data['Coumarin Dose'], errors='coerce')
data = data[data['Coumarin Dose'] <= 400]

# Encode cancer types for ML models
cancerType_Encoder = LabelEncoder();
coumarin_Encoder = LabelEncoder();
data['Cancer Type'] = cancerType_Encoder.fit_transform(data['Cancer Type'])
data['Coumarin Type'] = coumarin_Encoder.fit_transform(data['Coumarin Type'])

# Some compounds have an additional 96h measurement
allowed_times = [24, 48, 72, 96];
auraptene_allowed_time = [24, 48, 72];
data = data[
    ((data['Coumarin Type'] != "Auraptene") & (data['Time'].isin(allowed_times))) |
    ((data['Coumarin Type'] == "Auraptene") & (data['Time'].isin(auraptene_allowed_time)))
]

# ------------------------------------------------------------
# GRID SEARCHING FOR OPTIMAL HYPER PARAMETERS
# ------------------------------------------------------------

print("\n=====================================<GRID SEARCHING>=====================================")


xgb_paramGrid = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 1.0],
    'colsample_bytree': [0.7, 1.0],
    'gamma': [0, 1],
    'reg_lambda': [1, 5],
    'reg_alpha': [0, 1],
}

param_dist = {
    'n_estimators': np.arange(100, 1200, 100),
    'max_depth': np.arange(2, 12, 1),
    'min_child_weight': np.arange(1, 10, 1),
    'learning_rate': np.linspace(0.005, 0.3, 20),
    'subsample': np.linspace(0.5, 1.0, 6),
    'colsample_bytree': np.linspace(0.5, 1.0, 6),
    'gamma': np.linspace(0, 10, 11),
    'reg_alpha': np.logspace(-3, 1, 10),   # L1
    'reg_lambda': np.logspace(-1, 2, 10),  # L2
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
X = data[['Coumarin Type', 'Coumarin Dose', 'Time', 'Cancer Type']]
y = data['Viability']
# {'colsample_bytree': 1.0, 'gamma': 1, 'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 300, 'reg_alpha': 1, 'reg_lambda': 1, 'subsample': 1.0}
# BEST R2 SCORE ON : 0.6486041733242749
xgb_gridSearcher = GridSearchCV(estimator=XGBRegressor(random_state=42,),
                                            param_grid=xgb_paramGrid,
                                            scoring='r2',
                                            verbose=1,
                                            cv=kf,
                                            n_jobs=-1,
                                            )

xgb_gridSearcher.fit(X,y)
bestParameters_xgb = xgb_gridSearcher.best_params_;
bestR2Score_xgb = xgb_gridSearcher.best_score_;

print(f"\nBEST R2 SCORE ON : {bestR2Score_xgb}")
print(bestParameters_xgb)

bestPrametersDataFrame = pd.DataFrame({
                                        "ParameterName" : list(bestParameters_xgb.keys()),
                                        "OptimalParameter" : list(bestParameters_xgb.values()),
                                        "R2": [bestR2Score_xgb] * len(bestParameters_xgb)
                                        });

bestPrametersDataFrame.to_excel(RESULT_DIR + f"/General_Best_Parameters.xlsx", index=False);

print(f"\n============<DONE WITH GRID SEARCHING>============");

#     # ------------------------------------------------------------
#     # MODEL TRAINING (HistGradientBoostingRegressor)
#     # ------------------------------------------------------------

# model = HistGradientBoostingRegressor(**bestParameters_xgb,
#                                         random_state=42);

# # Define scoring metrics
# scorers = {
#     'MSE': make_scorer(mean_squared_error, greater_is_better=False),
#     'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#     'MedAE': make_scorer(median_absolute_error, greater_is_better=False),
#     'R2': make_scorer(r2_score)
# }

# # Cross-validation evaluation
# cv_results = {}
# for name, scorer in scorers.items():
#     scores = cross_val_score(model, X, y, cv=kf, scoring=scorer)
#     # Flip sign for error metrics
#     if name in ['MSE', 'MAE', 'MedAE']:
#         scores = -scores
#     cv_results[name] = (round(scores.mean(), 2), round(scores.std(), 2))

# # Fit final model for reporting + predictions
# model.fit(X, y)
# y_pred_train = model.predict(X)

# # Training set evaluation
# train_results = {
#     'MSE': (round(mean_squared_error(y, y_pred_train), 2), 0),
#     'MAE': (round(mean_absolute_error(y, y_pred_train), 2), 0),
#     'MedAE': (round(median_absolute_error(y, y_pred_train), 2), 0),
#     'R2': (round(r2_score(y, y_pred_train), 2), 0)
# }

# # Save evaluation report
# eval_report = pd.DataFrame({
#     'Metric': list(scorers.keys()),
#     'Cross_Validation': [cv_results[m][0] for m in scorers.keys()],
#     'CV_SD': [cv_results[m][1] for m in scorers.keys()],
#     'Train': [train_results[m][0] for m in scorers.keys()],
#     'Train_SD': [train_results[m][1] for m in scorers.keys()]
# })
# eval_report.to_csv(f'{REPORT_DIR}Evaluation_Report_General.csv', index=False)
# print(f"\n--- Model Evaluation (General) ---\n{eval_report}\n")

# # Save trained model
# with open(f'{RESULT_DIR}histgb_model_General.pkl', 'wb') as f:
#     pickle.dump(model, f)

# # ------------------------------------------------------------
# # FEATURE IMPORTANCE
# # Computed using model's built-in method or permutation fallback.
# # ------------------------------------------------------------
# feature_names = ['Coumarin Dose', 'Time', 'Cancer Type']

# try:
#     importance = model.feature_importances_
# except AttributeError:
#     from sklearn.inspection import permutation_importance
#     result = permutation_importance(model, X, y, n_repeats=10, random_state=42)
#     importance = result.importances_mean

# # Rank features
# sorted_idx = np.argsort(importance)
# sorted_features = [feature_names[i] for i in sorted_idx]
# sorted_importance = importance[sorted_idx]

# # Save importance table
# importance_df = pd.DataFrame({
#     'Feature': sorted_features,
#     'Importance': sorted_importance
# })
# importance_df.to_csv(
#     f'{REPORT_DIR}Feature_Importance_General_HistGradientBoostingRegressor.csv',
#     index=False
# )

# # Save TIFF plot
# plt.figure(figsize=(8, 5))
# plt.barh(sorted_features, sorted_importance, color='black')
# plt.xlabel("Importance")
# plt.title(f"Feature Importance – General – HistGradientBoostingRegressor")
# plt.tight_layout()
# plt.savefig(
#     f'{REPORT_DIR}Feature_Importance_General_HistGradientBoostingRegressor.tiff',
#     format='tiff', dpi=300
# )
# plt.close()

# # ------------------------------------------------------------
# # OPTIMAL DOSE/TIME PREDICTION FOR TARGET VIABILITY 50
# # Scans a dose-time grid to find setting closest to viability=50.
# # ------------------------------------------------------------
# def predict_viability(cancer_code):
#     predictions = []
#     for dose in np.linspace(0, 400, 50):
#         for time in allowed_times:
#             viability = model.predict([[dose, time, cancer_code]])[0]
#             predictions.append((dose, time, viability))
#     df_pred = pd.DataFrame(predictions, columns=['Dose', 'Time', 'Viability'])
#     df_pred['AbsError'] = abs(df_pred['Viability'] - 50)
#     return df_pred.loc[df_pred['AbsError'].idxmin()]

# # Generate predictions for each reliable cancer type
# results = []
# for cancer_code in tqdm(reliable_cancers, desc=f"Predicting {coumarin}"):
#     cancer_name = CancerType_Encoder.inverse_transform([cancer_code])[0]
#     count = cancer_counts.get(cancer_code, 0)
#     best = predict_viability(cancer_code)
#     results.append({
#         'Cancer Type': cancer_name,
#         'Best Dose': best['Dose'],
#         'Best Time': best['Time'],
#         'Predicted Viability': best['Viability'],
#         'Sample Count': count,
#         'Reliability': 'Reliable'
#     })

# # Save final prediction summary
# results_df = pd.DataFrame(results)
# results_df.to_csv(f'{REPORT_DIR}Prediction_Report_{coumarin}.csv', index=False)
# print(f"Prediction report for {coumarin} saved successfully.\n")

# print("\n✅ All Coumarin types processed successfully.")

## GradientBoosting (SEPERATE COUMARINS)

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error,
    median_absolute_error, r2_score, make_scorer
)
from tqdm import tqdm
import warnings
import pickle
import os
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# ============================================================
# CONFIGURATION
# Paths for dataset, trained models, and evaluation reports.
# ============================================================
DATA_PATH = 'Total_Data.csv'
RESULT_DIR = 'GridSearchingResults/Models/GradientBoostingOnSeperateCoumarins/'
REPORT_DIR = 'GridSearchingReports/Reports/GradientBoostingOnSeperateCoumarins/'
os.makedirs(RESULT_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)

# ============================================================
# LOAD & PREPARE DATA
# Includes: loading dataset, column selection, cleaning, label encoding
# ============================================================
data = pd.read_csv(DATA_PATH, encoding='utf-8')
required_columns = ['Coumarin Type', 'Cancer Type', 'Coumarin Dose', 'Time', 'Viability']
data = data[required_columns].dropna()

# Convert dose to numeric and restrict doses ≤ 400
data['Coumarin Dose'] = pd.to_numeric(data['Coumarin Dose'], errors='coerce')
data = data[data['Coumarin Dose'] <= 400]

# Encode cancer types for ML models
CANCER_ENCODER = LabelEncoder()
data['Cancer Type'] = CANCER_ENCODER.fit_transform(data['Cancer Type'])

# ============================================================
# PROCESS EACH COUMARIN TYPE
# Each coumarin is processed independently to produce:
# - Reliability filtering via Gaussian Mixture Model
# - Model training using HistGradientBoostingRegressor
# - Full cross-validation metrics
# - Feature importance (CSV + TIFF plot)
# - Optimal dose/time predictions for viability≈50
# ============================================================
for coumarin in data['Coumarin Type'].unique():
    print(f"\n=== Processing {coumarin} ===")

    if coumarin in ['Auraptene', 'Esculetin'] :
        continue

    # Extract coumarin-specific subset
    coumarin_data = data[data['Coumarin Type'] == coumarin].copy()

    # Some compounds have an additional 96h measurement
    allowed_times = [24, 48, 72, 96] if coumarin.lower() == 'auraptene' else [24, 48, 72]
    coumarin_data = coumarin_data[coumarin_data['Time'].isin(allowed_times)]

    if coumarin_data.empty:
        print(f"⚠️ No valid time points for {coumarin}. Skipping...")
        continue

    # ------------------------------------------------------------
    # RELIABILITY FILTERING USING GMM
    # Identifies cancer types with sufficient sample size.
    # ------------------------------------------------------------
    cancer_counts = coumarin_data['Cancer Type'].value_counts().to_dict()
    count_df = pd.DataFrame(list(cancer_counts.items()), columns=['Cancer Type', 'Sample Count'])

    gmm = GaussianMixture(n_components=2, random_state=42)
    gmm.fit(count_df[['Sample Count']])

    # Threshold = midpoint between two mixture components
    threshold = np.mean(gmm.means_.flatten())

    count_df['Reliability'] = count_df['Sample Count'].apply(
        lambda x: 'Reliable' if x >= threshold else 'Unreliable'
    )

    reliable_cancers = count_df[count_df['Reliability'] == 'Reliable']['Cancer Type'].tolist()
    reliable_data = coumarin_data[coumarin_data['Cancer Type'].isin(reliable_cancers)].copy()

    print(f"Reliability threshold for {coumarin}: {threshold:.2f}")
    print(f"Reliable cancer types retained: {len(reliable_cancers)} of {len(cancer_counts)}")

    if reliable_data.empty:
        print(f"⚠️ No reliable data for {coumarin}. Skipping...")
        continue

    #-------------------------------------------------------------
    # BEGIN GRID SEARCHING FOR OPTIMAL R2
    #-------------------------------------------------------------
    
    gradientBoosting_paramGrid = {
        'n_estimators': [50, 150, 300],
        'learning_rate': [0.01 , 0.1, 0.2],
        'max_depth': [2, 3, 6],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'subsample': [0.7, 0.9, 1.0],
        'max_features': [None, 'sqrt', 'log2']
    }

    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    gridSearcher = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42),
                                param_grid=gradientBoosting_paramGrid,
                                scoring='r2',
                                cv=kf,
                                n_jobs=-1,
                                verbose=1)
    
    X = reliable_data[['Coumarin Dose', 'Time', 'Cancer Type']]
    y = reliable_data['Viability']
    
    print(f"\n===<GRID SEARCHING FOR {coumarin.upper()}>===")
    gridSearcher.fit(X,y);
    
    BEST_PARAMETERS = gridSearcher.best_params_;
    BEST_PARAMETERS["R2"] = gridSearcher.best_score_;
    df = pd.DataFrame([BEST_PARAMETERS]);
    print("BEST PARAMETERS: ", df)

    df.to_csv(RESULT_DIR + f"{coumarin}_gba_BEST_PARAMETERS.csv",index=False)
    BEST_PARAMETERS.pop('R2')

    print("===<DONE WITH GRID SEARCHING>===")
    # ------------------------------------------------------------
    # MODEL TRAINING (GradientBoostingRegressor)
    # ------------------------------------------------------------

    model = GradientBoostingRegressor(**BEST_PARAMETERS, random_state=42)


    # Define scoring metrics
    scorers = {
        'MSE': make_scorer(mean_squared_error, greater_is_better=False),
        'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
        'MedAE': make_scorer(median_absolute_error, greater_is_better=False),
        'R2': make_scorer(r2_score)
    }

    # Cross-validation evaluation
    cv_results = {}
    for name, scorer in scorers.items():
        scores = cross_val_score(model, X, y, cv=kf, scoring=scorer)
        # Flip sign for error metrics
        if name in ['MSE', 'MAE', 'MedAE']:
            scores = -scores
        cv_results[name] = (round(scores.mean(), 2), round(scores.std(), 2))

    # Fit final model for reporting + predictions
    model.fit(X, y)
    
    cv_results = pd.DataFrame(zip(cv_results));

    print(cv_results);

    cv_results.to_csv(f'{REPORT_DIR}Evaluation_Report_{coumarin}.csv', index=False)
    print(f"\n--- Model Evaluation ({coumarin}) ---\n{cv_results}\n")

    # Save trained model
    with open(f'{RESULT_DIR}gba_model_{coumarin}.pkl', 'wb') as f:
        pickle.dump(model, f)

    # ------------------------------------------------------------
    # FEATURE IMPORTANCE
    # Computed using model's built-in method or permutation fallback.
    # ------------------------------------------------------------
    feature_names = ['Coumarin Dose', 'Time', 'Cancer Type']

    try:
        importance = model.feature_importances_
    except AttributeError:
        from sklearn.inspection import permutation_importance
        result = permutation_importance(model, X, y, n_repeats=10, random_state=42)
        importance = result.importances_mean

    # Rank features
    sorted_idx = np.argsort(importance)
    sorted_features = [feature_names[i] for i in sorted_idx]
    sorted_importance = importance[sorted_idx]

    # Save importance table
    importance_df = pd.DataFrame({
        'Feature': sorted_features,
        'Importance': sorted_importance
    })
    importance_df.to_csv(
        f'{REPORT_DIR}Feature_Importance_{coumarin}_gbaOnSeperateCoumarins.csv',
        index=False
    )

    # Save TIFF plot
    plt.figure(figsize=(8, 5))
    plt.barh(sorted_features, sorted_importance, color='black')
    plt.xlabel("Importance")
    plt.title(f"Feature Importance – {coumarin} – gbaOnSeperateCoumarins")
    plt.tight_layout()
    plt.savefig(
        f'{REPORT_DIR}Feature_Importance_{coumarin}_gbaOnSeperateCoumarins.tiff',
        format='tiff', dpi=300
    )
    plt.close()

    # ------------------------------------------------------------
    # OPTIMAL DOSE/TIME PREDICTION FOR TARGET VIABILITY 50
    # Scans a dose-time grid to find setting closest to viability=50.
    # ------------------------------------------------------------
    def predict_viability(cancer_code):
        predictions = []
        for dose in np.linspace(0, 400, 50):
            for time in allowed_times:
                viability = model.predict([[dose, time, cancer_code]])[0]
                predictions.append((dose, time, viability))
        df_pred = pd.DataFrame(predictions, columns=['Dose', 'Time', 'Viability'])
        df_pred['AbsError'] = abs(df_pred['Viability'] - 50)
        return df_pred.loc[df_pred['AbsError'].idxmin()]

    cancers = data['Cancer Type'].unique();

    # Generate predictions for each reliable cancer type
    results = []
    for cancer_code in tqdm(cancers, desc=f"Predicting {coumarin}"):
        cancer_name = CANCER_ENCODER.inverse_transform([cancer_code])[0]
        # count = cancer_counts.get(cancer_code, 0)
        best = predict_viability(cancer_code)
        results.append({
            'Cancer Type': cancer_name,
            'Best Dose': best['Dose'],
            'Best Time': best['Time'],
            'Predicted Viability': best['Viability'],
            # 'Sample Count': count,
            'Reliability': 'Reliable'
        })

    # Save final prediction summary
    results_df = pd.DataFrame(results)
    results_df.to_csv(f'{REPORT_DIR}Prediction_Report_{coumarin}.csv', index=False)
    print(f"Prediction report for {coumarin} saved successfully.\n")

print("\n✅ All Coumarin types processed successfully.")

## GradientBoositngRegressor

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor, AdaBoostRegressor, RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error,
    median_absolute_error, r2_score, make_scorer
)
from tqdm import tqdm
import warnings
import pickle
import os
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# ============================================================
# CONFIGURATION
# Paths for dataset, trained models, and evaluation reports.
# ============================================================
DATA_PATH = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/Total_Data.csv'
RESULT_DIR = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/GridSearchingResults/Models/GradientBoosting'
REPORT_DIR = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/GridSearchingResults/Reports/GradientBoosting'
os.makedirs(RESULT_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)

# ============================================================
# LOAD & PREPARE DATA
# Includes: loading dataset, column selection, cleaning, label encoding
# ============================================================
data = pd.read_csv(DATA_PATH, encoding='utf-8')
required_columns = ['Coumarin Type', 'Cancer Type', 'Coumarin Dose', 'Time', 'Viability']
data = data[required_columns].dropna()

# Convert dose to numeric and restrict doses ≤ 400
data['Coumarin Dose'] = pd.to_numeric(data['Coumarin Dose'], errors='coerce')
data = data[data['Coumarin Dose'] <= 400]

# Encode cancer types for ML models
cancerType_Encoder = LabelEncoder();
coumarin_Encoder = LabelEncoder();
data['Cancer Type'] = cancerType_Encoder.fit_transform(data['Cancer Type'])
data['Coumarin Type'] = coumarin_Encoder.fit_transform(data['Coumarin Type'])

# Some compounds have an additional 96h measurement
allowed_times = [24, 48, 72, 96];
auraptene_allowed_time = [24, 48, 72];
data = data[
    ((data['Coumarin Type'] != "Auraptene") & (data['Time'].isin(allowed_times))) |
    ((data['Coumarin Type'] == "Auraptene") & (data['Time'].isin(auraptene_allowed_time)))
]

# ------------------------------------------------------------
# GRID SEARCHING FOR OPTIMAL HYPER PARAMETERS
# ------------------------------------------------------------

print("\n=====================================<GRID SEARCHING>=====================================")


gradientBoosting_paramGrid = {
    'n_estimators': [50, 150, 300],
    'learning_rate': [0.01 , 0.1, 0.2],
    'max_depth': [2, 3, 4, 6],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'max_features': [None, 'sqrt', 'log2']
}
kf = KFold(n_splits=5, shuffle=True, random_state=42)
X = data[['Coumarin Type', 'Coumarin Dose', 'Time', 'Cancer Type']]
y = data['Viability']



GradientBoosting_gridSearcher = GridSearchCV(estimator=GradientBoostingRegressor(random_state=42),
                                            param_grid=gradientBoosting_paramGrid,
                                            scoring='r2',
                                            cv=kf,
                                            n_jobs=-1,
                                            verbose=1
                                            )

GradientBoosting_gridSearcher.fit(X,y)
bestParameters_GradientBoosting = GradientBoosting_gridSearcher.best_params_;
bestR2Score_GradientBoosting = GradientBoosting_gridSearcher.best_score_;

print(f"\nBEST R2 SCORE : {bestR2Score_GradientBoosting}")
print(bestParameters_GradientBoosting)
#{'learning_rate': 0.1, 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 300, 'subsample': 1.0}
# BEST R2 SCORE : 0.6540099951366823
bestPrametersDataFrame = pd.DataFrame({
                                        "ParameterName" : list(bestParameters_GradientBoosting.keys()),
                                        "OptimalParameter" : list(bestParameters_GradientBoosting.values()),
                                        "R2": [bestR2Score_GradientBoosting] * len(bestParameters_GradientBoosting)
                                        });

bestPrametersDataFrame.to_excel(RESULT_DIR + f"/General_Best_Parameters.xlsx", index=False);

print(f"\n============<DONE WITH GRID SEARCHING>============");

    # ------------------------------------------------------------
    # MODEL TRAINING (HistGradientBoostingRegressor)
    # ------------------------------------------------------------

# model = HistGradientBoostingRegressor(**bestParameters_adaRegressor,
#                                         random_state=42);

# # Define scoring metrics
# scorers = {
#     'MSE': make_scorer(mean_squared_error, greater_is_better=False),
#     'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#     'MedAE': make_scorer(median_absolute_error, greater_is_better=False),
#     'R2': make_scorer(r2_score)
# }

# # Cross-validation evaluation
# cv_results = {}
# for name, scorer in scorers.items():
#     scores = cross_val_score(model, X, y, cv=kf, scoring=scorer)
#     # Flip sign for error metrics
#     if name in ['MSE', 'MAE', 'MedAE']:
#         scores = -scores
#     cv_results[name] = (round(scores.mean(), 2), round(scores.std(), 2))

# # Fit final model for reporting + predictions
# model.fit(X, y)
# y_pred_train = model.predict(X)

# # Training set evaluation
# train_results = {
#     'MSE': (round(mean_squared_error(y, y_pred_train), 2), 0),
#     'MAE': (round(mean_absolute_error(y, y_pred_train), 2), 0),
#     'MedAE': (round(median_absolute_error(y, y_pred_train), 2), 0),
#     'R2': (round(r2_score(y, y_pred_train), 2), 0)
# }

# # Save evaluation report
# eval_report = pd.DataFrame({
#     'Metric': list(scorers.keys()),
#     'Cross_Validation': [cv_results[m][0] for m in scorers.keys()],
#     'CV_SD': [cv_results[m][1] for m in scorers.keys()],
#     'Train': [train_results[m][0] for m in scorers.keys()],
#     'Train_SD': [train_results[m][1] for m in scorers.keys()]
# })
# eval_report.to_csv(f'{REPORT_DIR}Evaluation_Report_General.csv', index=False)
# print(f"\n--- Model Evaluation (General) ---\n{eval_report}\n")

# # Save trained model
# with open(f'{RESULT_DIR}histgb_model_General.pkl', 'wb') as f:
#     pickle.dump(model, f)

# # ------------------------------------------------------------
# # FEATURE IMPORTANCE
# # Computed using model's built-in method or permutation fallback.
# # ------------------------------------------------------------
# feature_names = ['Coumarin Dose', 'Time', 'Cancer Type']

# try:
#     importance = model.feature_importances_
# except AttributeError:
#     from sklearn.inspection import permutation_importance
#     result = permutation_importance(model, X, y, n_repeats=10, random_state=42)
#     importance = result.importances_mean

# # Rank features
# sorted_idx = np.argsort(importance)
# sorted_features = [feature_names[i] for i in sorted_idx]
# sorted_importance = importance[sorted_idx]

# # Save importance table
# importance_df = pd.DataFrame({
#     'Feature': sorted_features,
#     'Importance': sorted_importance
# })
# importance_df.to_csv(
#     f'{REPORT_DIR}Feature_Importance_General_HistGradientBoostingRegressor.csv',
#     index=False
# )

# # Save TIFF plot
# plt.figure(figsize=(8, 5))
# plt.barh(sorted_features, sorted_importance, color='black')
# plt.xlabel("Importance")
# plt.title(f"Feature Importance – General – HistGradientBoostingRegressor")
# plt.tight_layout()
# plt.savefig(
#     f'{REPORT_DIR}Feature_Importance_General_HistGradientBoostingRegressor.tiff',
#     format='tiff', dpi=300
# )
# plt.close()

# # ------------------------------------------------------------
# # OPTIMAL DOSE/TIME PREDICTION FOR TARGET VIABILITY 50
# # Scans a dose-time grid to find setting closest to viability=50.
# # ------------------------------------------------------------
# def predict_viability(cancer_code):
#     predictions = []
#     for dose in np.linspace(0, 400, 50):
#         for time in allowed_times:
#             viability = model.predict([[dose, time, cancer_code]])[0]
#             predictions.append((dose, time, viability))
#     df_pred = pd.DataFrame(predictions, columns=['Dose', 'Time', 'Viability'])
#     df_pred['AbsError'] = abs(df_pred['Viability'] - 50)
#     return df_pred.loc[df_pred['AbsError'].idxmin()]

# # Generate predictions for each reliable cancer type
# results = []
# for cancer_code in tqdm(reliable_cancers, desc=f"Predicting {coumarin}"):
#     cancer_name = CancerType_Encoder.inverse_transform([cancer_code])[0]
#     count = cancer_counts.get(cancer_code, 0)
#     best = predict_viability(cancer_code)
#     results.append({
#         'Cancer Type': cancer_name,
#         'Best Dose': best['Dose'],
#         'Best Time': best['Time'],
#         'Predicted Viability': best['Viability'],
#         'Sample Count': count,
#         'Reliability': 'Reliable'
#     })

# # Save final prediction summary
# results_df = pd.DataFrame(results)
# results_df.to_csv(f'{REPORT_DIR}Prediction_Report_{coumarin}.csv', index=False)
# print(f"Prediction report for {coumarin} saved successfully.\n")

# print("\n✅ All Coumarin types processed successfully.")

## HistGradientBoostingRegressor

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import (
    mean_squared_error, mean_absolute_error,
    median_absolute_error, r2_score, make_scorer
)
from tqdm import tqdm
import warnings
import pickle
import os
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# ============================================================
# CONFIGURATION
# Paths for dataset, trained models, and evaluation reports.
# ============================================================
DATA_PATH = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/Total_Data.csv'
RESULT_DIR = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/GridSearchingResults/Models/HistGradientBoostingRegressor'
REPORT_DIR = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/GridSearchingResults/Reports/HistGradientBoostingRegressor'
os.makedirs(RESULT_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)

# ============================================================
# LOAD & PREPARE DATA
# Includes: loading dataset, column selection, cleaning, label encoding
# ============================================================
data = pd.read_csv(DATA_PATH, encoding='utf-8')
required_columns = ['Coumarin Type', 'Cancer Type', 'Coumarin Dose', 'Time', 'Viability']
data = data[required_columns].dropna()

# Convert dose to numeric and restrict doses ≤ 400
data['Coumarin Dose'] = pd.to_numeric(data['Coumarin Dose'], errors='coerce')
data = data[data['Coumarin Dose'] <= 400]

# Encode cancer types for ML models
cancerType_Encoder = LabelEncoder();
coumarin_Encoder = LabelEncoder();
data['Cancer Type'] = cancerType_Encoder.fit_transform(data['Cancer Type'])
data['Coumarin Type'] = coumarin_Encoder.fit_transform(data['Coumarin Type'])

# Some compounds have an additional 96h measurement
allowed_times = [24, 48, 72, 96];
auraptene_allowed_time = [24, 48, 72];
data = data[
    ((data['Coumarin Type'] != "Auraptene") & (data['Time'].isin(allowed_times))) |
    ((data['Coumarin Type'] == "Auraptene") & (data['Time'].isin(auraptene_allowed_time)))
]

# ------------------------------------------------------------
# GRID SEARCHING FOR OPTIMAL HYPER PARAMETERS
# ------------------------------------------------------------

print("\n=====================================<GRID SEARCHING>=====================================")

# {'early_stopping': True, 'l2_regularization': 1.5, 'learning_rate': 0.1, 'max_bins': 128, 'max_depth': None,
#  'max_iter': 100, 'max_leaf_nodes': 31, 'min_samples_leaf': 10, 'n_iter_no_change': 20, 'validation_fraction': 0.1}
# BEST R2 SCORE ON : 0.6206090802796808
histGradientRegressor_paramGrid = {
    'learning_rate': [0.05, 0.1, 0.2],
    'max_iter': [100, 200],
    'max_leaf_nodes': [31, 63, 255],
    'max_depth': [None, 10, 20],
    'min_samples_leaf': [10, 20, 50],
    'l2_regularization': [0.0, 0.1, 1.5],
    'max_bins': [128, 255],
    'early_stopping': [True],
    'validation_fraction': [0.1],
    'n_iter_no_change': [10, 20]
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
X = data[['Coumarin Type', 'Coumarin Dose', 'Time', 'Cancer Type']]
y = data['Viability']

histGradient_gridSearcher = GridSearchCV(estimator=HistGradientBoostingRegressor(random_state=42),
                                            param_grid=histGradientRegressor_paramGrid,
                                            scoring='r2',
                                            cv=kf,
                                            n_jobs=-1,
                                            verbose=1
                                            )

histGradient_gridSearcher.fit(X,y)
bestParameters_HistGradientBoostingRegressor = histGradient_gridSearcher.best_params_;
bestR2Score_HistGradientBoostingRegressor = histGradient_gridSearcher.best_score_;

print(f"\nBEST R2 SCORE ON : {bestR2Score_HistGradientBoostingRegressor}")
print(bestParameters_HistGradientBoostingRegressor)

bestPrametersDataFrame = pd.DataFrame({
                                        "ParameterName" : list(bestParameters_HistGradientBoostingRegressor.keys()),
                                        "OptimalParameter" : list(bestParameters_HistGradientBoostingRegressor.values()),
                                        "R2": [bestR2Score_HistGradientBoostingRegressor] * len(bestParameters_HistGradientBoostingRegressor)
                                        });

bestPrametersDataFrame.to_excel(RESULT_DIR + f"/General_Best_Parameters.xlsx", index=False);

print(f"\n============<DONE WITH GRID SEARCHING>============");

#     # ------------------------------------------------------------
#     # MODEL TRAINING (HistGradientBoostingRegressor)
#     # ------------------------------------------------------------

# model = HistGradientBoostingRegressor(**bestParameters_HistGradientBoostingRegressor,
#                                         random_state=42);

# # Define scoring metrics
# scorers = {
#     'MSE': make_scorer(mean_squared_error, greater_is_better=False),
#     'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#     'MedAE': make_scorer(median_absolute_error, greater_is_better=False),
#     'R2': make_scorer(r2_score)
# }

# # Cross-validation evaluation
# cv_results = {}
# for name, scorer in scorers.items():
#     scores = cross_val_score(model, X, y, cv=kf, scoring=scorer)
#     # Flip sign for error metrics
#     if name in ['MSE', 'MAE', 'MedAE']:
#         scores = -scores
#     cv_results[name] = (round(scores.mean(), 2), round(scores.std(), 2))

# # Fit final model for reporting + predictions
# model.fit(X, y)
# y_pred_train = model.predict(X)

# # Training set evaluation
# train_results = {
#     'MSE': (round(mean_squared_error(y, y_pred_train), 2), 0),
#     'MAE': (round(mean_absolute_error(y, y_pred_train), 2), 0),
#     'MedAE': (round(median_absolute_error(y, y_pred_train), 2), 0),
#     'R2': (round(r2_score(y, y_pred_train), 2), 0)
# }

# # Save evaluation report
# eval_report = pd.DataFrame({
#     'Metric': list(scorers.keys()),
#     'Cross_Validation': [cv_results[m][0] for m in scorers.keys()],
#     'CV_SD': [cv_results[m][1] for m in scorers.keys()],
#     'Train': [train_results[m][0] for m in scorers.keys()],
#     'Train_SD': [train_results[m][1] for m in scorers.keys()]
# })
# eval_report.to_csv(f'{REPORT_DIR}Evaluation_Report_General.csv', index=False)
# print(f"\n--- Model Evaluation (General) ---\n{eval_report}\n")

# # Save trained model
# with open(f'{RESULT_DIR}histgb_model_General.pkl', 'wb') as f:
#     pickle.dump(model, f)

# # ------------------------------------------------------------
# # FEATURE IMPORTANCE
# # Computed using model's built-in method or permutation fallback.
# # ------------------------------------------------------------
# feature_names = ['Coumarin Dose', 'Time', 'Cancer Type']

# try:
#     importance = model.feature_importances_
# except AttributeError:
#     from sklearn.inspection import permutation_importance
#     result = permutation_importance(model, X, y, n_repeats=10, random_state=42)
#     importance = result.importances_mean

# # Rank features
# sorted_idx = np.argsort(importance)
# sorted_features = [feature_names[i] for i in sorted_idx]
# sorted_importance = importance[sorted_idx]

# # Save importance table
# importance_df = pd.DataFrame({
#     'Feature': sorted_features,
#     'Importance': sorted_importance
# })
# importance_df.to_csv(
#     f'{REPORT_DIR}Feature_Importance_General_HistGradientBoostingRegressor.csv',
#     index=False
# )

# # Save TIFF plot
# plt.figure(figsize=(8, 5))
# plt.barh(sorted_features, sorted_importance, color='black')
# plt.xlabel("Importance")
# plt.title(f"Feature Importance – General – HistGradientBoostingRegressor")
# plt.tight_layout()
# plt.savefig(
#     f'{REPORT_DIR}Feature_Importance_General_HistGradientBoostingRegressor.tiff',
#     format='tiff', dpi=300
# )
# plt.close()

# # ------------------------------------------------------------
# # OPTIMAL DOSE/TIME PREDICTION FOR TARGET VIABILITY 50
# # Scans a dose-time grid to find setting closest to viability=50.
# # ------------------------------------------------------------
# def predict_viability(cancer_code):
#     predictions = []
#     for dose in np.linspace(0, 400, 50):
#         for time in allowed_times:
#             viability = model.predict([[dose, time, cancer_code]])[0]
#             predictions.append((dose, time, viability))
#     df_pred = pd.DataFrame(predictions, columns=['Dose', 'Time', 'Viability'])
#     df_pred['AbsError'] = abs(df_pred['Viability'] - 50)
#     return df_pred.loc[df_pred['AbsError'].idxmin()]

# # Generate predictions for each reliable cancer type
# results = []
# for cancer_code in tqdm(reliable_cancers, desc=f"Predicting {coumarin}"):
#     cancer_name = CancerType_Encoder.inverse_transform([cancer_code])[0]
#     count = cancer_counts.get(cancer_code, 0)
#     best = predict_viability(cancer_code)
#     results.append({
#         'Cancer Type': cancer_name,
#         'Best Dose': best['Dose'],
#         'Best Time': best['Time'],
#         'Predicted Viability': best['Viability'],
#         'Sample Count': count,
#         'Reliability': 'Reliable'
#     })

# # Save final prediction summary
# results_df = pd.DataFrame(results)
# results_df.to_csv(f'{REPORT_DIR}Prediction_Report_{coumarin}.csv', index=False)
# print(f"Prediction report for {coumarin} saved successfully.\n")

# print("\n✅ All Coumarin types processed successfully.")

## RandomForestRegressor

In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score, make_scorer
from tqdm import tqdm
import warnings
import pickle
import os
import matplotlib.pyplot as plt

warnings.filterwarnings("ignore")

# ============================================================
# CONFIGURATION
# ============================================================
DATA_PATH = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/Total_Data.csv'
RESULT_DIR = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/GridSearchingResults/Models/RandomForestRegressor'
REPORT_DIR = '/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/GridSearchingResults/Reports/RandomForestRegressor'
os.makedirs(RESULT_DIR, exist_ok=True)
os.makedirs(REPORT_DIR, exist_ok=True)

# ============================================================
# LOAD & CLEAN DATA
# ============================================================
data = pd.read_csv(DATA_PATH, encoding='utf-8')
required_columns = ['Coumarin Type', 'Cancer Type', 'Coumarin Dose', 'Time', 'Viability']
data = data[required_columns].dropna()

data['Coumarin Dose'] = pd.to_numeric(data['Coumarin Dose'], errors='coerce')
data = data[data['Coumarin Dose'] <= 400]

# Exclude Scopoletin
data = data[data['Coumarin Type'].str.lower() != 'scopoletin']
print(f"✅ Scopoletin data excluded. Remaining coumarins: {data['Coumarin Type'].unique().tolist()}")

# ============================================================
# MAIN LOOP: PROCESS EACH COUMARIN TYPE
# ============================================================
print(f"\n=== Processing ===")

# Time filter
allowed_times = [24, 48, 72, 96];
auraptene_allowed_time = [24, 48, 72];
data = data[
    ((data['Coumarin Type'] != "Auraptene") & (data['Time'].isin(allowed_times))) |
    ((data['Coumarin Type'] == "Auraptene") & (data['Time'].isin(auraptene_allowed_time)))
]
# Encode cancer types for this coumarin
cancerType_Encoder = LabelEncoder();
coumarin_Encoder = LabelEncoder();
data['Cancer Type'] = cancerType_Encoder.fit_transform(data['Cancer Type'])
data['Coumarin Type'] = coumarin_Encoder.fit_transform(data['Coumarin Type'])

# # ------------------------------------------------------------
# # RELIABILITY FILTERING
# # ------------------------------------------------------------
# cancer_counts = coumarin_data['Cancer Type'].value_counts().to_dict()
# count_df = pd.DataFrame(list(cancer_counts.items()), columns=['Cancer Type', 'Sample Count'])

# gmm = GaussianMixture(n_components=2, random_state=42)
# gmm.fit(count_df[['Sample Count']])
# threshold = np.mean(gmm.means_.flatten())

# count_df['Reliability'] = count_df['Sample Count'].apply(lambda x: 'Reliable' if x >= threshold else 'Unreliable')
# count_df.to_csv(f'{REPORT_DIR}Reliability_Threshold_{coumarin}.csv', index=False)

# reliable_cancers = count_df[count_df['Reliability'] == 'Reliable']['Cancer Type'].tolist()
# reliable_data = coumarin_data[coumarin_data['Cancer Type'].isin(reliable_cancers)].copy()

# print(f"Reliability threshold for {coumarin}: {threshold:.2f}")
# print(f"Reliable cancer types retained: {len(reliable_cancers)} of {len(cancer_counts)}")

# if reliable_data.empty:
#     print(f"⚠️ No reliable data for {coumarin}. Skipping...")
#     continue


#------<BEGIN GRIDSEARCHING FOR OPTIMAL R2>----------------

# {'bootstrap': True, 'max_depth': 20, 'max_features': 'sqrt', 'max_samples': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
# BEST R2 SCORE : 0.5946859558299303
X = data[['Coumarin Type', 'Coumarin Dose', 'Time', 'Cancer Type']]
y = data['Viability']

param_grid_ForestRegressor = {
    'n_estimators': [100, 300, 600],
    'max_depth': [None, 10, 20, 40],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 6],
    'max_features': ['auto', 'sqrt', 0.3, 0.7],
    'bootstrap': [True, False],
    'max_samples': [None, 0.7, 0.9],
}

kf = KFold(n_splits=5, shuffle=True, random_state=42);

gridSearcher_ForestRegressor = GridSearchCV(estimator=RandomForestRegressor(random_state=42),
                                            param_grid=param_grid_ForestRegressor,
                                            scoring='r2',
                                            cv=kf,
                                            n_jobs=-1,
                                            verbose=1)


gridSearcher_ForestRegressor.fit(X,y)
bestParameters_ForestRegressor = gridSearcher_ForestRegressor.best_params_;
bestR2Score_ForestRegressor = gridSearcher_ForestRegressor.best_score_;

print(f"\nBEST R2 SCORE : {bestR2Score_ForestRegressor}")

print(bestParameters_ForestRegressor);

bestPrametersDataFrame = pd.DataFrame({
                                        "ParameterName" : list(bestParameters_ForestRegressor.keys()),
                                        "OptimalParameter" : list(bestParameters_ForestRegressor.values()),
                                        "R2": [bestR2Score_ForestRegressor] * len(bestParameters_ForestRegressor)
                                        });

bestPrametersDataFrame.to_excel(RESULT_DIR + f"/best_parameters.xlsx", index=False);

print(f"============<DONE WITH GRID SEARCHING>============");


#     # ------------------------------------------------------------
#     # MODEL TRAINING (RandomForest)
#     # ------------------------------------------------------------

#     model = RandomForestRegressor(
#         n_estimators=200,
#         max_depth=20,
#         min_samples_split=5,
#         min_samples_leaf=2,
#         max_features='log2',
#         random_state=42,
#         n_jobs=-1
#     )

#     kf = KFold(n_splits=5, shuffle=True, random_state=42)
#     scorers = {
#         'MSE': make_scorer(mean_squared_error, greater_is_better=False),
#         'MAE': make_scorer(mean_absolute_error, greater_is_better=False),
#         'MedAE': make_scorer(median_absolute_error, greater_is_better=False),
#         'R2': make_scorer(r2_score)
#     }

#     cv_results = {}
#     for name, scorer in scorers.items():
#         scores = cross_val_score(model, X, y, cv=kf, scoring=scorer)
#         if name in ['MSE', 'MAE', 'MedAE']:
#             scores = -scores
#         cv_results[name] = (round(scores.mean(), 2), round(scores.std(), 2))

#     # Fit final model
#     model.fit(X, y)
#     y_train_pred = model.predict(X)
#     train_results = {
#         'MSE': (round(mean_squared_error(y, y_train_pred), 2), 0),
#         'MAE': (round(mean_absolute_error(y, y_train_pred), 2), 0),
#         'MedAE': (round(median_absolute_error(y, y_train_pred), 2), 0),
#         'R2': (round(r2_score(y, y_train_pred), 2), 0)
#     }

#     # ------------------------------------------------------------
#     # EVALUATION REPORT
#     # ------------------------------------------------------------
#     eval_report = pd.DataFrame({
#         'Metric': list(scorers.keys()),
#         'Cross_Validation': [cv_results[m][0] for m in scorers.keys()],
#         'CV_SD': [cv_results[m][1] for m in scorers.keys()],
#         'Train': [train_results[m][0] for m in scorers.keys()],
#         'Train_SD': [train_results[m][1] for m in scorers.keys()]
#     })
#     eval_report.to_csv(f'{REPORT_DIR}Evaluation_Report_{coumarin}.csv', index=False)
#     print(f"\n--- Model Evaluation ({coumarin}) ---\n{eval_report}\n")

#     # Save model
#     with open(f'{RESULT_DIR}rf_model_{coumarin}.pkl', 'wb') as f:
#         pickle.dump(model, f)

#     # ------------------------------------------------------------
#     # FEATURE IMPORTANCE
#     # ------------------------------------------------------------
#     feature_names = ['Coumarin Dose', 'Time', 'Cancer Type']
#     importance = model.feature_importances_

#     sorted_idx = np.argsort(importance)
#     sorted_features = [feature_names[i] for i in sorted_idx]
#     sorted_importance = importance[sorted_idx]

#     importance_df = pd.DataFrame({
#         'Feature': sorted_features,
#         'Importance': sorted_importance
#     })
#     importance_df.to_csv(f'{REPORT_DIR}Feature_Importance_{coumarin}_RandomForestRegressor.csv', index=False)

#     plt.figure(figsize=(8, 5))
#     plt.barh(sorted_features, sorted_importance, color='black')
#     plt.xlabel("Importance")
#     plt.title(f"Feature Importance – {coumarin} – RandomForestRegressor")
#     plt.tight_layout()
#     plt.savefig(f'{REPORT_DIR}Feature_Importance_{coumarin}_RandomForestRegressor.tiff', format='tiff', dpi=300)
#     plt.close()

#     # ------------------------------------------------------------
#     # PREDICTION REPORT
#     # ------------------------------------------------------------
#     def predict_viability(cancer_code):
#         predictions = []
#         for dose in np.linspace(0, 400, 50):
#             for time in times:
#                 viability = model.predict([[dose, time, cancer_code]])[0]
#                 predictions.append((dose, time, viability))
#         df_pred = pd.DataFrame(predictions, columns=['Dose', 'Time', 'Viability'])
#         df_pred['AbsError'] = abs(df_pred['Viability'] - 50)
#         return df_pred.loc[df_pred['AbsError'].idxmin()]

#     results = []
#     for cancer_code in tqdm(reliable_cancers, desc=f"Predicting {coumarin}"):
#         cancer_name = label_encoder.inverse_transform([cancer_code])[0]
#         count = cancer_counts.get(cancer_code, 0)
#         best = predict_viability(cancer_code)
#         results.append({
#             'Cancer Type': cancer_name,
#             'Best Dose': best['Dose'],
#             'Best Time': best['Time'],
#             'Predicted Viability': best['Viability'],
#             'Sample Count': count,
#             'Reliability': 'Reliable'
#         })

#     results_df = pd.DataFrame(results)
#     results_df.to_csv(f'{REPORT_DIR}Prediction_Report_{coumarin}.csv', index=False)
#     print(f"\nPrediction report for {coumarin} saved successfully.\n")

# print("\n✅ All Coumarin types processed successfully.")

✅ Scopoletin data excluded. Remaining coumarins: ['Auraptene', 'Esculetin', 'Galbanic Acid', 'Umbelliprenin']

=== Processing ===
Fitting 5 folds for each of 2592 candidates, totalling 12960 fits


KeyboardInterrupt: 

## Testing File

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import (RandomForestRegressor, GradientBoostingRegressor, HistGradientBoostingRegressor)
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.mixture import GaussianMixture
from sklearn.metrics import mean_squared_error, mean_absolute_error, median_absolute_error, r2_score, make_scorer
from tqdm import tqdm
import warnings
import pickle
import os
import matplotlib.pyplot as plt

# Please Change the paths to match your own enviroment.

DATA_PATH = '../Total_Data.csv'
RESULT_DIR = '../GridSearchingResults/Models/'
REPORT_DIR = '../GridSearchingReports/Reports/'

data = pd.read_csv(DATA_PATH);

data = data[['Cancer Type', 'Coumarin Type', 'Coumarin Dose', 'Time', 'Viability']].dropna();

/home/arashp/Programming_Files/DoseTimeOptimizations_Coumarins/src
['Prostate' 'Leukemia/Lymphoma' 'Breast' 'Colon' 'Glioma' 'Bone' 'Ovarian'
 'Cervical' 'Liver' 'Gastric' 'Lung' 'Pancreatic' 'Oral' 'Renal'
 'Cholangiocarcinoma' 'Salivary Gland' 'Skin' 'Melanoma']
