In [None]:
# Install Bayesian Optimization
!pip install bayesian-optimization



In [None]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from bayes_opt import BayesianOptimization

# Load and prepare data
df = pd.read_csv('/content/drive/My Drive/drugdata.csv')
df = df.iloc[:,:6].dropna()
X = df.iloc[:, :5]
y = df.iloc[:, 5]
d = 1000000

# Split data: 64% train, 16% val, 20% test
X_temp, X_test, y_temp, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_temp, y_temp, test_size=0.2, random_state=42)  # 0.2 of 80% = 16%

# Scale data
scaler_X = MinMaxScaler()
X_train_scaled = scaler_X.fit_transform(X_train)
X_val_scaled = scaler_X.transform(X_val)
X_test_scaled = scaler_X.transform(X_test)

scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.values.reshape(-1, 1)).flatten()
y_val_scaled = scaler_y.transform(y_val.values.reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.values.reshape(-1, 1)).flatten()

# Evaluation metrics
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    rrmse = rmse / (np.max(y_true) - np.min(y_true))
    r2 = r2_score(y_true, y_pred)
    aapre = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    rae = np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true - np.mean(y_true)))
    return mae, mse, rmse, rrmse, r2, aapre, rae

# Bayesian Optimization target function (uses validation set!)
def bayes_optimization(n_estimators_rf, max_depth_rf, n_estimators_gb, learning_rate_gb):
    # Round & scale hyperparameters
    n_estimators_rf = int(n_estimators_rf) * 2
    max_depth_rf = int(max_depth_rf) * 2
    n_estimators_gb = int(n_estimators_gb) * 2

    rf = RandomForestRegressor(n_estimators=n_estimators_rf, max_depth=max_depth_rf, random_state=42)
    gb = GradientBoostingRegressor(n_estimators=n_estimators_gb, learning_rate=learning_rate_gb, random_state=42)

    stacking_model = StackingRegressor(
        estimators=[('rf', rf), ('gb', gb)],
        final_estimator=Ridge(),
        n_jobs=-1
    )

    stacking_model.fit(X_train_scaled, y_train_scaled)
    y_val_pred = stacking_model.predict(X_val_scaled)

    return r2_score(y_val_scaled, y_val_pred)

# Hyperparameter bounds
pbounds = {
    'n_estimators_rf': (25, 100),
    'max_depth_rf': (5, 15),
    'n_estimators_gb': (25, 100),
    'learning_rate_gb': (0.01, 0.2)
}

# Run Bayesian Optimization
optimizer = BayesianOptimization(
    f=bayes_optimization,
    pbounds=pbounds,
    random_state=42,
    verbose=2
)

optimizer.maximize(init_points=5, n_iter=100)

# Best parameters
best_params = optimizer.max['params']
best_base_models = [
    ('rf', RandomForestRegressor(n_estimators=int(best_params['n_estimators_rf']) * 2,
                                 max_depth=int(best_params['max_depth_rf']) * 2,
                                 random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=int(best_params['n_estimators_gb']) * 2,
                                     learning_rate=best_params['learning_rate_gb'],
                                     random_state=42))
]

# Retrain on full train + validation set
X_trainval_scaled = scaler_X.fit_transform(pd.concat([X_train, X_val]))
y_trainval_scaled = scaler_y.fit_transform(pd.concat([y_train, y_val]).values.reshape(-1, 1)).flatten()

best_stacking_model = StackingRegressor(
    estimators=best_base_models,
    final_estimator=Ridge()
)

best_stacking_model.fit(X_trainval_scaled, y_trainval_scaled)

# Evaluate on test set
y_test_pred_scaled = best_stacking_model.predict(scaler_X.transform(X_test))
y_test_pred = scaler_y.inverse_transform(y_test_pred_scaled.reshape(-1, 1)).flatten()
y_test_true = y_test.values

# Final metrics
test_mae, test_mse, test_rmse, test_rrmse, test_r2, test_aapre, test_rae = calculate_metrics(y_test_true / d, y_test_pred / d)

# Print results
print("\n--- Final Evaluation on Test Set ---")
print(f"Test MAE: {test_mae}")
print(f"Test MSE: {test_mse}")
print(f"Test RMSE: {test_rmse:.4f}")
print(f"Test RRMSE: {test_rrmse:.4f}")
print(f"Test R²: {test_r2:.4f}")
print(f"Test AAPRE: {test_aapre:.4f}%")
print(f"Test RAE: {test_rae:.4f}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
|   iter    |  target   | learni... | max_de... | n_esti... | n_esti... |
-------------------------------------------------------------------------
| [39m1        [39m | [39m0.9177   [39m | [39m0.08116  [39m | [39m14.51    [39m | [39m79.9     [39m | [39m69.9     [39m |
| [39m2        [39m | [39m0.8936   [39m | [39m0.03964  [39m | [39m6.56     [39m | [39m29.36    [39m | [39m89.96    [39m |
| [39m3        [39m | [39m0.914    [39m | [39m0.1242   [39m | [39m12.08    [39m | [39m26.54    [39m | [39m97.74    [39m |
| [35m4        [39m | [35m0.9244   [39m | [35m0.1682   [39m | [35m7.123    [39m | [35m38.64    [39m | [35m38.76    [39m |
| [39m5        [39m | [39m0.9188   [39m | [39m0.06781  [39m | [39m10.25    [39m | [39m57.4     [39m | [39m46.84    [39m |
| [39m6        [39m | [39m0.9201   [39m | [39m0.

KeyboardInterrupt: 

In [None]:
#Performing cross-validation on the selected model
# Import necessary libraries
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge

# Load data
df = pd.read_csv('/content/drive/My Drive/drugdata.csv')
X = df.iloc[:, :5]
y = df.iloc[:, 5]
d = 1000000

# Scale the data
scaler_X = MinMaxScaler()
X_scaled = scaler_X.fit_transform(X)

scaler_y = MinMaxScaler()
y_scaled = scaler_y.fit_transform(y.values.reshape(-1, 1)).flatten()

# Function to calculate evaluation metrics including RAE
def calculate_metrics(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    rrmse = rmse / (np.max(y_true) - np.min(y_true))
    r2 = r2_score(y_true, y_pred)
    aapre = np.mean(np.abs((y_true - y_pred) / y_true)) * 100

    # RAE calculation
    rae = np.sum(np.abs(y_true - y_pred)) / np.sum(np.abs(y_true - np.mean(y_true)))

    return mae, mse, rmse, rrmse, r2, aapre, rae

# Define base models for the stacking ensemble with specified hyperparameters
base_models = [
    ('rf', RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)),
    ('gb', GradientBoostingRegressor(n_estimators=200, learning_rate=0.2, random_state=42))
]

# Define the stacking ensemble with a Ridge regression meta-model
stacking_model = StackingRegressor(
    estimators=base_models,
    final_estimator=Ridge()
)

# Set up 4-fold cross-validation
kf = KFold(n_splits=4, shuffle=True, random_state=42)

# List to store all metrics for each fold
all_metrics = []

for fold, (train_index, test_index) in enumerate(kf.split(X_scaled), 1):
    # Split data according to the current fold
    X_train_fold, X_test_fold = X_scaled[train_index], X_scaled[test_index]
    y_train_fold, y_test_fold = y_scaled[train_index], y_scaled[test_index]

    # Fit the stacking model
    stacking_model.fit(X_train_fold, y_train_fold)

    # Make predictions
    y_train_pred_scaled_fold = stacking_model.predict(X_train_fold)
    y_test_pred_scaled_fold = stacking_model.predict(X_test_fold)

    # Denormalize the predictions
    y_train_pred_fold = scaler_y.inverse_transform(y_train_pred_scaled_fold.reshape(-1, 1)).flatten()
    y_test_pred_fold = scaler_y.inverse_transform(y_test_pred_scaled_fold.reshape(-1, 1)).flatten()

    # Calculate metrics for training and testing sets
    train_mae, train_mse, train_rmse, train_rrmse, train_r2, train_aapre, train_rae = calculate_metrics(y_train_fold / d, y_train_pred_fold / d)
    test_mae, test_mse, test_rmse, test_rrmse, test_r2, test_aapre, test_rae = calculate_metrics(y_test_fold / d, y_test_pred_fold / d)

    # Store metrics for this fold
    all_metrics.append({
        'fold': fold,
        'train_mae': train_mae, 'train_mse': train_mse, 'train_rmse': train_rmse,
        'train_rrmse': train_rrmse, 'train_r2': train_r2, 'train_aapre': train_aapre, 'train_rae': train_rae,
        'test_mae': test_mae, 'test_mse': test_mse, 'test_rmse': test_rmse,
        'test_rrmse': test_rrmse, 'test_r2': test_r2, 'test_aapre': test_aapre, 'test_rae': test_rae
    })

# Convert the list of dictionaries to a DataFrame for easier analysis
metrics_df = pd.DataFrame(all_metrics)

# Calculate the mean of the metrics across all folds
mean_metrics = metrics_df.mean()
print("Average Metrics Across Folds:")
print(mean_metrics)

# # Optionally, save the metrics to a CSV file
# metrics_df.to_csv('/content/drive/My Drive/stacking_model_cv_results.csv', index=False)


Mounted at /content/drive


  aapre = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  aapre = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
  aapre = np.mean(np.abs((y_true - y_pred) / y_true)) * 100


Average Metrics Across Folds:
fold           2.500000e+00
train_mae      2.255797e-04
train_mse      2.045773e-07
train_rmse     4.470136e-04
train_rrmse    4.815856e+02
train_r2      -2.508579e+07
train_aapre             inf
train_rae      5.144466e+03
test_mae       2.223443e-04
test_mse       1.692787e-07
test_rmse      4.107105e-04
test_rrmse     5.632285e+02
test_r2       -2.544133e+07
test_aapre              inf
test_rae       5.332296e+03
dtype: float64


  aapre = np.mean(np.abs((y_true - y_pred) / y_true)) * 100


In [None]:
all_metrics

[{'fold': 1,
  'train_mae': 0.0002542388945143164,
  'train_mse': 2.7821935253254157e-07,
  'train_rmse': 0.0005274650249377125,
  'train_rrmse': 527.4650249377124,
  'train_r2': -30013568.45729348,
  'train_aapre': inf,
  'train_rae': 5395.652058613807,
  'test_mae': 0.00022630908598068525,
  'test_mse': 1.6156643806920122e-07,
  'test_rmse': 0.00040195327846554655,
  'test_rrmse': 695.9002632965554,
  'test_r2': -39819711.96613975,
  'test_aapre': 9011516.62121235,
  'test_rae': 6739.682671462682},
 {'fold': 2,
  'train_mae': 0.00023890046161613272,
  'train_mse': 2.3578945021082066e-07,
  'train_rmse': 0.0004855815587631193,
  'train_rrmse': 485.5815587631193,
  'train_r2': -26323626.63345711,
  'train_aapre': inf,
  'train_rae': 5094.763406152443,
  'test_mae': 0.00020043082571533164,
  'test_mse': 1.4050279391364615e-07,
  'test_rmse': 0.0003748370231362507,
  'test_rrmse': 551.3096895573537,
  'test_r2': -28206642.70280763,
  'test_aapre': 14938608.624251941,
  'test_rae': 5837.6