# MultiOutput Gradient Boosting Regressor

In [1]:
import logging
from typing import Any, Callable, Dict, List, Tuple

import matplotlib.pyplot as plt
import numpy as np
import optuna
import pandas as pd
from optuna.trial import FrozenTrial, TrialState
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn import metrics, preprocessing, model_selection, pipeline, ensemble

optuna.logging.set_verbosity(logging.ERROR)

### Define Random State and Test Size

In [2]:
RANDOM_STATE = 35
TEST_SIZE = 0.20

### Read In the Data

In [3]:
data = pd.read_csv("2022_08_29_AllCycles_LHS.csv")
data.head()

Unnamed: 0,Conductivity,CycleNumber,Porosity,Permeability,AverageFiberDiameter,MeanPoreDiameter,VoltageEfficiency,CoulombicEfficiency,EnergyEfficiency,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
0,67.326517,2,0.926359,1.7e-10,1.4e-05,0.000135,0.746404,0.956847,0.714195,,,,
1,67.326517,4,0.926359,1.7e-10,1.4e-05,0.000135,0.746284,0.957371,0.71447,,,,
2,67.326517,5,0.926359,1.7e-10,1.4e-05,0.000135,0.746303,0.95718,0.714346,,,,
3,67.326517,6,0.926359,1.7e-10,1.4e-05,0.000135,0.746261,0.956961,0.714142,,,,
4,86.086664,2,0.818147,3.55e-11,2e-05,0.000124,0.751707,0.952122,0.715717,,,,


### Define Features and Targets

In [4]:
features = [
    "Conductivity",
    "CycleNumber",
    "Porosity",
    "Permeability",
    "AverageFiberDiameter",
    "MeanPoreDiameter"
] 

targets =  [
    "VoltageEfficiency",
    "CoulombicEfficiency",
    "EnergyEfficiency"
]

### Split Data into Training and Testing Subsets

In [5]:
train_data, test_data = model_selection.train_test_split(data, test_size=TEST_SIZE)

In [6]:
train_data.head()

Unnamed: 0,Conductivity,CycleNumber,Porosity,Permeability,AverageFiberDiameter,MeanPoreDiameter,VoltageEfficiency,CoulombicEfficiency,EnergyEfficiency,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12
149,104.237553,5,0.81326,2.78e-11,1.8e-05,0.000143,0.755884,0.9418,0.711892,,,,
382,68.362214,4,0.806097,1.44e-11,1.4e-05,0.000191,0.742333,0.950476,0.70557,,,,
332,105.866825,3,0.863991,5.25e-11,1.7e-05,0.000183,0.756375,0.95379,0.721423,,,,
59,75.633303,3,0.751791,8.23e-12,1.5e-05,0.000141,0.746558,0.912899,0.681532,,,,
115,102.8834,2,0.900617,1.04e-09,1e-05,0.000111,0.760353,0.952827,0.724485,,,,


In [7]:
x_train = train_data[features]
y_train = train_data[targets]

x_test = test_data[features]
y_test = test_data[targets]

### Define Model Scoring Metrics 
- Root Mean Squared Error (RMSE) 
- Mean Absolute Percentage Error (MAPE)

In [8]:
def print_results(y_true: np.ndarray, y_pred: np.ndarray) -> None:
    print(f"RMSE = {mean_squared_error(y_true, y_pred, squared=False):,.6f}")
    print(f"MAPE = {mean_absolute_percentage_error(y_true, y_pred):.5%}")


### Baseline Multioutput Gradient Boosting Regressor 
**Baseline**: Before Hyperparameter Tuning to get Final Surrogate Model

#### Training the Model and Printing the MAPE and RMSE Scores for the Testing Data and Training Data, Respectively

In [9]:
model = MultiOutputRegressor(GradientBoostingRegressor())
model.fit(x_train, y_train)
y_pred = model.predict(x_test)
print_results(y_test, y_pred)

RMSE = 0.005093
MAPE = 0.31255%


In [10]:
y_pred_train = model.predict(x_train)
print_results(y_train, y_pred_train)

RMSE = 0.002286
MAPE = 0.14714%


### Creating Optuna Study for the Baseline GBR Model

In [11]:
ResponseVector = Tuple[float, float, float, float, float, float]
ObjectiveFunction = Callable[[optuna.Trial], ResponseVector]


def create_objective(model: MultiOutputRegressor) -> ObjectiveFunction:
    def objective(trial: optuna.Trial) -> ResponseVector:
        x = [
            trial.suggest_float("Conductivity", 61.32085384, 109.7652427),
            trial.suggest_float("CycleNumber", 2, 6),
            trial.suggest_float("Porosity", 0.7217427, 0.949060441),
            trial.suggest_float("Permeability", 4.40E-12, 1.14E-09),
            trial.suggest_float("AverageFiberDiameter", 1.00E-05, 2.00E-05),
            trial.suggest_float("MeanPoreDiameter", 0.0001, 0.000198985)
        ]
        x = pd.DataFrame([x], columns=features)
        y = model.predict(x)
        return tuple(y.ravel())

    return objective

In [12]:
study = optuna.study.create_study(
    storage="sqlite:///2022_10_23_MultiOutputRegressor_GBR.db",
    sampler=optuna.samplers.NSGAIISampler(),
    study_name="OptunaTrial_35_GBR",
    load_if_exists=True,
    directions=["maximize" for _ in range(len(targets))],
)

In [13]:
model = MultiOutputRegressor(GradientBoostingRegressor())
model.fit(x_train, y_train)
objective_function = create_objective(model)

In [14]:
study.optimize(objective_function, n_trials=250, show_progress_bar=True)

  self._init_valid()


  0%|          | 0/250 [00:00<?, ?it/s]

In [15]:
def frozen_trials_to_frame(trials: List[FrozenTrial]) -> pd.DataFrame:
    return pd.DataFrame([frozen_trial_to_dict(trial) for trial in trials])


def frozen_trial_to_dict(trial: FrozenTrial) -> Dict[str, Any]:
    return {
        "number": trial.number,
        "state": trial.state,
        "values": trial.values,
        "datetime_start": trial.datetime_start,
        "datetime_complete": trial.datetime_complete,
        "params": trial.params,
        "distributions": trial.distributions,
        "user_attrs": trial.user_attrs,
        "system_attrs": trial.system_attrs,
        "intermediate_values": trial.intermediate_values,
    }

In [16]:
## Make Sure ascending is equal to false
trials = frozen_trials_to_frame(study.get_trials())
trials = trials.loc[trials["state"] == TrialState.COMPLETE]
trials["max_value"] = trials["values"].apply(np.max)
trials["mean_value"] = trials["values"].apply(np.mean)
trials.sort_values(by=["max_value"], ascending=False, inplace=True)
trials.iloc[0].loc["params"]


{'Conductivity': 108.28379058632942,
 'CycleNumber': 5.599871489767488,
 'Porosity': 0.9267730938856802,
 'Permeability': 6.679223673475659e-10,
 'AverageFiberDiameter': 1.992131983791198e-05,
 'MeanPoreDiameter': 0.00012736467870438486}

In [17]:
trials.params.iloc[:10]

386    {'Conductivity': 108.28379058632942, 'CycleNum...
353    {'Conductivity': 108.28379058632942, 'CycleNum...
459    {'Conductivity': 108.28379058632942, 'CycleNum...
467    {'Conductivity': 108.28379058632942, 'CycleNum...
282    {'Conductivity': 108.28379058632942, 'CycleNum...
387    {'Conductivity': 108.28379058632942, 'CycleNum...
442    {'Conductivity': 108.28379058632942, 'CycleNum...
477    {'Conductivity': 108.28379058632942, 'CycleNum...
430    {'Conductivity': 108.28379058632942, 'CycleNum...
405    {'Conductivity': 108.28379058632942, 'CycleNum...
Name: params, dtype: object

In [18]:
top_ten_design_params = trials.params.iloc[:10]
top_ten_design_params = pd.DataFrame.from_records(top_ten_design_params.to_list(), index=top_ten_design_params.index)
top_ten_design_params

Unnamed: 0,Conductivity,CycleNumber,Porosity,Permeability,AverageFiberDiameter,MeanPoreDiameter
386,108.283791,5.599871,0.926773,6.679224e-10,2e-05,0.000127
353,108.283791,4.553355,0.926773,1.111423e-09,1.9e-05,0.000127
459,108.283791,4.553355,0.926773,6.679224e-10,1.9e-05,0.000127
467,108.283791,5.618011,0.926773,7.64272e-10,1.9e-05,0.000127
282,108.283791,5.618011,0.926773,1.478988e-10,1.9e-05,0.000127
387,108.283791,5.600676,0.926773,8.033383e-10,1.9e-05,0.000127
442,108.283791,5.618011,0.926773,1.478988e-10,1.9e-05,0.000127
477,108.283791,5.618011,0.926773,1.478988e-10,1.9e-05,0.000127
430,108.283791,5.625892,0.922408,4.492128e-10,1.9e-05,0.000156
405,108.283791,3.655492,0.893776,4.492128e-10,1.9e-05,0.000127


In [19]:
pd.DataFrame(model.predict(top_ten_design_params), columns=targets, index=top_ten_design_params.index)

Unnamed: 0,VoltageEfficiency,CoulombicEfficiency,EnergyEfficiency
386,0.75886,0.964739,0.731888
353,0.759042,0.964469,0.731592
459,0.759042,0.964469,0.731592
467,0.759042,0.964465,0.731592
282,0.759042,0.964465,0.731592
387,0.759042,0.964465,0.731592
442,0.759042,0.964465,0.731592
477,0.759042,0.964465,0.731592
430,0.758531,0.964137,0.731078
405,0.758492,0.963203,0.729482


# Hyperparameter Tuning Using Bayesian Methods and NSGA-II Optimizer 

### Creating the Optuna Study with the Hyperparameters 

In [20]:
def hyperparameter_objective(trial: optuna.Trial) -> float:
    hyperparams = {
        "learning_rate": trial.suggest_float("learning_rate", 0.0001, 1, log=True),
        "subsample": trial.suggest_float("subsample", 0.5, 0.95),
        "min_samples_split": trial.suggest_int("min_samples_split", 2, 200),
        "max_depth": trial.suggest_int("max_depth", 1, 50),
        "alpha": trial.suggest_float("alpha", 0.001, 0.99)
    }

    model = MultiOutputRegressor(GradientBoostingRegressor(
            n_estimators=1_000,
            n_iter_no_change=10,
            random_state=RANDOM_STATE,
            **hyperparams,
        )
    )
    model.fit(x_train, y_train)

    y_pred = model.predict(x_test)
    MAPE = mean_absolute_percentage_error(
        y_test, y_pred, multioutput="uniform_average"#, squared=False
    )
    return MAPE

In [21]:
model_selection_study = optuna.study.create_study(
    storage="sqlite:///model_selection.db",
    sampler=optuna.samplers.NSGAIISampler(),
    study_name="OptunaTrial_35_GBR",
    load_if_exists=True,
    direction="minimize",
)

In [22]:
model_selection_study.optimize(
    hyperparameter_objective, n_trials=250, show_progress_bar=True
)

  self._init_valid()


  0%|          | 0/250 [00:00<?, ?it/s]

In [23]:
#Want the MAPE to be Ascending since we want the hyperparameters with the smallest Error
trials = frozen_trials_to_frame(model_selection_study.get_trials())
trials = trials.loc[trials["state"] == TrialState.COMPLETE]
trials["values"] = trials["values"].apply(lambda x: x[0])
trials.sort_values(by=["values"], ascending=True, inplace=True)
trials.iloc[0].loc["params"]

{'learning_rate': 0.3305386167919417,
 'subsample': 0.9256945121041953,
 'min_samples_split': 31,
 'max_depth': 48,
 'alpha': 0.08947432516111173}

In [24]:
best_model = MultiOutputRegressor(
    GradientBoostingRegressor(
        n_estimators=1000,
        n_iter_no_change=10,
        random_state=RANDOM_STATE,
        **trials.iloc[0].loc["params"],
    )
)
best_model.fit(x_train, y_train)
y_pred_test = best_model.predict(x_test)
print_results(y_test, y_pred_test)

RMSE = 0.004512
MAPE = 0.27929%


In [25]:
y_pred_train = best_model.predict(x_train)
print_results(y_train, y_pred_train)

RMSE = 0.002488
MAPE = 0.16010%


In [26]:
parameters = trials.iloc[0].loc["params"]
parameters = {f"estimator__{k}": [v] for k,v in parameters.items()}
parameters

{'estimator__learning_rate': [0.3305386167919417],
 'estimator__subsample': [0.9256945121041953],
 'estimator__min_samples_split': [31],
 'estimator__max_depth': [48],
 'estimator__alpha': [0.08947432516111173]}

In [27]:
from sklearn.model_selection import GridSearchCV
RandomForestRegressor
grid_search = GridSearchCV(MultiOutputRegressor(GradientBoostingRegressor()), parameters)
grid_search

GridSearchCV(estimator=MultiOutputRegressor(estimator=GradientBoostingRegressor()),
             param_grid={'estimator__alpha': [0.08947432516111173],
                         'estimator__learning_rate': [0.3305386167919417],
                         'estimator__max_depth': [48],
                         'estimator__min_samples_split': [31],
                         'estimator__subsample': [0.9256945121041953]})

In [28]:
grid_search.fit(data[features], data[targets])

GridSearchCV(estimator=MultiOutputRegressor(estimator=GradientBoostingRegressor()),
             param_grid={'estimator__alpha': [0.08947432516111173],
                         'estimator__learning_rate': [0.3305386167919417],
                         'estimator__max_depth': [48],
                         'estimator__min_samples_split': [31],
                         'estimator__subsample': [0.9256945121041953]})

In [29]:
pd.DataFrame(grid_search.cv_results_).T

Unnamed: 0,0
mean_fit_time,0.234718
std_fit_time,0.008271
mean_score_time,0.005829
std_score_time,0.000828
param_estimator__alpha,0.089474
param_estimator__learning_rate,0.330539
param_estimator__max_depth,48
param_estimator__min_samples_split,31
param_estimator__subsample,0.925695
params,"{'estimator__alpha': 0.08947432516111173, 'est..."


In [30]:
ResponseVectorOpt = Tuple[float, float, float, float, float, float]
ObjectiveFunctionOpt = Callable[[optuna.Trial], ResponseVectorOpt]


def create_objective(model: MultiOutputRegressor) -> ObjectiveFunctionOpt:
    def objective(trial: optuna.Trial) -> ResponseVectorOpt:
        x = [
            trial.suggest_float("Conductivity", 61.32085384, 109.7652427),
            trial.suggest_float("CycleNumber", 2, 6),
            trial.suggest_float("Porosity", 0.7217427, 0.949060441),
            trial.suggest_float("Permeability", 4.40E-12, 1.14E-09),
            trial.suggest_float("AverageFiberDiameter", 1.00E-05, 2.00E-05),
            trial.suggest_float("MeanPoreDiameter", 0.0001, 0.000198985)
        ]
        x = pd.DataFrame([x], columns=features)
        y = model.predict(x)
        return tuple(y.ravel())

    return objective

In [31]:
study = optuna.study.create_study(
    storage="sqlite:///2022_10_23_MultiOutputRegressor_GBROpt.db",
    sampler=optuna.samplers.NSGAIISampler(),
    study_name="OptunaTrial_35_GBROpt",
    load_if_exists=True,
    directions=["maximize" for _ in range(len(targets))],
)

In [32]:
{'estimator__learning_rate': [0.3305386167919417],
 'estimator__subsample': [0.9325988249887232],
 'estimator__min_samples_split': [3],
 'estimator__max_depth': [15],
 'estimator__alpha': [0.3892732881795934]}

{'estimator__learning_rate': [0.3305386167919417],
 'estimator__subsample': [0.9325988249887232],
 'estimator__min_samples_split': [3],
 'estimator__max_depth': [15],
 'estimator__alpha': [0.3892732881795934]}

In [33]:
model_opt = MultiOutputRegressor(GradientBoostingRegressor(alpha=0.3892732881795934,
                                                           learning_rate= 0.3305386167919417,
                                                           max_depth=15,
                                                           min_samples_split=3,
                                                           subsample=0.9325988249887232))
                                                       
                                                       
    
model_opt.fit(x_train, y_train)
objective_function = create_objective(model)

In [34]:
study.optimize(objective_function, n_trials=250, show_progress_bar=True)

  self._init_valid()


  0%|          | 0/250 [00:00<?, ?it/s]

In [35]:
def frozen_trials_to_frame(trials: List[FrozenTrial]) -> pd.DataFrame:
    return pd.DataFrame([frozen_trial_to_dict(trial) for trial in trials])


def frozen_trial_to_dict(trial: FrozenTrial) -> Dict[str, Any]:
    return {
        "number": trial.number,
        "state": trial.state,
        "values": trial.values,
        "datetime_start": trial.datetime_start,
        "datetime_complete": trial.datetime_complete,
        "params": trial.params,
        "distributions": trial.distributions,
        "user_attrs": trial.user_attrs,
        "system_attrs": trial.system_attrs,
        "intermediate_values": trial.intermediate_values,
    }

In [57]:
trials = frozen_trials_to_frame(study.get_trials())
trials = trials.loc[trials["state"] == TrialState.COMPLETE]
trials["max_value"] = trials["values"].apply(np.max)
trials["mean_value"] = trials["values"].apply(np.mean)
trials.sort_values(by=["max_value"], ascending=True, inplace=True)
trials.iloc[0].loc["params"]

{'AverageFiberDiameter': 1.8245427191053035e-05,
 'Conductivity': 68.3147704722198,
 'CycleNumber': 3.233640045257663,
 'MeanPoreDiameter': 0.0001957479666083721,
 'Permeability': 1.5028237950985855e-10,
 'Porosity': 0.7614592207112789}

In [58]:
trials.params.iloc[:10]

48     {'AverageFiberDiameter': 1.8245427191053035e-0...
141    {'AverageFiberDiameter': 1.1150708949394157e-0...
42     {'AverageFiberDiameter': 1.0147778391442922e-0...
0      {'AverageFiberDiameter': 1.2026442628977904e-0...
22     {'AverageFiberDiameter': 1.0608951614277198e-0...
26     {'AverageFiberDiameter': 1.1137106494849194e-0...
162    {'AverageFiberDiameter': 1.1826108007402182e-0...
415    {'Conductivity': 107.39500191591281, 'CycleNum...
293    {'Conductivity': 63.405927368538485, 'CycleNum...
242    {'AverageFiberDiameter': 1.1380025465045386e-0...
Name: params, dtype: object

In [59]:
# results = pd.DataFrame(trials.params)

# results.head()

In [60]:
# results.sort_values(["EnergyEfficiency"], ascending=False)

In [61]:
top_ten_design_params_tuned = trials.params.iloc[:10]
top_ten_design_params_tuned = pd.DataFrame.from_records(top_ten_design_params_tuned.to_list(), index=top_ten_design_params_tuned.index)
top_ten_design_params_tuned

Unnamed: 0,AverageFiberDiameter,Conductivity,CycleNumber,MeanPoreDiameter,Permeability,Porosity
48,1.8e-05,68.31477,3.23364,0.000196,1.502824e-10,0.761459
141,1.1e-05,74.701724,2.06438,0.000194,1.031847e-09,0.818969
42,1e-05,71.054193,2.932544,0.000117,6.878695e-10,0.769662
0,1.2e-05,66.633639,2.269934,0.000185,4.421823e-10,0.764098
22,1.1e-05,96.167219,2.080856,0.000184,3.114766e-10,0.735746
26,1.1e-05,93.245291,3.628675,0.000116,7.610185e-10,0.755395
162,1.2e-05,106.230795,2.676809,0.0001,1.129623e-09,0.755251
415,1.9e-05,107.395002,3.553031,0.0001,9.988287e-10,0.943366
293,1.9e-05,63.405927,5.679608,0.0001,2.224271e-10,0.944782
242,1.1e-05,98.242651,4.598581,0.000135,8.391054e-10,0.755553


In [62]:
pd.DataFrame(model.predict(top_ten_design_params_tuned), columns=targets, index=top_ten_design_params_tuned.index)

Unnamed: 0,VoltageEfficiency,CoulombicEfficiency,EnergyEfficiency
48,0.74331,0.951836,0.71053
141,0.74331,0.951836,0.71053
42,0.74331,0.951836,0.71053
0,0.74331,0.951836,0.71053
22,0.74331,0.951836,0.71053
26,0.74331,0.951836,0.71053
162,0.74331,0.951836,0.71053
415,0.74331,0.951836,0.71053
293,0.74331,0.951836,0.71053
242,0.74331,0.951836,0.71053
