# Primer intento Optuna

In [29]:
import pandas as pd 
import optuna as opt
import numpy as np
from sklearn.ensemble import RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import plotly

In [3]:
def series_to_supervised(data, n_in=1, n_out=1, dropnan=True):
   """
   Frame a time series as a supervised learning dataset.
   Arguments:
   data: Sequence of observations as a list or NumPy array.
   n_in: Number of lag observations as input (X).
   n_out: Number of observations as output (y).
   """
   df = pd.DataFrame(data)
   col_names = df.columns
   cols, names = list(), list()
   # input sequence (t-n, ... t-1)
   for i in range(n_in, 0, -1):
      cols.append(df.shift(i))
      names += [(f'{col}(t-{i})') for col in col_names]
   # forecast sequence (t, t+1, ... t+n)
   for i in range(0, n_out):
      cols.append(df.shift(-i))
   if i == 0:
      names += [(f'{col}(t)') for col in col_names]
   else:
      names += [(f'{col}(t+{i})') for col in col_names]
   # put it all together
   agg = pd.concat(cols, axis=1)
   agg.columns = names
   # drop rows with NaN values
   if dropnan:
      agg.dropna(inplace=True)
   return agg

In [4]:
df = pd.read_csv('../data/processed/processed_data_energy.csv')
df_supervised = series_to_supervised(df, n_in = 3, n_out = 1)

In [5]:
df_supervised.head()

Unnamed: 0,hora(t-3),dia(t-3),mes(t-3),anio(t-3),tmax-cab(t-3),tmax-hmo(t-3),tmax-obr(t-3),tmax-lmo(t-3),tmax-cul(t-3),tmin-cab(t-3),...,martes_postfestivo(t),semana_santa(t),1_mayo(t),10_mayo(t),16_sep(t),2_nov.(t),pre-navidad_y_new_year(t),navidad_y_new_year(t),post-navidad_y_new_year(t),demanda_energia(t)
3,0.0,1.0,1.0,2007.0,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1222
4,1.0,1.0,1.0,2007.0,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1168
5,2.0,1.0,1.0,2007.0,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1128
6,3.0,1.0,1.0,2007.0,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1100
7,4.0,1.0,1.0,2007.0,21.0,22.0,25.0,30.0,29.0,2.0,...,0,0,0,0,0,0,0,1,0,1083


In [6]:
X = df_supervised[df_supervised.columns[0:-1]]
y = df_supervised[df_supervised.columns[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=12)

In [7]:
def objective(trial):
    # Suggest values for hyperparameters
    n_estimators = trial.suggest_int("n_estimators", 10, 200, log=True)
    max_depth = trial.suggest_int("max_depth", 2, 32)
    min_samples_split = trial.suggest_int("min_samples_split", 2, 10)
    min_samples_leaf = trial.suggest_int("min_samples_leaf", 1, 10)

    # Create and fit random forest model
    model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        random_state=42
    )
    model.fit(X_train, y_train)

    # Make predictions and calculate RMSE
    y_pred = model.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))

    # Return MAE
    return rmse

In [8]:
# Create study object
study = opt.create_study(direction="minimize")

# Run optimization process
study.optimize(objective, n_trials=5, show_progress_bar=True)

[I 2024-05-21 13:58:38,388] A new study created in memory with name: no-name-09cccb79-392d-463e-8db1-cb337ea81a57


  0%|          | 0/5 [00:00<?, ?it/s]

[I 2024-05-21 13:59:37,029] Trial 0 finished with value: 25.850011871367492 and parameters: {'n_estimators': 23, 'max_depth': 24, 'min_samples_split': 6, 'min_samples_leaf': 5}. Best is trial 0 with value: 25.850011871367492.
[I 2024-05-21 14:00:44,330] Trial 1 finished with value: 46.239515848776 and parameters: {'n_estimators': 55, 'max_depth': 8, 'min_samples_split': 2, 'min_samples_leaf': 6}. Best is trial 0 with value: 25.850011871367492.
[I 2024-05-21 14:02:22,748] Trial 2 finished with value: 65.60515135371158 and parameters: {'n_estimators': 115, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 3}. Best is trial 0 with value: 25.850011871367492.
[I 2024-05-21 14:02:33,044] Trial 3 finished with value: 144.2638852372342 and parameters: {'n_estimators': 27, 'max_depth': 3, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 25.850011871367492.
[I 2024-05-21 14:09:07,114] Trial 4 finished with value: 29.709543789917166 and parameters: {'n_estima

In [28]:
# Import optuna.visualization
import optuna.visualization as vis

# Plot optimization history
vis.plot_optimization_history(study)

In [23]:
# Plot parameter importance
vis.plot_param_importances(study)

In [25]:
# Plot contour plot
fig_contour = vis.plot_contour(study, params=["min_samples_split", "min_samples_leaf"])
fig_contour

In [31]:
plotly.offline.plot(fig_contour, filename='images/fig_contour')

'images/fig_contour.html'

In [29]:
# Plot parallel_coordinate
vis.plot_parallel_coordinate(study)