In [None]:
import sys
from pathlib import Path

# Añade src al path
sys.path.append(str(Path().resolve().parent / "src"))

In [None]:
import pandas as pd
from paths import TRANSFORMED_DATA_DIR

# Cargamos datos transformados
df = pd.read_parquet(TRANSFORMED_DATA_DIR / 'tabular_data_year_2024_4weeks_lags.parquet')

df

In [None]:
from data_split import train_test_split
from datetime import datetime

# Train test split
X_train, y_train, X_test, y_test = train_test_split(
    df,
    cutoff_date=datetime(2024, 1, 25),
    target_column_name='target'
)

print(f'{X_train.shape=}')
print(f'{y_train.shape=}')
print(f'{X_test.shape=}')
print(f'{y_test.shape=}')

In [None]:
# poetry add optuna
# !pip install optuna

In [None]:
# Importamos las librerías necesarias
import numpy as np
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_absolute_error
import optuna


# Usamos un pipeline
from model import get_pipeline

# Definimos la función objetivo de optuna
def objective(trial: optuna.trial.Trial) -> float:
    """
    Given a set of hyper-parameters, it trains a model and computes an average
    validation error based on a TimeSeriesSplit
    """
    # pick hyper-parameters
    hyperparams = {
        "metric": 'mae',
        "verbose": -1,
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 1.0),
        "min_child_samples": trial.suggest_int("min_child_samples", 3, 100),   
    }
    
    # Usammos un TimeSeriesSplit
    tss = TimeSeriesSplit(n_splits=3) # n_splits=3 para el ejemplo
    # tss = KFold(n_splits=3) # Kfold
    scores = []
        
    for train_index, val_index in tss.split(X_train):

        # split data for training and validation
        X_train_, X_val_ = X_train.iloc[train_index, :], X_train.iloc[val_index,:]
        y_train_, y_val_ = y_train.iloc[train_index], y_train.iloc[val_index]
        
        # train the model
        pipeline = get_pipeline(**hyperparams)
        pipeline.fit(X_train_, y_train_)
        
        # evaluate the model
        y_pred = pipeline.predict(X_val_)
        mae = mean_absolute_error(y_val_, y_pred)

        scores.append(mae)
   
    # Return the mean score
    return np.array(scores).mean()

In [None]:
# Estudio con Optuna
study = optuna.create_study(direction='minimize') # minimizar el error
study.optimize(objective, n_trials=50) # 50 iteraciones

In [None]:
# Imprimimos mejores hiperparámetros
best_params = study.best_trial.params
print(f'{best_params=}')

In [None]:
# Aplicamos el pipeline con los mejores hiperparámetros
pipeline = get_pipeline(**best_params)
pipeline.fit(X_train, y_train)

In [None]:
# Evaluamos el modelo
predictions = pipeline.predict(X_test)
test_mae = mean_absolute_error(y_test, predictions)
print(f'{test_mae=:.4f}')