In [None]:
import mlflow
import os
import numpy as np
from sklearn.model_selection import train_test_split

from pickle import dump

from hyperopt import fmin, tpe, Trials, STATUS_OK, hp, space_eval
from functools import partial

from scripts.Preprocessing import Preprocessing
from scripts.LinearRegressionTraining import LinearRegressionTraining

from scripts.config import (year_month_train,
    model_name_pref,
    input_data_path_train,
    seed)

In [None]:
local_path_save = './local_artifacts_tmp/02_4_LinearRegression_comparison/'
year_month = year_month_train
input_data_path = input_data_path_train

### MLFlow setting

In [None]:
if not os.path.exists(local_path_save):
    os.makedirs(local_path_save)

#save all metadata in a sqlite db. Artifacts will be saved on local folder ./mlflow    
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Name of the experiment
exp_name = "02.3 - LR comparison"
# set up MlFlow axperiment
experiment_id = mlflow.set_experiment(exp_name)

### Experiment

In [None]:
run_name = 'base'

In [None]:
linear_regression_training = LinearRegressionTraining(
    input_data_path,
    local_path_save,
    year_month,
    'linear_regression')

ridge_regression_training = LinearRegressionTraining(
    input_data_path,
    local_path_save,
    year_month,
    'ridge')

lasso_regression_training = LinearRegressionTraining(
    input_data_path,
    local_path_save,
    year_month,
    'lasso')

elastic_net_regression_training = LinearRegressionTraining(
    input_data_path,
    local_path_save,
    year_month,
    'elastic_net')

In [None]:
max_evals = 5

# Here we can decide which hyperparameters we want to tune
# For the Elastic Net regression, alpha tunes the amount of regularization
# Alpha = 0 means no regularization == Linear Regression
# l1_ratio is the mixing parameter: 0 means only L2 reg, 1 means only L1 reg
elastic_net_parameters_search = {
    'alpha': hp.lognormal('alpha', 0, 1),
    'l1_ratio': hp.uniform('l1_ratio', 0, 1),
    'random_state': seed
}

elastic_net_regression_training.set_hyperparameter_space(elastic_net_parameters_search)

# For the Ridge regression, alpha tunes the amount of regularization
# Alpha = 0 means no regularization == Linear Regression
ridge_parameters_search = {
    'alpha': hp.lognormal('alpha', 0, 1),
    'random_state': seed
}

ridge_regression_training.set_hyperparameter_space(ridge_parameters_search)

# For the Lasso regression, alpha tunes the amount of regularization
# Alpha = 0 means no regularization == Linear Regression
lasso_parameters_search = {
    'alpha': hp.lognormal('alpha', 0, 1),
    'random_state': seed
}

lasso_regression_training.set_hyperparameter_space(lasso_parameters_search)

In [None]:
#### Data Preprocessing ####

prepr = Preprocessing(input_data_path_train)
X, Y = prepr.read_dataframe(request_tgt=True)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=seed)

#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe = prepr.preprocess_for_regression(df=X_train, fit_ohe=True, drop_first_column=True)
X_test_ohe, _ = prepr.preprocess_for_regression(df=X_test, fit_ohe=False, ohe=ohe, drop_first_column=True)

assert shapes_pre == (X_train.shape[0], X_test.shape[0])
dump(ohe, open(local_path_save + run_name + '_ohe.pkl', 'wb'))

In [None]:
# Calculating regression baseline
linear_regression_training.calculate_regression_baseline(Y_train=Y_train, Y_test=Y_test, run_name=run_name)

In [None]:
models = [linear_regression_training,
         ridge_regression_training,
         lasso_regression_training,
         elastic_net_regression_training]

for model in models:
    trials = Trials() 
    if model.model_name == 'linear_regression':
        best_result = model.objective_lr(X_train=X_train_ohe,
             X_test=X_test_ohe,
             Y_train=Y_train,
             Y_test=Y_test,
             run_name=run_name)
    else:
        objective_function = getattr(model, 'objective_lr_' + model.model_name)
        
        best_result = fmin(
            fn=partial(objective_function, 
                X_train=X_train_ohe,
                X_test=X_test_ohe,
                Y_train=Y_train,
                Y_test=Y_test,
                run_name=run_name),
            space=model.hp_space,
            algo=tpe.suggest,
            max_evals=max_evals,
            trials=trials,
            rstate=np.random.default_rng(seed)
        )
    
    best_result