In [None]:
import mlflow
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
import seaborn as sns
import matplotlib.pyplot as plt

from hyperopt import fmin, tpe, Trials, STATUS_OK, hp, space_eval
from functools import partial

from pickle import dump

from scripts.Preprocessing import Preprocessing
from scripts.BinaryClassificationTraining import BinaryClassificationTraining

from scripts.config import (year_month_train, 
    input_data_path_train,
    seed)

In [None]:
local_path_save = './local_artifacts_tmp/04_Logistic_Regression/'
year_month = year_month_train
input_data_path = input_data_path_train

### MLFlow setting

In [None]:
if not os.path.exists(local_path_save):
    os.makedirs(local_path_save)

#save all metadata in a sqlite db. Artifacts will be saved on local folder ./mlflow    
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Name of the experiment
exp_name = "04 - Logistic Regression"
# set up MlFlow axperiment
experiment_id = mlflow.set_experiment(exp_name)

### Experiments

In [None]:
run_name = 'base'

In [None]:
logistic_regression_training = BinaryClassificationTraining(
    input_data_path,
    local_path_save,
    year_month,
    'logistic_regression')

In [None]:
prepr = Preprocessing(input_data_path_train, task_type='classification')
X, Y = prepr.read_dataframe(request_tgt=True)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=seed)


#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe, scaler = prepr.preprocess_for_classification(df=X_train, fit_ohe=True, perform_scaling=True)
X_test_ohe, _, _ = prepr.preprocess_for_classification(df=X_test, 
                                                    fit_ohe=False, 
                                                    ohe=ohe,
                                                    perform_scaling=True,
                                                    scaler=scaler)
assert shapes_pre == (X_train.shape[0], X_test.shape[0])
dump(ohe, open(local_path_save + run_name + '_ohe.pkl', 'wb'))
dump(scaler, open(local_path_save + run_name + '_scaler.pkl', 'wb'))

In [None]:
# Calculating the baseline
logistic_regression_training.calculate_classification_baseline(Y_train=Y_train, Y_test=Y_test, run_name=run_name)

In [None]:
max_evals = 5

# Here we can decide which hyperparameters we want to tune
logistic_regression_parameters_search = {
    #'penalty': hp.choice('penalty', ['l1', 'l2', 'elasticnet', None]),
    'penalty': 'l2',
    'C': hp.lognormal('C', -1, 0.5),
    'random_state': seed,
    'max_iter': 250
}

logistic_regression_training.set_hyperparameter_space(logistic_regression_parameters_search)

In [None]:
trials = Trials()

best_result = fmin(
    fn=partial(logistic_regression_training.objective_logistic_regression, 
        X_train=X_train_ohe,
        X_test=X_test_ohe,
        Y_train=Y_train,
        Y_test=Y_test,
        run_name=run_name,
        threshold=0.5),
    space=logistic_regression_training.hp_space,
    algo=tpe.suggest,
    max_evals=max_evals,
    trials=trials,
    rstate=np.random.default_rng(seed)
)

best_result

### Learning Curve of the best model

In [None]:
from sklearn.model_selection import learning_curve
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, make_scorer

In [None]:
roc_auc_scorer = make_scorer(roc_auc_score, needs_proba=True)

In [None]:
best_hps = logistic_regression_parameters_search.copy()
best_hps['C'] = best_result['C']

In [None]:
best_hps

In [None]:
best_logistic_regression = LogisticRegression(**best_hps)

In [None]:
train_sizes, train_scores, test_scores = learning_curve(estimator=best_logistic_regression, X=X_train_ohe, y=Y_train,
                                                       cv=10, train_sizes=np.linspace(0.1, 1.0, 10),
                                                     n_jobs=-1,
                                                       scoring=roc_auc_scorer)

In [None]:
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)

In [None]:
plt.plot(train_sizes, train_mean, color='blue', marker='o', markersize=5, label='Training ROC AUC')
plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, alpha=0.15, color='blue')
plt.plot(train_sizes, test_mean, color='green', marker='+', markersize=5, linestyle='--', label='Validation ROC AUC')
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, alpha=0.15, color='green')
plt.title('Learning Curve')
plt.xlabel('Training Data Size')
plt.ylabel('Model ROC AUC')
plt.grid()
plt.legend(loc='lower right')
plt.show()