In [None]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

# Import

In [None]:
import dill
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

from src.models.gradient_boosting import ModelGradientBoosting
import src.models.performance_metrics as performance_metrics

In [None]:
path_dataset_train = "data/raw/20210119/dataset_train.pkl"
path_dataset_valid = "data/raw/20210119/dataset_valid.pkl"

# Dataset

In [None]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

In [None]:
with open(path_dataset_valid, "rb") as input_file:
    dataset_valid = dill.load(input_file)

# Overall

In [None]:
model = ModelGradientBoosting()
model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
dataset_valid = model.preprocessing_inference(dataset_valid)

In [None]:
model.train(dataset_train, dataset_valid)

In [None]:
with open(f"models/{model.version}__model.pkl", "wb") as file:
    dill.dump(model, file)

# Data Transformation

In [None]:
vardict.keys()

## Target

## Numerical

## Diff time

## Boolean

## Categorical

## Overall

### vardict

In [None]:
vardict["all"] = (
    vardict["numerical"]
    + vardict["diff_time"]
    + vardict["dummy_boolean"]
    + vardict["dummy_categorical"]
)

# 1st model

# Validation results

## Overall

In [None]:
with open(path_dataset_valid, "rb") as input_file:
    dataset_valid = dill.load(input_file)
model.predict_and_show_results(dataset_valid, save_folder="data/pipeline/20210121")

## Details

In [None]:
with open(path_dataset_valid, "rb") as input_file:
    dataset_valid = dill.load(input_file)

In [None]:
self = model

### Predictions

In [None]:
y_valid = dataset_valid[self.vardict["target"]].copy()

In [None]:
dataset_valid = self.preprocessing_inference(dataset_valid)

In [None]:
predictions = self.predict(dataset=dataset_valid, target_present=False)

In [None]:
predictions["y_true"] = y_valid.values.tolist()

In [None]:
predictions

### Results

In [None]:
binary_classification_results = performance_metrics.get_binary_classification_results(
    predictions, model_name=f"{model.version}_valid"
)

binary_classification_results

In [None]:
regression_results = performance_metrics.get_regression_results(
    predictions, model_name=f"{model.version}_valid"
)

regression_results

In [None]:
performance_metrics.plot_roc_auc_curve(predictions, model_name=f"{model.version}_valid")

In [None]:
performance_metrics.plot_precision_recall_curve(
    predictions, binary_classification_results, model_name=f"{model.version}_valid"
)

In [None]:
performance_metrics.plot_predictions(predictions, model_name=f"{model.version}_valid")

In [None]:
def predict_and_show_results(model, dataset_valid, save_folder="data/processed"):

    y_valid = dataset_valid[model.vardict["target"]].copy()
    dataset_valid = model.preprocessing_inference(dataset_valid)
    predictions = model.predict(dataset=dataset_valid, target_present=False)
    predictions["y_true"] = y_valid.values.tolist()

    show_results(
        predictions,
        model_name=model.version,
        show_plot=model.global_config["show_plot"],
        save_plot=model.global_config["save_plot"],
        save_folder=save_folder,
    )

In [None]:
def show_results(
    predictions,
    model_name,
    show_plot=True,
    save_plot=True,
    save_folder="data/processed",
):

    binary_classification_results = (
        performance_metrics.get_binary_classification_results(
            predictions, model_name, save_folder
        )
    )

    regression_results = performance_metrics.get_regression_results(
        predictions, model_name, save_folder
    )

    performance_metrics.plot_roc_auc_curve(
        predictions, model_name, show_plot, save_plot, save_folder
    )

    performance_metrics.plot_precision_recall_curve(
        predictions,
        binary_classification_results,
        model_name,
        show_plot,
        save_plot,
        save_folder,
    )

    performance_metrics.plot_predictions(
        predictions, model_name, show_plot, save_plot, save_folder
    )

# Hyperparameters search

In [None]:
path_dataset_train = "data/raw/20210119/dataset_train.pkl"
path_dataset_valid = "data/raw/20210119/dataset_valid.pkl"

##### Import

In [None]:
import dill
import json
import numpy as np
import optuna
import pandas as pd

pd.set_option("display.max_columns", None)

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

from src.models.gradient_boosting import ModelGradientBoosting
import src.models.performance_metrics as performance_metrics
import src.visualization.visualize_hyperparameter as visualize_hyperparameter

##### Dataset

In [None]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

In [None]:
with open(path_dataset_valid, "rb") as input_file:
    dataset_valid = dill.load(input_file)

In [None]:
dataset_hyperoptim = dataset_train.append(dataset_valid)

##### create different training/valid folds

In [None]:
nb_sessions = max(dataset_hyperoptim["id_session"]) + 1
nb_folds = 10
nb_sessions_valid = 1

In [None]:
list_train_dataset, list_valid_dataset = create_folds_for_hyperparameters_tuning(
    nb_sessions, nb_folds, nb_sessions_valid
)

##### functions for hyperparametrization

##### launch hyperparameter tuning

In [None]:
study = optuna.create_study(direction="maximize", study_name="gb_20210123")

In [None]:
study.optimize(func=hyperparameter_objective, n_trials=20, callbacks=callback_object)

In [None]:
study.best_trial.params

In [None]:
study.trials_dataframe

##### study hyperparameter tuning

In [None]:
hyperparameters_df = pd.read_csv(
    "data/interim/hyperparameter_tuning/gb_20210123/all_trials.csv"
)

hyperparameters_df["n_trial_all"] = list(range(1, len(hyperparameters_df) + 1))

hyperparameters_df

In [None]:
best_hyperparameters = hyperparameters_df.loc[
    hyperparameters_df["value"].idxmax(axis=1)
]
best_hyperparameters

In [None]:
for hyperparameter_to_plot in [
    "params_max_depth",
    "params_num_leaves",
    "params_bagging_fraction",
    "params_feature_fraction",
    "params_learning_rate",
]:

    visualize_hyperparameter.plot_result_hyperparameter(
        hyperparameters_df=hyperparameters_df,
        hyperparameter_to_plot=hyperparameter_to_plot,
        variable_objective="value",
        use_log_scale=(hyperparameter_to_plot in ["params_learning_rate"]),
        minimize_objective=False,
        folder_save=f"data/interim/hyperparameter_tuning/gb_20210123",
    )

##### time taken for each trial

In [None]:
visualize_hyperparameter.plot_time_hyperparameter(
    hyperparameters_df=hyperparameters_df,
    folder_save=f"data/interim/hyperparameter_tuning/gb_20210123",
)

# Probas & Predictions