In [None]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

In [None]:
path_dataset_train = "data/raw/20210119/dataset_train.pkl"
path_dataset_valid = "data/raw/20210119/dataset_valid.pkl"
path_dataset_test = "data/raw/20210119/dataset_test.pkl"

# Import

In [None]:
import dill
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)

from src.models.dummy_classifier import ModelDummyClassifier
from src.models.logistic_regression import ModelLogisticRegression
from src.models.gradient_boosting import ModelGradientBoosting
import src.models.performance_metrics as performance_metrics

# Training models

## Dummy Classifier

In [None]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

In [None]:
model = ModelDummyClassifier()
model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
model.train(dataset_train)

In [None]:
with open(f"models/{model.version}__model.pkl", "wb") as file:
    dill.dump(model, file)
    print(f"Saved at {file.name}")

## Logistic Regression

In [None]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

In [None]:
model = ModelLogisticRegression()
model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
model.train(dataset_train)

In [None]:
model.plot_coefficients()

In [None]:
with open(f"models/{model.version}__model.pkl", "wb") as file:
    dill.dump(model, file)

In [None]:
print(f"saved at models/{model.version}__model.pkl")

## Gradient Boosting

In [None]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

In [None]:
with open(path_dataset_valid, "rb") as input_file:
    dataset_valid = dill.load(input_file)

In [None]:
model = ModelGradientBoosting()
model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
dataset_valid = model.preprocessing_inference(dataset_valid)

In [None]:
model.train(dataset_train, dataset_valid)

In [None]:
with open(f"models/{model.version}__model.pkl", "wb") as file:
    dill.dump(model, file)

# Dataset

In [None]:
with open(path_dataset_test, "rb") as input_file:
    dataset_test_original = dill.load(input_file)

# Initialisation

In [None]:
results_test_all_models = pd.DataFrame(columns=["metric"])
fig_precision_recall_curve = go.Figure()
fig_roc_auc_curve = go.Figure()

In [None]:
list_models_to_compare = [
    # "dummy_classifier__20210123",
    "logistic_regression__20210131",
    "gradient_boosting__20210128",
]

# Getting data

In [None]:
for model_name in list_models_to_compare:

    dataset_test = dataset_test_original.copy()

    with open(f"models/{model_name}__model.pkl", "rb") as input_file:
        model = dill.load(input_file)

    y_test = dataset_test[model.vardict["target"]].copy()
    dataset_test = model.preprocessing_inference(dataset_test)
    predictions = model.predict(dataset=dataset_test, target_present=False)
    predictions["y_true"] = y_test.values.tolist()

    binary_classification_results = (
        performance_metrics.get_binary_classification_results(
            predictions, model_name=f"{model_name}_test"
        )
    )

    regression_results = performance_metrics.get_regression_results(
        predictions, model_name=f"{model_name}_test"
    )

    results_test_model = {**binary_classification_results, **regression_results}

    results_test_model_table = pd.DataFrame.from_dict(
        results_test_model, orient="index", columns=["value"]
    )

    results_test_model_table.reset_index(inplace=True)
    results_test_model_table.rename(
        columns={"index": "metric", "value": model_name}, inplace=True
    )

    results_test_all_models = pd.merge(
        results_test_all_models,
        results_test_model_table,
        on="metric",
        how="outer",
    )

    # Create traces

    fig_precision_recall_curve = performance_metrics.add_precision_recall_curve(
        fig_precision_recall_curve, predictions, model_name
    )

    fig_roc_auc_curve = performance_metrics.add_roc_auc_curve(
        fig_roc_auc_curve, predictions, model_name
    )

# Analysis

## Model metrics

In [None]:
metric_objective = {
    "total_population": "neutral",
    "total_positive": "neutral",
    "total_negative": "neutral",
    "random_precision": "neutral",
    "true_positive": "maximum",
    "false_negative": "minimum",
    "false_positive": "minimum",
    "true_negative": "maximum",
    "recall": "maximum",
    "miss_rate": "minimum",
    "fall_out": "minimum",
    "specificity": "maximum",
    "precision": "maximum",
    "false_discovery_rate": "minimum",
    "false_omission_rate": "minimum",
    "negative_predictive_value": "maximum",
    "accuracy": "maximum",
    "prevalence": "neutral",
    "positive_likelihood_ratio": "maximum",
    "negative_likelihood_ratio": "minimum",
    "diagnostic_odds_ratio": "maximum",
    "f1_score": "maximum",
    "logit_roc_auc": "maximum",
    "explained_variance_score": "maximum",
    "max_error": "minimum",
    "mean_absolute_error": "minimum",
    "root_mean_squared_error": "minimum",
    "r2_score": "maximum",
    "normalised_log_loss": "minimum",
    "normalised_cross_entropy": "minimum",
    "brier_score": "minimum",
}

In [None]:
results_test_all_models["objective"] = results_test_all_models["metric"].map(
    metric_objective
)

In [None]:
import pandas as pd
import numpy as np


def ifthenelse_color(v, objective, min_value, max_value):
    if (v == min_value) & (v == max_value):
        return "color: black"
    elif ((v == min_value) & (objective == "maximum")) or (
        (v == max_value) & (objective == "minimum")
    ):
        return "color: red"
    elif ((v == max_value) & (objective == "maximum")) or (
        (v == min_value) & (objective == "minimum")
    ):
        return "color: green"
    else:
        return "color: black"


def highlight_max(s):
    s_wo_metric = s[1:-1]
    objective = s[-1]

    max_value = max(s_wo_metric)
    min_value = min(s_wo_metric)

    s_color = [
        ifthenelse_color(v, objective, min_value, max_value) for v in s_wo_metric
    ]
    s_color = ["color: black"] + s_color + ["color: black"]
    return s_color


results_test_all_models.style.apply(highlight_max, axis=1)

In [None]:
results_test_all_models[
    results_test_all_models["metric"].isin(
        [
            "total_population",
            "precision",
            "recall",
            "f1_score",
            "accuracy",
            "logit_roc_auc",
            "mean_absolute_error",
        ]
    )
].style.apply(highlight_max, axis=1)

## ROC AUC curve

In [None]:
fig_roc_auc_curve.add_trace(
    go.Scatter(
        x=[0, 1],
        y=[0, 1],
        mode="lines",
        name="random",
        line=dict(color="black", dash="dash"),
    )
)

fig_roc_auc_curve = performance_metrics.add_square(
    fig_roc_auc_curve, x0=0, x1=1, y0=0, y1=1
)

fig_roc_auc_curve.update_layout(
    title="Receiver operating characteristic (ROC) curve",
    legend={"itemsizing": "constant"},
)

fig_roc_auc_curve.update_xaxes(title_text="False Positive Rate", range=[-0.05, 1.05])
fig_roc_auc_curve.update_yaxes(title_text="True Positive Rate", range=[-0.05, 1.05])

fig_roc_auc_curve.show()

fig_roc_auc_curve.write_html("reports/20210119_comparison_roc_auc_curve.html")

## Precision - Recall curve

In [None]:
fig_precision_recall_curve = performance_metrics.add_square(
    fig_precision_recall_curve, x0=0, x1=1, y0=0, y1=1
)

fig_precision_recall_curve.add_trace(
    go.Scatter(
        x=[0, 1],
        y=[
            binary_classification_results["random_precision"],
            binary_classification_results["random_precision"],
        ],
        mode="lines",
        name="Random precision",
        line=dict(color="black", dash="dash"),
    )
)

fig_precision_recall_curve.update_layout(
    title="Precision-Recall curve",
    legend={"itemsizing": "constant"},
)

fig_precision_recall_curve.update_xaxes(title_text="Recall", range=[-0.05, 1.05])
fig_precision_recall_curve.update_yaxes(title_text="Precision", range=[-0.05, 1.05])

fig_precision_recall_curve.show()

fig_roc_auc_curve.write_html("reports/20210119_comparison_precision_recall_curve.html")