In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

/Users/admin/Projects/vocabulary_learning/notebooks
/Users/admin/Projects/vocabulary_learning


<IPython.core.display.Javascript object>

In [2]:
path_dataset_test = "data/raw/20201009/dataset_test.pkl"

<IPython.core.display.Javascript object>

# Import

In [3]:
import dill
import numpy as np
import pandas as pd
import plotly.graph_objects as go

pd.set_option("display.max_columns", None)

from sklearn.linear_model import LogisticRegression

from src.data.make_dataset import get_vardict

import src.models.performance_metrics as performance_metrics


<IPython.core.display.Javascript object>

# Dataset

In [5]:
with open(path_dataset_test, "rb") as input_file:
    dataset_test = dill.load(input_file)

<IPython.core.display.Javascript object>

# Initialisation

In [30]:
results_test_all_models = pd.DataFrame(columns=["metric"])
fig_precision_recall_curve = go.Figure()
fig_roc_auc_curve = go.Figure()

<IPython.core.display.Javascript object>

In [31]:
list_models_to_compare = ["Logistic Regression", "Logistic Regression 2"]

<IPython.core.display.Javascript object>

# Getting data

In [32]:
for model_name in list_models_to_compare:

    with open(f"data/processed/{model_name}_model.pkl", "rb") as input_file:
        model = dill.load(input_file)

    with open(f"data/processed/{model_name}_vardict.pkl", "rb") as input_file:
        vardict = dill.load(input_file)

    X_test = dataset_test[vardict["all"]]
    y_test = dataset_test[vardict["target"]]

    predictions = X_test.copy()

    predictions["y_pred"] = model.predict(X_test)
    predictions["y_proba"] = [x[1] for x in model.predict_proba(X_test)]
    predictions["y_true"] = y_test

    binary_classification_results = (
        performance_metrics.get_binary_classification_results(
            predictions, model_name=f"{model_name}_test"
        )
    )

    regression_results = performance_metrics.get_regression_results(
        predictions, model_name=f"{model_name}_test"
    )

    results_test_model = {**binary_classification_results, **regression_results}

    results_test_model_table = pd.DataFrame.from_dict(
        results_test_model, orient="index", columns=["value"]
    )

    results_test_model_table.reset_index(inplace=True)
    results_test_model_table.rename(
        columns={"index": "metric", "value": model_name}, inplace=True
    )

    results_test_all_models = pd.merge(
        results_test_all_models,
        results_test_model_table,
        on="metric",
        how="outer",
    )

    # Create traces

    fig_precision_recall_curve = performance_metrics.add_precision_recall_curve(
        fig_precision_recall_curve, predictions, model_name
    )

    fig_roc_auc_curve = performance_metrics.add_roc_auc_curve(
        fig_roc_auc_curve, predictions, model_name
    )

<IPython.core.display.Javascript object>

# Analysis

## Model metrics

In [33]:
results_test_all_models

Unnamed: 0,metric,Logistic Regression,Logistic Regression 2
0,total_population,112.0,112.0
1,total_positive,70.0,70.0
2,total_negative,42.0,42.0
3,random_precision,0.625,0.625
4,true_positive,24.0,23.0
5,false_negative,46.0,47.0
6,false_positive,12.0,13.0
7,true_negative,30.0,29.0
8,recall,0.342857,0.328571
9,miss_rate,0.657143,0.671429


<IPython.core.display.Javascript object>

## ROC AUC curve

In [37]:
fig_roc_auc_curve.add_trace(
    go.Scatter(
        x=[0, 1],
        y=[0, 1],
        mode="lines",
        name="random",
        line=dict(color="black", dash="dash"),
    )
)

fig_roc_auc_curve = performance_metrics.add_square(
    fig_roc_auc_curve, x0=0, x1=1, y0=0, y1=1
)

fig_roc_auc_curve.update_layout(
    title="Receiver operating characteristic (ROC) curve",
    legend={"itemsizing": "constant"},
)

fig_roc_auc_curve.update_xaxes(title_text="False Positive Rate", range=[-0.05, 1.05])
fig_roc_auc_curve.update_yaxes(title_text="True Positive Rate", range=[-0.05, 1.05])

fig_roc_auc_curve.show()

<IPython.core.display.Javascript object>

## Precision - Recall curve

In [38]:
fig_precision_recall_curve = performance_metrics.add_square(
    fig_precision_recall_curve, x0=0, x1=1, y0=0, y1=1
)

fig_precision_recall_curve.add_trace(
    go.Scatter(
        x=[0, 1],
        y=[
            binary_classification_results["random_precision"],
            binary_classification_results["random_precision"],
        ],
        mode="lines",
        name="Random precision",
        line=dict(color="black", dash="dash"),
    )
)

fig_precision_recall_curve.update_layout(
    title="Precision-Recall curve",
    legend={"itemsizing": "constant"},
)

fig_precision_recall_curve.update_xaxes(title_text="Recall", range=[-0.05, 1.05])
fig_precision_recall_curve.update_yaxes(title_text="Precision", range=[-0.05, 1.05])

fig_precision_recall_curve.show()

<IPython.core.display.Javascript object>