In [None]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

In [None]:
path_dataset_train = "data/raw/20201009/dataset_train.pkl"
path_dataset_valid = "data/raw/20201009/dataset_valid.pkl"

# Import

In [None]:
import dill
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

from src.models.logistic_regression import ModelLogisticRegression
import src.models.performance_metrics as performance_metrics

# Dataset

In [None]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

with open(path_dataset_valid, "rb") as input_file:
    dataset_valid = dill.load(input_file)

# Overall

In [None]:
model = ModelLogisticRegression()

model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
model.train(dataset_train)

In [None]:
model.version

In [None]:
with open(f"models/{model.version}__model.pkl", "wb") as file:
    dill.dump(model, file)

# Data Transformation

In [None]:
vardict.keys()

## Target

In [None]:
dataset_train[vardict["target"]].describe()

## Numerical

In [None]:
dataset_train[vardict["numerical"]].isnull().sum()

In [None]:
def data_transform_numerical(dataset, vardict):

    dataset["previous_levenshtein_distance_guess_answer"].fillna(-1, inplace=True)
    dataset["previous_question_time"].fillna(-1, inplace=True)
    dataset["previous_write_it_again_german"].fillna(-1, inplace=True)
    dataset["previous_write_it_again_english"].fillna(-1, inplace=True)

    return dataset, vardict

## Diff time

In [None]:
dataset_train[vardict["diff_time"]].isnull().sum()

In [None]:
def data_transform_diff_time(dataset, vardict):

    dataset["days_since_last_occurrence_same_language"].fillna(-1, inplace=True)
    dataset["days_since_last_occurrence_any_language"].fillna(-1, inplace=True)
    dataset["days_since_last_success_same_language"].fillna(-1, inplace=True)
    dataset["days_since_last_success_any_language"].fillna(-1, inplace=True)
    dataset["days_since_first_occur_same_language"].fillna(-1, inplace=True)
    dataset["days_since_first_occur_any_language"].fillna(-1, inplace=True)

    return dataset, vardict

## Boolean

In [None]:
dataset_train[vardict["boolean"]]

In [None]:
def data_transform_boolean(dataset, vardict):

    # Transform to dummies

    vardict["dummy_boolean"] = []

    for i_var_boolean in vardict["boolean"]:

        # possible improvement: pandas.get_dummies(drop_first=False)
        i_dummy_boolean = pd.get_dummies(
            dataset[i_var_boolean],
            prefix=i_var_boolean,
            prefix_sep="__",
            dummy_na=True,
        )

        del dataset_train[i_var_boolean]

        vardict["dummy_boolean"] = (
            vardict["dummy_boolean"] + i_dummy_boolean.columns.tolist()
        )

        dataset = pd.concat([dataset, i_dummy_boolean], axis=1)

    dataset[vardict["dummy_boolean"]].describe()

    return dataset, vardict

## Categorical

In [None]:
dataset_train[vardict["categorical"]]

In [None]:
def data_transform_categorical(dataset, vardict):

    # Transform to dummies

    vardict["dummy_categorical"] = []

    for i_var_categorical in vardict["categorical"]:

        # possible improvement: pandas.get_dummies(drop_first=False)
        i_dummy_categorical = pd.get_dummies(
            dataset[i_var_categorical],
            prefix=i_var_categorical,
            prefix_sep="__",
            dummy_na=True,
        )

        del dataset[i_var_categorical]

        vardict["dummy_categorical"] = (
            vardict["dummy_categorical"] + i_dummy_categorical.columns.tolist()
        )

        dataset = pd.concat([dataset, i_dummy_categorical], axis=1)

    return dataset, vardict

## Overall

In [None]:
dataset_train, vardict = data_transform_numerical(dataset_train, vardict)
dataset_train, vardict = data_transform_diff_time(dataset_train, vardict)
dataset_train, vardict = data_transform_boolean(dataset_train, vardict)
dataset_train, vardict = data_transform_categorical(dataset_train, vardict)

### vardict

In [None]:
vardict["all"] = (
    vardict["numerical"]
    + vardict["diff_time"]
    + vardict["dummy_boolean"]
    + vardict["dummy_categorical"]
)

# 1st model

In [None]:
X_train = dataset_train[vardict["all"]]
y_train = dataset_train[vardict["target"]]

In [None]:
X_train

In [None]:
y_train

In [None]:
model = LogisticRegression(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
with open(f"data/processed/{model_name}_model.pkl", "wb") as file:
    dill.dump(model, file)

with open(f"data/processed/{model_name}_vardict.pkl", "wb") as file:
    dill.dump(vardict, file)

# Validation results

In [None]:
dataset_valid = model.preprocessing_inference(dataset_valid)

In [None]:
predictions = model.predict(dataset=dataset_valid)

In [None]:
binary_classification_results = performance_metrics.get_binary_classification_results(
    predictions, model_name=f"{model.version}_valid"
)

binary_classification_results

In [None]:
regression_results = performance_metrics.get_regression_results(
    predictions, model_name=f"{model.version}_valid"
)

regression_results

In [None]:
performance_metrics.plot_roc_auc_curve(predictions, model_name=f"{model.version}_valid")

In [None]:
performance_metrics.plot_precision_recall_curve(
    predictions, binary_classification_results, model_name=f"{model.version}_valid"
)