In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

/Users/admin/Projects/vocabulary_learning/notebooks
/Users/admin/Projects/vocabulary_learning


<IPython.core.display.Javascript object>

In [2]:
path_dataset_train = "data/raw/20201009/dataset_train.pkl"
path_dataset_valid = "data/raw/20201009/dataset_valid.pkl"

<IPython.core.display.Javascript object>

# Import

In [3]:
import dill
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

from src.models.logistic_regression import ModelLogisticRegression
import src.models.performance_metrics as performance_metrics

<IPython.core.display.Javascript object>

# Dataset

In [None]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

# Overall

In [None]:
model = ModelLogisticRegression()
model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
model.train(dataset_train)

In [None]:
model.plot_coefficients()

In [None]:
with open(f"models/{model.version}__model.pkl", "wb") as file:
    dill.dump(model, file)

# Data Transformation

In [None]:
vardict.keys()

## Target

In [None]:
dataset_train[model.vardict["target"]].describe()

## Numerical

In [None]:
dataset_train[vardict["numerical"]].isnull().sum()

In [None]:
def data_transform_numerical(dataset, vardict):

    dataset["previous_levenshtein_distance_guess_answer"].fillna(-1, inplace=True)
    dataset["previous_question_time"].fillna(-1, inplace=True)
    dataset["previous_write_it_again_german"].fillna(-1, inplace=True)
    dataset["previous_write_it_again_english"].fillna(-1, inplace=True)

    return dataset, vardict

## Diff time

In [None]:
dataset_train[vardict["diff_time"]].isnull().sum()

In [None]:
def data_transform_diff_time(dataset, vardict):

    dataset["days_since_last_occurrence_same_language"].fillna(-1, inplace=True)
    dataset["days_since_last_occurrence_any_language"].fillna(-1, inplace=True)
    dataset["days_since_last_success_same_language"].fillna(-1, inplace=True)
    dataset["days_since_last_success_any_language"].fillna(-1, inplace=True)
    dataset["days_since_first_occur_same_language"].fillna(-1, inplace=True)
    dataset["days_since_first_occur_any_language"].fillna(-1, inplace=True)

    return dataset, vardict

## Boolean

In [None]:
dataset_train[vardict["boolean"]]

In [None]:
def data_transform_boolean(dataset, vardict):

    # Transform to dummies

    vardict["dummy_boolean"] = []

    for i_var_boolean in vardict["boolean"]:

        # possible improvement: pandas.get_dummies(drop_first=False)
        i_dummy_boolean = pd.get_dummies(
            dataset[i_var_boolean],
            prefix=i_var_boolean,
            prefix_sep="__",
            dummy_na=True,
        )

        del dataset_train[i_var_boolean]

        vardict["dummy_boolean"] = (
            vardict["dummy_boolean"] + i_dummy_boolean.columns.tolist()
        )

        dataset = pd.concat([dataset, i_dummy_boolean], axis=1)

    dataset[vardict["dummy_boolean"]].describe()

    return dataset, vardict

## Categorical

In [None]:
dataset_train[model.vardict["categorical"]]["previous_language_asked"].value_counts()

In [None]:
def data_transform_categorical(dataset, vardict):

    # Transform to dummies

    vardict["dummy_categorical"] = []

    for i_var_categorical in vardict["categorical"]:

        # possible improvement: pandas.get_dummies(drop_first=False)
        i_dummy_categorical = pd.get_dummies(
            dataset[i_var_categorical],
            prefix=i_var_categorical,
            prefix_sep="__",
            dummy_na=True,
        )

        del dataset[i_var_categorical]

        vardict["dummy_categorical"] = (
            vardict["dummy_categorical"] + i_dummy_categorical.columns.tolist()
        )

        dataset = pd.concat([dataset, i_dummy_categorical], axis=1)

    return dataset, vardict

## Overall

In [None]:
dataset_train, vardict = data_transform_numerical(dataset_train, vardict)
dataset_train, vardict = data_transform_diff_time(dataset_train, vardict)
dataset_train, vardict = data_transform_boolean(dataset_train, vardict)
dataset_train, vardict = data_transform_categorical(dataset_train, vardict)

### vardict

In [None]:
vardict["all"] = (
    vardict["numerical"]
    + vardict["diff_time"]
    + vardict["dummy_boolean"]
    + vardict["dummy_categorical"]
)

# 1st model

In [None]:
model = ModelLogisticRegression()
model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
dataset = dataset_train.copy()

In [None]:
X_train = dataset[model.vardict["into_model"]]
y_train = dataset[model.vardict["target"]]

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [None]:
from sklearn.linear_model import LogisticRegression

logistic_Reg = LogisticRegression(solver="liblinear")

In [None]:
pipe = Pipeline(steps=[("logistic_Reg", logistic_Reg)])

In [None]:
C = np.logspace(-4, 4, 50)
penalty = ["l1", "l2"]

In [None]:
parameters = dict(logistic_Reg__C=C, logistic_Reg__penalty=penalty)

In [None]:
clf_GS = GridSearchCV(pipe, parameters)
clf_GS.fit(X_train, y_train)

In [None]:
print("Best Penalty:", clf_GS.best_estimator_.get_params()["logistic_Reg__penalty"])
print("Best C:", clf_GS.best_estimator_.get_params()["logistic_Reg__C"])
print()
print(clf_GS.best_estimator_.get_params()["logistic_Reg"])

In [None]:
os = SMOTE(random_state=0)
X_train = dataset[model.vardict["all"]]
y_train = dataset[[model.vardict["target"]]]
X_train_os, y_train_os_series = os.fit_sample(X_train, y_train)
y_train_os = pd.DataFrame()
y_train_os[model.vardict["target"]] = y_train_os_series
dataset = pd.concat([X_train_os, y_train_os], axis=1)


In [None]:
y_train_os_series

In [None]:
from imblearn.over_sampling import SMOTE

In [None]:
os = SMOTE(random_state=0)

In [None]:
len(X_train)

In [None]:
X_train_os, y_train_os = os.fit_sample(X_train, y_train)

In [None]:
y_train_os

In [None]:
dataset_test2 = pd.concat([X_train_os, y_train_os], axis=0)

In [None]:
dataset_test2

In [None]:
len(X_train_os)

In [None]:
y_train_os

In [None]:
os_data_X = pd.DataFrame(data=X_train_os, columns=X_train.columns)
os_data_y = pd.DataFrame()
os_data_y["y"] = y_train_os

In [None]:
os_data_y

In [None]:
# we can Check the numbers of our data
print("length of oversampled data is ", len(os_data_X))
print(
    "Number of no subscription in oversampled data", len(os_data_y[os_data_y["y"] == 0])
)
print("Number of subscription", len(os_data_y[os_data_y["y"] == 1]))
print(
    "Proportion of no subscription data in oversampled data is ",
    len(os_data_y[os_data_y["y"] == 0]) / len(os_data_X),
)
print(
    "Proportion of subscription data in oversampled data is ",
    len(os_data_y[os_data_y["y"] == 1]) / len(os_data_X),
)

In [None]:
X_train

In [None]:
y_train

In [None]:
model = LogisticRegression(random_state=0)

In [None]:
model.fit(X_train, y_train)

In [None]:
with open(f"data/processed/{model_name}_model.pkl", "wb") as file:
    dill.dump(model, file)

with open(f"data/processed/{model_name}_vardict.pkl", "wb") as file:
    dill.dump(vardict, file)

# Validation results

In [None]:
with open(path_dataset_valid, "rb") as input_file:
    dataset_valid = dill.load(input_file)
y_valid = dataset_valid[model.vardict["target"]].copy()

In [None]:
dataset_valid = model.preprocessing_inference(dataset_valid)

In [None]:
predictions = model.predict(dataset=dataset_valid, target_present=False)

In [None]:
predictions["y_true"] = y_valid.values.tolist()

In [None]:
predictions

In [None]:
binary_classification_results = performance_metrics.get_binary_classification_results(
    predictions, model_name=f"{model.version}_valid"
)

binary_classification_results

In [None]:
regression_results = performance_metrics.get_regression_results(
    predictions, model_name=f"{model.version}_valid"
)

regression_results

In [None]:
performance_metrics.plot_roc_auc_curve(predictions, model_name=f"{model.version}_valid")

In [None]:
performance_metrics.plot_precision_recall_curve(
    predictions, binary_classification_results, model_name=f"{model.version}_valid"
)

In [None]:
performance_metrics.plot_predictions(predictions, model_name=f"{model.version}_valid")

# Hyperparameters search

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

<IPython.core.display.Javascript object>

In [5]:
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import LogisticRegression

<IPython.core.display.Javascript object>

In [6]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

<IPython.core.display.Javascript object>

In [7]:
model = ModelLogisticRegression()
model.version

'logistic_regression_mle__20201016'

<IPython.core.display.Javascript object>

In [67]:
scaler = StandardScaler()
dimension_reduction = PCA()
model_logistic_regression = LogisticRegression()
feature_selection = RFE(estimator=LogisticRegression())

<IPython.core.display.Javascript object>

## 1st part preprocessing - fix

In [9]:
dataset_train = model.preprocessing_training_numerical(dataset_train)
dataset_train = model.preprocessing_training_diff_time(dataset_train)
dataset_train = model.preprocessing_training_boolean(dataset_train)
dataset_train = model.preprocessing_training_categorical(dataset_train)

model.vardict["preprocessed"] = (
        model.vardict["numerical"]
        + model.vardict["diff_time"]
        + model.vardict["dummy_boolean"]
        + model.vardict["dummy_categorical"]
)

# SMOTE
# dataset_train = model.apply_sampling(dataset_train)


X_train = dataset_train[model.vardict["preprocessed"]]
y_train = dataset_train[[model.vardict["target"]]]


<IPython.core.display.Javascript object>

## pipeline

In [11]:
pipe = Pipeline(
    steps=[
        ("std_slc", scaler),
        ("pca", dimension_reduction),
        ("feat_select", feature_selection),
        ("logistic_Reg", model_logistic_regression),
    ]
)

<IPython.core.display.Javascript object>

## Grid

In [72]:
param_grid = {
    "pca__n_components": list(range(1, X_train.shape[1] + 1, 1)),
    "logistic_Reg__penalty": ["l1", "l2"],
    "logistic_Reg__C": np.logspace(-4, 4, 20),
    "logistic_Reg__solver": ["liblinear"],
}

<IPython.core.display.Javascript object>

## Create grid search object

In [73]:
clf = GridSearchCV(pipe, param_grid=param_grid, cv=5, verbose=True, n_jobs=2)

<IPython.core.display.Javascript object>

## Fit on data

In [74]:
best_clf = clf.fit(X_train, y_train)

Fitting 5 folds for each of 360 candidates, totalling 1800 fits


[Parallel(n_jobs=2)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=2)]: Done 360 tasks      | elapsed:    2.8s
[Parallel(n_jobs=2)]: Done 1800 out of 1800 | elapsed:    9.9s finished

A column-vector y was passed when a 1d array was expected. Please change the shape of y to (n_samples, ), for example using ravel().



<IPython.core.display.Javascript object>

In [75]:
print("----- Best parameters -----")
for i_parameter in list(param_grid.keys()):
    print("{} - {}".format(i_parameter, clf.best_params_[i_parameter]))

----- Best parameters -----
pca__n_components - 5
logistic_Reg__penalty - l2
logistic_Reg__C - 0.03359818286283781
logistic_Reg__solver - liblinear


<IPython.core.display.Javascript object>

In [76]:
hyperparameters_df = pd.DataFrame.from_dict(clf.cv_results_)
hyperparameters_df["rank_test_score_inverse"] = (
    max(hyperparameters_df["rank_test_score"]) - hyperparameters_df["rank_test_score"]
)
hyperparameters_df.sort_values("rank_test_score_inverse", inplace=True)
hyperparameters_df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logistic_Reg__C,param_logistic_Reg__penalty,param_logistic_Reg__solver,param_pca__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,rank_test_score_inverse
0,0.017385,0.013253,0.002783,0.000267,0.0001,l1,liblinear,1,"{'logistic_Reg__C': 0.0001, 'logistic_Reg__pen...",0.629630,0.629630,0.629630,0.653846,0.653846,0.639316,0.011864,298,0
36,0.006721,0.000459,0.002576,0.000195,0.000695193,l1,liblinear,1,"{'logistic_Reg__C': 0.0006951927961775605, 'lo...",0.629630,0.629630,0.629630,0.653846,0.653846,0.639316,0.011864,298,0
37,0.006234,0.000119,0.002396,0.000039,0.000695193,l1,liblinear,2,"{'logistic_Reg__C': 0.0006951927961775605, 'lo...",0.629630,0.629630,0.629630,0.653846,0.653846,0.639316,0.011864,298,0
38,0.006584,0.000721,0.002447,0.000263,0.000695193,l1,liblinear,3,"{'logistic_Reg__C': 0.0006951927961775605, 'lo...",0.629630,0.629630,0.629630,0.653846,0.653846,0.639316,0.011864,298,0
39,0.006911,0.000769,0.002474,0.000196,0.000695193,l1,liblinear,4,"{'logistic_Reg__C': 0.0006951927961775605, 'lo...",0.629630,0.629630,0.629630,0.653846,0.653846,0.639316,0.011864,298,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
121,0.005982,0.000190,0.002388,0.000108,0.0335982,l2,liblinear,5,"{'logistic_Reg__C': 0.03359818286283781, 'logi...",0.703704,0.592593,0.814815,0.769231,0.807692,0.737607,0.082525,1,297
122,0.005976,0.000020,0.002371,0.000023,0.0335982,l2,liblinear,6,"{'logistic_Reg__C': 0.03359818286283781, 'logi...",0.703704,0.592593,0.814815,0.769231,0.807692,0.737607,0.082525,1,297
123,0.007047,0.001084,0.003121,0.000638,0.0335982,l2,liblinear,7,"{'logistic_Reg__C': 0.03359818286283781, 'logi...",0.703704,0.592593,0.814815,0.769231,0.807692,0.737607,0.082525,1,297
124,0.006529,0.000372,0.002566,0.000167,0.0335982,l2,liblinear,8,"{'logistic_Reg__C': 0.03359818286283781, 'logi...",0.703704,0.592593,0.814815,0.769231,0.807692,0.737607,0.082525,1,297


<IPython.core.display.Javascript object>

In [77]:
for i_parameter in list(param_grid.keys()):
    # variable_to_plot = "param_pca__n_components"
    variable_to_plot = f"param_{i_parameter}"

    import plotly.graph_objects as go

    # hyperparameters_df.sort_values(variable_to_plot, inplace=True)

    # Create traces
    fig = go.Figure()

    fig.add_trace(
        go.Scatter(
            x=hyperparameters_df[variable_to_plot],
            y=hyperparameters_df["rank_test_score_inverse"],
            mode="markers",
            marker=dict(
                color=hyperparameters_df["mean_test_score"],
                colorscale="Viridis_r",  # one of plotly colorscales
                colorbar=dict(title="mean_test_score"),
                showscale=True,
            ),
            hovertemplate="<b>Trial %{y}</b><br><br>"
            + "Value: %{x:.5f}<br>"
            + "MAE: %{marker.color:.3f}<br>"
            + "<extra></extra>",
            showlegend=False,
            name=variable_to_plot,
        )
    )

    if i_parameter in ["logistic_Reg__C"]:
        fig.update_xaxes(type="log")

    fig.add_trace(
        go.Scatter(
            x=[
                clf.best_params_[i_parameter],
                clf.best_params_[i_parameter],
            ],
            y=[
                min(hyperparameters_df["rank_test_score_inverse"]),
                max(hyperparameters_df["rank_test_score_inverse"]),
            ],
            mode="lines",
            showlegend=False,
            line=dict(color="#e377c2", dash="dash"),
        )
    )

    fig.update_layout(
        title="Evolution of hyperparameter {} by trial".format(variable_to_plot),
        xaxis_title=variable_to_plot,
        yaxis_title="Rank number (the higher, the better ranked)",
        legend={"itemsizing": "constant"},
    )

    fig.update_layout(
        annotations=[
            go.layout.Annotation(
                text="Vertical line: best score",
                align="center",
                showarrow=False,
                xref="paper",
                yref="paper",
                x=0.5,
                y=-0.22,
                bordercolor="black",
                borderwidth=1,
            )
        ]
    )

    fig.show()

<IPython.core.display.Javascript object>