In [None]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

In [None]:
import datetime
import dill
import numpy as np
import pandas as pd

import src.data.get_dataset as get_dataset
import src.data.make_dataset as make_dataset
import src.data.make_predictions_next_session as make_predictions_next_session

pd.set_option("display.max_columns", None)

# Overall

In [None]:
historical_data_path = "data/raw/historical_data.csv"
vocab_path = "data/raw/german_english.csv"

model_path = "models/logistic_regression_mle__20201017__model.pkl"

dataset_predictions_path = "data/raw/dataset_predictions.pkl"
probas_next_session_path = "data/raw/predictions_next_session.csv"

make_predictions_next_session.make_predictions_next_session(
    historical_data_path,
    vocab_path,
    model_path,
    dataset_predictions_path,
    probas_next_session_path,
)

# Details

In [None]:
historical_data_path = "data/raw/20201017/historical_data.csv"
vocab_path = "data/raw/20201017/german_english.csv"

dataset_predictions_path = "data/raw/20201017/dataset_predictions.pkl"
probas_next_session_path = "data/raw/20201017/predictions_next_session.csv"

In [None]:
# Create dataset_new_session

In [None]:
make_dataset.create_dataset_new_session(
    historical_data_path, vocab_path, dataset_predictions_path
)

In [None]:
# Take model

In [None]:
with open(f"models/logistic_regression_mle__20201017__model.pkl", "rb") as input_file:
    model = dill.load(input_file)

In [None]:
# Take predictions

In [None]:
# get historical data
with open(dataset_predictions_path, "rb") as input_file:
    dataset_predictions = dill.load(input_file)

dataset_to_keep = dataset_predictions[
    ["id_vocab", "german_word", "english_word", "language_asked"]
]

In [None]:
dataset_predictions = model.preprocessing_inference(dataset_predictions)

In [None]:
predictions = model.predict(dataset=dataset_predictions, target_present=False)

In [None]:
predictions = pd.concat([dataset_to_keep, predictions], axis=1)

In [None]:
predictions

In [None]:
probas_next_session = (
    predictions[["id_vocab", "language_asked", "y_proba"]]
    .pivot(index="id_vocab", columns="language_asked", values="y_proba")
    .reset_index()
)
probas_next_session.columns.name = None

probas_next_session.rename(
    columns={
        "german": "german_proba",
        "english": "english_proba",
    },
    inplace=True,
)

probas_next_session

In [None]:
# Save dataset
probas_next_session.to_csv(probas_next_session_path, index=False)
print(f"Saved at {probas_next_session_path}")

In [None]:
import plotly.graph_objects as go

# Create traces
fig = go.Figure()

fig.add_trace(
    go.Histogram(
        x=probas_next_session["german_proba"],
        name="german",
        xbins=dict(size=0.01)
        # , histnorm='probability'
    )
)

fig.add_trace(
    go.Histogram(
        x=probas_next_session["english_proba"],
        name="english",
        xbins=dict(size=0.01)
        # , histnorm='probability'
    )
)

fig.update_layout(
    title="predictions",
    xaxis_title="prediction",
    yaxis_title="count",
    legend={"itemsizing": "constant"},
)

# fig.update_layout(barmode="overlay")
fig.update_traces(opacity=0.75)

fig.show()