In [None]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

In [None]:
path_dataset_train = "data/raw/20201105/dataset_train.pkl"
path_dataset_valid = "data/raw/20201105/dataset_valid.pkl"
path_dataset_valid_historical = "data/raw/20201105/dataset_valid_historical.pkl"

# Import

In [None]:
import dill
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

import plotly.graph_objects as go
import matplotlib.pyplot as plt
import seaborn as sns

from src.models.fully_connected_nn import ModelFullyConnectedNN
import src.models.performance_metrics as performance_metrics

# Dataset

In [None]:
with open(path_dataset_train, "rb") as input_file:
    dataset_train = dill.load(input_file)

with open(path_dataset_valid, "rb") as input_file:
    dataset_valid = dill.load(input_file)

with open(path_dataset_valid_historical, "rb") as input_file:
    dataset_valid_historical = dill.load(input_file)

# Overall

In [None]:
model = ModelFullyConnectedNN()
model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
dataset_valid = model.preprocessing_inference(
    dataset=dataset_valid, dataset_historical=dataset_valid_historical
)

In [None]:
model.train(dataset=dataset_train, dataset_test=dataset_valid)

In [None]:
model.plot_loss()

In [None]:
model.plot_accuracy()

# Initialize model

In [None]:
model = ModelFullyConnectedNN()
model.version

In [None]:
dataset_train = model.preprocessing_training(dataset_train)

In [None]:
dataset_train

Dataset for validation

**Objective:** have the latest max_session, fill with None

We need to take the whole history, train+valid

In [None]:
dataset_valid = model.preprocessing_inference(
    dataset=dataset_valid, dataset_historical=dataset_valid_historical
)

In [None]:
dataset_valid

# Data Transformation

In [None]:
nb_sessions_inference = (
    max(dataset_train_valid["id_session"]) + 1
)  # because it exists id_session = 0
nb_sessions_inference

In [None]:
i_dataset_vocab = dataset_train_valid[
    (dataset_train_valid["german_word"] == "bald")
    & (dataset_train_valid["language_asked"] == "german")
]

i_dataset_vocab[
    [
        "id_vocab",
        "german_word",
        "english_word",
        "language_asked",
        "result",
        "id_session",
    ]
]

In [None]:
i_id_session = i_dataset_vocab["id_session"].tolist()
i_id_session

In [None]:
i_result = i_dataset_vocab["result"].tolist()
i_result

In [None]:
i_results_session = np.full(shape=nb_sessions_inference, fill_value=None).tolist()
i_results_session

In [None]:
for r, s in zip(i_result, i_id_session):
    i_results_session[s] = r
i_results_session

In [None]:
pred = lambda x: x in {None}
i_results_session = list(strip(iterable=i_results_session, pred=pred))
i_results_session

In [None]:
from more_itertools import strip


def define_word_language_sessions(i_dataset_vocab, nb_sessions, inference=False):

    i_id_session = i_dataset_vocab["id_session"].tolist()
    i_result = i_dataset_vocab["result"].tolist()

    i_results_session = np.full(
        shape=max(nb_sessions, max(i_id_session) + 1) if not inference else nb_sessions,
        fill_value=None,
    ).tolist()

    for r, s in zip(i_result, i_id_session):
        i_results_session[s] = r

    i_results_session = list(
        strip(iterable=i_results_session, pred=lambda x: x in {None})
    )

    return pd.Series(data=[i_results_session], index=["results_session"])

In [None]:
dataset_train_vocab = (
    dataset_train.groupby(["id_vocab", "language_asked"])
    .apply(lambda x: define_word_language_sessions(x, nb_sessions))
    .reset_index()
)
dataset_train_vocab

In [None]:
with open("data/raw/20201105/dataset_train_journey.pkl", "wb") as file:
    dill.dump(dataset_train_vocab, file)


In [None]:
from more_itertools import locate


def multiply_word_language_sessions(i_dataset_vocab):

    i_results_session = i_dataset_vocab["results_session"].tolist()[0]

    all_results_session = [
        (i_results_session[:i], i_results_session[i])
        for i in locate(i_results_session, lambda x: x != None)
    ]

    session_before = [x[0] for x in all_results_session]
    session_result = [x[1] for x in all_results_session]

    return pd.DataFrame.from_dict({"before": session_before, "result": session_result})

In [None]:
dataset_train_vocab_multiplied = (
    dataset_train_vocab.groupby(["id_vocab", "language_asked"])
    .apply(multiply_word_language_sessions)
    .reset_index()
)
del dataset_train_vocab_multiplied['level_2']


In [None]:
dataset_train_vocab_multiplied

In [None]:
with open("data/raw/20201105/dataset_train_journey_multiplied.pkl", "wb") as file:
    dill.dump(dataset_train_vocab_multiplied, file)


In [None]:
from src.features.make_sessions import (
    define_word_language_sessions,
    multiply_word_language_sessions,
)

In [None]:
nb_sessions_inference = (
    max(dataset_valid["id_session"]) + 1
)  # because it exists id_session = 0
nb_sessions_inference

In [None]:
dataset_word_language_sessions_valid = (
    dataset_train_valid.groupby(["id_vocab", "language_asked"])
    .apply(lambda x: define_word_language_sessions(x, nb_sessions_inference))
    .reset_index()
)

In [None]:
dataset_valid = dataset_valid[
    dataset_valid.groupby(["id_vocab", "language_asked"])["id_session"].transform(max)
    == dataset_valid["id_session"]
]

In [None]:
dataset_valid = pd.merge(
    dataset_valid,
    dataset_word_language_sessions_valid,
    on=["id_vocab", "language_asked"],
)

In [None]:
dataset_valid

# Fully connected model

## Import

In [None]:
import tensorflow as tf

## Dataset

Objective: we have to create an array of sub-arrays. Each sub-arrays will have X elements: the last failures/successes. We replace all non-existing elements by None for starters.

* ~~1st test:~~ keeping None, 0, 1. We don't apply preprocessing -> **None is not supported**
* 2nd test if fails: recode them as -1=failure, 0=None, 1=success

We transform it to tensor.

To this tensor, we apply shuffle, batch and repeat.

We apply the same for validation set.

In [None]:
import src.features.make_sessions as make_sessions

##### sessions_numeric

In [None]:
max_sessions = dataset_train["before"].map(len).max()

# add None to complete sessions
dataset_train["sessions_standardized"] = dataset_train["before"].map(
    lambda x: make_sessions.standardize_sessions(x, max_sessions)
)

dataset_train["sessions_numeric"] = dataset_train["sessions_standardized"].map(
    make_sessions.map_session_to_numeric
)

In [None]:
dataset_train

In [None]:
# add None to complete sessions
dataset_valid["sessions_standardized"] = dataset_valid["before"].map(
    lambda x: make_sessions.standardize_sessions(x, max_sessions)
)

dataset_valid["sessions_numeric"] = dataset_valid["sessions_standardized"].map(
    make_sessions.map_session_to_numeric
)



In [None]:
dataset_valid

##### array of array

In [None]:
# transform to array of array
sessions_train = np.array(dataset_train["sessions_numeric"].tolist(), dtype="int8")
sessions_train.shape

In [None]:
targets_train = np.array(dataset_train["result"].tolist(), dtype="int8")  # to try: bool
targets_train.shape

In [None]:
# transform to array of array
sessions_valid = np.array(dataset_valid["sessions_numeric"].tolist(), dtype="int8")
sessions_valid.shape

In [None]:
targets_valid = np.array(dataset_valid["result"].tolist(), dtype="int8")  # to try: bool
targets_valid.shape

##### batch

In [None]:
# transform into batch

batch_size = 32
num_epochs = 10

ds_train = (
    tf.data.Dataset.from_tensor_slices((sessions_train, targets_train))
    .shuffle(buffer_size=10 * batch_size)
    .repeat(num_epochs)
    .batch(batch_size)
)

In [None]:
ds_train

In [None]:
ds_valid = tf.data.Dataset.from_tensor_slices((sessions_valid, targets_valid)).batch(
    batch_size
)

In [None]:
ds_valid

## Model

In [None]:
input_shape_test = (8,)

In [None]:
from tensorflow.keras import Sequential

from tensorflow.keras.layers import Dense


In [None]:
nn_model = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(
            10, activation="relu", input_shape=input_shape_test
        ),  # input shape required
        tf.keras.layers.Dense(10, activation="relu"),
        tf.keras.layers.Dense(2),
    ]
)

## Loss

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

### Metrics to measure during epoch

In [None]:
loss_train = tf.keras.metrics.Mean(name="loss_train")
accuracy_train = tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy_train")

loss_valid = tf.keras.metrics.Mean(name="loss_valid")
accuracy_valid = tf.keras.metrics.SparseCategoricalAccuracy(name="accuracy_valid")

## Optimizer

In [None]:
optimizer = tf.keras.optimizers.Adam()

## Training function

In [None]:
@tf.function
def train_step(sessions, labels):

    # tf.GradientTape - Record operations for automatic differentiation.
    with tf.GradientTape() as tape:
        # training=True is only needed if there are layers with different
        # behavior during training versus inference (e.g. Dropout).
        predictions = nn_model(sessions, training=True)
        loss = loss_object(labels, predictions)

    gradients = tape.gradient(loss, nn_model.trainable_variables)
    optimizer.apply_gradients(zip(gradients, nn_model.trainable_variables))

    loss_train(loss)
    accuracy_train(labels, predictions)

## Testing function

In [None]:
@tf.function
def test_step(sessions, labels):
    # training=False is only needed if there are layers with different
    # behavior during training versus inference (e.g. Dropout).
    predictions = nn_model(sessions, training=False)
    t_loss = loss_object(labels, predictions)

    loss_valid(t_loss)
    accuracy_valid(labels, predictions)

## Launch

In [None]:
results = {
    "loss_train": [],
    "accuracy_train": [],
    "loss_valid": [],
    "accuracy_valid": [],
}

In [None]:
for epoch in range(num_epochs):

    # Reset the metrics at the start of the next epoch
    loss_train.reset_states()
    accuracy_train.reset_states()
    # test
    loss_valid.reset_states()
    accuracy_valid.reset_states()

    for sessions, labels in ds_train:
        train_step(sessions, labels)

    for test_sessions, test_labels in ds_valid:
        test_step(test_sessions, test_labels)

    # End epoch
    results["loss_train"].append(loss_train.result())
    results["accuracy_train"].append(accuracy_train.result())
    results["loss_valid"].append(loss_valid.result())
    results["accuracy_valid"].append(accuracy_valid.result())

    print(
        f"Epoch {epoch + 1}, \n"
        f"Loss: {loss_train.result()}, \n"
        f"Accuracy: {accuracy_train.result() * 100},\n"
        f"Test Loss: {loss_valid.result()}, \n"
        f"Test Accuracy: {accuracy_valid.result() * 100}\n"
    )

In [None]:
import plotly.graph_objects as go

# Create traces
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=list(range(len(results["loss_train"]))),
        y=results["loss_train"],
        mode="lines",  # ['markers', 'lines']
        name="loss_train",
    )
)

fig.add_trace(
    go.Scatter(
        x=list(range(len(results["loss_valid"]))),
        y=results["loss_valid"],
        mode="lines",  # ['markers', 'lines']
        name="loss_valid",
    )
)

fig.update_layout(
    title="Loss train vs valid per epoch",
    xaxis_title="epoch",
    yaxis_title="loss",
    legend={"itemsizing": "constant"},
)

fig.show()

In [None]:
import plotly.graph_objects as go

# Create traces
fig = go.Figure()

fig.add_trace(
    go.Scatter(
        x=list(range(len(results["accuracy_train"]))),
        y=results["accuracy_train"],
        mode="lines",  # ['markers', 'lines']
        name="accuracy_train",
    )
)

fig.add_trace(
    go.Scatter(
        x=list(range(len(results["accuracy_valid"]))),
        y=results["accuracy_valid"],
        mode="lines",  # ['markers', 'lines']
        name="accuracy_valid",
    )
)

fig.update_layout(
    title="Accuracy train vs valid per epoch",
    xaxis_title="epoch",
    yaxis_title="accuracy",
    legend={"itemsizing": "constant"},
)

fig.show()