In [1]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

/Users/admin/Projects/vocabulary_learning/notebooks
/Users/admin/Projects/vocabulary_learning


<IPython.core.display.Javascript object>

In [2]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

import datetime
import dill

from sklearn.model_selection import train_test_split

<IPython.core.display.Javascript object>

# Overall

In [None]:
historical_data_path = "data/raw/20201009/historical_data.csv"
vocab_path = "data/raw/20201009/german_english.csv"
dataset_path = "data/raw/20201009/dataset.pkl"

train_dataset_path = "data/raw/20201009/dataset_train.pkl"
valid_dataset_path = "data/raw/20201009/dataset_valid.pkl"
test_dataset_path = "data/raw/20201009/dataset_test.pkl"

In [None]:
from src.data.make_dataset import create_dataset

dataset = create_dataset(historical_data_path, vocab_path, dataset_path)

In [None]:
from collections import Counter

Counter(dataset["id_session"])

In [None]:
from src.data.make_dataset import split_train_valid_test_dataset

split_train_valid_test_dataset(
    dataset, train_dataset_path, valid_dataset_path, test_dataset_path
)

# Details

In [None]:
def create_dataset(historical_data_path, vocab_path, dataset_path):

    historical_data = get_historical_data(historical_data_path)
    historical_data = create_historical_features(historical_data)

    vocab = get_vocab(vocab_path)
    vocab = create_vocab_features(vocab)

    dataset = merge_feature_datasets(historical_data, vocab)

    vardict = get_vardict()
    dataset = transform_type(dataset, vardict)

    with open(dataset_path, "wb") as file:
        dill.dump(dataset, file)

    print("Saved")

In [None]:
# Historical dataset
from src.data.get_dataset import get_historical_data
from src.data.make_historical_features import create_historical_features

historical_data = get_historical_data("data/raw/historical_data__large.csv")
historical_data = create_historical_features(historical_data)
historical_data

In [None]:
# Vocab dataset
from src.data.get_dataset import get_vocab
from src.data.make_vocab_features import create_vocab_features

vocab = get_vocab("data/raw/german_english.csv")
vocab = create_vocab_features(vocab)
vocab

In [None]:
# Dataset
from src.data.make_dataset import merge_feature_datasets

dataset = merge_feature_datasets(historical_data, vocab)
dataset

In [None]:
# Vardict
from src.data.make_dataset import get_vardict

vardict = get_vardict()
vardict

In [None]:
# Transform datatype
from src.data.make_dataset import transform_type

dataset = transform_type(dataset, vardict)
dataset

In [None]:
[x for x in dataset.columns.tolist() if x not in vardict["all"]]

In [None]:
dataset[dataset["german_word"] == "oft"][
    ["german_word", "english_word"] + [vardict["target"]] + vardict["all"]
]

In [None]:
historical_data[historical_data["german_word"] == "oft"]

In [None]:
# use dill - works the same way as pickle

with open('data/interim/{}__Q.pkl'.format('MC_every_visit'), 'wb') as file:
    dill.dump(Q, file)

# Historical dataset

In [None]:
from src.data.get_dataset import get_historical_data

historical_data_test = get_historical_data("data/raw/historical_data__feature.csv")
historical_data_test

## Add features

In [None]:
from src.data.make_historical_features import create_historical_features

historical_data_test = create_historical_features(historical_data_test)
historical_data_test

# Vocab dataset

In [None]:
from src.data.get_dataset import get_vocab

vocab = get_vocab("data/raw/german_english__feature.csv")
vocab

## Add features

In [None]:
from src.data.make_vocab_features import create_vocab_features

vocab_test = create_vocab_features(vocab)
vocab_test

# Mix datasets

In [None]:
dataset = pd.merge(historical_data_test, vocab_test, on="id_vocab")
dataset.sort_values("datetime", inplace=True)
dataset

# Variable type transformation

In [None]:
# List of feature columns

In [None]:
dataset.columns.tolist()

In [None]:
vardict = dict()

## Target

In [None]:
vardict["target"] = "result"

In [None]:
dataset[[vardict["target"]]]

## Numerical

In [None]:
vardict["numerical"] = [
    "previous_occurrences_same_language",
    "previous_successes_same_language",
    "previous_fails_same_language",
    "previous_occurrences_any_language",
    "previous_successes_any_language",
    "previous_fails_any_language",
    "levenshtein_distance_guess_answer",
    "previous_question_time",
    "write_it_again_german",
    "write_it_again_english",
    "levenshtein_distance_german_english",
]

In [None]:
for i_num_var in vardict["numerical"]:
    dataset[i_num_var] = dataset[i_num_var].astype(float)

In [None]:
dataset[["german_word", "guess"] + vardict["numerical"]]

## Difference in time

In [None]:
vardict["diff_time"] = [
    "days_since_last_occurrence_same_language",
    "days_since_last_occurrence_any_language",
    "days_since_last_success_same_language",
    "days_since_last_success_any_language",
    "days_since_first_occur_same_language",
    "days_since_first_occur_any_language",
]

In [None]:
for i_diff_time_var in vardict["diff_time"]:
    dataset[i_diff_time_var] = dataset[i_diff_time_var].dt.days.astype(float)

In [None]:
dataset[["german_word", "guess"] + vardict["diff_time"]]

## Boolean

In [None]:
vardict["boolean"] = [
    "previous_result",
    "correct_article",
    "only_missed_uppercase",
    "write_it_again_not_null",
]

In [None]:
for i_boolean_var in vardict["boolean"]:
    dataset.loc[~dataset[i_boolean_var].isna(), i_boolean_var] = dataset.loc[
        ~dataset[i_boolean_var].isna(), i_boolean_var
    ].astype("bool")

In [None]:
dataset[["german_word", "guess"] + vardict["boolean"]]

## Categorical

In [None]:
vardict["categorical"] = [
    "previous_language_asked",
]

In [None]:
for i_categorical_var in vardict["categorical"]:
    dataset.loc[~dataset[i_categorical_var].isna(), i_categorical_var] = dataset.loc[
        ~dataset[i_categorical_var].isna(), i_categorical_var
    ].astype(str)

    dataset.loc[dataset[i_categorical_var].isna(), i_categorical_var] = None

In [None]:
dataset[["german_word", "guess"] + vardict["categorical"]]

# Split training - validation - test

We will split by sessions.
* 70% training
* 20% validation
* 10% test

In [None]:
sessions = list(set(dataset["id_session"].values))
sessions

In [None]:
train_valid_sessions, test_sessions = train_test_split(
    sessions, shuffle=False, test_size=0.10
)

train_sessions, valid_sessions = train_test_split(
    train_valid_sessions, shuffle=False, test_size=0.18
)

train_sessions, valid_sessions, test_sessions

In [None]:
train_dataset = dataset[dataset["id_session"].isin(train_sessions)]
valid_dataset = dataset[dataset["id_session"].isin(valid_sessions)]
test_dataset = dataset[dataset["id_session"].isin(test_sessions)]

In [None]:
with open(train_dataset_path, "wb") as file:
    dill.dump(train_dataset, file)

with open(valid_dataset_path, "wb") as file:
    dill.dump(valid_dataset, file)

with open(test_dataset_path, "wb") as file:
    dill.dump(test_dataset, file)

# To use for predictions

In [73]:
historical_data_path = "data/raw/20201009/historical_data.csv"
vocab_path = "data/raw/german_english__feature.csv"

dataset_predictions_path = "data/raw/20201009/dataset_predictions.pkl"

<IPython.core.display.Javascript object>

In [74]:
import src.data.get_dataset as get_dataset
import src.data.make_dataset as make_dataset


test_data = make_dataset.create_dataset_new_session(
    historical_data_path, vocab_path, dataset_predictions_path
)

Saved at data/raw/20201009/dataset_predictions.pkl


<IPython.core.display.Javascript object>