In [None]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

In [None]:
import numpy as np
import pandas as pd

pd.set_option("display.max_columns", None)

import datetime

# Overall

In [None]:
# Historical dataset
from src.data.get_dataset import get_historical_data
from src.data.make_historical_features import create_historical_features

historical_data = get_historical_data("data/raw/historical_data__feature.csv")
historical_data = create_historical_features(historical_data)

In [None]:
# Vocab dataset
from src.data.get_dataset import get_vocab
from src.data.make_vocab_features import create_vocab_features

vocab = get_vocab("data/raw/german_english__feature.csv")
vocab = create_vocab_features(vocab)

In [None]:
# Dataset
from src.data.make_dataset import merge_feature_datasets

dataset = merge_feature_datasets(historical_data, vocab)

In [None]:
# Vardict
from src.data.make_dataset import get_vardict

vardict = get_vardict()

In [None]:
# Transform datatype
from src.data.make_dataset import transform_type

dataset = transform_type(dataset, vardict)

In [None]:
[x for x in dataset.columns.tolist() if x not in vardict["all"]]

In [None]:
dataset[dataset["german_word"] == "stimmen"][
    ["german_word", "english_word"] + [vardict["target"]] + vardict["all"]
]

# Historical dataset

In [None]:
from src.data.get_dataset import get_historical_data

historical_data_test = get_historical_data("data/raw/historical_data__feature.csv")
historical_data_test

## Add features

In [None]:
from src.data.make_historical_features import create_historical_features

historical_data_test = create_historical_features(historical_data_test)
historical_data_test

# Vocab dataset

In [None]:
from src.data.get_dataset import get_vocab

vocab = get_vocab("data/raw/german_english__feature.csv")
vocab

## Add features

In [None]:
from src.data.make_vocab_features import create_vocab_features

vocab_test = create_vocab_features(vocab)
vocab_test

# Mix datasets

In [None]:
dataset = pd.merge(historical_data_test, vocab_test, on="id_vocab")
dataset.sort_values("datetime", inplace=True)
dataset

# Variable type transformation

In [None]:
# List of feature columns

In [None]:
dataset.columns.tolist()

In [None]:
vardict = dict()

## Target

In [None]:
vardict["target"] = "result"

In [None]:
dataset[[vardict["target"]]]

## Numerical

In [None]:
vardict["numerical"] = [
    "previous_occurrences_same_language",
    "previous_successes_same_language",
    "previous_fails_same_language",
    "previous_occurrences_any_language",
    "previous_successes_any_language",
    "previous_fails_any_language",
    "levenshtein_distance_guess_answer",
    "previous_question_time",
    "write_it_again_german",
    "write_it_again_english",
    "levenshtein_distance_german_english",
]

In [None]:
for i_num_var in vardict["numerical"]:
    dataset[i_num_var] = dataset[i_num_var].astype(float)

In [None]:
dataset[["german_word", "guess"] + vardict["numerical"]]

## Difference in time

In [None]:
vardict["diff_time"] = [
    "days_since_last_occurrence_same_language",
    "days_since_last_occurrence_any_language",
    "days_since_last_success_same_language",
    "days_since_last_success_any_language",
    "days_since_first_occur_same_language",
    "days_since_first_occur_any_language",
]

In [None]:
for i_diff_time_var in vardict["diff_time"]:
    dataset[i_diff_time_var] = dataset[i_diff_time_var].dt.days.astype(float)

In [None]:
dataset[["german_word", "guess"] + vardict["diff_time"]]

## Boolean

In [None]:
vardict["boolean"] = [
    "previous_result",
    "correct_article",
    "only_missed_uppercase",
    "write_it_again_not_null",
]

In [None]:
for i_boolean_var in vardict["boolean"]:
    dataset.loc[~dataset[i_boolean_var].isna(), i_boolean_var] = dataset.loc[
        ~dataset[i_boolean_var].isna(), i_boolean_var
    ].astype("bool")

In [None]:
dataset[["german_word", "guess"] + vardict["boolean"]]

## Categorical

In [None]:
vardict["categorical"] = [
    "previous_language_asked",
]

In [None]:
for i_categorical_var in vardict["categorical"]:
    dataset.loc[~dataset[i_categorical_var].isna(), i_categorical_var] = dataset.loc[
        ~dataset[i_categorical_var].isna(), i_categorical_var
    ].astype(str)

    dataset.loc[dataset[i_categorical_var].isna(), i_categorical_var] = None

In [None]:
dataset[["german_word", "guess"] + vardict["categorical"]]