In [None]:
%load_ext nb_black
%load_ext autoreload
%autoreload 2

import os

print(os.getcwd())


def update_working_directory():
    from pathlib import Path

    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)


update_working_directory()

In [None]:
import numpy as np
import pandas as pd

In [None]:
def get_vocab(vocab_path="data/raw/german_english.csv", list_columns=None):
    vocab = pd.read_csv(vocab_path)

    if list_columns is None:
        list_columns = ["id_vocab", "german", "english"]
    vocab = vocab[list_columns]

    return vocab

In [None]:
vocab = get_vocab("data/raw/german_english__feature.csv")
vocab

# Overall

In [None]:
from src.data.make_vocab_features import create_vocab_features

In [None]:
vocab_test = create_vocab_features(vocab)
vocab_test

# Levenshtein distance between words

In [None]:
def remove_article(
    vocab,
    list_german_article = ['der','die','das'],
    list_english_article = ['the','to']
):

    vocab['german'] = vocab['german'].map(
        lambda x: ' '.join(word for word in x.split(' ') if word not in list_german_article)
    )
    vocab['english'] = vocab['english'].map(
        lambda x: ' '.join(word for word in x.split(' ') if word not in list_english_article)
    )

In [None]:
def add_levenshtein_distance(vocab):
    
    from Levenshtein import distance
    
    vocab = vocab.copy()
    
    # Lowercase
    vocab['german'] = vocab['german'].str.lower()
    vocab['english'] = vocab['english'].str.lower()
    
    # Remove article
    remove_article(vocab)
    
    # Calculate Levenshtein distance
    levenshtein_dist = vocab.apply(lambda x: distance(x['german'], x['english']), axis=1)
    
    return levenshtein_dist

In [None]:
vocab['levenshtein_dist'] = add_levenshtein_distance(vocab)
vocab

In [None]:
# use dill - works the same way as pickle
import dill
with open('data/processed/vocab.pkl', 'wb') as file:
    dill.dump(vocab, file)

##### Tests

In [None]:
def assert_vocab_remove_articles():

    # Creating test dataframe
    vocab_test = pd.DataFrame(columns=['german', 'english'])
    vocab_test = vocab_test.append({'german': 'dienstag', 'english': 'tuesday'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'studieren', 'english': 'to study'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'die angst', 'english': 'the fear'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'andere', 'english': 'other'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'der arzt', 'english': 'the doctor'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'das café', 'english': 'the café'}, ignore_index=True)

    # Creating test dataframe
    vocab_results = pd.DataFrame(columns=['german', 'english'])
    vocab_results = vocab_results.append({'german': 'dienstag', 'english': 'tuesday'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'studieren', 'english': 'study'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'angst', 'english': 'fear'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'andere', 'english': 'other'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'arzt', 'english': 'doctor'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'café', 'english': 'café'}, ignore_index=True)

    return vocab_test, vocab_results

In [None]:
assert_vocab_remove_articles()

# Length words

In [None]:
vocab["nb_characters_german"] = vocab["german"].map(len)
vocab["nb_characters_english"] = vocab["english"].map(len)
vocab

In [None]:
def count_nb_words_german(x):
    list_german_article = ["der", "die", "das"]
    separate_words = x.split(" ")
    if separate_words[0] in list_german_article:
        separate_words = separate_words[1:]
    return len(separate_words)


def count_nb_words_english(x):
    list_english_article = ["the", "to"]
    separate_words = x.split(" ")
    if separate_words[0] in list_english_article:
        separate_words = separate_words[1:]
    return len(separate_words)

In [None]:
vocab["nb_words_german"] = vocab["german"].map(count_nb_words_german)
vocab["nb_words_english"] = vocab["english"].map(count_nb_words_english)
vocab

In [None]:
vocab["nb_words_english"] = vocab["english"].map(lambda x: len(x.split(" ")))
vocab

In [None]:
def is_noun(x, list_german_article=["der", "die", "das"]):
    possible_article = x["german"].split(" ", 1)[0]
    return possible_article in list_german_article


vocab["is_noun"] = vocab.apply(is_noun, axis=1)
vocab

In [None]:
def is_verb(x):
    possible_article = x["english"].split(" ", 1)[0]
    return "to" in possible_article


vocab["is_verb"] = vocab.apply(is_verb, axis=1)
vocab