In [1]:
import os
print(os.getcwd())
def update_working_directory():
    from pathlib import Path
    p = Path(os.getcwd()).parents[0]
    os.chdir(p)
    print(p)
update_working_directory()

/Users/admin/Projects/vocabulary_learning/notebooks
/Users/admin/Projects/vocabulary_learning


In [2]:
import numpy as np
import pandas as pd

In [3]:
def get_vocab(vocab_path='data/raw/german_english.csv', list_columns=None):
    vocab = pd.read_csv(vocab_path)

    if list_columns is None:
        list_columns = ['german', 'english']
    vocab = vocab[list_columns]

    return vocab

In [4]:
vocab = get_vocab()
vocab

Unnamed: 0,german,english
0,aber,but
1,allein,alone
2,alt,old
3,andere,other
4,anziehen,to put a clothe on
...,...,...
451,die Fliege,the fly
452,der Grossvater,the grand-father
453,der Wurst,the sausage
454,schlafen,to sleep


In [5]:
def remove_article(
    vocab,
    list_german_article = ['der','die','das'],
    list_english_article = ['the','to']
):

    vocab['german'] = vocab['german'].map(
        lambda x: ' '.join(word for word in x.split(' ') if word not in list_german_article)
    )
    vocab['english'] = vocab['english'].map(
        lambda x: ' '.join(word for word in x.split(' ') if word not in list_english_article)
    )

In [6]:
def add_levenshtein_distance(vocab):
    
    from Levenshtein import distance
    
    vocab = vocab.copy()
    
    # Lowercase
    vocab['german'] = vocab['german'].str.lower()
    vocab['english'] = vocab['english'].str.lower()
    
    # Remove article
    remove_article(vocab)
    
    # Calculate Levenshtein distance
    levenshtein_dist = vocab.apply(lambda x: distance(x['german'], x['english']), axis=1)
    
    return levenshtein_dist

In [7]:
vocab['levenshtein_dist'] = add_levenshtein_distance(vocab)
vocab

Unnamed: 0,german,english,levenshtein_dist
0,aber,but,3
1,allein,alone,4
2,alt,old,2
3,andere,other,4
4,anziehen,to put a clothe on,11
...,...,...,...
451,die Fliege,the fly,4
452,der Grossvater,the grand-father,6
453,der Wurst,the sausage,6
454,schlafen,to sleep,5


In [8]:
# use dill - works the same way as pickle
import dill
with open('data/processed/vocab.pkl', 'wb') as file:
    dill.dump(vocab, file)

# Tests

In [9]:
def assert_vocab_remove_articles():

    # Creating test dataframe
    vocab_test = pd.DataFrame(columns=['german', 'english'])
    vocab_test = vocab_test.append({'german': 'dienstag', 'english': 'tuesday'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'studieren', 'english': 'to study'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'die angst', 'english': 'the fear'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'andere', 'english': 'other'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'der arzt', 'english': 'the doctor'}, ignore_index=True)
    vocab_test = vocab_test.append({'german': 'das café', 'english': 'the café'}, ignore_index=True)

    # Creating test dataframe
    vocab_results = pd.DataFrame(columns=['german', 'english'])
    vocab_results = vocab_results.append({'german': 'dienstag', 'english': 'tuesday'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'studieren', 'english': 'study'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'angst', 'english': 'fear'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'andere', 'english': 'other'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'arzt', 'english': 'doctor'}, ignore_index=True)
    vocab_results = vocab_results.append({'german': 'café', 'english': 'café'}, ignore_index=True)

    return vocab_test, vocab_results

In [10]:
assert_vocab_remove_articles()

(      german     english
 0   dienstag     tuesday
 1  studieren    to study
 2  die angst    the fear
 3     andere       other
 4   der arzt  the doctor
 5   das café    the café,
       german  english
 0   dienstag  tuesday
 1  studieren    study
 2      angst     fear
 3     andere    other
 4       arzt   doctor
 5       café     café)