In [1]:
import pandas as pd

In [2]:
df_de = pd.read_parquet('../sg_de1.parquet') # results of skip-grams and 2-grams
df_nl = pd.read_parquet('../sn_grams/sg_nl.parquet') # results of skip-grams and 2-grams
df_fr = pd.read_parquet('../sn_grams/sg_fr.parquet') # results of skip-grams and 2-grams
df_es = pd.read_parquet('../sn_grams/sg_es.parquet') # results of skip-grams and 2-grams

In [3]:
df_de['skips'] = df_de['skips'].apply(dict)

In [4]:
regex = r"[~!@#$%*()-_+=\[\]{}\\\":;,.<>?/|`]"

In [25]:
df = \
    df[ \
        ~df.word1.str.contains(regex) & \
        ~df.word2.str.contains(regex) & \
        ~df.word1.str.contains(r"\d") & \
        ~df.word2.str.contains(r"\d") & \
        ~df.word1.str.isnumeric() & \
        ~df.word2.str.isnumeric() & \
        (df.word1 != df.word2)
    ]

In [33]:
reversed_words = df["word2"]+df["word1"]
df['symmetric'] = (df["word1"]+df["word2"]).isin(reversed_words)

In [5]:
# list of ~94k US babynames to remove (hello, john), (james, bond)
names = pd.read_csv("../names.csv", names=['name'])['name'].tolist()

In [6]:
names = [i for i in names if type(i) == str] #remove non strings

# Filter names and from list

In [6]:
german_filter = ['ich', 'du', 'sie', 'mir', 'wir', 'ihr', 'uns', 'euch', 'die', 'das', 'der', 'den', 'er', 'und', 'ein', 'eine', 'einer', 'einem', 'mein', 'dein', 'nicht', 'wie', 'wo', 'wann', 'was']

In [7]:
dutch_filter = ['ik', 'jij', 'hij', 'zij', 'wij', 'we', 'jullie', 'hun', 'mij', 'me', 'mijn', 'haar', 'hem', 'een', 'de', 'het', 'wie', 'wat', 'waar', 'hoe', 'niet']

In [8]:
french_filter = ['un', 'une', 'le', 'la', 'je', 'tu', 'nous', 'vous']

In [9]:
spanish_filter = ['el']

In [7]:
l_de = names #+ german_filter
l_nl = names #+ dutch_filter
l_fr = names #+ french_filter
l_es = names #+ spanish_filter

In [8]:
df_fr.word1 = df_fr.word1.apply(str)
df_fr.word2 = df_fr.word2.apply(str)
df_es.word1 = df_es.word1.apply(str)
df_es.word2 = df_es.word2.apply(str)

In [9]:
df_de = \
    df_de[ \
    ~df_de.word1.isin(l_de) & \
    ~df_de.word2.isin(l_de) & \
    ~df_de.word1.str.isnumeric() & \
    ~df_de.word2.str.isnumeric() &\
    (df_de.word1 != df_de.word2)
    ]

df_nl = df_nl[ \
    ~df_nl.word1.isin(l_nl) & \
    ~df_nl.word2.isin(l_nl) & \
    ~df_nl.word1.str.isnumeric() & \
    ~df_nl.word2.str.isnumeric() & \
    (df_nl.word1 != df_nl.word2)
    ]

df_fr = \
    df_fr[ \
    ~df_fr.word1.isin(l_fr) & \
    ~df_fr.word2.isin(l_fr) & \
    ~df_fr.word1.str.isnumeric() & \
    ~df_fr.word2.str.isnumeric() & \
    (df_fr.word1 != df_fr.word2)
    ]
    
df_es = \
    df_es[ \
    ~df_es.word1.isin(l_es) & \
    ~df_es.word2.isin(l_es) & \
    ~df_es.word1.str.contains(regex) & \
    ~df_es.word2.str.isnumeric() & \
    (df_es.word1 != df_es.word2)
    ]

# Remove words overlapping with other languages

In [10]:
df_ex = df_de \
    .merge(df_nl ,left_on=['word1', 'word2'], right_on=['word1', 'word2'], how='left') \
    .merge(df_fr ,left_on=['word1', 'word2'], right_on=['word1', 'word2'], how='left') \
    .merge(df_es ,left_on=['word1', 'word2'], right_on=['word1', 'word2'], how='left') 

In [11]:
df_ex = df_ex.fillna(0)

In [12]:
df_ex.columns

Index(['word1', 'word2', 'frequency_x', 'skips_x', 'frequency_y', 'skips_y',
       'frequency_x', 'skips_x', 'frequency_y', 'skips_y'],
      dtype='object')

In [13]:
df_ex.columns = ['word1', 'word2', 'frequency', 'skips', 'nl', 'skips_nl', 'fr', 'skips_fr', 'es', 'skips_es']

In [14]:
df = df_ex[(df_ex.nl == 0.0) & (df_ex.fr == 0.0) & (df_ex.es == 0.0)].drop(['nl', 'skips_nl', 'fr', 'skips_fr', 'es', 'skips_es'], axis=1)

# Stemming

In [32]:
from nltk.stem.snowball import GermanStemmer   
stemmer = GermanStemmer()

In [16]:
df.word1 = df.word1.apply(stemmer.stem)
df.word2 = df.word2.apply(stemmer.stem)

In [17]:
df = df.groupby(['word1', 'word2'], as_index=False)['frequency'].sum()

# Add word frequency

In [15]:
wc_de = pd.read_csv("../word_counts/wc_de.csv", lineterminator='\n', names=['word', 'word_frequency'])

In [16]:
wc_de["word_frequency"] = pd.to_numeric(wc_de["word_frequency"])

In [17]:
df = df.merge(wc_de, left_on='word1', right_on='word', how='left') \
.merge(wc_de, left_on='word2', right_on='word', how='left') \
.drop(['word_x', 'word_y'], axis=1) \
.rename(index=str, columns={"word_frequency_x": "word_1_frequency", "word_frequency_y": "word_2_frequency"})

# Collocation info gain

In [18]:
from nltk.metrics.association import BigramAssocMeasures

In [19]:
bigram_measures = BigramAssocMeasures()

In [None]:
df['chi'] = df[['frequency', 'word_1_frequency', 'word_2_frequency']].apply(lambda row: 
    bigram_measures.chi_sq(
        row['frequency'], \
        (row['word_1_frequency'], row['word_2_frequency']), \
        df.frequency.count())        
, axis=1)

In [20]:
df['pmi'] = df[['frequency', 'word_1_frequency', 'word_2_frequency']].apply(lambda row: 
    bigram_measures.pmi(
        row['frequency'], \
        (row['word_1_frequency'],  row['word_2_frequency']), \
        df.frequency.count())        
, axis=1)

# Trennenbare Verben evaluation

In [30]:
verben = pd.read_csv("../verben_list.txt", names=["verben"])

In [33]:
stemmed = (df.word1+df.word2).apply(stemmer.stem)
stemmed = stemmed.append((df.word2+df.word1).apply(stemmer.stem))

In [34]:
round(((verben.verben.apply(stemmer.stem).isin(stemmed).apply(int).sum()/verben.count())*100)['verben'], 1)

27.7

# Skips Analysis

In [21]:
df['number_of_skips'] = df['skips'].apply(len)

In [22]:
df.to_csv('filtered_german_full.csv')

In [63]:
df_de.to_csv('german.csv')

In [41]:
df_de[~(df_de.word2+df_de.word1).isin(stemmed)].count()

word1        254473
word2        254473
frequency    254473
skips        254473
dtype: int64

In [2]:
pd.read_csv("../filtered_de.zip", compression='zip')

Unnamed: 0.1,Unnamed: 0,word1,word2,frequency,skips,number_of_skips,word_1_frequency,word_2_frequency,normalised_frequency,symmetric,chi,pmi
0,0,a,beautiful,120,"{'3': 1, '0': 1, '10': 1, '1': 1, '8': 1, '9':...",15,22631.0,446.0,0.005200,False,6.485048e+02,2.818098
1,1,ab,abend,161,"{'7': 1, '5': 1, '10': 1, '18': 1, '0': 1, '11...",13,150780.0,98385.0,0.000646,True,3.968298e+04,-7.279199
2,2,ab,jungs,130,"{'0': 1, '4': 1, '14': 1, '1': 1, '3': 1, '8':...",9,150780.0,58526.0,0.000621,True,2.174520e+04,-6.838388
3,3,ab,mit,2709,"{'4': 1, '1': 1, '25': 1, '22': 1, '15': 1, '1...",27,150780.0,1416442.0,0.001729,True,-3.426233e+05,-7.054261
4,4,ab,schule,140,"{'8': 1, '2': 1, '20': 1, '11': 1, '3': 1, '6'...",11,150780.0,37454.0,0.000744,True,1.322759e+04,-6.087515
5,5,abby,die,155,"{'2': 1, '4': 1, '6': 1, '1': 1, '0': 1, '10':...",13,4105.0,3505417.0,0.000044,True,-4.912188e+03,-7.290077
6,6,abend,passiert,392,"{'5': 1, '23': 1, '16': 1, '10': 1, '19': 1, '...",19,98385.0,127874.0,0.001733,False,3.122665e+04,-5.757684
7,7,aber,alkohol,235,"{'5': 1, '9': 1, '1': 1, '11': 1, '0': 1, '8':...",16,1173186.0,6843.0,0.000199,False,-1.352099e+04,-5.847776
8,8,aber,attraktiv,125,"{'1': 1, '3': 1, '9': 1, '5': 1, '6': 1, '10':...",15,1173186.0,3037.0,0.000106,False,-5.920313e+03,-5.586527
9,9,aber,finger,513,"{'0': 1, '4': 1, '11': 1, '17': 1, '5': 1, '21...",19,1173186.0,17725.0,0.000431,True,-3.588344e+04,-6.094563
