In [1]:
import pandas as pd

In [2]:
df_de = pd.read_csv('../grams/sg_de.csv', names=['frequency', 'word1', 'word2']) # results of skip-grams and 2-grams
df_nl = pd.read_csv('../grams/sg_nl.csv', names=['frequency', 'word1', 'word2']) # results of skip-grams and 2-grams
df_fr = pd.read_csv('../grams/sg_fr.csv', names=['frequency', 'word1', 'word2']) # results of skip-grams and 2-grams
df_es = pd.read_csv('../grams/sg_es.csv', names=['frequency', 'word1', 'word2']) # results of skip-grams and 2-grams

In [3]:
# list of ~94k US babynames to remove (hello, john), (james, bond)
names = pd.read_csv("../names.csv", names=['name'])['name'].tolist()

In [4]:
names = [i for i in names if type(i) == str] #remove non strings

# Filter names and from list

In [5]:
german_filter = ['ich', 'du', 'sie', 'mir', 'wir', 'ihr', 'uns', 'euch', 'die', 'das', 'der', 'den', 'er', 'und', '!', '/', 'ein', 'eine', 'einer', 'einem', 'mein', 'dein', 'nicht', 'wie', 'wo', 'wann', 'was']

In [6]:
dutch_filter = ['ik', 'jij', 'hij', 'zij', 'wij', 'we', 'jullie', 'hun', 'mij', 'me', 'mijn', 'haar', 'hem', '!', '/', 'een', 'de', 'het', 'wie', 'wat', 'waar', 'hoe', 'niet']

In [7]:
french_filter = ['un', 'une', 'le', 'la', '!', '/', 'je', 'tu', 'nous', 'vous']

In [8]:
spanish_filter = ['!', '/', 'el']

In [9]:
l_de = names + german_filter
l_nl = names + dutch_filter
l_fr = names + french_filter
l_es = names + spanish_filter


df_fr.word1 = df_fr.word1.apply(str)
df_fr.word2 = df_fr.word2.apply(str)
df_es.word1 = df_es.word1.apply(str)
df_es.word2 = df_es.word2.apply(str)

In [10]:
df_de = \
    df_de[ \
    ~df_de.word1.isin(l_de) & \
    ~df_de.word2.isin(l_de) & \
    ~df_de.word1.str.isnumeric() & \
    ~df_de.word2.str.isnumeric() &\
    ~(df_de.word1.str == df_de.word2.str)
    ]

df_nl = \
    df_nl[ \
    ~df_nl.word1.isin(l_nl) & \
    ~df_nl.word2.isin(l_nl) & \
    ~df_nl.word1.str.isnumeric() & \
    ~df_nl.word2.str.isnumeric() & \
    ~(df_de.word1.str == df_de.word2.str)
    ]

df_fr = \
    df_fr[ \
    ~df_fr.word1.isin(l_fr) & \
    ~df_fr.word2.isin(l_fr) & \
    ~df_fr.word1.str.isnumeric() & \
    ~df_fr.word2.str.isnumeric() & \
    ~(df_fr.word1.str == df_fr.word2.str)
    ]
    
df_es = \
    df_es[ \
    ~df_es.word1.isin(l_es) & \
    ~df_es.word2.isin(l_es) & \
    ~df_es.word1.str.isnumeric() & \
    ~df_es.word2.str.isnumeric() & \
    ~(df_es.word1.str == df_es.word2.str)
    ]

# Remove words overlapping with other languages

In [11]:
df_de_ex = df_de \
    .merge(df_nl ,left_on=['word1', 'word2'], right_on=['word1', 'word2'], how='left') \
    .merge(df_fr ,left_on=['word1', 'word2'], right_on=['word1', 'word2'], how='left') \
    .merge(df_es ,left_on=['word1', 'word2'], right_on=['word1', 'word2'], how='left')

In [12]:
df_de_ex = df_de_ex.fillna(0)

In [13]:
df_de_ex.columns = ['frequency', 'word1', 'word2', 'nl', 'fr', 'es']

In [14]:
df = df_de_ex[(df_de_ex.nl == 0.0) & (df_de_ex.fr == 0.0) & (df_de_ex.es == 0.0)].drop(['nl', 'fr', 'es'], axis=1)

# Add word frequency

In [15]:
wc_de = pd.read_csv("../word_counts/wc_de.csv", lineterminator='\n', names=['word', 'word_frequency'])

In [16]:
df = df.merge(wc_de, left_on='word1', right_on='word', how='left') \
.merge(wc_de, left_on='word2', right_on='word', how='left') \
.drop(['word_x', 'word_y'], axis=1) \
.rename(index=str, columns={"word_frequency_x": "word_1_frequency", "word_frequency_y": "word_2_frequency"})

In [17]:
df['normalised_frequency'] = df['frequency'] / (df['word_1_frequency']+df['word_2_frequency']) # divide skipgram freq. by wordcount freq.

# Collocation info gain

In [19]:
from nltk.metrics.association import BigramAssocMeasures

In [20]:
bigram_measures = BigramAssocMeasures()

In [60]:
df['chi'] = df[['frequency', 'word_1_frequency', 'word_2_frequency']].apply(lambda row: 
    bigram_measures.chi_sq(
        row['frequency'], \
        (row['word_1_frequency'], row['word_2_frequency']), \
        df.frequency.count())        
, axis=1)

  ((n_ii + n_io) * (n_ii + n_oi) * (n_io + n_oo) * (n_oi + n_oo)))


In [67]:
df.sort_values(by="chi", ascending=False)[:100]

Unnamed: 0,frequency,word1,word2,word_1_frequency,word_2_frequency,normalised_frequency,chi
79828,386,ist,h??ngen,4236656,11056,0.000091,3.779813e+11
33678,505,ist,gleichen,4236656,11056,0.000119,2.224038e+11
219373,208,noch,wohin,803581,35835,0.000248,1.318162e+10
268591,130,einzige,raus,53286,137606,0.000681,9.826972e+09
65238,137,wohin,noch,35835,803581,0.000163,9.631678e+09
174009,128,bringe,selbst,23638,126295,0.000854,6.687954e+09
263658,140,uh,hast,11018,711985,0.000194,5.798590e+09
99345,2614,etwas,zeigen,395606,43244,0.005956,3.614834e+09
188196,231,einzige,unsere,53286,137623,0.001210,2.655143e+09
9220,317,zeigen,etwas,43244,395606,0.000722,2.446655e+09
