In [27]:
import pandas as pd
import math

In [28]:
# Load the main compound data
df = pd.read_csv('compounds_data/cleansed_compounds.csv', keep_default_na=False)
df.head()

Unnamed: 0,compound,modifier,head
0,Aalbestand,Aal,Bestand
1,Aalfang,Aal,Fang
2,Aalfisch,Aal,Fisch
3,Aalmutter,Aal,Mutter
4,Aalquappe,Aal,Quappe


In [29]:
df_de_web = pd.read_csv('frequency_data/deu-de_web_2021_1M-words.txt', sep="\t", header=None, names=['word', 'freq'])
df_de_web

Unnamed: 0,word,freq
101,und,506683
102,der,443830
103,die,422247
104,in,243094
105,für,161903
...,...,...
694898,﻿Aus,1
694899,﻿Beim,1
694900,﻿Digitalisierung,1
694901,﻿Überdosierung,1


In [30]:
# Calculate the frequency class for each word: ⌊log2(max_f/f(grundform))+0,5⌋
max_f = df_de_web['freq'].max()
df_de_web['freq_class'] = df_de_web['freq'].apply(lambda x: int(math.floor(math.log2(max_f/x)+0.5)))
df_de_web

Unnamed: 0,word,freq,freq_class
101,und,506683,0
102,der,443830,0
103,die,422247,0
104,in,243094,1
105,für,161903,2
...,...,...,...
694898,﻿Aus,1,19
694899,﻿Beim,1,19
694900,﻿Digitalisierung,1,19
694901,﻿Überdosierung,1,19


In [31]:
# Merge the frequency class into the main compound data
df_merged = df.merge(df_de_web, how='left', left_on='compound', right_on='word')
df_merged = df_merged.drop(columns=['word', 'freq'])
df_merged

Unnamed: 0,compound,modifier,head,freq_class
0,Aalbestand,Aal,Bestand,
1,Aalfang,Aal,Fang,
2,Aalfisch,Aal,Fisch,
3,Aalmutter,Aal,Mutter,
4,Aalquappe,Aal,Quappe,
...,...,...,...,...
118185,Zypressenöl,Zypresse,Öl,
118186,Zypressenwolfsmilch,Zypresse,Wolfsmilch,
118187,Zytologieassistent,Zytologie,Assistent,
118188,Zytoplasma,zyto,Plasma,19.0


In [32]:
# Show statistics on the frequency class
print(df_merged['freq_class'].value_counts(dropna=False).sort_index())


6.0         1
7.0         1
8.0        18
9.0        37
10.0      100
11.0      245
12.0      656
13.0     1335
14.0     2504
15.0     4500
16.0     6398
17.0    11034
18.0     8743
19.0    18573
NaN     64045
Name: freq_class, dtype: int64


In [33]:
# Add the frequency data from the derewo dataset
df_derewo = pd.read_csv('frequency_data/derewo-v-ww-bll-250000g-2011-12-31-0.1/data_without_header.txt', sep=" ", header=None, names=['word', 'freq_derewo'], index_col=False)
df_merged = df_merged.merge(df_derewo, how='left', left_on='compound', right_on='word')
df_merged = df_merged.drop(columns=['word'], axis=1)

  df_derewo = pd.read_csv('frequency_data/derewo-v-ww-bll-250000g-2011-12-31-0.1/data_without_header.txt', sep=" ", header=None, names=['word', 'freq_derewo'], index_col=False)


In [34]:
# For each of the frequencies, show 5 random compounds with that frequency
freqs = df_merged['freq_class'].unique()
freqs.sort()
for freq in freqs:
    print(f"Frequency:{freq}, Count:{df_merged[df_merged['freq_class'] == freq].shape[0]}")
    count = df_merged[df_merged['freq_class'] == freq].shape[0]
    sample_size = min(count, 8)
    print(df_merged[df_merged['freq_class'] == freq].sample(n=sample_size, random_state=1))
    print('----------------')

Frequency:6.0, Count:1
         compound modifier    head  freq_class  freq_derewo
43619  Hochschule     hoch  Schule         6.0         11.0
----------------
Frequency:7.0, Count:1
        compound modifier  head  freq_class  freq_derewo
38818  Grundlage    grund  Lage         7.0         11.0
----------------
Frequency:8.0, Count:18
           compound modifier    head  freq_class  freq_derewo
30957     Freistaat     frei   Staat         8.0         13.0
3315         Anzahl       an    Zahl         8.0         12.0
104560       Umwelt       um    Welt         8.0         11.0
3129         Anteil       an    Teil         8.0         10.0
111674     Webseite      Web   Seite         8.0         14.0
57721     Landkreis     Land   Kreis         8.0         11.0
113352   Wettbewerb    Wette  Bewerb         8.0         10.0
3562    Arbeitgeber   Arbeit   Geber         8.0         11.0
----------------
Frequency:9.0, Count:39
                 compound modifier         head  freq_class  fr