# Combine frequencies

This notebook deals with combining the two frequency (derewo and de_web (from wortschatz leipzig)) datasets into one measure.

In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
df_derewo = pd.read_csv('frequency_data/derewo-v-ww-bll-250000g-2011-12-31-0.1/data_without_header.txt', sep=" ", header=None, names=['word', 'freq_derewo'], index_col=False)

# Clip the max frequency_class to 23 (only a few compounds have a higher frequency)
df_derewo['freq_derewo'] = df_derewo['freq_derewo'].clip(upper=23)

df_derewo['freq_derewo'].value_counts().sort_index().cumsum()

  df_derewo = pd.read_csv('frequency_data/derewo-v-ww-bll-250000g-2011-12-31-0.1/data_without_header.txt', sep=" ", header=None, names=['word', 'freq_derewo'], index_col=False)


0          1
2          4
3         11
4         26
5         48
6         82
7        152
8        314
9        631
10      1260
11      2355
12      4150
13      6985
14     11481
15     18402
16     28710
17     43612
18     64530
19     93127
20    128876
21    170264
22    211607
23    254159
Name: freq_derewo, dtype: int64

In [3]:
df_de_web = pd.read_csv('frequency_data/wortschatz-leipzig/deu-de_web_2021_1M-words.txt', sep="\t", header=None, names=['word', 'freq'])

# Convert the absolute frequency to a logarithmic frequency class
max_f = df_de_web['freq'].max()
df_de_web['freq_de_web'] = df_de_web['freq'].apply(lambda x: int(math.floor(math.log2(max_f/x)+0.5)))
df_de_web = df_de_web.drop(columns=['freq'])

df_de_web['freq_de_web'].value_counts().sort_index().cumsum()

0          3
1          4
2         19
3         37
4         60
5        114
6        245
7        525
8       1098
9       2249
10      4159
11      7302
12     12610
13     21680
14     36595
15     61998
16    100272
17    183580
18    276309
19    694802
Name: freq_de_web, dtype: int64

In [4]:
# Merge the two dataframes
df = pd.merge(df_derewo, df_de_web, on='word', how='outer')

nan_penalty = 4

max_derevo = df['freq_derewo'].max()
df['freq_derewo'] = df['freq_derewo'].fillna(max_derevo + nan_penalty)

max_web = df['freq_de_web'].max()
df['freq_de_web'] = df['freq_de_web'].fillna(max_web + nan_penalty)

df['freq_class'] = df['freq_derewo'] + df['freq_de_web']
df


Unnamed: 0,word,freq_derewo,freq_de_web,freq_class
0,"der,die,das",0.0,23.0,23.0
1,"der,die,das",3.0,23.0,26.0
2,und,2.0,0.0,2.0
3,in,2.0,1.0,3.0
4,sein,2.0,5.0,7.0
...,...,...,...,...
852155,﻿Aus,27.0,19.0,46.0
852156,﻿Beim,27.0,19.0,46.0
852157,﻿Digitalisierung,27.0,19.0,46.0
852158,﻿Überdosierung,27.0,19.0,46.0


In [5]:
# Remove duplicates
df = df.drop_duplicates(subset=['word'])

assert df['word'].duplicated().sum() == 0

In [6]:
# Save the data
df.to_csv('frequency_data/combined_freq_class.csv', index=False)