In [21]:
# This notebook is used to filter the cleansed compounds.
# For filtering we use other datasets containing frequent german words.

In [22]:
import pandas as pd

In [23]:
# Read the file without the header
file_path1 = 'frequency_data/derewo-v-ww-bll-250000g-2011-12-31-0.1/data_without_header.txt'
df1 = pd.read_csv(file_path1, sep=' ', encoding='utf-8', header=None, names=["word", "frequency_class"], index_col=False, keep_default_na=False)
df1.head(n=20)

  df1 = pd.read_csv(file_path1, sep=' ', encoding='utf-8', header=None, names=["word", "frequency_class"], index_col=False, keep_default_na=False)


Unnamed: 0,word,frequency_class
0,"der,die,das",0
1,und,2
2,in,2
3,sein,2
4,ein(e),3
5,werden,3
6,"der,die,das",3
7,haben,3
8,von,3
9,mit,3


In [24]:
# Remove all words containing special characters
df1_clean = df1[~df1['word'].str.contains(r'[^a-zA-ZäÄöÖüÜßé]')]
print(f"{len(df1)} -> {len(df1_clean)}")

# Show rows with special characters
print(df1[df1['word'].str.contains(r'[^a-zA-ZäÄöÖüÜßé]')].to_string())

254159 -> 254086
                           word  frequency_class
0                   der,die,das                0
4                        ein(e)                3
6                   der,die,das                3
19                       ein(e)                4
26                dies(e,er,es)                5
33                       ein(e)                5
54                  kein(e,r,s)                6
90                    jede(r,s)                7
131                 letzte(r,s)                7
205                    solch(e)                8
234              welch(e,er,es)                8
788                   jene(r,s)               10
1087           der/die/dasselbe               10
1696          der/die/dasjenige               11
3053               US-Präsident               12
3126              EU-Kommission               12
3193              jegliche(r,s)               12
3486        rheinland-pfälzisch               12
3636                  Come-back               12
504

In [25]:
# Load the compounds
df_compounds = pd.read_csv('compounds_data/cleansed_compounds.csv', keep_default_na=False)
df_compounds.head(n=20)

Unnamed: 0,compound,modifier,head
0,Aalbestand,Aal,Bestand
1,Aalfang,Aal,Fang
2,Aalfisch,Aal,Fisch
3,Aalmutter,Aal,Mutter
4,Aalquappe,Aal,Quappe
5,Aalräucherei,Aal,Räucherei
6,Aalraupe,Aal,Raupe
7,Aalstrich,Aal,Strich
8,Aalsuppe,Aal,Suppe
9,Aaretal,Aare,Tal


In [26]:
# Only keep the rows from the compounds where either:
# - both of the entries of the columns "modifier" or "head" are in the frequency list
# or
# - the "compound" entry is in the frequency list
df_compounds_filtered = df_compounds[
    (df_compounds['modifier'].isin(df1_clean['word'])
    & df_compounds['head'].isin(df1_clean['word']))
    | df_compounds['compound'].isin(df1_clean['word']) 
]
print(f"{len(df_compounds)} -> {len(df_compounds_filtered)}")

118190 -> 113666


In [27]:
# Merge the frequency data with the compounds data. Add a new column to the df that contains the frequency of the compound
df_merged = df_compounds_filtered.merge(df1_clean, left_on='compound', right_on='word', how='left')
df_merged = df_merged.drop(columns=['word'])
df_merged

Unnamed: 0,compound,modifier,head,frequency_class
0,Aalbestand,Aal,Bestand,22.0
1,Aalfang,Aal,Fang,20.0
2,Aalfisch,Aal,Fisch,
3,Aalmutter,Aal,Mutter,
4,Aalräucherei,Aal,Räucherei,23.0
...,...,...,...,...
113685,Zypressenöl,Zypresse,Öl,
113686,Zypressenwolfsmilch,Zypresse,Wolfsmilch,
113687,Zytologieassistent,Zytologie,Assistent,
113688,Zytoplasma,zyto,Plasma,22.0


In [28]:
# Merging may have introduced new duplicates. A compound is a complicate, is they have the same compound.
# Remove the duplicates
df_merged_without_duplicates = df_merged.drop_duplicates(subset=['compound'])
print(f"{len(df_merged)} -> {len(df_merged_without_duplicates)}")


113690 -> 113648


In [29]:
# There are still some duplicates. These are where the modifier and the head are the same, but the compound is different.
# Print these duplicates
df_merged_without_duplicates[df_merged_without_duplicates.duplicated(subset=['modifier', 'head'], keep=False)]

# Keep the compound with the highest frequency (lowest frequency class)
df_merged_without_duplicates2 = df_merged_without_duplicates.sort_values(by=['frequency_class'], ascending=True).drop_duplicates(subset=['modifier', 'head'], keep='first')
print(f"{len(df_merged_without_duplicates)} -> {len(df_merged_without_duplicates2)}")

113648 -> 112999


In [30]:
# Save the merged dataframe to a csv file called "filtered_compounds.csv"
df_merged_without_duplicates2.sort_values("compound").to_csv('../assets/filtered_compounds.csv', index=False, encoding='utf-8')