# Cleansing

This notebook does the following
- Cleans the dataset from compounds with invalid characters
- For compounds with two modifiers, create two entries, as these may reduce reports
- Remove duplicates
- Merge the data with the frequency data

In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import math

In [9]:
df = pd.read_csv("compounds_data/all_compounds.csv", keep_default_na=False)
df = df.rename(columns={"modifier1(|modifier2)": "modifier"})
assert(df.isnull().values.any() == False)

In [10]:
valid_characters_regex = r'[^a-zA-ZäÄöÖüÜßé]'

# Remove all rows with invalid characters in the compound column
df_valid_chars = df[~df['compound'].str.contains(valid_characters_regex)]

print("Removed rows with invalid characters in the compound column: ", len(df) - len(df_valid_chars))

Removed rows with invalid characters in the compound column:  2762


In [11]:
# The column modifier1|modifer2 sometimes contains two modifiers separated by a vertical bar.
# For these, create a new row for each modifier and remove the original row.
has_two_modifiers_mask = df_valid_chars['modifier'].str.contains('\|')
df_modifiers = df_valid_chars[has_two_modifiers_mask].copy()
df_single_modifiers = df_valid_chars[~has_two_modifiers_mask].copy()

df_modifiers['modifier'] = df_modifiers['modifier'].str.split('\|')
df_modifiers = df_modifiers.explode('modifier')
df_modifiers

Unnamed: 0,compound,modifier,head
73,Abbauarbeit,Abbau,Arbeit
73,Abbauarbeit,abbauen,Arbeit
74,Abbaufeld,abbauen,Feld
74,Abbaufeld,Abbau,Feld
75,Abbaufläche,Abbau,Fläche
...,...,...,...
120309,Zimmermann,zimmern,Mann
120317,Zimmermeister,Zimmer,Meister
120317,Zimmermeister,zimmern,Meister
121098,Zulaufstrecke,Zulauf,Strecke


In [12]:
df_modifiers_exploded = pd.concat([df_single_modifiers, df_modifiers], ignore_index=True)

# Remove duplicates where the modifier and head are the same
df_without_duplicates = df_modifiers_exploded.drop_duplicates(subset=['modifier', 'head'])

In [13]:
# Merge it with the frequency data
df_freq = pd.read_csv("frequency_data/combined_freq_class.csv", keep_default_na=False)
df_freq = df_freq.drop(columns=['freq_de_web', 'freq_derewo'])

df_merged = pd.merge(df_without_duplicates, df_freq, left_on='compound', right_on='word', how='left')
df_merged = df_merged.drop(columns=['word'])
df_merged

Unnamed: 0,compound,modifier,head,freq_class
0,Aalbestand,Aal,Bestand,45.0
1,Aalfang,Aal,Fang,43.0
2,Aalfisch,Aal,Fisch,
3,Aalmutter,Aal,Mutter,
4,Aalquappe,Aal,Quappe,
...,...,...,...,...
122960,Zimmermann,zimmern,Mann,26.0
122961,Zimmermeister,Zimmer,Meister,37.0
122962,Zimmermeister,zimmern,Meister,37.0
122963,Zulaufstrecke,Zulauf,Strecke,


In [14]:
# Save the data
df_merged.to_csv("compounds_data/cleansed_and_with_freq.csv", index=False)