In [1]:
# This notebook will add the accepted reported compounds to the filtered_compounds.csv file and remove the blocked compounds.
# Then the resulting file will get saved in assets/final_compounds.csv

# Run this script whenever the accepted or blocked compounds change

import pandas as pd
import numpy as np
import os

In [2]:
df = pd.read_csv('compounds_data/cleansed_and_with_freq.csv', keep_default_na=False)
df_reported = pd.read_csv('reported/report_accepted.csv', keep_default_na=False)
df_blocked = pd.read_csv('reported/blocked_compounds.csv', keep_default_na=False)

# For each compound in the reported compounds, count how many times it was reported
counts = df_reported.groupby('compound').size().reset_index(name='count').sort_values('count', ascending=False).head()
display(counts)

# Keep only one entry for each compound
df_reported = df_reported.drop_duplicates(subset=['modifier', 'head'], keep='first')

Unnamed: 0,compound,count
19,Aufstand,4
105,Vorfall,2
76,Regenwassertonne,2
22,Beifall,2
116,Zeigefinger,2


In [3]:
# Add the accepted reported compounds to the df 
df_with_reported = pd.concat([df, df_reported], ignore_index=True)

df_with_reported

Unnamed: 0,compound,modifier,head,freq_class
0,Aalbestand,Aal,Bestand,45.0
1,Aalfang,Aal,Fang,43.0
2,Aalfisch,Aal,Fisch,
3,Aalmutter,Aal,Mutter,
4,Aalquappe,Aal,Quappe,
...,...,...,...,...
122194,Werteunion,Wert,Union,
122195,Werkzeugwagen,Werkzeug,Wagen,
122196,Schützenball,Schütze,Ball,
122197,Unterführung,untere,Führung,


In [4]:
# Remove the blocked compounds from the df
compound_is_blocked = df_with_reported['compound'].isin(df_blocked['component_or_compound'])
head_is_blocked = df_with_reported['head'].isin(df_blocked['component_or_compound'])
modifier_is_blocked = df_with_reported['modifier'].isin(df_blocked['component_or_compound'])
is_blocked = compound_is_blocked | head_is_blocked | modifier_is_blocked

# Print the number of blocked compounds
print(is_blocked.sum())

# Set the frequency class of blocked compounds to NaN
df_with_reported.loc[is_blocked, 'freq_class'] = np.nan
df_final = df_with_reported
df_final

1589


Unnamed: 0,compound,modifier,head,freq_class
0,Aalbestand,Aal,Bestand,45.0
1,Aalfang,Aal,Fang,43.0
2,Aalfisch,Aal,Fisch,
3,Aalmutter,Aal,Mutter,
4,Aalquappe,Aal,Quappe,
...,...,...,...,...
122194,Werteunion,Wert,Union,
122195,Werkzeugwagen,Werkzeug,Wagen,
122196,Schützenball,Schütze,Ball,
122197,Unterführung,untere,Führung,


In [5]:
# Make sure that there are no duplicates (same head and modifier)
# Show the duplicates
duplicates = df_final[df_final.duplicated(subset=['head', 'modifier'], keep=False)]
display(duplicates.sort_values(by="compound"))

# This should be empty
assert len(duplicates) == 0

Unnamed: 0,compound,modifier,head,freq_class


In [6]:
# Save the final dataframe to a csv file called "final_compounds.csv"
df_final.rename(columns={'freq_class': 'frequency_class'}, inplace=True)
df_final.sort_values(["compound", "frequency_class"]).to_csv('../assets/final_compounds.csv', index=False, encoding='utf-8')