In [19]:
# This notebook will add the accepted reported compounds to the filtered_compounds.csv file and remove the blocked compounds.
# Then the resulting file will get saved in assets/final_compounds.csv

# Run this script whenever the accepted or blocked compounds change

import pandas as pd
import numpy as np
import os

In [35]:
df = pd.read_csv('compounds_data/filtered_compounds.csv', keep_default_na=False)
df_reported = pd.read_csv('reported/report_accepted.csv', keep_default_na=False)
df_blocked = pd.read_csv('reported/blocked_compounds.csv', keep_default_na=False)

# For each compound in the reported compounds, count how many times it was reported
counts = df_reported.groupby('compound').size().reset_index(name='count').sort_values('count', ascending=False).head()
display(counts)

# Keep only one entry for each compound
df_reported = df_reported.drop_duplicates(subset=['modifier', 'head'], keep='first')

Unnamed: 0,compound,count
0,Abschnitt,1
17,Mithilfe,1
31,Werkraum,1
30,Vorschrift,1
29,Vorsatz,1


In [21]:
# Add the accepted reported compounds to the df 
df_with_reported = pd.concat([df, df_reported], ignore_index=True)

df_with_reported

Unnamed: 0,compound,modifier,head,frequency_class
0,Aalbestand,Aal,Bestand,22.0
1,Aalfang,Aal,Fang,20.0
2,Aalfisch,Aal,Fisch,
3,Aalmutter,Aal,Mutter,
4,Aalraupe,Aal,Raupe,
...,...,...,...,...
113027,Vorschrift,vor,Schrift,
113028,Sitzgruppe,sitzen,Gruppe,
113029,Mithilfe,mit,Hilfe,
113030,Gardeprinz,Garde,Prinz,


In [22]:
# Remove the blocked compounds from the df
df_final = df_with_reported[~df_with_reported['compound'].isin(df_blocked['compound'])]
df_final

Unnamed: 0,compound,modifier,head,frequency_class
0,Aalbestand,Aal,Bestand,22.0
1,Aalfang,Aal,Fang,20.0
2,Aalfisch,Aal,Fisch,
3,Aalmutter,Aal,Mutter,
4,Aalraupe,Aal,Raupe,
...,...,...,...,...
113027,Vorschrift,vor,Schrift,
113028,Sitzgruppe,sitzen,Gruppe,
113029,Mithilfe,mit,Hilfe,
113030,Gardeprinz,Garde,Prinz,


In [23]:
# Make sure that there are no duplicates (same head and modifier)
# Show the duplicates
duplicates = df_final[df_final.duplicated(subset=['head', 'modifier'], keep=False)]
display(duplicates.sort_values(by="compound"))

# This should be empty
assert len(duplicates) == 0

Unnamed: 0,compound,modifier,head,frequency_class


In [24]:
# Save the final dataframe to a csv file called "final_compounds.csv"
df_final.sort_values("compound").to_csv('../assets/final_compounds.csv', index=False, encoding='utf-8')