In [1]:
import pandas as pd

# This notebook cleanses the data (removing duplicates, compounds with hyphens, spaces, etc.).

In [2]:
# Some of the compounds have "Null" or "null" as values in the columns, these should not be treated as NaN
df = pd.read_csv("compounds_data/all_compounds.csv", keep_default_na=False)
print(df.head)

# Function to extract the desired modifier: try to only keep the uppercase modifier, if present
def extract_modifier(modifiers):
    modifiers_list = modifiers.split('|')
    if len(modifiers_list) == 1:
        return modifiers_list[0]
    
    if modifiers_list[0][0].isupper():
        return modifiers_list[0]
    
    return modifiers_list[1]

# Apply the function to create the "modifier" column
df["modifier"] = df["modifier1(|modifier2)"].apply(extract_modifier)
df = df[["compound", "modifier", "head"]]
df.head

<bound method NDFrame.head of                    compound modifier1(|modifier2)        head
0          10000-Meter-Lauf           10000 Meter        Lauf
1           1000-Jahr-Feier             1000 Jahr       Feier
2           1000-Meter-Lauf            1000 Meter        Lauf
3            100-Jahr-Feier              100 Jahr       Feier
4         100-Jahr-Jubiläum              100 Jahr    Jubiläum
...                     ...                   ...         ...
121859          Zypressenöl              Zypresse          Öl
121860  Zypressenwolfsmilch              Zypresse  Wolfsmilch
121861   Zytologieassistent             Zytologie   Assistent
121862           Zytoplasma                  zyto      Plasma
121863   Zytoplasmafortsatz            Zytoplasma    Fortsatz

[121864 rows x 3 columns]>


<bound method NDFrame.head of                    compound     modifier        head
0          10000-Meter-Lauf  10000 Meter        Lauf
1           1000-Jahr-Feier    1000 Jahr       Feier
2           1000-Meter-Lauf   1000 Meter        Lauf
3            100-Jahr-Feier     100 Jahr       Feier
4         100-Jahr-Jubiläum     100 Jahr    Jubiläum
...                     ...          ...         ...
121859          Zypressenöl     Zypresse          Öl
121860  Zypressenwolfsmilch     Zypresse  Wolfsmilch
121861   Zytologieassistent    Zytologie   Assistent
121862           Zytoplasma         zyto      Plasma
121863   Zytoplasmafortsatz   Zytoplasma    Fortsatz

[121864 rows x 3 columns]>

In [3]:
# Check if there are any null values
print(df.isnull().sum())

# Print the rows with null values in the modifier1 column
print(df[df["modifier"].isnull()])

compound    0
modifier    0
head        0
dtype: int64
Empty DataFrame
Columns: [compound, modifier, head]
Index: []


In [4]:
# Remove compounds with hyphens (-) in them
df_without_hyphens = df[~df["compound"].str.contains("-")]
print(df_without_hyphens.head)

# Compare size of dataframes
print(len(df))
print(len(df_without_hyphens))

# Check if there are any hyphens in the modifier or head columns
print(df_without_hyphens[df_without_hyphens["modifier"].str.contains("-")])
print(df_without_hyphens[df_without_hyphens["head"].str.contains("-")])

<bound method NDFrame.head of                    compound    modifier        head
56               Aalbestand         Aal     Bestand
57                  Aalfang         Aal        Fang
58                 Aalfisch         Aal       Fisch
59                Aalmutter         Aal      Mutter
60                Aalquappe         Aal      Quappe
...                     ...         ...         ...
121859          Zypressenöl    Zypresse          Öl
121860  Zypressenwolfsmilch    Zypresse  Wolfsmilch
121861   Zytologieassistent   Zytologie   Assistent
121862           Zytoplasma        zyto      Plasma
121863   Zytoplasmafortsatz  Zytoplasma    Fortsatz

[119105 rows x 3 columns]>
121864
119105
Empty DataFrame
Columns: [compound, modifier, head]
Index: []
Empty DataFrame
Columns: [compound, modifier, head]
Index: []


In [5]:
# Check if there are any duplicates
print(df_without_hyphens.duplicated().any())

# Print the duplicates
print(df_without_hyphens[df_without_hyphens.duplicated(keep=False)].sort_values(by=["compound"]))

# Remove the duplicates
df_without_duplicates = df_without_hyphens.drop_duplicates()

True
              compound modifier      head
922         Abwehrchef   Abwehr      Chef
923         Abwehrchef   Abwehr      Chef
955     Abwehrstellung   Abwehr  Stellung
956     Abwehrstellung   Abwehr  Stellung
957     Abwehrstellung   Abwehr  Stellung
40052       Grünanteil     Grün    Anteil
40053       Grünanteil     Grün    Anteil
41200        Hagelkorn    Hagel      Korn
41201        Hagelkorn    Hagel      Korn
56729         Kreuzweg    Kreuz       Weg
56728         Kreuzweg    Kreuz       Weg
60197         Laufwerk     Lauf      Werk
60199         Laufwerk     Lauf      Werk
98012        Stammbuch  stammen      Buch
98014        Stammbuch  stammen      Buch
106324     Trommelfell  Trommel      Fell
106325     Trommelfell  Trommel      Fell
120168       Zielmarke     Ziel     Marke
120169       Zielmarke     Ziel     Marke


In [6]:
# Print all the rows where modifier or head starts with a lowercase letter
print(df_without_duplicates[df_without_duplicates["modifier"].str[0].str.islower()])
print(df_without_duplicates[df_without_duplicates["head"].str[0].str.islower()])

df_without_duplicates[df_without_duplicates["head"].str[0].str.islower()].groupby("head").count().sort_values(by=["compound"], ascending=False)


                      compound        modifier             head
89            Abbiegeassistent        abbiegen        Assistent
90      Abbiegeassistenzsystem        abbiegen  Assistenzsystem
91                 Abbiegespur        abbiegen             Spur
95                Abblendlicht       abblenden            Licht
117                Abdeckcreme        abdecken            Creme
...                        ...             ...              ...
121830           Zwölftonreihe           zwölf         Tonreihe
121831         Zwölftontechnik       zwölf Ton          Technik
121832           Zwölfzylinder           zwölf         Zylinder
121833      Zwölfzylindermotor  zwölf Zylinder            Motor
121862              Zytoplasma            zyto           Plasma

[18086 rows x 3 columns]
                         compound              modifier   head
4323                  Archivwesen                Archiv  wesen
4413                   Armenwesen                  Arme  wesen
5737             

Unnamed: 0_level_0,compound,modifier
head,Unnamed: 1_level_1,Unnamed: 2_level_1
wesen,90,90
logie,3,3
builder,1,1
click,1,1
core,1,1
groß,1,1
hoster,1,1
kauen,1,1
line,1,1
play,1,1


In [7]:
# Print rows with spaces in one of the columns and then remove these rows
print(df_without_duplicates[df_without_duplicates["compound"].str.contains(" ")])
print(df_without_duplicates[df_without_duplicates["modifier"].str.contains(" ")])
print(df_without_duplicates[df_without_duplicates["head"].str.contains(" ")])

df_without_spaces = df_without_duplicates[~df_without_duplicates["compound"].str.contains(" ")]
df_without_spaces = df_without_spaces[~df_without_spaces["modifier"].str.contains(" ")]
df_without_spaces = df_without_spaces[~df_without_spaces["head"].str.contains(" ")]
print(len(df_without_spaces))

               compound modifier   head
32016         Free Jazz     free   Jazz
48565  Irakischer Dinar     Irak  Dinar
85635       Rotary Club   rotary   Club
                      compound           modifier     head
1042      Achthundertmeterlauf  achthundert Meter     Lauf
1043    Achthundertmeterrennen  achthundert Meter   Rennen
1046            Achtstundentag        acht Stunde      Tag
1050         Achtzylindermotor      acht Zylinder    Motor
1272             Affenbrotbaum          Affe Brot     Baum
...                        ...                ...      ...
121826         Zwölffingerdarm       zwölf Finger     Darm
121828         Zwölfmeilenzone        zwölf Meile     Zone
121829           Zwölftonmusik          zwölf Ton    Musik
121831         Zwölftontechnik          zwölf Ton  Technik
121833      Zwölfzylindermotor     zwölf Zylinder    Motor

[902 rows x 3 columns]
Empty DataFrame
Columns: [compound, modifier, head]
Index: []
118190


In [8]:
# Save the dataframe to a csv file
df_without_spaces.to_csv("compounds_data/cleansed_compounds.csv", index=False)