In [67]:
import pandas as pd

In [68]:
df = pd.read_csv('../assets/final_compounds.csv', keep_default_na=False)
# Replace empty values in frequency_class column with nan
df['frequency_class'] = df['frequency_class'].replace('', 'nan')
df.head()

Unnamed: 0,compound,modifier,head,frequency_class
0,Aalbestand,Aal,Bestand,22.0
1,Aalfang,Aal,Fang,20.0
2,Aalfisch,Aal,Fisch,
3,Aalmutter,Aal,Mutter,
4,Aalraupe,Aal,Raupe,


# Game Mode: **All in One** - Build big compounds

In [69]:
# Add a column "level" to the dataframe, which is initiated with 0
df = df.assign(level=0)

# If the modifier of a compound is also a compound, increase the level of the compound by 1
df_modifier_is_compound = df.loc[df['modifier'].isin(df['compound']), 'level']
df.loc[df_modifier_is_compound.index, 'level'] = df.loc[df_modifier_is_compound.index, 'level'] + 1

# Do the same for the head of a compound
df_head_is_compound = df.loc[df['head'].isin(df['compound']), 'level']
df.loc[df_head_is_compound.index, 'level'] = df.loc[df_head_is_compound.index, 'level'] + 1

# Show a summary of the dataframe
print(df.describe(include='all'))
df['level'].value_counts()


        compound modifier    head frequency_class          level
count     113063   113063  113063          113063  113063.000000
unique    113041    11859    9678              21            NaN
top     Andenken     Land    Zeit             nan            NaN
freq           2      464     438           21234            NaN
mean         NaN      NaN     NaN             NaN       0.148484
std          NaN      NaN     NaN             NaN       0.369247
min          NaN      NaN     NaN             NaN       0.000000
25%          NaN      NaN     NaN             NaN       0.000000
50%          NaN      NaN     NaN             NaN       0.000000
75%          NaN      NaN     NaN             NaN       0.000000
max          NaN      NaN     NaN             NaN       2.000000


0    96835
1    15668
2      560
Name: level, dtype: int64

In [70]:
def increase_level(df, level):
    df_level = df.loc[df['level'] >= level]
    level_as_modifier_or_head = df['modifier'].isin(df_level['compound']) | df['head'].isin(df_level['compound'])
    df.loc[level_as_modifier_or_head, 'level'] = df.loc[level_as_modifier_or_head, 'level'] + 1
    return df

df = increase_level(df, 1)
df = increase_level(df, 2)
df = increase_level(df, 3)
df = increase_level(df, 4)
df = increase_level(df, 5)

df['level'].value_counts()

0    96835
1    15342
2      864
3       22
Name: level, dtype: int64

In [71]:
# Print the rows with level 3
df.loc[df['level'] == 3]

Unnamed: 0,compound,modifier,head,frequency_class,level
11683,Biomasseheizkraftwerk,Biomasse,Heizkraftwerk,,3
13374,Brennstoffzellenfahrzeug,Brennstoffzelle,Fahrzeug,21.0,3
14249,Bundesligatorschützenkönig,Bundesliga,Torschützenkönig,,3
23683,Fachhochschulstudiengang,Fachhochschule,Studiengang,20.0,3
24227,Fahrzeuginnenraumreinigung,Fahrzeuginnenraum,Reinigung,,3
26339,Feuerwehrdienstvorschrift,Feuerwehr,Dienstvorschrift,,3
28045,Fluorchlorkohlenwasserstoff,Fluor,Chlorkohlenwasserstoff,18.0,3
29064,Frauenfußballbundesliga,Frauenfußball,Bundesliga,,3
29088,Frauenhandballnationalmannschaft,Frau,Handballnationalmannschaft,,3
41609,Hochwasserschutzmaßnahme,Hochwasserschutz,Maßnahme,18.0,3


In [73]:
# Show the rows with level >= 2 and sort them by frequency_class, ignoring null values
df.loc[df['level'] >= 2].sort_values(by='frequency_class', na_position='first').head(20)

Unnamed: 0,compound,modifier,head,frequency_class,level
26347,Feuerwehrgerätehaus,Feuerwehr,Gerätehaus,14.0,2
14383,Bundesumweltminister,Bund,Umweltminister,14.0,2
87239,Sonntagnachmittag,Sonntag,Nachmittag,14.0,2
103879,Vorjahreszeitraum,Vorjahr,Zeitraum,14.0,2
14384,Bundesumweltministerium,Bund,Umweltministerium,15.0,2
6810,Bahnhofsvorplatz,Bahnhof,Vorplatz,15.0,2
51965,Kreisvolkshochschule,Kreis,Volkshochschule,15.0,2
99332,Umweltschutzorganisation,Umweltschutz,Organisation,15.0,2
99222,Umweltbundesamt,Umwelt,Bundesamt,15.0,2
14372,Bundestagswahlkampf,Bundestag,Wahlkampf,15.0,2
