In [71]:
# Using the filterd_compounds.csv file, this script creates a graph of the compounds and their components.
# A compound consists of two components, named modifier and head. The modifier is the first word in the compound, and the head is the second word.
# E.g. the compound "Apfelbaum" consists of the modifier "Apfel" and the head "Baum".
# Build a graph where the components are the nodes and the compounds are the edges.

import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx



# Read the filtered_compounds.csv file
df = pd.read_csv('compounds_data/filtered_compounds.csv', keep_default_na=False, na_values=[''])

# Filter the df to only contain rows with a frequency_class of <= 12
df = df[df['frequency_class'] <= 12]

# First use a simple graph with only 5 compounds
# Therefore create a new dataframe with custom data
# df = pd.DataFrame(columns=['modifier', 'head'])
# df.loc[0] = ['Apfel', 'Baum']
# df.loc[1] = ['Birne', 'Baum']
# df.loc[2] = ['Kirsch', 'Baum']
# df.loc[3] = ['Apfel', 'Kuchen']


# Create a graph
G = nx.Graph()

# Add the nodes
G.add_nodes_from(df['modifier'])
G.add_nodes_from(df['head'])

# Add the edges
for index, row in df.iterrows():
    # Add the frequency as an attribute to the edge
    G.add_edge(row['modifier'], row['head'], title=row['compound'], frequency=row['frequency_class'])

# Export the graph as a gexf file
nx.write_gexf(G, "compounds_data/compounds_graph.gexf")
