### Imports

In [63]:
from importlib import reload
import pandas as pd
import numpy as np
import networkx as nx

from os import listdir
from os.path import isfile, join

# Importieren und neu laden (damit aktuelle Änderungen übernommen werden)
from helper import graph_analyzer, text_preprocesser
reload(text_preprocesser)
reload(graph_analyzer)


from helper.text_preprocesser import prepareText, convert_preprocessed_tokens_to_graph
from helper.graph_analyzer import parallelGetDistanceMeasures

### Optionen setzen

In [64]:
PARALLEL = True

### Dateipfade laden

In [65]:
# Allgemeiner Pfad zu den Daten
DATA_PATH = "data/input/"

# Liste mit Dateinamen von Büchern
file_name_list = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

# Liste mit Inhalten von Dateien
file_content_list = [" ".join(open(join(DATA_PATH, f)).readlines()) for f in file_name_list]


### Dataframe mit Dateinamen und Inhalt befüllen

In [66]:
# Tabelle erstellen mit Spalten "title" und "content"
dataframe = pd.DataFrame({'title': file_name_list, 'content': file_content_list})


### Text vorbereiten und Graphen erstellen

In [67]:
dataframe["prepared_text"] = dataframe["content"].apply(prepareText)

dataframe["graph"] = dataframe["prepared_text"].apply(convert_preprocessed_tokens_to_graph)

### Metriken berechnen

In [68]:
# Basismetriken: Knotenanzahl und Kantenanzahl
dataframe["node_count"] = dataframe["graph"].apply(lambda g: len(g.nodes))
dataframe["edge_count"] = dataframe["graph"].apply(lambda g: len(g.edges))

# Knotengrade
dataframe["degree_list"] = dataframe["graph"].apply(lambda g: np.array([deg for node, deg in g.degree ]))
# Alternative von ChatGPT: dataframe["degree_list"] = dataframe["graph"].apply(lambda g: np.fromiter(dict(g.degree).values(), dtype=float))
dataframe["average_degree"] = dataframe["degree_list"].apply(lambda degree_list: np.mean(degree_list))
dataframe["median_degree"] = dataframe["degree_list"].apply(lambda degree_list: np.median(degree_list))

# Falls eingeschaltet: Parallele Berechnungen
if PARALLEL:
    distance_measures = dataframe["graph"].apply(parallelGetDistanceMeasures).apply(pd.Series)
    dataframe[["diameter", "average_distance"]] = distance_measures
    
# Ansonsten (NICHT parallel)
else:
    dataframe["diameter"] = dataframe["graph"].apply(lambda g: nx.diameter(g))
    dataframe["average_distance"] = dataframe["graph"].apply(lambda g: nx.average_shortest_path_length(g))

# Betweenness
dataframe["betweenness_list"] = dataframe["graph"].apply(nx.betweenness_centrality)
dataframe["betweenness_min"] = dataframe["betweenness_list"].apply(lambda d: np.min(list(d.values())))
dataframe["betweenness_max"] = dataframe["betweenness_list"].apply(lambda d: np.max(list(d.values())))
dataframe["betweenness_average"] = dataframe["betweenness_list"].apply(lambda d: np.mean(list(d.values())))
dataframe["betweenness_standard_deviation"] = dataframe["betweenness_list"].apply(lambda d: np.std(list(d.values())))


# Clustering
dataframe["average_clustering"] = dataframe["graph"].apply(lambda g: nx.average_clustering(g))


### Speichern der Tabelle in Datei

In [69]:
attributes_visible_in_file = [
    "title",
    "node_count",
    "edge_count",
    "average_degree",
    "median_degree",
    "diameter",
    "average_distance",
    "betweenness_min",
    "betweenness_max",
    "betweenness_average",
    "betweenness_standard_deviation",
    "average_clustering"
]

dataframe[attributes_visible_in_file].to_csv("data/output/output.csv", index=False)