### Imports

In [49]:
from importlib import reload
from itertools import chain

import pandas as pd
import numpy as np
import networkx as nx
import nx_parallel as nxp
import scipy

from os import listdir
from os.path import isfile, join

# Importieren und neu laden (damit aktuelle Änderungen übernommen werden)
from helper import graph_analyzer, file_preprocesser
reload(file_preprocesser)
reload(graph_analyzer)


from helper.file_preprocesser import prepare_text, prepare_text_with_libraries, convert_preprocessed_tokens_to_graph, extract_metadata_from_file_name
from helper.graph_analyzer import parallel_get_distance_measures, parallel_get_betweenness_list, get_powerlaw_result
import config

### Optionen laden

In [50]:
# Auslagern in config?
nx.config.backends.parallel.active = config.ACTIVE_PARALLEL
nx.config.backends.parallel.n_jobs = config.N_JOBS



### Dateipfade laden

In [51]:
# Liste mit Dateinamen von Büchern
file_name_list = [f for f in listdir(config.DATA_PATH) if isfile(join(config.DATA_PATH, f))]

# Liste mit Inhalten von Dateien
file_content_list = [" ".join(open(join(config.DATA_PATH, f)).readlines()) for f in file_name_list]


### Dataframe mit Dateinamen und Inhalt befüllen

In [52]:
# Tabelle erstellen mit Spalten "title" und "content"
df = pd.DataFrame({'file_name': file_name_list, 'file_content': file_content_list})

file_metadata = df["file_name"].apply(extract_metadata_from_file_name).apply(pd.Series)

df[["author", "title", "language"]] = file_metadata

print(df)

                  file_name  \
0      Kafka_Amerika_de.txt   
1      Kafka_Prozess_en.txt   
2      Kafka_Schloss_de.txt   
3  Kafka_Verwandlung_de.txt   
4            ALL_ALL_DE.txt   
5      Kafka_Schloss_en.txt   
6  Kafka_Verwandlung_en.txt   
7      Kafka_Prozess_de.txt   
8      Kafka_Amerika_en.txt   

                                        file_content author        title  \
0  Der Heizer\n Als der sechzehnjährige Karl Roßm...  Kafka      Amerika   
1  Chapter One\n \n Arrest--Conversation with Mrs...  Kafka      Prozess   
2  Es war spätabends, als K. ankam. Das Dorf lag ...  Kafka      Schloss   
3  Als Gregor Samsa eines Morgens aus unruhigen T...  Kafka  Verwandlung   
4  IN THE BEGINNING was the myth. God, in his sea...    ALL          ALL   
5  1\n Arrival\n It was late evening when K. arri...  Kafka      Schloss   
6  One morning, when Gregor Samsa woke from troub...  Kafka  Verwandlung   
7  ERSTES KAPITEL\n \n VERHAFTUNG · GESPRÄCH MIT ...  Kafka      Prozess   
8  TH

### Text vorbereiten und Graphen erstellen

In [53]:
df["prepared_text"] = df["file_content"].apply(lambda text: prepare_text_with_libraries(text, remove_stopwords=config.REMOVE_STOPWORDS))
df["stopwords_removed"] = config.REMOVE_STOPWORDS
df["link_distance"] = config.LINK_DISTANCE
df["graph"] = df["prepared_text"].apply(lambda g: convert_preprocessed_tokens_to_graph(g, config.LINK_DISTANCE))

### Metriken berechnen

In [54]:
# Basismetriken: Knotenanzahl und Kantenanzahl
df["node_count"] = df["graph"].apply(lambda g: len(g.nodes))
df["edge_count"] = df["graph"].apply(lambda g: len(g.edges))

# Knotengrade
df["degree_list"] = df["graph"].apply(lambda g: np.array([deg for node, deg in g.degree ]))
# Alternative von ChatGPT: dataframe["degree_list"] = dataframe["graph"].apply(lambda g: np.fromiter(dict(g.degree).values(), dtype=float))
df["average_degree"] = df["degree_list"].apply(lambda degree_list: np.mean(degree_list))
df["median_degree"] = df["degree_list"].apply(lambda degree_list: np.median(degree_list))

# Falls eingeschaltet: Parallele Berechnungen
if PARALLEL:
    distance_measures = df["graph"].apply(parallel_get_distance_measures).apply(pd.Series)
    df[["diameter", "average_distance"]] = distance_measures
    df["betweenness_list"] = df["graph"].apply(parallel_get_betweenness_list)

# Ansonsten (NICHT parallel)
else:
    df["diameter"] = df["graph"].apply(lambda g: nx.diameter(g))
    df["average_distance"] = df["graph"].apply(lambda g: nx.average_shortest_path_length(g))
    df["betweenness_list"] = df["graph"].apply(lambda g : np.array( list(nx.betweenness_centrality(g).values()) ))

# powerlaw-Eigenschaften bestimmen
powerlaw_result = df["degree_list"].apply(get_powerlaw_result).apply(pd.Series)
df[["powerlaw_alpha_value", "powerlaw_xmin_value"]] = powerlaw_result

# Betweenness aus Liste der Einzelwerte
df["betweenness_min"] = df["betweenness_list"].apply(np.min)
df["betweenness_max"] = df["betweenness_list"].apply(np.max)
df["betweenness_average"] = df["betweenness_list"].apply(np.mean)
df["betweenness_standard_deviation"] = df["betweenness_list"].apply(np.std)


# Clustering
df["average_clustering"] = df["graph"].apply(lambda g: nx.average_clustering(g))


### Speichern der Tabelle in Datei

In [55]:
# Nur bestimmte Spalten sollen in Datei geschrieben werden
df[config.ATTRIBUTES_VISIBLE_IN_FILE].to_csv("data/output/output.csv", index=False)

print(df)

                  file_name  \
0      Kafka_Amerika_de.txt   
1      Kafka_Prozess_en.txt   
2      Kafka_Schloss_de.txt   
3  Kafka_Verwandlung_de.txt   
4            ALL_ALL_DE.txt   
5      Kafka_Schloss_en.txt   
6  Kafka_Verwandlung_en.txt   
7      Kafka_Prozess_de.txt   
8      Kafka_Amerika_en.txt   

                                        file_content author        title  \
0  Der Heizer\n Als der sechzehnjährige Karl Roßm...  Kafka      Amerika   
1  Chapter One\n \n Arrest--Conversation with Mrs...  Kafka      Prozess   
2  Es war spätabends, als K. ankam. Das Dorf lag ...  Kafka      Schloss   
3  Als Gregor Samsa eines Morgens aus unruhigen T...  Kafka  Verwandlung   
4  IN THE BEGINNING was the myth. God, in his sea...    ALL          ALL   
5  1\n Arrival\n It was late evening when K. arri...  Kafka      Schloss   
6  One morning, when Gregor Samsa woke from troub...  Kafka  Verwandlung   
7  ERSTES KAPITEL\n \n VERHAFTUNG · GESPRÄCH MIT ...  Kafka      Prozess   
8  TH