### Imports

In [89]:
from importlib import reload
from itertools import chain

import pandas as pd
import numpy as np
import networkx as nx
import nx_parallel as nxp
import scipy

from os import listdir
from os.path import isfile, join

# Importieren und neu laden (damit aktuelle Änderungen übernommen werden)
import config
from helper import graph_analyzer, file_preprocesser
reload(config)
reload(file_preprocesser)
reload(graph_analyzer)


from helper.file_preprocesser import prepare_text, prepare_text_with_libraries, convert_preprocessed_tokens_to_graph, extract_metadata_from_file_name
from helper.graph_analyzer import parallel_get_distance_measures, parallel_get_betweenness_list, get_powerlaw_result
import config

### Optionen laden

In [90]:
# Auslagern in config?
nx.config.backends.parallel.active = config.ACTIVE_PARALLEL
nx.config.backends.parallel.n_jobs = config.N_JOBS

### Dateipfade laden

In [91]:
# Liste mit Dateinamen von Büchern
file_name_list = [f for f in listdir(config.DATA_PATH) if isfile(join(config.DATA_PATH, f))]

# Liste mit Inhalten von Dateien
file_content_list = [" ".join(open(join(config.DATA_PATH, f)).readlines()) for f in file_name_list]


### Dataframe mit Dateinamen und Inhalt befüllen

In [92]:
# Tabelle erstellen mit Spalten "title" und "content"
df = pd.DataFrame({'file_name': file_name_list, 'file_content': file_content_list})

file_metadata = df["file_name"].apply(extract_metadata_from_file_name).apply(pd.Series)

df[["author", "title", "language"]] = file_metadata

print(df)

                        file_name  \
0            Kafka_Amerika_de.txt   
1          Hesse_Camenzind_de.txt   
2              Austen_Anna_de.txt   
3        Hesse_Steppenwolf_de.txt   
4       Nietzsche_Jenseits_de.txt   
5         Nietzsche_Geburt_en.txt   
6            Kafka_Prozess_en.txt   
7            Kafka_Schloss_de.txt   
8        Kafka_Verwandlung_de.txt   
9          Austen_Verstand_en.txt   
10  Nietzsche_Menschliches_de.txt   
11            Hesse_Demian_en.txt   
12            Austen_Stolz_en.txt   
13        Hesse_Siddhartha_en.txt   
14       Nietzsche_Goetzen_de.txt   
15        Austen_Mansfield_en.txt   
16   Nietzsche_Zarathustra_de.txt   
17            Austen_Stolz_de.txt   
18        Hesse_Siddhartha_de.txt   
19       Nietzsche_Goetzen_en.txt   
20        Austen_Mansfield_de.txt   
21   Nietzsche_Zarathustra_en.txt   
22           Kafka_Schloss_en.txt   
23       Kafka_Verwandlung_en.txt   
24         Austen_Verstand_de.txt   
25  Nietzsche_Menschliches_en.txt   
2

### Text vorbereiten, Graphen erstellen und Infos in Dataframe schreiben

In [93]:
df["token_list"] = df["file_content"].apply(lambda text: prepare_text_with_libraries(text, remove_stopwords=config.REMOVE_STOPWORDS))
df["token_list_length"] = df["token_list"].apply(lambda text: len(text))

# Graph erstellen
df["graph"] = df["token_list"].apply(lambda g: convert_preprocessed_tokens_to_graph(g, config.LINK_DISTANCE))

# Infos in df schreiben
df["stopwords_removed"] = config.REMOVE_STOPWORDS
df["link_distance"] = config.LINK_DISTANCE

### Metriken des Netzwerks berechnen

In [94]:
# Basismetriken: Knotenanzahl und Kantenanzahl
df["node_count"] = df["graph"].apply(lambda g: len(g.nodes))
df["edge_count"] = df["graph"].apply(lambda g: len(g.edges))

# Knotengrade
df["degree_list"] = df["graph"].apply(lambda g: np.array([deg for node, deg in g.degree ]))


# Alternative von ChatGPT: dataframe["degree_list"] = dataframe["graph"].apply(lambda g: np.fromiter(dict(g.degree).values(), dtype=float))
df["average_degree"] = df["degree_list"].apply(lambda degree_list: np.mean(degree_list))
df["median_degree"] = df["degree_list"].apply(lambda degree_list: np.median(degree_list))

# Falls eingeschaltet: Parallele Berechnungen
if PARALLEL:
    distance_measures = df["graph"].apply(parallel_get_distance_measures).apply(pd.Series)
    df[["diameter", "average_distance"]] = distance_measures
    df["betweenness_list"] = df["graph"].apply(parallel_get_betweenness_list)

# Ansonsten (NICHT parallel)
else:
    df["diameter"] = df["graph"].apply(lambda g: nx.diameter(g))
    df["average_distance"] = df["graph"].apply(lambda g: nx.average_shortest_path_length(g))
    df["betweenness_list"] = df["graph"].apply(lambda g : np.array( list(nx.betweenness_centrality(g).values()) ))

# powerlaw-Eigenschaften der Knotenverteilung bestimmen
powerlaw_result = df["degree_list"].apply(get_powerlaw_result).apply(pd.Series)
df[["powerlaw_alpha_value", "powerlaw_xmin_value"]] = powerlaw_result

# Betweenness aus Liste der Einzelwerte
df["betweenness_min"] = df["betweenness_list"].apply(np.min)
df["betweenness_max"] = df["betweenness_list"].apply(np.max)
df["betweenness_average"] = df["betweenness_list"].apply(np.mean)
df["betweenness_standard_deviation"] = df["betweenness_list"].apply(np.std)


# Clustering
df["average_clustering"] = df["graph"].apply(lambda g: nx.average_clustering(g))


### Speichern der Tabelle in Datei

In [95]:
# Nur bestimmte Spalten sollen in Datei geschrieben werden
df[config.ATTRIBUTES_VISIBLE_IN_FILE].to_csv("data/output/output.csv", index=False)

### Buch mit den wenigsten Wörtern finden

In [96]:
smallest_book_idx = df["token_list_length"].idxmin()

print(df.iloc[smallest_book_idx])



file_name                                     Kafka_Verwandlung_de.txt
file_content         Als Gregor Samsa eines Morgens aus unruhigen T...
author                                                           Kafka
title                                                      Verwandlung
language                                                            de
token_list           [als, Gregor, Samsa, ein, Morgen, aus, unruhig...
token_list_length                                                18763
graph                (als, Gregor, Samsa, ein, Morgen, aus, unruhig...
stopwords_removed                                                False
link_distance                                                        1
node_count                                                        2854
edge_count                                                       11742
Name: 8, dtype: object
