### Imports

In [324]:
from importlib import reload
from itertools import chain

import pandas as pd
import numpy as np
import networkx as nx
import nx_parallel as nxp
import scipy

from os import listdir
from os.path import isfile, join

# Importieren und neu laden (damit aktuelle Änderungen übernommen werden)
from helper import graph_analyzer, text_preprocesser
reload(text_preprocesser)
reload(graph_analyzer)


from helper.text_preprocesser import prepare_text, prepare_text_with_libraries, convert_preprocessed_tokens_to_graph
from helper.graph_analyzer import parallel_get_distance_measures, parallel_get_betweenness_list, get_powerlaw_alpha

### Optionen setzen

In [325]:
PARALLEL = False
nx.config.backends.parallel.active = True
nx.config.backends.parallel.n_jobs = 4
print(nx.config)



### Dateipfade laden

In [326]:
# Allgemeiner Pfad zu den Daten
DATA_PATH = "data/input/"

# Liste mit Dateinamen von Büchern
file_name_list = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

# Liste mit Inhalten von Dateien
file_content_list = [" ".join(open(join(DATA_PATH, f)).readlines()) for f in file_name_list]

### Dataframe mit Dateinamen und Inhalt befüllen

In [327]:
# Tabelle erstellen mit Spalten "title" und "content"
df = pd.DataFrame({'file_name': file_name_list, 'file_content': file_content_list})


def extract_metadata_from_file_name(file_name):
    splitted_name = file_name[:-4].split("_")
    author = splitted_name[0]
    title = splitted_name[1]
    language = splitted_name[2]
    return author, title, language

file_metadata = df["file_name"].apply(extract_metadata_from_file_name).apply(pd.Series)

df[["author", "title", "language"]] = file_metadata

### Text vorbereiten und Graphen erstellen

In [328]:
df["prepared_text"] = df["file_content"].apply(prepare_text_with_libraries)

df["graph"] = df["prepared_text"].apply(lambda g: convert_preprocessed_tokens_to_graph(g, 1))

### Metriken berechnen

In [329]:
from helper.graph_analyzer import get_powerlaw_result

# Basismetriken: Knotenanzahl und Kantenanzahl
df["node_count"] = df["graph"].apply(lambda g: len(g.nodes))
df["edge_count"] = df["graph"].apply(lambda g: len(g.edges))

# Knotengrade
df["degree_list"] = df["graph"].apply(lambda g: np.array([deg for node, deg in g.degree ]))

# Ergebnisse von powerlaw
powerlaw_result = df["degree_list"].apply(get_powerlaw_result).apply(pd.Series)
df[["powerlaw_alpha_value", "powerlaw_xmin_value"]] = powerlaw_result

print(df)


                  file_name  \
0      Kafka_Amerika_de.txt   
1      Kafka_Prozess_en.txt   
2      Kafka_Schloss_de.txt   
3  Kafka_Verwandlung_de.txt   
4            ALL_ALL_DE.txt   
5      Kafka_Schloss_en.txt   
6  Kafka_Verwandlung_en.txt   
7      Kafka_Prozess_de.txt   
8      Kafka_Amerika_en.txt   

                                        file_content author        title  \
0  Der Heizer\n Als der sechzehnjährige Karl Roßm...  Kafka      Amerika   
1  Chapter One\n \n Arrest--Conversation with Mrs...  Kafka      Prozess   
2  Es war spätabends, als K. ankam. Das Dorf lag ...  Kafka      Schloss   
3  Als Gregor Samsa eines Morgens aus unruhigen T...  Kafka  Verwandlung   
4  IN THE BEGINNING was the myth. God, in his sea...    ALL          ALL   
5  1\n Arrival\n It was late evening when K. arri...  Kafka      Schloss   
6  One morning, when Gregor Samsa woke from troub...  Kafka  Verwandlung   
7  ERSTES KAPITEL\n \n VERHAFTUNG · GESPRÄCH MIT ...  Kafka      Prozess   
8  TH

Calculating best minimal value for power law fit
  (CDF_diff**2) /
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
Calculating best minimal value for power law fit
