### Imports

In [294]:
from importlib import reload
from itertools import chain

import pandas as pd
import numpy as np
import networkx as nx
import scipy

from os import listdir
from os.path import isfile, join

# Importieren und neu laden (damit aktuelle Änderungen übernommen werden)
from helper import graph_analyzer, text_preprocesser
reload(text_preprocesser)
reload(graph_analyzer)


from helper.text_preprocesser import prepareText, convert_preprocessed_tokens_to_graph
from helper.graph_analyzer import parallelGetDistanceMeasures, parallel_get_betweenness_list

### Optionen setzen

In [295]:
PARALLEL = False
nx.config.backends.parallel.active = True
nx.config.backends.parallel.n_jobs = 4
print(nx.config)



### Dateipfade laden

In [296]:
# Allgemeiner Pfad zu den Daten
DATA_PATH = "data/input/"

# Liste mit Dateinamen von Büchern
file_name_list = [f for f in listdir(DATA_PATH) if isfile(join(DATA_PATH, f))]

# Liste mit Inhalten von Dateien
file_content_list = [" ".join(open(join(DATA_PATH, f)).readlines()) for f in file_name_list]


### Dataframe mit Dateinamen und Inhalt befüllen

In [297]:
# Tabelle erstellen mit Spalten "title" und "content"
df = pd.DataFrame({'title': file_name_list, 'content': file_content_list})


### Text vorbereiten und Graphen erstellen

In [298]:
df["prepared_text"] = df["content"].apply(prepareText)

df["graph"] = df["prepared_text"].apply(convert_preprocessed_tokens_to_graph)

### Metriken berechnen

In [299]:

# Basismetriken: Knotenanzahl und Kantenanzahl
df["node_count"] = df["graph"].apply(lambda g: len(g.nodes))
df["edge_count"] = df["graph"].apply(lambda g: len(g.edges))

# Knotengrade
df["degree_list"] = df["graph"].apply(lambda g: np.array([deg for node, deg in g.degree ]))
# Alternative von ChatGPT: dataframe["degree_list"] = dataframe["graph"].apply(lambda g: np.fromiter(dict(g.degree).values(), dtype=float))
df["average_degree"] = df["degree_list"].apply(lambda degree_list: np.mean(degree_list))
df["median_degree"] = df["degree_list"].apply(lambda degree_list: np.median(degree_list))

# Falls eingeschaltet: Parallele Berechnungen
if PARALLEL:
    distance_measures = df["graph"].apply(parallelGetDistanceMeasures).apply(pd.Series)
    df[["diameter", "average_distance"]] = distance_measures
    df["betweenness_list"] = df["graph"].apply(parallel_get_betweenness_list)

# Ansonsten (NICHT parallel)
else:
    df["diameter"] = df["graph"].apply(lambda g: nx.diameter(g))
    df["average_distance"] = df["graph"].apply(lambda g: nx.average_shortest_path_length(g))
    df["betweenness_list"] = df["graph"].apply(lambda g : np.array( list(nx.betweenness_centrality(g).values()) ))

# Parameter Lambda der Exponentialverteilung bestimmen
# TODO: Besseren Name wählen
df["lambda"] = df["degree_list"].apply(lambda d: 1 / scipy.stats.expon.fit(d)[1])

# Betweenness aus Liste der Einzelwerte
df["betweenness_min"] = df["betweenness_list"].apply(np.min)
df["betweenness_max"] = df["betweenness_list"].apply(np.max)
df["betweenness_average"] = df["betweenness_list"].apply(np.mean)
df["betweenness_standard_deviation"] = df["betweenness_list"].apply(np.std)


# Clustering
df["average_clustering"] = df["graph"].apply(lambda g: nx.average_clustering(g))




### Speichern der Tabelle in Datei

In [300]:
attributes_visible_in_file = [
    "title",
    "node_count",
    "edge_count",
    "average_degree",
    "median_degree",
    "diameter",
    "average_distance",
    "betweenness_min",
    "betweenness_max",
    "betweenness_average",
    "betweenness_standard_deviation",
    "average_clustering"
]

df[attributes_visible_in_file].to_csv("data/output/output.csv", index=False)

In [280]:
all_tokens = list(chain.from_iterable(dataframe["prepared_text"]))

row = {c: None for c in dataframe.columns}
row.update({
    "title": "combined_text",
    "prepared_text": all_tokens,
    "graph": convert_preprocessed_tokens_to_graph(all_tokens),
})

dataframe = pd.concat([dataframe, pd.DataFrame([row])], ignore_index=True)
calculate_metrics(dataframe, rows=[len(dataframe) - 1], parallel=PARALLEL)


  dataframe = pd.concat([dataframe, pd.DataFrame([row])], ignore_index=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["node_count"] = df["graph"].apply(lambda g: len(g.nodes))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["edge_count"] = df["graph"].apply(lambda g: len(g.edges))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["de

Unnamed: 0,title,content,prepared_text,graph,node_count,edge_count,degree_list,average_degree,median_degree,diameter,average_distance,betweenness_list,lambda,betweenness_min,betweenness_max,betweenness_average,betweenness_standard_deviation,average_clustering
2,combined_text,,"[in, the, beginning, was, the, myth, god, in, ...","(in, the, beginning, was, myth, god, His, Sear...",2190,5249,"[143, 370, 3, 95, 2, 8, 60, 2, 57, 2, 4, 10, 2...",4.793607,2.0,10,3.348667,"[0.1068067996628173, 0.2198904277169819, 5.739...",0.263601,0.0,0.308576,0.001073,0.00974,0.204184
