In [211]:
import numpy as np
import pandas as pd
from IPython.core.pylabtools import figsize
from matplotlib import pyplot as plt

In [212]:
author_marker_dict = {
    'Nietzsche': 'o',
    'Kafka': 's',
    'Austen': '^',
    'Hesse': 'D'
}
author_color_dict = {
    'Nietzsche': (0.2, 0.4, 0.8),
    'Kafka': (0.1, 0.7, 0.3),
    'Austen': (0.9, 0.6, 0.1),
    'Hesse': (1.0, 0.2, 0.7)
}

numerical_columns = [
    # 'node_count',
    # 'edge_count',
    'betweenness_standard_deviation',
    'average_degree',
    'median_degree',
    'diameter',
    'average_distance',
    'betweenness_average',
    'average_clustering'
]


In [213]:
df = pd.read_csv('output_isDigit.csv')
df = df[df['stopwords_removed'] == True]
df = df[df['author'] != 'ALL']
df = df[df['link_distance'] == 1]
df = df[df['fixed_token_count'] == df['fixed_token_count'].max()]

print(df)

        author         title language  token_list_length  node_count  \
68      Austen          Anna       de              13062        3089   
69      Austen     Mansfield       de              13062        3518   
70       Hesse   Steppenwolf       de              13062        4554   
71       Hesse    Siddhartha       en              13062        2298   
72       Hesse        Demian       de              13062        3685   
73   Nietzsche      Jenseits       de              13062        4195   
74       Hesse        Demian       en              13062        2818   
75   Nietzsche       Goetzen       de              13062        4076   
76       Kafka       Amerika       en              13062        2743   
77       Kafka       Schloss       en              13062        2354   
78      Austen      Verstand       en              13062        2380   
79       Hesse   Steppenwolf       en              13062        3549   
80   Nietzsche   Zarathustra       en              13062        

In [214]:
fig, axes = plt.subplots(len(numerical_columns), len(numerical_columns), figsize=(40, 40))


for i, column_i in enumerate(numerical_columns):
    for j, column_j in enumerate(numerical_columns):
        for author in author_marker_dict:
            # Unterschiedliche Farben für Englisch und Deutsch
            author_color = author_color_dict[author]
            author_color_en = author_color + (0.5,)  # heller (englisch)
            author_color_de = author_color + (1.0,)  # dunkler (deutsch)

            ax = axes[i, j]
            ax.set_xlabel(column_j)
            ax.set_ylabel(column_i)

            author_rows = df[df['author'] == author]

            author_rows_en = author_rows[author_rows['language'] == 'en']
            author_rows_de = author_rows[author_rows['language'] == 'de']

            # ax.scatter(author_rows_en[column_j], author_rows_en[column_i], marker=author_marker_dict[author], label = f'{author} (en)', color = author_color_en, s=50)

            # ax.scatter(author_rows_de[column_j], author_rows_de[column_i], marker=author_marker_dict[author], label = f'{author} (de)', color = author_color_de, s=200)
            # edgecolors='black' kann hinzugefügt werden



            ax.scatter(author_rows[column_j], author_rows[column_i], marker = author_marker_dict[author], label = f'{author}', color = author_color)

            ax.legend()


            match (i, j):
                case (0, 3):
                    ax.set_xlabel('$3^3$')

plt.tight_layout()
plt.savefig('scatter_matrix.png')
plt.close()
# plt.show()

#### Abbildungen für gemischte Sprachen

In [215]:


feature_combinations_to_save = [
    ('betweenness_standard_deviation', 'diameter'),
    ('average_degree', 'average_clustering')
]

for feature_a, feature_b in feature_combinations_to_save:

    fig, ax = plt.subplots()

    for author in author_marker_dict:
        # Unterschiedliche Farben für Englisch und Deutsch
        author_color = author_color_dict[author]
        author_rows = df[df['author'] == author]
        ax.scatter(author_rows[feature_a], author_rows[feature_b], marker = author_marker_dict[author], label = f'{author}', color = author_color)

    ax.legend()
    match (feature_a, feature_b):
        case ('betweenness_standard_deviation', 'diameter'):
            filename = 'scatter_matrix_mixed_1'
            ax.set_xlabel('Betweenness Standardabweichung')
            ax.set_ylabel('$D$')
        case ('average_degree', 'average_clustering'):
            filename = 'scatter_matrix_mixed_2'
            ax.set_xlabel('Durchschnittlicher Knotengrad')
            ax.set_ylabel('$C$')
        case _:
            filename = 'default'
            ax.set_xlabel(feature_a)
            ax.set_ylabel(feature_b)
    fig.savefig(filename)
    plt.close(fig)



#### Abbildungen für getrennte Sprachen

In [216]:


feature_combinations_to_save = [
    ('betweenness_standard_deviation', 'diameter'),
    ('average_degree', 'average_clustering')
]

for feature_a, feature_b in feature_combinations_to_save:

    fig, ax = plt.subplots()


    df_en = df[df['language'] == 'en']
    df_de = df[df['language'] == 'de']

    ax.scatter(df_en[feature_a], df_en[feature_b], label = f'Englische Werke', color = 'blue', s=100)

    ax.scatter(df_de[feature_a], df_de[feature_b], label = f'Deutsche Werke', color = 'red', s=100)

    ax.legend()
    match (feature_a, feature_b):
        case ('betweenness_standard_deviation', 'diameter'):
            filename = 'scatter_matrix_diff_1'
            ax.set_xlabel('Betweenness Standardabweichung')
            ax.set_ylabel('$D$')
        case ('average_degree', 'average_clustering'):
            filename = 'scatter_matrix_diff_2'
            ax.set_xlabel('Durchschnittlicher Knotengrad')
            ax.set_ylabel('$C$')
        case _:
            filename = 'default'
            ax.set_xlabel(feature_a)
            ax.set_ylabel(feature_b)
    fig.savefig(filename)
    plt.close(fig)

