#### Imports

In [61]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.inspection import permutation_importance
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyClassifier
from sklearn.neighbors import KNeighborsClassifier

import numpy as np

from helper.normalizer import normalize
from config import FEATURES_USED_BY_CLASSIFIER

#### Einlesen der Daten und Aufteilen in Training- und Testdaten

In [62]:
df = pd.read_csv('output_isDigit.csv')
df = df[df['stopwords_removed'] == True]
df = df[df['author'] != 'ALL']
df = df[df['link_distance'] == 1]
df = df[df['fixed_token_count'] == df['fixed_token_count'].max()]

features = FEATURES_USED_BY_CLASSIFIER

df = normalize(df, features)

df_train = df[df['language'] == 'en'] # Trainingsdaten
df_test = df[df['language'] == 'de'] # Testdaten


X_train = df_train[features]
y_train = df_train['author']

X_test = df_test[features]
y_test = df_test['author']

#### Trainieren des Klassifikators und Speichern der Genauigkeitswerte

In [63]:
# Zufallszahlen setzen für reproduzierbare Ergebnisse
random_number = 123
np.random.seed(random_number)

classifier_list = [
    DummyClassifier(strategy='uniform', random_state=random_number),
    DecisionTreeClassifier(random_state=random_number), # Entscheidungsbaum
    KNeighborsClassifier(n_neighbors=5), # KNN-Algorithmus
    GaussianNB() # Naive-Bayes-Algorithmus
]



for classifier in classifier_list:

    print(f'Klassifikator: {classifier.__class__.__name__}')

    accuracy_score_list = []

    # Um Zufallseffekte zu umgehen, bilden wir den Durchschnitt der Genauigkeit in 10 unterschiedlichen Läufen
    for _ in range(10):

        classifier.fit(X_train, y_train)

        y_prediction = classifier.predict(X_test)

        accuracy_score_list.append(accuracy_score(y_test, y_prediction))

    print(f'    Durchschnitt: {np.mean(accuracy_score_list)}')
    print(f'    Standardabweichung: {np.std(accuracy_score_list)}')

    # Klassifikator auf alle möglichen Daten fitten
    # (für Bestimmung der Permutationswichtigkeit)
    classifier.fit(X_train, y_train)

    permutation_importance_list = permutation_importance(classifier, X_train, y_train, n_repeats=10, random_state=random_number)
    most_important_features = sorted(list(zip(features, permutation_importance_list.importances_mean)), key = lambda x: x[1], reverse=True)

    print(f'    Wichtigste Features: {most_important_features[:2]}')



Klassifikator: DummyClassifier
    Durchschnitt: 0.17647058823529413
    Standardabweichung: 0.0
    Wichtigste Features: [('node_count', 0.0), ('edge_count', 0.0)]
Klassifikator: DecisionTreeClassifier
    Durchschnitt: 0.5294117647058824
    Standardabweichung: 0.0
    Wichtigste Features: [('average_degree', 0.3647058823529411), ('edge_count', 0.3058823529411765)]
Klassifikator: KNeighborsClassifier
    Durchschnitt: 0.4117647058823529
    Standardabweichung: 0.0
    Wichtigste Features: [('edge_count', 0.005882352941176472), ('diameter', 0.005882352941176472)]
Klassifikator: GaussianNB
    Durchschnitt: 0.4117647058823529
    Standardabweichung: 0.0
    Wichtigste Features: [('diameter', 0.2176470588235294), ('node_count', 0.09411764705882351)]
