# Unterscheide Gedichte von Spam - naiver Ansatz

In [None]:
import string
import numpy as np
import pandas as pd
import sklearn.tree
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix

## Lade Daten

In [None]:
df_spam = pd.read_csv("spam_betreff_und_text.csv", index_col=0)
df_spam

In [None]:
df_poems = pd.read_csv("poems.csv", index_col=0)
df_poems

Führe die zwei Datensätze zusammen.

In [None]:
df_poems_merger = df_poems.copy()
df_poems_merger = df_poems_merger.assign(category="poem")
df_poems_merger.columns = ["creator", "title", "text", "category"]

In [None]:
df_spam_merger = df_spam.copy()
df_spam_merger = df_spam_merger.assign(category="spam")
df_spam_merger.columns = ["creator", "title", "text", "category"]

In [None]:
df = pd.concat([df_poems_merger, df_spam_merger])
df

In [None]:
df.info()

Remove rows with missing values

In [None]:
df = df.dropna()
df

## Feature Engineering

Es wird für jeden Eintrag ein Vektor $x$ erzeugt.
Die meisten ML-Verfahren können nur Zahlenwerte in Form von Vektoren und Matrizen verarbeiten, weswegen Texte speziell aufbereitet werden müssen.

In [None]:
features = []

for i, row in df.iterrows():
    features.append({
        "category": row["category"],
        "Titellänge": len(row["title"]),
        "Textlänge": len(row["text"]),
        "Anzahl 'Geld'": row["text"].lower().count("money"),
        "Großbuchstaben": (len([x for x in row["text"] if x in string.ascii_uppercase]) /
                           len([x for x in row["text"] if x in string.ascii_letters]))
    })

features

In [None]:
df_text_features = pd.DataFrame(features)
df_text_features

## Teile Daten auf

In [None]:
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    df_text_features.drop("category", axis=1).values, df_text_features["category"].values,
    test_size=0.33, random_state=42)

X_train = np.stack(X_train, axis=0)
X_test = np.stack(X_test, axis=0)

## Trainiere Entscheidungsbaum

In [None]:
clf = sklearn.tree.DecisionTreeClassifier(random_state=0, max_depth=5)
clf = clf.fit(X_train, y_train)

Berechne Accuracy-Wert.

In [None]:
clf.score(X_train, y_train)

## Untersuche Ergebnis

In [None]:
clf.score(X_test, y_test)

In [None]:
plot_confusion_matrix(clf, X_test, y_test, cmap="BuPu")

In [None]:
plt.figure(figsize=(27, 10))
sklearn.tree.plot_tree(clf, feature_names=df_text_features.drop("category", axis=1).columns)
plt.show()