# a)
Trainieren Sie auf den Daten aus email_body.csv einen NaiveBayes Klassifikator.
Ermitteln der a priori und bedingten Wahrscheinlichkeiten.
Ermitteln von a posteriori Wahrscheinlichkeiten für Spam, kein Spam.

Dünnbesetzte Matrix
->  jede Reihe referenziert auf eine E-Mail, während eine Spalte auf ein Wort referenziert;
    je Zelle sind dann die Anzahl der Vorkommnisse der Wörter vermerkt.

In [153]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

#Daten einlesen
df = pd.read_csv("email_body.csv")
df['body'] = df['body'].fillna("")

#Vektorisierung der E-Mail-Texte
vectorizer = CountVectorizer(stop_words="english")
X = vectorizer.fit_transform(df["body"])
X_sparse = X.toarray()

#Aufteilen der Daten in Trainings- und Testdatensätze
X_train, X_test, y_train, y_test = train_test_split(X_sparse, df["label"], test_size=0.2, random_state=5)

#Naive Bayes-Klassifikator trainieren
clf = MultinomialNB()
clf.fit(X_train, y_train)

#Vorhersagen auf Testdaten treffen und Metriken berechnen
y_pred = clf.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred)
nb_f1 = f1_score(y_test, y_pred, average="binary")

print("Accuracy:", nb_accuracy)
print("F1 Score:", nb_f1)

def log_posterior_probabilities(clf, X):
    log_prob_spam = X.dot(clf.feature_log_prob_[1]) + clf.class_log_prior_[1]
    log_prob_ham = X.dot(clf.feature_log_prob_[0]) + clf.class_log_prior_[0]
    return log_prob_spam, log_prob_ham

def print_conditional_probabilities(vectorizer, log_class_probabilities, class_name):
    #Abrufen der Feature-Namen (Wörter) aus dem Vektorisierer
    feature_names = vectorizer.get_feature_names_out()

    #Kombinieren der Wörter und ihrer bedingten Wahrscheinlichkeiten
    word_probabilities = zip(feature_names, log_class_probabilities)

    #Sortieren der Wörter nach ihren bedingten Wahrscheinlichkeiten
    sorted_word_probabilities = sorted(word_probabilities, key=lambda x: x[1], reverse=True)

    print(f"Logarithmus der bedingten Wahrscheinlichkeiten für {class_name}:")
    for word, probability in sorted_word_probabilities[:10]:
        print(f"{word}: {probability:.4f}")


#Berechnung der Logarithmen der a posteriori Wahrscheinlichkeiten für ein Beispiel aus den Testdaten
example_idx = 12
log_prob_spam, log_prob_ham = log_posterior_probabilities(clf, X_test[example_idx])

train_spam_emails = np.sum(y_train == 1)
train_non_spam_emails = np.sum(y_train == 0)

test_spam_emails = np.sum(y_test == 1)
test_non_spam_emails = np.sum(y_test == 0)
print(f"\nSpam_E-Mails im Testset: {test_spam_emails}")
print(f"Nicht-Spam_E-Mails im Testset: {test_non_spam_emails}")
print(f"Spam-E-Mails im Trainingsset: {train_spam_emails}")
print(f"Nicht-Spam-E-Mails im Trainingsset: {train_non_spam_emails}\n")

print("Logarithmus der a posteriori Wahrscheinlichkeit für Spam:", log_prob_spam)
print("Logarithmus der a posteriori Wahrscheinlichkeit für kein Spam:", log_prob_ham)

#a priori Wahrscheinlichkeiten
prior_prob_spam = np.exp(clf.class_log_prior_[1])
prior_prob_ham = np.exp(clf.class_log_prior_[0])

#bedingte Wahrscheinlichkeiten
conditional_prob_spam = np.exp(clf.feature_log_prob_[1])
conditional_prob_ham = np.exp(clf.feature_log_prob_[0])


print("A priori Wahrscheinlichkeit für Spam:", prior_prob_spam)
print("A priori Wahrscheinlichkeit für kein Spam:", prior_prob_ham)

print_conditional_probabilities(vectorizer, clf.feature_log_prob_[1], "Spam")
print("-------------------------------------")
print_conditional_probabilities(vectorizer, clf.feature_log_prob_[0], "Kein Spam")
def classify(log_prob_spam, log_prob_ham):
    if log_prob_spam > log_prob_ham:
        return "Spam"
    else:
        return "Kein Spam"

classification = classify(log_prob_spam, log_prob_ham)
print("\nKlassifizierung:", classification)

Accuracy: 0.9833333333333333
F1 Score: 0.9456521739130435

Spam_E-Mails im Testset: 95
Nicht-Spam_E-Mails im Testset: 505
Spam-E-Mails im Trainingsset: 405
Nicht-Spam-E-Mails im Trainingsset: 1995

Logarithmus der a posteriori Wahrscheinlichkeit für Spam: -156.11531405275807
Logarithmus der a posteriori Wahrscheinlichkeit für kein Spam: -145.6917898434386
A priori Wahrscheinlichkeit für Spam: 0.1687499999999999
A priori Wahrscheinlichkeit für kein Spam: 0.8312500000000003
Logarithmus der bedingten Wahrscheinlichkeiten für Spam:
number: -3.0158
url: -5.1168
hyperlink: -5.2416
free: -5.2892
money: -5.4912
email: -5.6705
mail: -5.7096
business: -5.8766
list: -5.8892
click: -5.9215
-------------------------------------
Logarithmus der bedingten Wahrscheinlichkeiten für Kein Spam:
number: -2.7316
url: -4.0580
list: -5.2512
just: -5.6529
use: -5.7015
com: -5.7212
like: -5.7606
new: -5.8763
people: -5.8789
time: -5.9238

Klassifizierung: Kein Spam


# b)
Zwei neue sinnvolle Merkmale auf Basis der Textdaten in email_body.csv
und Ergänzung von email_headers.csv.

In [154]:
#Verhältnis zwischen Großbuchstaben und allen Zeichen
import re

def count_uppercase(email):
    if not isinstance(email, str):
        return 0
    uppercase_count = 0
    number_count = len(re.findall(r'NUMBER', email))
    for char in email:
        if char.isupper():
            uppercase_count += 1
    return uppercase_count - (number_count * 5)

def ratio_uppercase_to_all_characters(email):
    uppercase_count = count_uppercase(email)
    total_characters = len(email) if isinstance(email, str) else 0

    if total_characters == 0:
        return 0
    else:
        return uppercase_count / total_characters

df['uppercase_to_all_characters_ratio'] = df['body'].apply(ratio_uppercase_to_all_characters)
df

Unnamed: 0,body,label,uppercase_to_all_characters_ratio
0,"Date: Wed, NUMBER Aug NUMBER NUMBER:NUM...",0,0.053044
1,"Martin A posted:\nTassos Papadopoulos, the Gre...",0,0.044632
2,Man Threatens Explosion In Moscow \n\nThursday...,0,0.053974
3,Klez: The Virus That Won't Die\n \nAlready the...,0,0.051088
4,"> in adding cream to spaghetti carbonara, whi...",0,0.038988
...,...,...,...
2995,ABC's Good Morning America ranks it the #NUMBE...,1,0.078947
2996,HYPERLINK\n HYPERLINK\n HYPERL...,1,0.083481
2997,THANK YOU FOR SHOPPING WITH US\nGIFTS FOR ALL ...,1,0.180667
2998,The\n Famous\n eBay Mark...,1,0.058641


In [155]:
import pandas as pd

headers = pd.read_csv("email_headers.csv")

df['uppercase_to_all_characters_ratio'] = df['body'].apply(ratio_uppercase_to_all_characters)

df["excl_marks"] = df["body"].str.count("!")

extended_headers = pd.merge(headers, df[['uppercase_to_all_characters_ratio', "excl_marks", "label"]], left_index=True, right_index=True)

extended_headers

Unnamed: 0,From,Content-Type,Subject,To,Organization,User-Agent,Attachement,uppercase_to_all_characters_ratio,excl_marks,label
0,Robert Elz <kre@munnari.OZ.AU>,"text/plain; charset=""us-ascii""",Re: New Sequences Window,Chris Garrigues <cwg-dated-1030377287.06fa6d@D...,,,0,0.053044,0,0
1,Steve Burt <Steve_Burt@cursor-system.com>,"text/plain; charset=""US-ASCII""",[zzzzteana] RE: Alexander,"""'zzzzteana@yahoogroups.com'"" <zzzzteana@yahoo...",,,0,0.044632,2,0
2,Tim Chapman <timc@2ubh.com>,"text/plain; charset=""US-ASCII""",[zzzzteana] Moscow bomber,zzzzteana <zzzzteana@yahoogroups.com>,,,0,0.053974,2,0
3,Monty Solomon <monty@roscom.com>,"text/plain; charset=""us-ascii""",[IRR] Klez: The Virus That Won't Die,undisclosed-recipient:;,,,0,0.051088,0,0
4,Stewart Smith <Stewart.Smith@ee.ed.ac.uk>,"text/plain; charset=""US-ASCII""",Re: [zzzzteana] Nothing like mama used to make,zzzzteana@yahoogroups.com,Scottish Microelectronics Centre,Mozilla/5.0 (X11; U; SunOS sun4u; en-US; rv:1....,0,0.038988,2,0
...,...,...,...,...,...,...,...,...,...,...
2995,Rob <yelanotyami912@bot.or.th>,"text/html; charset=""iso-8859-1""",hurry,yyyy@neteze.com,,,0,0.078947,5,1
2996,bolttish@hotmail.com,"text/html; charset=""iso-8859-1""",Do you need a second MORTGAGE? ...,webmaster@ehlaw.com,,,0,0.083481,0,1
2997,Mary's Store <removeme@marysstore.com>,"multipart/related; boundary=""----=_NextPart_OO...",,yyyy@netnoteinc.com,,,1,0.180667,0,1
2998,eBayInternetMarketing@yahoo.com,"text/html; charset=""iso-8859-1""",Earn Your Fortune on eBay!,webmaster@efi.ie,,,0,0.058641,7,1


# c)
Trainieren eines ausgewählten Klassifikators auf den erweiterten Datensatz email_headers.csv

In [156]:
#Gewählter Klassifikator: Klassifikationsbaum
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X = extended_headers[["From", "Content-Type", "Subject", "To", "Organization", "User-Agent", "Attachement",'uppercase_to_all_characters_ratio', "excl_marks"]]
X_transformed = vectorizer.fit_transform(X.astype(str).sum(axis=1))

# d)
Teilen der Datensätze in Trainingsdaten (80%) und Testdaten (20%) mittels
der Methode sklearn.model_selection.train_test_split.

In [157]:
x_train, x_test, y_train, y_test = train_test_split(X_transformed, extended_headers["label"], test_size=0.2, random_state=5)

In [158]:
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(random_state=5, criterion='gini', max_depth=3)
tree_clf.fit(x_train, y_train)
y_pred_tree = tree_clf.predict(x_test)
tree_accuracy = accuracy_score(y_test, y_pred_tree)
tree_f1 = f1_score(y_test, y_pred_tree)

# e)
Kombinieren der Klassifikatoren aus a) und c) zu einem übergeordneten Ensemble Klassifikator.

In [159]:
from sklearn.ensemble import VotingClassifier
naivebayes = MultinomialNB()
decisiontree = DecisionTreeClassifier()
voting_clf = VotingClassifier(estimators=[('naive_bayes', decisiontree), ('decision_tree', naivebayes)], voting='soft')
voting_clf.fit(x_train, y_train)
y_pred_ensemble = voting_clf.predict(x_test)
ensemble_accuracy = accuracy_score(y_test, y_pred_ensemble)
ensemble_f1 = f1_score(y_test, y_pred_ensemble)

# f)
Vergleichen der einzelnen Klassifikatoren aus a) und c) mit dem Ensemble Klassifikator aus e)
auf Basis von Trainings- und Testdaten. Nutzen Sie eine Evaluationsmetrik Ihrer Wahl.

In [160]:
print(f"Ensemble Klassifikator Genauigkeit: {ensemble_accuracy}")
print(f"Decision Tree Klassifikator Genauigkeit: {tree_accuracy}")
print(f"Naive Bayes Klassifikator Genauigkeit: {nb_accuracy} \n")

print(f"Ensemble Klassifikator F1-Score: {ensemble_f1}")
print(f"Decision Tree Klassifikator F1-Score: {tree_f1}")
print(f"Naive Bayes Klassifikator F1-Score: {nb_f1}")

Ensemble Klassifikator Genauigkeit: 0.9683333333333334
Decision Tree Klassifikator Genauigkeit: 0.9083333333333333
Naive Bayes Klassifikator Genauigkeit: 0.9833333333333333 

Ensemble Klassifikator F1-Score: 0.8972972972972973
Decision Tree Klassifikator F1-Score: 0.6357615894039735
Naive Bayes Klassifikator F1-Score: 0.9456521739130435
