In [None]:
import pandas as pd
import os
import re
from gensim.models import Word2Vec
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sns

## LOAD DATA

In [None]:
df = pd.read_parquet("../data/de_poems.parquet")
df = df.astype({"title": "string", "text": "string", "author": "string", "creation": "int64"})
df.dtypes

## PREPROCESS THE DATA

In [None]:
def preprocess_text(text: str) -> list[str]:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text) # remove punctuation
    tokens = text.split()
    return tokens

df["century"] = (df["creation"] // 100) + 1
df["tokens"] = df["text"].apply(preprocess_text)
df

## Word2Vec model training & text vectorization

In [None]:
#w2v_model = Word2Vec(sentences=df["tokens"], vector_size=500, window=25, min_count=2, workers=20, epochs=30, seed=42)
w2v_model = Word2Vec(sentences=df["tokens"], vector_size=750, window=25, min_count=2, workers=20, epochs=50, seed=42)

In [None]:
"""def vectorize(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)"""
def vectorize(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size * 2)
    vectors = np.array(vectors)
    mean_vec = vectors.mean(axis=0)
    max_vec = vectors.max(axis=0)
    return np.concatenate([mean_vec, max_vec])

df["vector"] = df["tokens"].apply(lambda tokens: vectorize(tokens, w2v_model))

## LOGISTIC REGRESSION TRAINING

In [None]:
X = np.stack(df["vector"].values)
y = df["century"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

clf = LogisticRegression(max_iter=5000, class_weight="balanced", random_state=42)
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
report = classification_report(y_test, y_pred, output_dict=True)
print(classification_report(y_test, y_pred))

avg_precision = report['weighted avg']['precision']
avg_recall = report['weighted avg']['recall']
avg_f1 = report['weighted avg']['f1-score']
accuracy = report['accuracy']
# for ROC AUC
y_test_binarized = label_binarize(y_test, classes=np.unique(y_test))
roc_auc = roc_auc_score(y_test_binarized, clf.predict_proba(X_test), average='macro', multi_class='ovr')

In [None]:
cm = confusion_matrix(y_test, y_pred, normalize="true")
cm_percent = cm * 100
plt.figure(figsize=(12, 6))
sns.heatmap(cm_percent, annot=True, fmt=".1f", cmap="Blues",
            xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.title("Confusion Matrix")
plt.ylabel("True Century")
plt.xlabel("Predicted Century")
plt.show()

In [None]:

BENCHMARK_TABLE = "../../class_bench.parquet"
benchmark = pd.read_parquet(BENCHMARK_TABLE)
benchmark.loc[("Word2Vec", "Logistic Regression"), ["Avg Recall", "Avg F1-Score", "Avg Precision", "Accuracy", "Avg AUC"]] = [
    avg_recall, avg_f1, avg_precision, accuracy, roc_auc
]
benchmark.to_parquet(BENCHMARK_TABLE, index=True)

In [None]:
benchmark