# Poem: Logistic Regression

For this project, the text is the feature and the label is the date of the poem.

We use logistic regression for a multi-class text classification task.

With no preprocessing, the model has 304 targets. We will present the results of different models depending on the preprocessing.

In [None]:
# If you do not have stopwords
import nltk
nltk.download("stopwords")


# 🎓 Library

In [None]:
# Misc
import os
import numpy as np
import pandas as pd
import pickle

# Training
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

# Plotting
import seaborn as sns
import matplotlib.pyplot as plt

DATA = "data/"

if not os.path.exists(DATA):
    raise FileNotFoundError(f"Data directory {DATA} does not exist. Please create it and add the data files.")

In [None]:
poems_df = pd.read_parquet(DATA + "de_poems.parquet")

In [None]:
poems_df.head(3)["text"].values

# Model A: Per Century

Each poem is mapped to a century.

Our feature is `text`, and the label is `creation`.

We encode only the features.

## Step 1: Preprocessing

In [None]:
# Transform all date to centuries, else we have 304 classes
poems_century = poems_df.copy()

poems_century["creation"] = poems_century["creation"].apply(lambda x: str(int(x)// 100 + 1))

In [None]:
german_stop_words = stopwords.words("german")

vectorizer = TfidfVectorizer(stop_words=german_stop_words)

x = vectorizer.fit_transform(poems_century["text"])
y = poems_century["creation"]

In [None]:
print(f"Currently have {len(poems_century)} poems with {len(y.unique())} dates.")
print(f"Model has {len(vectorizer.get_feature_names_out())} features.")

## Step 2: Data splitting and model training

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
model = LogisticRegression(random_state=42, solver="saga", verbose=10)
model.fit(x_train, y_train)

#### Quick save

In [None]:
pickle.dump(model, open("trained/LR_Century_model.pkl", "wb"))

## Step 3: Evaluation and finetuning

In [None]:
y_pred = model.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred, zero_division=0))

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

heat = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
fig, ax = plt.subplots()  # optional: adjust figure size
heat.plot(ax=ax)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Normalize it (row-wise percentages)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=model.classes_)
disp.plot(cmap='Blues', values_format='.2f')  # values_format to control decimal places
plt.title('Confusion Matrix (in %)')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Compare the results with the actual y values
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

# Model B: Per Movement
Each poem is mapped to a German literary movement.

## Step 1: Preprocessing

In [None]:
# Transform all date to periods, else we have 304 classes
from env import get_period
poems_movement = poems_df.copy()

poems_movement["creation"] = poems_movement["creation"].apply(lambda x: get_period(int(x)))

In [None]:
german_stop_words = stopwords.words("german")

vectorizer = TfidfVectorizer(stop_words=german_stop_words)

x = vectorizer.fit_transform(poems_movement["text"])
y = poems_movement["creation"]

In [None]:
print(f"Currently have {len(poems_movement)} poems with {len(y.unique())} movements.")
print(f"Model has {len(vectorizer.get_feature_names_out())} features.")

## Step 2: Data splitting and model training

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
model = LogisticRegression(random_state=42, solver="saga", verbose=10)
model.fit(x_train, y_train)

#### Quick save

In [None]:
pickle.dump(model, open("trained/LR_Movement_model.pkl", "wb"))

## Step 3: Evaluation and finetuning

In [None]:
y_pred = model.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred, zero_division=0))

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

heat = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=model.classes_)
fig, ax = plt.subplots()  # optional: adjust figure size
heat.plot(ax=ax)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Normalize it (row-wise percentages)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=model.classes_)
disp.plot(values_format='.2f')  # values_format to control decimal places
plt.title('Confusion Matrix (in %)')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Compare the results with the actual y values
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")