# Poem: Logistic Regression

For this project, the text is the feature and the label is the date of the poem.

We use logistic regression for a multi-class text classification task.

With no preprocessing, the model has 304 targets. We will present the results of different models depending on the preprocessing.

In [None]:
# If you do not have stopwords
import nltk
nltk.download("stopwords")


# 🎓 Library

In [46]:
# Misc
import os
import numpy as np
import pandas as pd
import pickle
import re

# Training
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score, ConfusionMatrixDisplay
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline


# Plotting
import matplotlib.pyplot as plt 

DATA = "../data/"
BENCHMARK_TABLE = "../../class_bench.parquet"
RANDOM_STATE = 42

if not os.path.exists(DATA):
    raise FileNotFoundError(f"Data directory {DATA} does not exist. Please create it and add the data files.")

In [None]:
poems_df = pd.read_parquet(DATA + "de_poems.parquet")

In [None]:
poems_df.head(3)["text"].values

# Preprocessing

In [None]:
import sys
sys.path.append("../")
from env import get_period

poems = poems_df.copy()

# Transform all date to centuries, else we have 304 classes
poems["creation"] = poems["creation"].astype(int)

# For model A
poems["century"] = poems["creation"].apply(lambda x: str(x // 100 + 1))

# For model B
poems["movement"] = poems["creation"].apply(get_period)

def preprocess(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[\d\W_]+', ' ', text)
    tokens = text.split()
    return ' '.join(tokens)

poems["cleaned_text"] = poems["text"].apply(preprocess)

In [None]:
poems.head(3)[["text", "cleaned_text"]]


## Model creation and tuning

When calling the `fit` method of the model, the y parameter will be different (either century or movement).

We use SMOTE in order to create synthetic samples for the minority class (11th century).

This reduces accuracy, but increases F1 score.

In [None]:
# Remove german stop words
german_stop_words = stopwords.words("german")

# Pipeline to fine-tune encoding and model
# No need to fit_transform, pipeline will do it
pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(stop_words=german_stop_words, ngram_range=(1, 2), max_df = 0.9, max_features=350000)),
    # To keep the interesting features
    ("SMOTE", SMOTE(random_state=RANDOM_STATE)),
    ("clf", LogisticRegression(random_state = RANDOM_STATE, solver = "sag", penalty = "l2", max_iter = 100, verbose = 10, class_weight="balanced"))
])

x = poems["cleaned_text"]
# y will be defined depending on the model

In [None]:
# For overall better config: penalty = "L2", solver = "lbfgs", max_iter = 100, max_features = 350000, ngram_range(1, 2)
# For better accuracy on 19th, same thing with "sag"

# https://stackoverflow.com/questions/44066264/how-to-choose-parameters-in-tfidfvectorizer-in-sklearn-during-unsupervised-clust

# Hyperparameter grid
param_grid = [
    {
    #    "tfidf__max_df": [0.9, 1.0],
    #    "tfidf__ngram_range": [(1, 1), (1, 2), (1, 3)],
    #    "tfidf__max_features": [200000, 250000, 300000, 350000],
    #    "clf__class_weight": [None, "balanced"],
    #    "clf__class_weight": ["balanced", None],
    #    "clf__solver": ["lbfgs", "sag"],
    #    "clf__penalty": ["l1", "l2"],
    #    "clf__max_iter": [100, 350]
    }
]

clf = GridSearchCV(pipeline, param_grid, n_jobs = 2, cv = 3, verbose = True)


# Model A: SMOTE, Per Century

Each poem is mapped to a century.

Our feature is `text`, and the label is `century`.

## Step 1: Feature selection

In [None]:
y = poems["century"]

In [None]:
print(f"Currently have {len(poems)} poems with {len(y.unique())} dates.")

## Step 2: Data splitting and model training

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
clf.fit(x_train, y_train)

In [None]:
century_model = clf.best_estimator_
century_model

In [None]:
century_model.get_params()

#### Quick save

In [None]:
pickle.dump(century_model, open("../trained/LR_Century_model_SMOTE_SAG.pkl", "wb"))

## Step 3: Evaluation

#### Quick load

In [29]:
century_model = pickle.load(open("../trained/LR_Century_model_SMOTE_lbfgs.pkl", "rb"))

### Prediction

In [30]:
y_pred = century_model.predict(x_test)

### Metrics

In [40]:
print(classification_report(y_test,y_pred, zero_division=0))

results = classification_report(y_test, y_pred, zero_division=0, output_dict=True)

recall_avg = results["weighted avg"]["recall"]
f1_score_avg = results["weighted avg"]["f1-score"]
precision_avg = results["weighted avg"]["precision"]

              precision    recall  f1-score   support

          11       0.34      0.58      0.43        55
          13       0.73      0.44      0.55        18
          14       0.75      0.79      0.77       189
          16       0.95      0.84      0.89       210
          17       0.91      0.90      0.90      4032
          18       0.71      0.74      0.72      2880
          19       0.87      0.78      0.82      6476
          20       0.32      0.71      0.44       454

    accuracy                           0.80     14314
   macro avg       0.70      0.72      0.69     14314
weighted avg       0.83      0.80      0.81     14314



In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

heat = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=century_model.classes_)
fig, ax = plt.subplots()  # optional: adjust figure size
heat.plot(ax=ax)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Normalize it (row-wise percentages)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=century_model.classes_)
disp.plot(values_format='.2f')  # values_format to control decimal places
plt.title('Confusion Matrix (in %)')
plt.xticks(rotation=45)
plt.show()


In [55]:
# Compare the results with the actual y values
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


In [52]:
# Use predict_proba to get probabilities for each class
y_pred_proba = century_model.predict_proba(x_test)

# Compute ROC AUC score
avg_roc = roc_auc_score(y_test, y_pred_proba, average="macro", multi_class="ovr")
print(f"ROC AUC: {avg_roc:.2f}")

ROC AUC: 0.97


### Saving for benchmarking

In [59]:
benchmark = pd.read_parquet(BENCHMARK_TABLE)
benchmark.loc[("TF-IDF", "Logistic Regression"), ["Avg Recall", "Avg F1-Score", "Avg Precision", "Accuracy", "Avg AUC"]] = [
    recall_avg, f1_score_avg, precision_avg, accuracy, avg_roc
]


In [61]:
# Save the benchmark table
benchmark.to_parquet(BENCHMARK_TABLE, index=True)

# Model B: SMOTE, Per Movement

Each poem is mapped to a German literary movement.

Our feature is `text`, and the label is `movement`.

## Step 1: Feature selection

In [None]:
y = poems["movement"]

In [None]:
print(f"Currently have {len(poems)} poems with {len(y.unique())} movements.")

## Step 2: Data splitting and model training

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
clf.fit(x_train, y_train)

In [None]:
movement_model = clf.best_estimator_
movement_model

In [None]:
movement_model.get_params()

#### Quick save

In [None]:
pickle.dump(movement_model, open("../trained/LR_Movement_model.pkl", "wb"))

## Step 3: Evaluation and finetuning

#### Quick load

In [None]:
# Load just in case
model = pickle.load(open("../trained/LR_Movement_model.pkl", "rb"))

In [None]:
y_pred = movement_model.predict(x_test)

In [None]:
print(classification_report(y_test,y_pred, zero_division=0))

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred)

heat = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=movement_model.classes_)
fig, ax = plt.subplots()  # optional: adjust figure size
heat.plot(ax=ax)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Normalize it (row-wise percentages)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=movement_model.classes_)
disp.plot(values_format='.2f')  # values_format to control decimal places
plt.title('Confusion Matrix (in %)')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Compare the results with the actual y values
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")