# Poem: Feedforward Neural Network

We use a feedforward neural network to classify the poems. The model is a simple logistic regression model with a single hidden layer. The input is the poem text, and the output is the date of the poem.

This model requires less tweaking than others.

In [None]:
# If you do not have stopwords
import nltk
nltk.download("stopwords")


# ðŸŽ“ Library

In [None]:
# Misc
import os
import numpy as np
import pandas as pd
import re


# Training
from nltk.corpus import stopwords

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
import keras
from keras.api.models import Model, Sequential
from keras.api.layers import Dense, Input, Dropout
from keras.api.optimizers import Adam
from keras.api.losses import SparseCategoricalCrossentropy
from keras.api.metrics import SparseCategoricalAccuracy

# Plot metrics
import plotly.express as px
import plotly.graph_objects as go

# Deactivate XLA compilation
tf.config.optimizer.set_jit(False)
# TensorFlow, check if GPU is available
if tf.config.list_physical_devices('GPU'):
    print(f"GPU available: {tf.config.list_physical_devices('GPU')}")


# Plotting
import matplotlib.pyplot as plt 

DATA = "../data/"
BENCHMARK_TABLE = "../../class_bench.parquet"
RANDOM_STATE = 42

if not os.path.exists(DATA):
    raise FileNotFoundError(f"Data directory {DATA} does not exist. Please create it and add the data files.")

In [None]:
poems_df = pd.read_parquet(DATA + "de_poems.parquet")

In [None]:
poems_df.head(3)["text"].values

# Preprocessing

In [None]:
import sys
sys.path.append("../")
from env import get_period

poems = poems_df.copy()

# Transform all date to centuries, else we have 304 classes
poems["creation"] = poems["creation"].astype(int)

# For model A
poems["century"] = poems["creation"].apply(lambda x: str(x // 100 + 1))

# For model B
poems["movement"] = poems["creation"].apply(get_period)

def preprocess(text: str) -> str:
    text = text.lower()
    text = re.sub(r'[\d\W_]+', ' ', text)
    tokens = text.split()
    return ' '.join(tokens)

poems["cleaned_text"] = poems["text"].apply(preprocess)

In [None]:
poems.head(3)[["text", "cleaned_text"]]

## Model creation and tuning

In [None]:
# Remove german stop words
german_stop_words = stopwords.words("german")

vectorizer = TfidfVectorizer(stop_words=german_stop_words)
x = vectorizer.fit_transform(poems["cleaned_text"])


# Model A: Per Century

Each poem is mapped to a century.

Our feature is `text`, and the label is `century`.

## Step 1: Feature selection & class weight distribution

In [None]:
# Label encoding
labeler = LabelEncoder()
y = labeler.fit_transform(poems["century"])
x.shape, y.shape

In [None]:
class_weight_values = compute_class_weight(
    class_weight="balanced",
    classes=np.unique(y),
    y=y
)

In [None]:
class_weights = dict(zip(np.unique(y), class_weight_values))

## Step 2: Data splitting and model training

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state = 42)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

In [None]:
poem_dim = x_train.shape[1]
num_classes = len(np.unique(y))

# Model
poem_input = Input(shape=(poem_dim,), name="input")

# Hidden layers
hidden_layers = Sequential(
    [
        Dense(128, activation="relu", name="dense_1"),
        Dropout(0.2, name="dropout_1"),
        Dense(64, activation="relu", name="dense_2"),
        Dense(num_classes, activation="softmax", name="output"),
    ], name="hidden_layers"
)
poem_nn = hidden_layers(poem_input)

In [None]:
model = Model(inputs=poem_input, outputs=poem_nn, name="poem_model")

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss=SparseCategoricalCrossentropy(),
    metrics=[SparseCategoricalAccuracy()],
)

model.summary()

In [None]:
hist = model.fit(
    x=x_train,
    y=y_train,
    validation_data=(x_test, y_test),
    epochs=6,
    batch_size=1024,
    callbacks=[keras.callbacks.EarlyStopping(monitor="loss", patience=3, restore_best_weights=True)],
    class_weight=class_weights,
)

In [None]:
fig = go.Figure()

fig.add_trace(go.Scatter(
    y=hist.history["val_loss"],
    mode="lines",
    name="Validation Loss"
))

fig.add_trace(go.Scatter(
    y=hist.history["loss"],
    mode="lines",
    name="Train Loss"
))

fig.update_layout(
    title="Validation loss per epoch",
    xaxis_title="Epoch",
    yaxis_title="Loss",
    legend_title="Dataset",
    xaxis=dict(tickmode="linear"),
)

#### Quick save

In [None]:
model.save("../trained/ffnn_weighted.keras")

## Step 3: Evaluation

y_pred is a matrix of probabilities for each class.

We convert it to the label (encoded) with the highest probability.

#### Quick load

In [None]:
# Load just in case
model = keras.models.load_model("../trained/ffnn_weighted.keras")

### Prediction

In [None]:
y_pred = model.predict(x_test)
y_pred_classes = np.argmax(y_pred, axis=1)

### Metrics

In [None]:
print(classification_report(y_test, y_pred_classes, target_names=labeler.classes_, zero_division=0)["f1-score"])

In [None]:
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)

heat = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labeler.classes_)
fig, ax = plt.subplots()  # optional: adjust figure size
heat.plot(ax=ax)
plt.xticks(rotation=45)
plt.show()

In [None]:
# Normalize it (row-wise percentages)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] * 100

# Plot
disp = ConfusionMatrixDisplay(confusion_matrix=cm_normalized, display_labels=labeler.classes_)
disp.plot(values_format='.2f')  # values_format to control decimal places
plt.title('Confusion Matrix (in %)')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Compare the results with the actual y values
accuracy = accuracy_score(y_test, y_pred_classes)
print(f"Accuracy: {accuracy:.2f}")