# W2V MLP

### Imports

In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import optuna
import random

# import modules

from model.rnn import (
    PoemDataset,
    get_weighted_sampler,
    PoemRNNClassifier,
    train_epoch,
    evaluate,
    train_rnn,
    validate_rnn,
    save_model,
    load_model,
    BiLSTMWithAttention,
    plot_confusion_matrix_percent,
    sequence_vectorize,
)

from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import DataLoader
from spacy.lang.de.stop_words import STOP_WORDS

import mlflow
import mlflow.pytorch

seed = 42
torch.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
optuna.logging.set_verbosity(optuna.logging.WARNING)

# If using CUDA
torch.cuda.manual_seed_all(seed)

# Define experiment
mlflow.set_experiment("W2V")
mlflow.pytorch.autolog()


%load_ext autoreload
%autoreload 2

Using device: cpu


2025/05/06 23:32:30 INFO mlflow.tracking.fluent: Experiment with name 'W2V' does not exist. Creating a new experiment.


### Loading data into a Dataframe (df)

In [6]:
df = pd.read_parquet("data/de_poems.parquet")
df = df.astype(
    {"title": "string", "text": "string", "author": "string", "creation": "int64"}
)
df.dtypes


title       string[python]
text        string[python]
author      string[python]
creation             int64
dtype: object

In [7]:
# Convertir les dates en siècles, en gérant les valeurs None
# D'abord, créer une copie pour ne pas modifier l'original
df_siecles = df.copy()

# Convertir uniquement les valeurs non-nulles en entiers puis en siècles
# Assurons-nous que toutes les valeurs sont soit des nombres, soit des NaN
df_siecles["creation"] = pd.to_numeric(df_siecles["creation"], errors="coerce")

# Maintenant, appliquer la conversion en siècles seulement sur les valeurs non-nulles
mask = df_siecles["creation"].notna()
df_siecles.loc[mask, "creation"] = df_siecles.loc[mask, "creation"].apply(
    lambda x: (x - 1) // 100 + 1
)

# Remplacer df par df_siecles pour conserver les modifications
df = df_siecles

In [8]:
# Filtrer les lignes où creation est None
df_none_creation = df[df["creation"].isnull()]


# Grouper par author pour voir combien de poèmes chaque author a avec creation = None
authors_avec_none = (
    df_none_creation.groupby("author").size().sort_values(ascending=False)
)

# Pour chaque author ayant des poèmes sans date, vérifier s'il a aussi des poèmes avec date
authors_a_corriger = []
for author in df_none_creation["author"].unique():
    dates_disponibles = df[df["author"] == author]["creation"].dropna().unique()
    if len(dates_disponibles) > 0:
        authors_a_corriger.append((author, dates_disponibles))


# Fonction pour remplir les dates manquantes avec la médiane des dates de l'author
def remplir_dates_manquantes(df):
    df_copy = df.copy()

    for author in df[df["creation"].isna()]["author"].unique():
        dates = df[df["author"] == author]["creation"].dropna()
        if len(dates) > 0:
            # Utiliser la médiane des dates disponibles pour cet author
            date_mediane = int(np.median(dates))
            # Remplir les valeurs manquantes pour cet author
            mask = (df_copy["author"] == author) & (df_copy["creation"].isna())
            df_copy.loc[mask, "creation"] = date_mediane

    return df_copy


# Appliquer la fonction pour remplir les dates manquantes
df_corrige = remplir_dates_manquantes(df)

# Vérifier combien de dates ont été corrigées
nb_corriges = len(df) - df_corrige["creation"].isna().sum()
nb_restants = df_corrige["creation"].isna().sum()

print(f"\nNombre de dates corrigées: {nb_corriges - (len(df) - len(df_none_creation))}")
print(f"Nombre de poèmes restants sans date: {nb_restants}")

df = df_corrige.dropna()


Nombre de dates corrigées: 0
Nombre de poèmes restants sans date: 0


### Tokenizing text

#### NLTK

In [9]:
nltk.download("punkt_tab")

def preprocess_text_wordTokenize(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s]+", "", text)
    tokens = word_tokenize(text)
    return tokens

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/erwinrodrigues/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()

def preprocess_text_lemmatize(text):
    text = re.sub(r"[^\w\s'-]+", "", text)
    tokens = word_tokenize(re.sub(r"[^\w\s]+", "", text.lower()))
    return [lemmatizer.lemmatize(token) for token in tokens]


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/erwinrodrigues/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
def preprocess_text_stemmer(text):
    text = re.sub(r"[^\w\s'-]+", "", text)
    tokens = word_tokenize(re.sub(r"[^\w\s]+", "", text.lower()))
    stemmer = SnowballStemmer("german")
    return [stemmer.stem(token) for token in tokens]

In [12]:
df["tokens"] = df["text"].apply(preprocess_text_stemmer)


#### Spacy

In [32]:
nlp = spacy.load("de_core_news_sm")
# python -m spacy download de_core_news_sm


def preprocess_text_spacy(doc):
    tokens = [token.lemma_ for token in doc if not token.is_punct]
    return tokens


def preprocess_text_spacy_stopwords(doc):
    filtered_tokens = [
        token.lemma_
        for token in doc
        if not token.is_punct and token.lemma_.lower() not in STOP_WORDS
    ]
    return filtered_tokens


In [33]:
texts = df["text"].tolist()
docs = list(nlp.pipe(texts, batch_size=1000, n_process=4))

df["tokens"] = [preprocess_text_spacy_stopwords(doc) for doc in docs]

In [34]:
df.to_csv("spacy_tokenized.csv")

In [None]:
df = pd.read_csv("spacy_tokenized.csv")

### Training Word 2 Vec on the tokenized texts

In [None]:
tokenized_poems = df["tokens"].tolist()
model_df = Word2Vec(tokenized_poems, vector_size=100, window=5, min_count=2, workers=4)

model_df.save("word2vec_df.model")

#### W2V Boosted

In [35]:
model_df = Word2Vec(
    sentences=df["tokens"],
    vector_size=500,
    window=20,
    min_count=2,
    workers=32,
    epochs=50,
    seed=42,
)

model_df.save("word2vec_boosted2.model")

In [20]:
model_df = Word2Vec.load("word2vec_boosted2.model")

## MLP

### Embedding Text && Encoding Century

In [38]:
def text_to_embedding(tokens, model, embedding_dim=100):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) == 0:
        return np.zeros(embedding_dim)
    return np.mean(vectors, axis=0)

def vectorize(tokens, model):
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if not vectors:
        return np.zeros(model.vector_size * 2)
    vectors = np.array(vectors)
    mean_vec = vectors.mean(axis=0)
    max_vec = vectors.max(axis=0)
    return np.concatenate([mean_vec, max_vec])


df["embedding"] = df["tokens"].apply(lambda x: text_to_embedding(x, model_df))
#df["embedding"] = df["tokens"].apply(lambda x: vectorize(x, model_df))

In [39]:
from sklearn.preprocessing import StandardScaler

label_encoder = LabelEncoder()
df.loc[:, "creation"] = label_encoder.fit_transform(df["creation"])

x = torch.tensor(np.stack(df["embedding"].values), dtype=torch.float32)
y = torch.tensor(df["creation"].values, dtype=torch.long)

x_train, x_test, y_train, y_test = train_test_split(
    x, y, test_size=0.2, random_state=42
)

scaler = StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

### Creating the data loaders

In [40]:
train_dataset = TextDataset(x_train, y_train)
val_dataset = TextDataset(x_test, y_test)

batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

### Creating model and hyperparameters

In [43]:
embedding_dim = 500
hidden_size = 128
num_classes = len(df["century_label"].unique())

class_counts = np.bincount(df["century_label"])
class_weights = 1.0 / class_counts

def initialize_weights(m):
    if isinstance(m, nn.Linear):
        nn.init.xavier_uniform_(m.weight)
        if m.bias is not None:
            nn.init.zeros_(m.bias)

model = LanguageModelNN(embedding_dim, hidden_size, num_classes)
# model = LanguageModelNN_Dropout(num_classes)
model.apply(initialize_weights)

loss_fn = nn.CrossEntropyLoss()
# loss_fn = nn.CrossEntropyLoss(weight=torch.tensor(class_weights, dtype=torch.float32))

optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
# optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-4)
# optimizer = optim.AdamW(model.parameters(), lr=0.001, weight_decay=1e-4)


### Training and Validating model

In [44]:
mlflow.end_run()    
mlp_training_time = train_mlp(model, train_loader, loss_fn, optimizer, num_epochs=200)

Epoch 1/200, Loss: 0.5531
Epoch 2/200, Loss: 0.4196
Epoch 3/200, Loss: 0.3645
Epoch 4/200, Loss: 0.3238
Epoch 5/200, Loss: 0.2891
Epoch 6/200, Loss: 0.2596
Epoch 7/200, Loss: 0.2359
Epoch 8/200, Loss: 0.2121
Epoch 9/200, Loss: 0.1939
Epoch 10/200, Loss: 0.1782
Epoch 11/200, Loss: 0.1634
Epoch 12/200, Loss: 0.1504
Epoch 13/200, Loss: 0.1411
Epoch 14/200, Loss: 0.1289
Epoch 15/200, Loss: 0.1198
Epoch 16/200, Loss: 0.1096
Epoch 17/200, Loss: 0.1059
Epoch 18/200, Loss: 0.0965
Epoch 19/200, Loss: 0.0913
Epoch 20/200, Loss: 0.0859
Epoch 21/200, Loss: 0.0819
Epoch 22/200, Loss: 0.0767
Epoch 23/200, Loss: 0.0715
Epoch 24/200, Loss: 0.0707
Epoch 25/200, Loss: 0.0672
Epoch 26/200, Loss: 0.0643
Epoch 27/200, Loss: 0.0608
Epoch 28/200, Loss: 0.0596
Epoch 29/200, Loss: 0.0546
Epoch 30/200, Loss: 0.0584
Epoch 31/200, Loss: 0.0528
Epoch 32/200, Loss: 0.0507
Epoch 33/200, Loss: 0.0497
Epoch 34/200, Loss: 0.0469
Epoch 35/200, Loss: 0.0475
Epoch 36/200, Loss: 0.0471
Epoch 37/200, Loss: 0.0484
Epoch 38/2

In [47]:
save_mlp(model, label_encoder)

MLP model saved successfully to models/saved/mlp_model.pt


## RNN

In [6]:
df["sequence_embedding"] = df["tokens"].apply(lambda x: sequence_vectorize(x, model_df))

In [44]:
df = df[df["sequence_embedding"].apply(lambda x: len(x) > 0)].reset_index(drop=True)

In [7]:
print("Number of unique classes:", len(df["creation"].unique()))
print("Unique class labels:", df["creation"].unique())

print("Label range:", df["creation"].min(), "to", df["creation"].max())

Number of unique classes: 9
Unique class labels: [5 7 6 4 3 8 1 2 0]
Label range: 0 to 8


In [11]:
# Create a mapping from century to zero-indexed labels
century_to_idx = {
    century: idx for idx, century in enumerate(sorted(df["creation"].unique()))
}
print("Century to index mapping:", century_to_idx)

Century to index mapping: {0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5, 6: 6, 7: 7, 8: 8}


In [9]:
# Create new column with zero-indexed labels
df["century_label"] = df["creation"].map(century_to_idx)

In [10]:
print("New label range:", df["century_label"].min(), "to", df["century_label"].max())
print("Number of unique classes:", len(df["century_label"].unique()))

New label range: 0 to 8
Number of unique classes: 9


In [13]:
from sklearn.model_selection import train_test_split

X_embedded = df["sequence_embedding"].tolist()  # List of lists of word2vec vectors (np.ndarray)
y = df["century_label"].tolist()  # List of ints

# Set params
max_len = 50
embedding_dim = X_embedded[0][0].shape[0]
hidden_dim = 128
num_classes = len(set(y))
batch_size = 32
epochs = 100

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_embedded, y, stratify=y, test_size=0.2, random_state=42
)

# Datasets
train_dataset = PoemDataset(X_train, y_train, max_len)
test_dataset = PoemDataset(X_test, y_test, max_len)

# Sampler for imbalance
sampler = get_weighted_sampler(y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)



In [14]:
from sklearn.utils.class_weight import compute_class_weight
import torch

# convert y from list to tensor
y = torch.tensor(y, dtype=torch.long)

# Calculate class weights based on the labels in your dataset
class_weights = compute_class_weight(
    "balanced", classes=torch.unique(y).numpy(), y=y.numpy()
)
class_weights = torch.tensor(class_weights, dtype=torch.float32)

# Create the loss function with class weights
loss_fn = nn.CrossEntropyLoss(weight=class_weights)


In [15]:
# Example of running the training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BiLSTMWithAttention(
    input_dim=embedding_dim, hidden_dim=128, num_classes=num_classes, dropout=0.3
).to(device)

# Optimizer & Loss
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)

# Scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode="min", factor=0.5, patience=5, verbose=True
)



In [17]:
# Call train_model
model, rnn_training_time = train_rnn(
    model, train_loader, loss_fn, optimizer, scheduler, num_epochs=10
)

Epoch 1/10


KeyboardInterrupt: 

In [None]:
save_model(model, label_encoder)

## Evaluation

In [None]:
validate_rnn(model, val_loader, label_encoder, rnn_training_time)