# W2V MLP

### Imports

In [19]:
import pandas as pd
import numpy as np
import re
import torch
import torch.nn as nn
import torch.optim as optim
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import numpy as np
from rouge_score import rouge_scorer
from bert_score import score

DATA = "../../data/en_poems.parquet"

  from .autonotebook import tqdm as notebook_tqdm


### Model / Dataset

In [3]:
class PoemGeneratorMLP(nn.Module):
    def __init__(self, embedding_dim, hidden_size, vocab_size, dropout=0.5):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.fc1 = nn.Linear(embedding_dim, hidden_size)
        self.dropout = nn.Dropout(dropout)
        self.fc2 = nn.Linear(hidden_size, vocab_size)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        return self.fc2(x)
    
class PoemDataset(Dataset):
    def __init__(self, sequences, word2vec_model, vocab):
        self.sequences = sequences
        self.word2vec_model = word2vec_model
        self.vocab = vocab
        self.word_to_idx = {word: idx for idx, word in enumerate(vocab)}

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        # Get current word and next word
        current_word = self.sequences[idx][0]
        next_word = self.sequences[idx][1]

        # Get word vector for current word
        if current_word in self.word2vec_model.wv:
            word_vector = self.word2vec_model.wv[current_word]
        else:
            word_vector = np.zeros(self.word2vec_model.vector_size)

        # Get index of next word
        next_word_idx = self.word_to_idx.get(next_word, 0)

        return torch.tensor(word_vector, dtype=torch.float32), torch.tensor(
            next_word_idx, dtype=torch.long
        )

In [4]:
def prepare_training_sequences(poems, window_size=2):
    sequences = []
    for poem in poems:
        tokens = word_tokenize(re.sub(r"[^\w\s]", " ", poem.lower()).strip())
        for i in range(len(tokens) - 1):
            sequences.append((tokens[i], tokens[i + 1]))
    return sequences


In [38]:
def train_poem_generator(
    model, train_loader, test_loader, loss_fn, optimizer, num_epochs=50
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)

    best_test_loss = float("inf")
    train_losses = []
    test_losses = []

    for epoch in range(num_epochs):
        # Training
        model.train()
        total_train_loss = 0
        train_pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{num_epochs} [Train]")

        for batch_X, batch_y in train_pbar:
            batch_X = batch_X.to(device)
            batch_y = batch_y.to(device)

            outputs = model(batch_X)
            loss = loss_fn(outputs, batch_y)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item()
            train_pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        avg_train_loss = total_train_loss / len(train_loader)
        train_losses.append(avg_train_loss)

        model.eval()
        total_test_loss = 0
        test_pbar = tqdm(test_loader, desc=f"Epoch {epoch + 1}/{num_epochs} [Test]")

        with torch.no_grad():
            for batch_X, batch_y in test_pbar:
                batch_X = batch_X.to(device)
                batch_y = batch_y.to(device)

                outputs = model(batch_X)
                loss = loss_fn(outputs, batch_y)
                total_test_loss += loss.item()
                test_pbar.set_postfix({"loss": f"{loss.item():.4f}"})

        avg_test_loss = total_test_loss / len(test_loader)
        test_losses.append(avg_test_loss)

        print(f"Epoch {epoch + 1}/{num_epochs}:")
        print(f"Average Training Loss: {avg_train_loss:.4f}")
        print(f"Average Test Loss: {avg_test_loss:.4f}")

        if avg_test_loss < best_test_loss:
            best_test_loss = avg_test_loss
            torch.save(model.state_dict(), "best_poem_model.pt")

    return train_losses, test_losses


### Loading data 

In [6]:
df = pd.read_parquet(DATA)

df["tokens"] = df["text"].apply(
    lambda x: word_tokenize(re.sub(r"[^\w\s]", " ", x.lower()).strip())
)

### W2V

In [None]:
word2vec_model = Word2Vec(
    sentences=df["tokens"],
    vector_size=100,
    window=5,
    min_count=2,
    workers=4,
)
word2vec_model.train(df["tokens"], total_examples=len(df["tokens"]), epochs=10)

(116510229, 150443030)

In [8]:
vocabulary = list(word2vec_model.wv.key_to_index.keys())
sequences = prepare_training_sequences(df["text"].tolist())

### Model setup

In [9]:
# too long to run otherwise
sequences = sequences[:1000000]

train_sequences, test_sequences = train_test_split(
    sequences, test_size=0.1, random_state=42
)

train_dataset = PoemDataset(train_sequences, word2vec_model, vocabulary)
test_dataset = PoemDataset(test_sequences, word2vec_model, vocabulary)

batch_size = 1024
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [10]:
embedding_dim = 100
hidden_size = 256
vocab_size = len(vocabulary)

model = PoemGeneratorMLP(embedding_dim, hidden_size, vocab_size, dropout=0.5)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

### Training

In [None]:
model = PoemGeneratorMLP(embedding_dim, hidden_size, vocab_size, dropout=0.5)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 50

In [14]:
train_poem_generator(
    model, train_loader, test_loader, loss_fn, optimizer, num_epochs=1
)

Epoch 1/1 [Train]: 100%|██████████| 879/879 [03:45<00:00,  3.91it/s, loss=6.2231]
Epoch 1/1 [Test]: 100%|██████████| 98/98 [00:11<00:00,  8.29it/s, loss=6.3625]


Epoch 1/1:
Average Training Loss: 6.1852
Average Test Loss: 6.5589


([6.185246550046815], [6.558924494957437])

### Generation

In [15]:
def generate_poem(
    model, word2vec_model, vocabulary, max_length=50, temperature=0.7, start_word=None
):
    if start_word is None:
        current_word = np.random.choice(vocabulary)
    else:
        current_word = start_word

    generated_poem = [current_word]
    word_to_idx = {word: idx for idx, word in enumerate(vocabulary)}

    for _ in range(max_length):
        if current_word in word2vec_model.wv:
            word_vector = word2vec_model.wv[current_word]
            input_vector = torch.tensor(word_vector, dtype=torch.float32).unsqueeze(0)

            with torch.no_grad():
                output = model(input_vector)

            output = output / temperature
            probs = torch.softmax(output, dim=1)

            next_word_idx = torch.multinomial(probs, 1).item()
            next_word = vocabulary[next_word_idx]

            generated_poem.append(next_word)
            current_word = next_word

            if np.random.random() < 0.1:
                current_word = np.random.choice(vocabulary)

    return " ".join(generated_poem)


def format_poem(text, line_length=40):
    words = text.split()
    lines = []
    current_line = []
    current_length = 0

    for word in words:
        if current_length + len(word) + 1 <= line_length:
            current_line.append(word)
            current_length += len(word) + 1
        else:
            lines.append(" ".join(current_line))
            current_line = [word]
            current_length = len(word)

    if current_line:
        lines.append(" ".join(current_line))

    return "\n".join(lines)

In [16]:
def generate_evaluation_poems(
    model, word2vec_model, vocabulary, num_poems=100, max_length=50
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = model.to(device)
    model.eval()

    generated_poems = []
    for _ in tqdm(range(num_poems), desc="Generating poems"):
        poem = generate_poem(model, word2vec_model, vocabulary, max_length=max_length)
        generated_poems.append(poem)

    return generated_poems

In [37]:
generated_text = generate_poem(
    model,
    word2vec_model,
    vocabulary,
    max_length=100,
    temperature=0.7,
    start_word="far",
)
formatted_poem = format_poem(generated_text)
print(formatted_poem)

far off the song was laid them here
they are all the study and fly from
which seem the simple youth which flame
of the day we must be sure the happy our
fathers and the tramp a broad and green
the chase the sorcerer puppy in the sun
and an at the storm like down through
the world my passion and the whole child
was it is a week was the moons with a
woe and whiles when through with a year
when we know i mean the leafless the
ground from the winds rise and his you
ll be


## Evaluation

In [30]:
def evaluate_with_rouge(generated_poems, reference_poems):
    """
    Evaluate generated poems against reference poems using ROUGE metrics.
    """
    scorer = rouge_scorer.RougeScorer(
        ["rouge1", "rouge2", "rougeL", "rougeLsum"], use_stemmer=True
    )

    rouge_scores = {"rouge1": [], "rouge2": [], "rougeL": [], "rougeLsum": []}

    for gen_poem, ref_poem in tqdm(
        zip(generated_poems, reference_poems),
        total=len(generated_poems),
        desc="Calculating ROUGE scores",
    ):
        scores = scorer.score(ref_poem, gen_poem)

        for metric in rouge_scores.keys():
            rouge_scores[metric].append(scores[metric].fmeasure)

    results = {}
    for metric in rouge_scores:
        results[metric] = np.mean(rouge_scores[metric])

    return results


In [None]:
def evaluate_with_bertscore(generated_poems, reference_poems):
    """
    Evaluate generated poems against reference poems using BERTScore.
    """
    generated_poems = [str(poem) for poem in generated_poems]
    reference_poems = [str(poem) for poem in reference_poems]

    try:
        P, R, F1 = score(generated_poems, reference_poems, lang="en", verbose=True)
        return {
            "Precision": P.mean().item(),
            "Recall": R.mean().item(),
            "F1": F1.mean().item(),
        }
    except Exception as e:
        print(f"Error in BERTScore evaluation: {e}")
        return {"Precision": 0.0, "Recall": 0.0, "F1": 0.0}


In [None]:
def evaluate_model(model, word2vec_model, vocabulary, test_poems, num_samples=100):
    """
    Evaluate the model by generating poems and comparing them with test poems.
    """
    generated_poems = []
    for _ in tqdm(range(num_samples), desc="Generating poems for evaluation"):
        start_word = np.random.choice(vocabulary)
        generated_poem = generate_poem(
            model,
            word2vec_model,
            vocabulary,
            max_length=50,
            temperature=0.7,
            start_word=start_word,
        )
        generated_poems.append(str(generated_poem))

    selected_test_poems = [
        str(poem) for poem in np.random.choice(test_poems, num_samples, replace=False)
    ]

    rouge_scores = evaluate_with_rouge(generated_poems, selected_test_poems)
    bert_scores = evaluate_with_bertscore(generated_poems, selected_test_poems)

    print("ROUGE Scores:")
    for metric in ["rouge1", "rouge2", "rougeL", "rougeLsum"]:
        print(f"{metric}: {rouge_scores[metric]:.4f}")

    print("\nBERTScore:")
    print(f"Precision: {bert_scores['Precision']:.4f}")
    print(f"Recall: {bert_scores['Recall']:.4f}")
    print(f"F1: {bert_scores['F1']:.4f}")

    return {**rouge_scores, **bert_scores}

In [31]:
test_poems = df["text"].tolist()
results = evaluate_model(model, word2vec_model, vocabulary, test_poems, num_samples=100)

Generating poems for evaluation: 100%|██████████| 100/100 [00:31<00:00,  3.16it/s]
Calculating ROUGE scores: 100%|██████████| 100/100 [00:00<00:00, 109.52it/s]
Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


100%|██████████| 4/4 [00:26<00:00,  6.62s/it]


computing greedy matching.


100%|██████████| 2/2 [00:00<00:00, 28.28it/s]

done in 26.57 seconds, 3.76 sentences/sec
ROUGE Scores:
rouge1: 0.1633
rouge2: 0.0111
rougeL: 0.1089
rougeLsum: 0.1559

BERTScore:
Precision: 0.7905
Recall: 0.7643
F1: 0.7771



