# NLP Project: Reimplementation of 
## A Neural Probabilistic Language Model 
### *(Bengio et al., 2003)*

### Created by Dennis Linnert

------------------------------------------------------------------------------------------------------------------------------

Remark: To enable training of the neural network, I executed this juypter notebook using the GPU resources provided by Kaggle's online platform (https://www.kaggle.com/code) to utilize CUDA instead of running the training on my old laptop cpu.    

In [1]:
import nltk
from nltk.corpus import brown
from collections import Counter, defaultdict
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import numpy as np
import math
import matplotlib.pyplot as plt
from IPython.display import display, Math

In [2]:
# Download & load the Brown corpus (utilized in the paper)
#nltk.download('brown') #commented out due to warning message when corpus is already downloaded
corpus = brown.words()

# Find total number of words in the corpus
total_words = len(corpus)
print(f"Total words in the Brown corpus: {total_words}")

Total words in the Brown corpus: 1161192


## Preprocessing of Corpus

In [3]:
# Convert all words to lowercase
corpus = [word.lower() for word in corpus]

# Replace rare words with a single symbol like Bengio et al. (all words with frequency equal or lower than 3)
word_freq = Counter(corpus)
vocab = {word for word, freq in word_freq.items() if freq > 3}
vocab.add('<UNK>')

# Create word to index and index to word mappings
word_to_idx = {word: idx for idx, word in enumerate(vocab)}
word_to_idx['<UNK>'] = len(word_to_idx) - 1
idx_to_word = {idx: word for word, idx in word_to_idx.items()}

# Ensure all words are within the vocabulary range
corpus_indices = [word_to_idx.get(word, word_to_idx['<UNK>']) for word in corpus]

# Check the Vocabulary Size
vocab_size = len(word_to_idx)
print(f"Vocabulary size: {vocab_size}")

Vocabulary size: 16690


**Note**: 
In the paper, the Brown corpus utilized was slightly larger (1 181 041 words). Therefore to stick as close to the paper as possible, I will calculate the proportions from the numbers used in the paper (originally 800 000 words were used for training, 200 000 validation, rest of 181 041 for testing).

In [4]:
# Define proportions based on the original paper (See note above)
train_proportion = 800000 / 1181041
valid_proportion = 200000 / 1181041
test_proportion = 181041 / 1181041

# Calculate actual sizes based on proportions
train_size = int(train_proportion * total_words)
valid_size = int(valid_proportion * total_words)
test_size = total_words - train_size - valid_size

# Split the data into training, validation, and test sets
train_data = corpus_indices[:train_size]
valid_data = corpus_indices[train_size:train_size + valid_size]
test_data = corpus_indices[train_size + valid_size:]

print(f"Train size: {len(train_data)}, Validation size: {len(valid_data)}, Test size: {len(test_data)}")

Train size: 786554, Validation size: 196638, Test size: 178000


## Parameters from the Paper

In [5]:
# Parameters from the paper
context_size = 5  # as used in the papers best approach (n = 5)
embedding_dim = 30 #  m = 30 
hidden_dim = 100 # h = 100
epochs = 15 # On the brown corpus they used 10 to 20 epochs
learning_rate = 0.001 

## Neural Probabilistic Language Model

In [6]:
class NeuralProbabilisticLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NeuralProbabilisticLanguageModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim) # Embedding layer (distributed word representations as vector)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim) # First linear layer
        self.linear2 = nn.Linear(hidden_dim, vocab_size) # Second linear layer

    def forward(self, inputs):
         # Forward pass: get embeddings, reshape, apply linear transformations and activations
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1)) # flat vector that represents word contexts
        hidden = torch.tanh(self.linear1(embeds))  # Applying tanh activation fucntion
        output = self.linear2(hidden)  # Linear transformation
        log_probs = F.log_softmax(output, dim=1)  # Softmax to get log probabilities
        return log_probs

# Initalise the model
model = NeuralProbabilisticLanguageModel(vocab_size, embedding_dim, context_size, hidden_dim)


## Loss function
#### as described in the paper, Negative Log-Likelihood. 

A more detailed description of the perplexity formula and calculation is displayed below in the chapter after the training function)

In [7]:
def negative_log_likelihood_loss(log_probs, target):
    return -log_probs[range(target.size(0)), target].mean()

# Define optimizer with weight decay as specified in the paper 
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-4)

## Define Dataset Class for Brown Corpus, Create Dataloader and Training Function

In [8]:
# Define the dataset class for the Brown corpus
class BrownDataset(Dataset):
    def __init__(self, data, context_size):
        self.data = data
        self.context_size = context_size

    def __len__(self):
        return len(self.data) - self.context_size

    def __getitem__(self, idx):
        # Return context and target for the current index
        return (torch.tensor(self.data[idx:idx + self.context_size], dtype=torch.long),
                torch.tensor(self.data[idx + self.context_size], dtype=torch.long))

# Create Dataloader for training data 
batch_size = 64
train_dataset = BrownDataset(train_data, context_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

#Training Function
def train(model, train_loader, loss_function, optimizer, epochs):
    model.train()
    epoch_losses = []  # List to store the loss for each epoch
    for epoch in range(epochs):
        total_loss = 0
        for i, (context, target) in enumerate(train_loader):
            context, target = context.to(device), target.to(device)

            # Zero the gradients
            optimizer.zero_grad()

            # Forward pass
            log_probs = model(context)

            # Compute loss
            loss = loss_function(log_probs, target)

            # Backward pass
            loss.backward()

            # Update parameters
            optimizer.step()

            # Sum up loss
            total_loss += loss.item()

        average_loss = total_loss / len(train_loader)  # Average loss for the epoch
        epoch_losses.append(average_loss)  # Store the average loss
        print(f'Epoch {epoch + 1}, Loss: {average_loss}')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
print(f'Using device: {device}')

# Train the model
train(model, train_loader, negative_log_likelihood_loss, optimizer, epochs)

Using device: cuda
Epoch 1, Loss: 9.330602554890856
Epoch 2, Loss: 8.07758345181036
Epoch 3, Loss: 7.542020863415848
Epoch 4, Loss: 7.243356521814407
Epoch 5, Loss: 7.0721296347671645
Epoch 6, Loss: 6.960651938754237
Epoch 7, Loss: 6.881320454688263
Epoch 8, Loss: 6.821043903755889
Epoch 9, Loss: 6.77289244183881
Epoch 10, Loss: 6.7329027452926695
Epoch 11, Loss: 6.698673285576655
Epoch 12, Loss: 6.668700559591258
Epoch 13, Loss: 6.642030216136339
Epoch 14, Loss: 6.61803004603933
Epoch 15, Loss: 6.596240571497707


## Perplexity Calculation

As explained in the paper, the perplexity of a language model is calculated using the following formula (also see loss calculation above):

$$
\text{Perplexity} = \exp\left( -\frac{1}{N} \sum_{i=1}^{N} \log P(w_i \mid w_{1:i-1}) \right)
$$

Where:
- $N$ is the total number of words.
- $P(w_i \mid w_{1:i-1})$ is the probability of the $i$-th word given its previous context.


In [9]:
def calculate_perplexity(data, model, context_size, batch_size):
    model.eval() 
    data_len = len(data) - context_size
    perplexity_sum = 0
    num_batches = 0

    with torch.no_grad():
        for i in range(0, data_len, batch_size):
            context_batch = []
            target_batch = []
            for j in range(i, min(i + batch_size, data_len)):
                context_batch.append(data[j:j + context_size])
                target_batch.append(data[j + context_size])

            context_batch = torch.tensor(context_batch, dtype=torch.long).to(device)
            target_batch = torch.tensor(target_batch, dtype=torch.long).to(device)

            log_probs = model(context_batch)
            loss = negative_log_likelihood_loss(log_probs, target_batch)

            perplexity_sum += torch.exp(loss).item()
            num_batches += 1

    return perplexity_sum / num_batches

# Calculate perplexity on validation and test sets
validation_perplexity = calculate_perplexity(valid_data, model, context_size, batch_size)
test_perplexity = calculate_perplexity(test_data, model, context_size, batch_size)

print(f'Validation Perplexity: {validation_perplexity}')
print(f'Test Perplexity: {test_perplexity}')

Validation Perplexity: 764.3944241504812
Test Perplexity: 718.8795194481878


## Create interpolated trigram model for comparison

In [10]:
class InterpolatedTrigramModel:
    def __init__(self, corpus):
        self.trigrams = defaultdict(Counter)
        self.bigrams = defaultdict(Counter)
        self.unigrams = Counter()
        self.total_words = 0
        self.train(corpus)

    def train(self, corpus):
        # First train the trigram, bigram and unigram counts for interpolated trigram model
        for i in range(2, len(corpus)):
            trigram = (corpus[i-2], corpus[i-1], corpus[i])
            bigram = (corpus[i-2], corpus[i-1])
            unigram = corpus[i-2]

            self.trigrams[bigram][corpus[i]] += 1
            self.bigrams[bigram[0]][bigram[1]] += 1
            self.unigrams[unigram] += 1
            self.total_words += 1

    def trigram_probability(self, word, context):
        bigram = context
        epsilon = 1e-10  # Small value to avoid zero probabilities
        if self.trigrams[bigram][word] > 0:
            return self.trigrams[bigram][word] / sum(self.trigrams[bigram].values())
        elif self.bigrams[bigram[0]][bigram[1]] > 0:
            return 0.4 * (self.bigrams[bigram[0]][bigram[1]] / sum(self.bigrams[bigram[0]].values()))
        else:
            return 0.1 * (self.unigrams[word] / self.total_words + epsilon)

# Train the interpolated trigram model
trigram_model = InterpolatedTrigramModel(train_data)

In [11]:
# Evaluate the interpolated trigram model on the validation set
def evaluate_trigram_model(trigram_model, data, context_size):
    log_prob_sum = 0
    epsilon = 1e-10  # Small value to avoid log(0)
    for i in range(context_size, len(data)):
        context = tuple(data[i-context_size:i])
        target = data[i]
        prob = trigram_model.trigram_probability(target, context[-2:])
        log_prob_sum += -math.log(prob + epsilon)
    perplexity = math.exp(log_prob_sum / (len(data) - context_size))
    return perplexity

# Calculate perplexity on validation and test data
trigram_validation_perplexity = evaluate_trigram_model(trigram_model, valid_data, context_size)
trigram_test_perplexity = evaluate_trigram_model(trigram_model, test_data, context_size)

print(f'Interpolated Trigram Validation Perplexity: {trigram_validation_perplexity}')
print(f'Interpolated Trigram Test Perplexity: {trigram_test_perplexity}')

Interpolated Trigram Validation Perplexity: 291.5492151600732
Interpolated Trigram Test Perplexity: 277.4435776519447


## Mixture model

In the Paper, a mixture model (neural model/trigram) had the best results

In [12]:
# Mixture function to combine neural and interpol. trigram model with a simple fixed weight of 0.5
def mixture_model_probability(context, word, model, trigram_model, alpha=0.5):
    context_tensor = torch.tensor(context).unsqueeze(0).to(device)
    neural_log_probs = model(context_tensor)
    neural_prob = torch.exp(neural_log_probs).cpu().detach().numpy()[0][word]
    trigram_prob = trigram_model.trigram_probability(word, (context[-2], context[-1]))
    return alpha * neural_prob + (1 - alpha) * trigram_prob

In [13]:
# Calculate perplexity for mixture model
def mixture_model_perplexity(data, model, trigram_model, context_size, batch_size, alpha=0.5):
    model.eval()  # Set the model to evaluation mode
    data_len = len(data) - context_size
    perplexity_sum = 0
    num_batches = 0

    with torch.no_grad():
        for i in range(0, data_len, batch_size):
            context_batch = []
            target_batch = []
            for j in range(i, min(i + batch_size, data_len)):
                context_batch.append(data[j:j + context_size])
                target_batch.append(data[j + context_size])

            context_batch = torch.tensor(context_batch, dtype=torch.long).to(device)
            target_batch = torch.tensor(target_batch, dtype=torch.long).to(device)

            log_probs = model(context_batch)
            neural_probs = torch.exp(log_probs).cpu().detach().numpy()

            # Calculate mixture probabilities
            mixture_probs = []
            for k in range(context_batch.size(0)):
                context = context_batch[k].cpu().numpy()
                target = target_batch[k].item()
                mixture_prob = mixture_model_probability(context, target, model, trigram_model, alpha)
                mixture_probs.append(mixture_prob)

            mixture_probs = torch.tensor(mixture_probs)
            loss = -torch.log(mixture_probs).mean()
            perplexity_sum += torch.exp(loss).item()
            num_batches += 1

    return perplexity_sum / num_batches

mixture_validation_perplexity = mixture_model_perplexity(valid_data, model, trigram_model, context_size, batch_size)
mixture_test_perplexity = mixture_model_perplexity(test_data, model, trigram_model, context_size, batch_size)

print(f'Mixture Model Validation Perplexity: {mixture_validation_perplexity}')
print(f'Mixture Model Test Perplexity: {mixture_test_perplexity}')

Mixture Model Validation Perplexity: 182.98508888003227
Mixture Model Test Perplexity: 174.8710499575007


## Test Perplexity Comparison

Comparison of test perplexities for the three models

In [14]:
latex_table = f"""
\\begin{{array}}{{|c|c|c|}}
\\hline
\\text{{Model}} & \\text{{Test Perplexity}} & \\text{{Bengio et al.}} \\\\
\\hline
\\text{{Neural Model}} & {round(test_perplexity)} & 268 \\\\
\\hline
\\text{{Interpolated Trigram Model}} & {round(trigram_test_perplexity)} & 312 \\\\
\\hline
\\text{{Mixture Model}} & {round(mixture_test_perplexity)} & 252 \\\\
\\hline
\\end{{array}}
"""

display(Math(latex_table))

<IPython.core.display.Math object>

In summary, the reimplemented model was mostly able to create similar perplexity scores as observed in the paper, especially the mixed model.

Nevertheless, the neural model without mixture unfortunately achieves a significantly higher perplexity, I assume differences in the dataset (as the utilized brown corpus from the nltk library did deviate from the descriptions from the authors) and further specifications of the authors that they did not explicitly mention.


------------------------------------------------------------------------------------------------------------------------------

## Extra
As this paper is over 20 years old, there are new ways to handle the problem (like using a ReLU activation function to mitigate the possible issue of gradient vanishing or using adaptive learning rates like adam). But as the task is to reimplement the paper, I will try to adjust hyperparameters only.  

In the following, the best hyperparameter combination of my experiments is displayed, where I changed the learning rate and weight decay along with number of epochs, batch_size, embedding dimension and hidden dimension:

In [15]:
# Trying different hyperparameters:
learning_rate = 0.01
epochs = 40
batch_size = 256
embedding_dim = 100
hidden_dim = 200

# Redefine the model with adjusted parameters
class NeuralProbabilisticLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, context_size, hidden_dim):
        super(NeuralProbabilisticLanguageModel, self).__init__()
        self.embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.linear1 = nn.Linear(context_size * embedding_dim, hidden_dim)
        self.linear2 = nn.Linear(hidden_dim, vocab_size)

    def forward(self, inputs):
        embeds = self.embeddings(inputs).view((inputs.shape[0], -1))
        hidden = torch.tanh(self.linear1(embeds))
        output = self.linear2(hidden)
        log_probs = F.log_softmax(output, dim=1)
        return log_probs

# Reinitialize the model
model = NeuralProbabilisticLanguageModel(vocab_size, embedding_dim, context_size, hidden_dim).to(device)

# Reinitialize the DataLoader with new batch size
train_dataset = BrownDataset(train_data, context_size)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)

# Reinitialize the optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay=1e-5)

# Retrain the model
train(model, train_loader, negative_log_likelihood_loss, optimizer, epochs)
# Calculate perplexity on test set for the new neural model
new_neural_test_perplexity = calculate_perplexity(test_data, model, context_size, batch_size)

print(f'Adjusted Neural Model Test Perplexity: {new_neural_test_perplexity}')

Epoch 1, Loss: 8.0648157741672
Epoch 2, Loss: 6.956512863148007
Epoch 3, Loss: 6.69932531092402
Epoch 4, Loss: 6.560917732500696
Epoch 5, Loss: 6.470315258083225
Epoch 6, Loss: 6.404835165652552
Epoch 7, Loss: 6.35418316831505
Epoch 8, Loss: 6.313470383478693
Epoch 9, Loss: 6.279439462654915
Epoch 10, Loss: 6.250393284519293
Epoch 11, Loss: 6.224869589521642
Epoch 12, Loss: 6.2021346186877775
Epoch 13, Loss: 6.181478220439678
Epoch 14, Loss: 6.1628125923447215
Epoch 15, Loss: 6.145415593621167
Epoch 16, Loss: 6.129393770451892
Epoch 17, Loss: 6.114214642366756
Epoch 18, Loss: 6.100332642936023
Epoch 19, Loss: 6.086979032610202
Epoch 20, Loss: 6.074329304997711
Epoch 21, Loss: 6.062488515408854
Epoch 22, Loss: 6.051138480323322
Epoch 23, Loss: 6.040405781463812
Epoch 24, Loss: 6.030129662484871
Epoch 25, Loss: 6.0203173027882695
Epoch 26, Loss: 6.010881852684679
Epoch 27, Loss: 6.001757072618797
Epoch 28, Loss: 5.992955295495711
Epoch 29, Loss: 5.984464758814952
Epoch 30, Loss: 5.976335