In [5]:
%%capture
import warnings
from tqdm import tqdm

warnings.simplefilter('ignore')
import time
from collections import OrderedDict

import re

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import random

import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import string
import time
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from nltk.tokenize import word_tokenize

import matplotlib.pyplot as plt
from sklearn.manifold import TSNE

# You can also use this section to suppress warnings generated by your code:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn
%capture


##  Feedforward Neural Networks (FNNs) for language models

FNNs, or Multi-Layer Perceptrons, serve as the foundational components for comprehending neural networks in natural language processing (NLP). In NLP tasks, FNNs process textual data by transforming it into numerical vectors known as embeddings. Subsequently, these embeddings are input to the network to predict language facets, such as the upcoming word in a sentence or the sentiment of a text.

Let's consider the following song lyrics for our analysis.

In [6]:
song= """We are no strangers to love
You know the rules and so do I
A full commitments what Im thinking of
You wouldnt get this from any other guy
I just wanna tell you how Im feeling
Gotta make you understand
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Weve known each other for so long
Your hearts been aching but youre too shy to say it
Inside we both know whats been going on
We know the game and were gonna play it
And if you ask me how Im feeling
Dont tell me youre too blind to see
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Weve known each other for so long
Your hearts been aching but youre too shy to say it
Inside we both know whats been going on
We know the game and were gonna play it
I just wanna tell you how Im feeling
Gotta make you understand
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you
Never gonna give you up
Never gonna let you down
Never gonna run around and desert you
Never gonna make you cry
Never gonna say goodbye
Never gonna tell a lie and hurt you"""

# Tokenization for FNN

In [7]:
tokenizer = get_tokenizer("basic_english")
tokens = tokenizer(song)

In [8]:
def preprocess_string(s):

    """
    Preprocesses a given string by performing the following steps:

    1. Removes all non-word characters (excluding letters and numbers).
    2. Removes all whitespace characters.
    3. Removes all numeric digits.

    Parameters:
    s (str): The input string to be cleaned.

    Returns:
    str: The processed string with only alphabetic characters, no spaces, and no digits.
    """

    # Remove all non-word characters (everything except letters and numbers)
    # \w matches any word character (letters, numbers, and underscores)
    # \s matches any whitespace characters
    # ^ inside [] negates the selection, so [^\w\s] matches anything  that's NOT a word character or whitespace.
    s = re.sub(r"[^\w\s]", '', s)

    # Remove all whitespace characters (space, tabs, newlines)
    # \s+ matches one or more whitespaces characters
    s = re.sub(r"\s+", '', s)

    # remove all digits (0, 9)
    # \d matches any digit character
    s = re.sub(r"\d", '', s)

    return s

In [9]:
def preprocess(words):
    """
    Preprocesses a given text by tokenizing it, cleaning individual words, and
    converting them to lowercase while removing empty or punctuation tokens.

    Steps:
    1. Tokenization: Splits the input text into individual word tokens.
    2. Cleaning: Applies `preprocess_string()` to remove non-word characters,
       spaces, and digits from each token.
    3. Normalization: Converts all tokens to lowercase.
    4. Filtering: Removes empty strings and punctuation tokens.

    Parameters:
    words (str): The input text to be tokenized and preprocessed.

    Returns:
    list: A list of cleaned, lowercase tokens.
    """
    # Tokenize the input text into words
    tokens = word_tokenize(words)

    # Apply preprocessing to each token(remove unwanted characters)
    tokens = [preprocess_string(w) for w in tokens]

    # Convert tokens to lowercase and remove empty strings or punctuations
    return [w.lower() for w in tokens if len(w) != 0 and w not in string.punctuation]

# Example usage:
tokens = preprocess(song)

# Indexing

In [10]:
def tokenizetext(song):
    """
    Tokenizes the input text (song) and builds a vocabulary from the tokens.

    Steps:
    1. Tokenization: The function splits the input text into words and applies
       a tokenizer function to each word.
    2. Vocabulary Building: Constructs a vocabulary from the tokenized words,
       including a special "<unk>" token to handle out-of-vocabulary words.
    3. Default Indexing: Sets the default index for unknown words, ensuring
       that any unseen tokens are mapped to "<unk>".

    Parameters:
    song (str): The input text (song lyrics) to be tokenized and processed.

    Returns:
    vocab (Vocab): A vocabulary object mapping tokens to their corresponding indices.
    """
    # Tokenize the text
    # Split the input text into words and apply the tokenizer function to each word.
    # The 'map' function ensures that each word is tokenized properly.

    tokenized_song = map(tokenizer, song.split())

    # Build vocabulary from tokenized text
    # The function `build_vocab_from_iterator` constructs a vocabulary by iterating
    # over the tokenized words. The special token "<unk>" is added to handle words
    # that are not present in the vocabulary.

    vocab = build_vocab_from_iterator(tokenized_song, specials=['<unk>'])

     # Set the default index for unknown words
    # The default index is set to the index of "<unk>" so that any word not found
    # in the vocabulary is mapped to this token, preventing errors during lookup.
    vocab.set_default_index(vocab['<unk>'])

    return vocab

In [11]:
vocab = tokenizetext(song)
print(vocab(tokens[0: 10]))

[21, 58, 70, 74, 25, 69, 2, 20, 31, 72]


In [12]:
# text funct that converts raw text into indexes
text_pipeline = lambda x: vocab(tokenizer(x))
print(text_pipeline(song)[0: 10])

[21, 58, 70, 74, 25, 69, 2, 20, 31, 72]


In [13]:
# find the word corresponding to an index
index_to_token = vocab.get_itos()
print(index_to_token[58])

are


## Embedding Layers

An embedding layer is a crucial element in natural language processing (NLP) and neural networks designed for sequential data. It serves to convert categorical variables, like words or discrete indexes representing tokens, into continuous vectors. This transformation facilitates training and enables the network to learn meaningful relationships among words.

Let's consider a simple example involving a vocabulary of words

Vocabulary: {apple, banana, orange, pear}
Each word in your vocabulary has a unique index assigned to it:

Indices: {0, 1, 2, 3}
When using an embedding layer, you will initialize random continuous vectors for each index. For instance, the embedding vectors might look like:

Vector for index 0 (apple): [0.2, 0.8]
Vector for index 1 (banana): [0.6, -0.5]
Vector for index 2 (orange): [-0.3, 0.7]
Vector for index 3 (pear): [0.1, 0.4] In PyTorch, you can create an embedding layer.

In [14]:
def genembedding(vocab):
    """
    Generates an embedding layer for the given vocabulary.

    The embedding layer transforms words into dense vector representations,
    allowing the model to learn semantic relationships between words.

    Parameters:
    vocab (Vocab): The vocabulary object containing unique words and their indices.

    Returns:
    nn.Embedding: A PyTorch embedding layer with a specified embedding dimension.
    """

    # Define the embedding dimensions (size of word vectors)
    embedding_dim = 20  # Each word will be represented as a 20-dimensional vector

    # Get the vocabulary size (number of unique words in the vocabulary)
    vocab_size = len(vocab)

    # Create the embedding layer
     # The nn.Embedding module maps word indices to dense vector representations.
    # It takes vocab_size as the number of words and embedding_dim as the vector size.

    embeddings = nn.Embedding(vocab_size, embedding_dim)

    return embeddings

Generating context-target pairs (n-grams)

Organize words within a variable-size context using the following approach: Each word is denoted by 'i'. To establish the context, simply subtract 'j'. The size of the context is determined by the value ofCONTEXT_SIZE.

In [15]:
# Define the context size for generating n-grams
CONTEXT_SIZE = 2 # The number of previous words used to predict the next word

def genngrams(tokens):
    """
    Generates n-grams from a list of tokens, where each n-gram consists of a
    context (previous words) and a target (next word).

    The function constructs a list of tuples where:
    - The first element is a list of `CONTEXT_SIZE` previous words.
    - The second element is the target word that follows the context.

    Parameters:
    tokens (list): A list of preprocessed word tokens.

    Returns:
    list: A list of tuples representing n-grams.
          Each tuple contains (context_words, target_word).
    """

    # Generate n-grams
    # Iterate through the tokens starting from index CONTEXT_SIZE to the end
    # For each token at position 'i', extract the previous CONTEXT_SIZE words as context

    ngrams = [
        (
            [tokens[i - j - 1] for j in range(CONTEXT_SIZE)], # Context words
            tokens[i] # Target word(the word to predict)
        )
        for i in range(CONTEXT_SIZE, len(tokens))
    ]

    return ngrams

In [16]:
ngrams = genngrams(tokens)
context, target = ngrams[0]
print("Context", context, "target", target)
print("context index", vocab(context), "target index", vocab([target]))

Context ['are', 'we'] target no
context index [58, 21] target index [70]


In [18]:
embedding_dim = 20
linear = nn.Linear(embedding_dim*CONTEXT_SIZE, 128)

In [19]:
embeddings = genembedding(vocab)
my_embeddings = embeddings(torch.tensor(vocab(context)))
print(my_embeddings.shape)

torch.Size([2, 20])


In [20]:
# they can be used as inputs in the next layer
linear(my_embeddings)

RuntimeError: mat1 and mat2 shapes cannot be multiplied (2x20 and 40x128)

Batch function

wCreate a Batch function to interface with the data loader. Several adjustments are necessary to handle words that are part of a context in one batch and a predicted word in the following batch.

In [21]:
from torch.utils.data import DataLoader # for batch production
import torch

# Set the devices to GPU if available; otherwise, use CPU
device = torch.device("cuda" if torch.cuda.is_available() else 'cpu')

# Define the hyperparameters
CONTEXT_SIZE = 3 # number of previous words used as context for prediction
BATCH_SIZE = 10 # Number of samples per training batch
EMBEDDING_DIM = 10 # Dimension of words embeddings

def collate_batch(batch):
    """
    Processes a batch of text data into input (context) and output (target) tensors
    for training a language model.

    The function extracts:
    - `context`: A list of word indices representing the context words for each target word.
    - `target`: A list of word indices representing the target word to predict.

    Parameters:
    batch (list): A list of tokenized words (strings).

    Returns:
    tuple: Two PyTorch tensors: (context_tensor, target_tensor)
           - context_tensor: Tensor of shape (batch_size - CONTEXT_SIZE, CONTEXT_SIZE),
             containing the word indices of context words.
           - target_tensor: Tensor of shape (batch_size - CONTEXT_SIZE,),
             containing the word indices of target words.
    """

    batch_size = len(batch) # Get the size of the batch
    context, target = [], [] # Initialize lists for context and target word

    # Loop through the batch, ensuring enough previous words exist for context

    for i in range(CONTEXT_SIZE, batch_size):
        # Convert the target word to index using the vocabulary
        target.append(vocab([batch[i]]))

        # Convert the previous CONTEXT_SIZE words to indices using the vocabulary
        context.append((vocab([batch[i - j - 1] for j in range(CONTEXT_SIZE)])))

    # Convert lists to PyTorch tensors and move them to the appropriate device
    return torch.tensor(context).to(device), torch.tensor(target).to(device).reshape(-1)


Similarly, it's important to highlight that the size of the last batch could deviate from that of the earlier batches. To tackle this, the approach involves adjusting the final batch to conform to the specified batch size, ensuring it becomes a multiple of the predetermined size. When necessary, you'll employ padding techniques to achieve this harmonization. One approach you'll use is appending the beginning of the song to the end of the batch.

In [25]:
Padding = BATCH_SIZE-len(tokens)%BATCH_SIZE
tokens_pad = tokens + tokens[0: Padding]

Create the DataLoader

In [26]:
dataloader = DataLoader(
    tokens_pad,
    batch_size=BATCH_SIZE,
    shuffle=False,
    collate_fn=collate_batch
)


## Multi-class neural network

You have developed a PyTorch class for a multi-class neural network. The network's output is the probability of the next word within a given context. Therefore, the number of classes corresponds to the count of distinct words. The initial layer consists of embeddings, and in addition to the final layer, an extra hidden layer is incorporated.

In [22]:
class NGramLanguageModeler(nn.Module):
    """
    A neural network-based n-gram language model that predicts the next word
    given a sequence of context words.

    This model consists of:
    - An embedding layer that converts word indices into dense vector representations.
    - A fully connected hidden layer with ReLU activation.
    - An output layer that predicts the probability distribution over the vocabulary.

    Parameters:
    vocab_size (int): The number of unique words in the vocabulary.
    embedding_dim (int): The size of the word embeddings (vector representation of words).
    context_size (int): The number of previous words used as context to predict the next word.
    """
    def __init__(self, vocab_size, embedding_dim, context_size):
        super(NGramLanguageModeler, self).__init__()

        # Store context size and embedding dimension
        self.context_size = context_size
        self.embedding_dim = embedding_dim

        # Embedding layer: Maps word indices to dense vectors
        self.embedding = nn.Embedding(vocab_size, embedding_dim)

        # Fully connected hidden layer: Maps the concatenated embeddings to a 128-dimensional space
        self.linear1 = nn.Linear(context_size * embedding_dim, 128)

        # Output layer: Maps the hidden layer output to vocabulary size (probability distribution over words)
        self.linear2 = nn.Linear(128, vocab_size)

    def forward(self, inputs):
        """
        Forward pass of the model.

        Parameters:
        inputs (Tensor): A tensor of shape (batch_size, context_size) containing word indices.

        Returns:
        Tensor: A tensor of shape (batch_size, vocab_size) representing predicted probabilities for the next word.
        """

        # Convert input words indices into dense vectors using the embedding layer

        embeds = self.embedding(inputs) # Shape: (batch_size, context_size, embedding_dim)

        # Reshape the embeddings into a single vector per input sample
        embeds = torch.reshape(embeds, (-1, self.context_size * self.embedding_dim))
        # New shape: (batch_size, context_size * embedding_dim)

        # Apply first fully connected layer with ReLU activation
        out = F.relu(self.linear1(embeds)) # Shape: (batch_size, 128)

        # Apply second fully connected layer to generate vocabulary-size logits
        out = self.linear2(out) # Shape: (batch_size, vocab_size)

        return out

Create a model

In [23]:
model = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE).to(device)

Retrieve samples from the data loader object and input them into the neural network.

In [27]:
context, target = next(iter(dataloader))
print(context, target)
out = model(context)

tensor([[70, 58, 21],
        [74, 70, 58],
        [25, 74, 70],
        [69, 25, 74],
        [ 2, 69, 25],
        [20,  2, 69],
        [31, 20,  2]]) tensor([74, 25, 69,  2, 20, 31, 72])


In [28]:
print(out.shape)

torch.Size([7, 79])



Find the index with the highest probability.

In [29]:
predicted_index = torch.argmax(out, 1)
print(predicted_index)

tensor([11, 63, 73, 37, 63, 63, 63])


Find the corresponding token.

In [30]:
[index_to_token[i.item()] for i in predicted_index]

['desert', 'dont', 'see', 'each', 'dont', 'dont', 'dont']

In [31]:
def write_song(model, my_song, number_of_words=100):
    """
    Generates text using a trained n-gram language model.

    Given an initial text (`my_song`), the function generates additional words by
    predicting the next word iteratively based on the trained model.

    Parameters:
    model (nn.Module): The trained n-gram language model.
    my_song (str): The initial seed text to start generating words.
    number_of_words (int): The number of words to generate (default: 100).

    Returns:
    str: The generated song lyrics as a string.
    """

    # Get the mapping from index to word for decoding predictions
    index_to_token = vocab.get_itos()

    # Loop to generate the desired number of words
    for i in range(number_of_words):

        with torch.no_grad():  # Disable gradient computation for inference

            # Prepare the input context by extracting the last CONTEXT_SIZE words from tokens
            context = torch.tensor(
                vocab([tokens[i - j - 1] for j in range(CONTEXT_SIZE)])
            ).to(device)  # Move to CPU/GPU as required

            # Predict the next word by selecting the word with the highest probability
            word_idx = torch.argmax(model(context))  # Get index of the most likely next word

            # Append the predicted word to the generated text
            my_song += " " + index_to_token[word_idx.detach().item()]

    return my_song  # Return the generated lyrics


In [32]:
def pickrandomline(song):
    """
    Selects a random line from the given song text.

    This function splits the song into separate lines and randomly picks one of them.

    Parameters:
    song (str): The song lyrics as a multi-line string.

    Returns:
    str: A randomly selected line from the song.
    """

    # Split the song into individual lines
    lines = song.split("\n")

    # Randomly select a line and remove leading/trailing whitespace
    selected_line = random.choice(lines).strip()

    return selected_line  # Return the randomly selected line

# Example usage:
selected_line = pickrandomline(song)  # Pick a random line from the song

# Generate a new song starting with the selected line
generated_song = write_song(model, selected_line)

# Print the generated lyrics
print(generated_song)


Never gonna tell a lie and hurt you dont you give desert dont see each dont dont dont dont ask let dont let dont if see see let you so dont if blind let let too dont let dont see let dont your dont weve dont let dont <unk> dont dont weve never we dont dont dont give dont we dont dont your let say give dont dont dont ask <unk> each rules gonna dont dont dont dont dont we dont dont dont dont we dont your say weve down your dont how gonna <unk> dont so dont if dont dont weve long dont dont dont dont dont


## Training

Training a language model involves a multi-step process that leverages training and testing data to optimize model performance. In the realm of Natural Language Processing (NLP), this process often employs various metrics to gauge a model's accuracy, such as perplexity or accuracy on unseen data. However, in the context of your current exploration, you will embark on a slightly different journey. Instead of relying solely on conventional NLP metrics, the focus shifts to manual inspection of the results.

You have the cross entropy loss between input logits and target:

In [33]:
criterion = torch.nn.CrossEntropyLoss()

In [34]:
optimizer = optim.SGD(model.parameters(), lr=0.01)


You have developed a function dedicated to training the model using the supplied data loader. In addition to training the model, the function's output includes predictions for each epoch, spanning context for the next 100 words.

In [35]:
def train(dataloader, model,song,number_of_epochs=100, show=10):
    """
    Args:
        dataloader (DataLoader): DataLoader containing training data.
        model (nn.Module): Neural network model to be trained.
        number_of_epochs (int, optional): Number of epochs for training. Default is 100.
        show (int, optional): Interval for displaying progress. Default is 10.

    Returns:
        list: List containing loss values for each epoch.
    """

    MY_LOSS = []  # List to store loss values for each epoch

    # Iterate over the specified number of epochs
    for epoch in tqdm(range(number_of_epochs)):
        total_loss = 0  # Initialize total loss for the current epoch
        my_song = ""    # Initialize a string to store the generated song

        # Iterate over batches in the dataloader
        for context, target in dataloader:
            model.zero_grad()          # Zero the gradients to avoid accumulation
            predicted = model(context)  # Forward pass through the model to get predictions
            loss = criterion(predicted, target.reshape(-1))  # Calculate the loss
            total_loss += loss.item()   # Accumulate the loss

            loss.backward()    # Backpropagation to compute gradients
            optimizer.step()   # Update model parameters using the optimizer

        # Display progress and generate song at specified intervals
        if epoch % show == 0:
            selected_line=pickrandomline(song)
            my_song += write_song(model, selected_line)    # Generate song using the model

            print("Generated Song:")
            print("\n")
            print(my_song)

        MY_LOSS.append(total_loss/len(dataloader))  # Append the total loss for the epoch to MY_LOSS list

    return MY_LOSS  # Return the list of  mean loss values for each epoch

In [36]:
my_loss_list=[]

In [37]:
# Define the context size for the n-gram model
CONTEXT_SIZE = 2

# Create an instance of the NGramLanguageModeler class with specified parameters
model_2 = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE).to(device)

# Define the optimizer for training the model, using stochastic gradient descent (SGD)
optimizer = optim.SGD(model_2.parameters(), lr=0.01)

# Set up a learning rate scheduler using StepLR to adjust the learning rate during training
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1.0, gamma=0.1)

In [38]:
my_loss=train(dataloader,model_2,song)

  1%|▍                                             | 1/100 [00:01<01:44,  1.05s/it]

Generated Song:


Weve known each other for so long <unk> <unk> <unk> <unk> <unk> never what <unk> <unk> you never <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> commitments <unk> never <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> you <unk> <unk> <unk> <unk> <unk> feeling <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> you <unk> <unk> <unk> <unk> <unk> <unk> <unk> never <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> you <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> run how never


 11%|████▉                                        | 11/100 [00:12<02:12,  1.49s/it]

Generated Song:


Never gonna make you cry <unk> never <unk> never never <unk> <unk> never never never <unk> never you you never never never <unk> <unk> <unk> <unk> never never <unk> you <unk> <unk> you <unk> never never <unk> <unk> <unk> you never <unk> <unk> <unk> <unk> <unk> you never <unk> <unk> <unk> <unk> you <unk> never <unk> <unk> <unk> you never never <unk> <unk> <unk> you and you you never <unk> <unk> <unk> you never you <unk> <unk> <unk> you never <unk> <unk> <unk> you never and you you <unk> you never <unk> <unk> never <unk> you <unk> <unk> you <unk> <unk> <unk> you never <unk>


 21%|█████████▍                                   | 21/100 [00:24<01:13,  1.07it/s]

Generated Song:


Never gonna let you down never never never never never <unk> <unk> never never never <unk> hurt you you never never and <unk> never feeling <unk> never never <unk> you <unk> <unk> you <unk> never never <unk> <unk> tell you never <unk> feeling <unk> <unk> tell you never never <unk> <unk> tell you never never <unk> <unk> tell you down never <unk> <unk> tell you and desert you never <unk> <unk> tell you never never <unk> <unk> tell goodbye never <unk> <unk> tell you never and hurt you never never never <unk> <unk> never and you <unk> <unk> never <unk> <unk> <unk> goodbye never <unk>


 31%|█████████████▉                               | 31/100 [00:40<02:22,  2.06s/it]

Generated Song:


Never gonna run around and desert you never never never never never <unk> and never never never <unk> hurt you you never never and <unk> im feeling <unk> never never never you <unk> <unk> you down never never <unk> <unk> tell you never <unk> feeling <unk> <unk> tell you down never <unk> <unk> tell you never never <unk> <unk> tell you down never <unk> <unk> tell you and desert you never <unk> <unk> tell you down never <unk> <unk> tell goodbye never <unk> <unk> tell you never and hurt you never never each <unk> <unk> never and you <unk> and never <unk> <unk> <unk> goodbye never <unk>


 41%|██████████████████▍                          | 41/100 [00:53<01:14,  1.27s/it]

Generated Song:


Never gonna give you up never never no never never <unk> and never never never <unk> hurt you you never never and <unk> im feeling <unk> never never never you <unk> <unk> never down never never <unk> <unk> tell you how im feeling <unk> <unk> tell you down never <unk> <unk> tell you never never <unk> <unk> tell you down never <unk> <unk> tell you and desert you never <unk> <unk> tell you down never <unk> <unk> tell goodbye never <unk> <unk> tell you never and hurt you never never each other for never and you goodbye been aching <unk> <unk> <unk> goodbye to <unk>


 51%|██████████████████████▉                      | 51/100 [01:02<00:43,  1.13it/s]

Generated Song:


Never gonna make you cry never never no strangers never know and never never never <unk> hurt you you never never and <unk> im feeling <unk> never never never you <unk> how never how never never <unk> <unk> tell you how im feeling <unk> <unk> tell you down never <unk> <unk> tell you never never <unk> <unk> tell you down never <unk> <unk> tell you and desert you never <unk> <unk> tell you down never <unk> <unk> tell goodbye never <unk> <unk> tell you how and hurt you never known each other for never and you other been aching but <unk> too shy to <unk>


 61%|███████████████████████████▍                 | 61/100 [01:12<00:44,  1.14s/it]

Generated Song:


Inside we both know whats been going on never never no strangers to know and never the game and hurt you you to full and <unk> im feeling <unk> know never never been <unk> how never how never never <unk> <unk> tell you how im feeling <unk> <unk> tell you down never <unk> <unk> tell you never never <unk> <unk> tell you down never <unk> <unk> tell you and desert you never <unk> <unk> tell you down never <unk> <unk> tell goodbye never <unk> <unk> tell you how and hurt you never known each other for never and you other been aching but youre too shy to <unk>


 71%|███████████████████████████████▉             | 71/100 [01:24<00:32,  1.13s/it]

Generated Song:


Never gonna say goodbye never never no strangers to know you never the game and hurt you you to full commitments <unk> im feeling to know never never been <unk> how never how never never <unk> <unk> tell you how im feeling <unk> <unk> tell you understand never <unk> <unk> tell you never never <unk> <unk> tell you down never <unk> <unk> tell you and desert you never <unk> <unk> tell you understand never <unk> <unk> tell goodbye never <unk> <unk> tell you how and hurt you never known each other for never and you other been aching but youre too shy to <unk>


 81%|████████████████████████████████████▍        | 81/100 [01:32<00:16,  1.15it/s]

Generated Song:


Never gonna run around and desert you never never no strangers to know you know the game and hurt you you to full commitments <unk> im feeling to know never never been <unk> how other how i never <unk> <unk> tell you how im feeling <unk> <unk> tell you understand never <unk> <unk> tell you up never <unk> <unk> tell you down never <unk> <unk> tell you and desert you never <unk> <unk> tell you understand never <unk> <unk> tell goodbye never <unk> <unk> tell you how and hurt you never known each other for never and you other been aching but youre too shy to say


 91%|████████████████████████████████████████▉    | 91/100 [01:42<00:08,  1.07it/s]

Generated Song:


I just wanna tell you how Im feeling never never no strangers to know you know the game and hurt you i to full commitments <unk> im feeling to know never get been from how other guy i never <unk> <unk> tell you how im feeling <unk> <unk> tell you understand never <unk> <unk> tell you up never <unk> <unk> tell you down never <unk> <unk> tell you and desert you never <unk> <unk> tell you understand never <unk> <unk> tell goodbye never <unk> <unk> tell you how and hurt you never known each other for never and you hearts been aching but youre too shy to say


100%|████████████████████████████████████████████| 100/100 [01:51<00:00,  1.12s/it]


Save the model

In [39]:
save_path = '2gram.pth'
torch.save(model_2.state_dict(), save_path)
my_loss_list.append(my_loss)

The code provided below shows word embeddings from the created model, reduces their dimensionality to 2D using t-SNE, and then plots them as a scatter plot. Additionally, it annotates the first 20 points in the visualization with their corresponding words. This is used to visualize how similar words cluster together in a lower-dimensional space, revealing the structure of the word embeddings. Embeddings allow the model to represent words in a continuous vector space, capturing semantic relationships and similarities between words.



In [42]:
X = model_2.embedding.weight.cpu().detach().numpy()
tsne = TSNE(n_components=2, random_state=42)
X_2d = tsne.fit_transform(X)

labels = []

for j in range(len(X_2d)):
    if j < 20:
        plt.scatter(X_2d[j, 0], X_2d[j, 1], label=index_to_token[j])
        labels.append(index_to_token[j])
        # Add words as annotations
        plt.annotate(index_to_token[j],
                     (X_2d[j, 0], X_2d[j, 1]),
                     textcoords="offset points",
                     xytext=(0, 10),
                     ha='center')
    else:
        plt.scatter(X_2d[j, 0], X_2d[j, 1])

plt.legend(labels, loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

RuntimeError: Numpy is not available

Finally, for a context of eight.

In [43]:
CONTEXT_SIZE=8
model_8 = NGramLanguageModeler(len(vocab), EMBEDDING_DIM, CONTEXT_SIZE).to(device)
optimizer = optim.SGD(model_8.parameters(), lr=0.01)

scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
my_loss=train(dataloader,model_8,song)

save_path = '8gram.pth'
torch.save(model_8.state_dict(), save_path)

my_loss_list.append(my_loss)

  1%|▍                                             | 1/100 [00:06<11:32,  6.99s/it]

Generated Song:


Never gonna give you up <unk> <unk> for just <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> lie <unk> <unk> <unk> <unk> <unk> cry <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> you <unk> <unk> <unk> <unk> <unk> <unk> make you around <unk> <unk> <unk> <unk> you <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> feeling <unk> <unk> <unk> around <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>


 11%|████▉                                        | 11/100 [00:28<05:09,  3.48s/it]

Generated Song:


We know the game and were gonna play it <unk> <unk> <unk> strangers to <unk> <unk> <unk> the <unk> <unk> tell <unk> <unk> <unk> you <unk> make im thinking never <unk> <unk> make <unk> you <unk> never <unk> i <unk> <unk> <unk> give you you never <unk> <unk> <unk> give you you never <unk> <unk> give you cry never <unk> <unk> give you you <unk> <unk> <unk> give you you never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you never <unk> <unk> tell you lie <unk> <unk> tell <unk> <unk> <unk> give for so <unk> you <unk> <unk> <unk> <unk> <unk> <unk> shy to <unk>


 21%|█████████▍                                   | 21/100 [00:49<03:11,  2.42s/it]

Generated Song:


We know the game and were gonna play it <unk> <unk> to strangers to to <unk> <unk> the rules <unk> tell make <unk> <unk> it and give im thinking never <unk> tell make <unk> me on im guy i <unk> <unk> <unk> give you im im feeling <unk> <unk> give you cry never <unk> <unk> give you up never <unk> <unk> give you up never <unk> <unk> run around im never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you cry never <unk> tell a lie never <unk> tell <unk> <unk> tell other for so long im feeling <unk> <unk> to <unk> too shy to say


 31%|█████████████▉                               | 31/100 [01:05<02:18,  2.01s/it]

Generated Song:


Gotta make you understand <unk> <unk> to strangers to to feeling <unk> the rules <unk> tell make <unk> <unk> it and give im thinking never <unk> tell for <unk> me on so guy i <unk> the <unk> give you up im feeling <unk> <unk> give you cry never <unk> <unk> give you up never <unk> <unk> give around up never <unk> <unk> run around up never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you up never <unk> tell a lie never <unk> tell <unk> <unk> give other for so long im feeling <unk> <unk> a <unk> too shy to say


 41%|██████████████████▍                          | 41/100 [01:24<03:16,  3.34s/it]

Generated Song:


Never gonna run around and desert you <unk> <unk> to strangers to so feeling <unk> the rules <unk> tell make <unk> i it and give im thinking never <unk> tell for <unk> me on so guy i <unk> the <unk> give you up im feeling <unk> <unk> give you cry never <unk> <unk> give you up never <unk> <unk> give around up never <unk> <unk> run around up never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you up never <unk> tell a lie never <unk> tell <unk> <unk> give other for so long im feeling <unk> <unk> thinking <unk> too shy to say


 51%|██████████████████████▉                      | 51/100 [01:51<02:58,  3.65s/it]

Generated Song:


Never gonna tell a lie and hurt you <unk> <unk> to strangers to so feeling <unk> the rules <unk> tell make <unk> i it and give im thinking never <unk> tell for tell me on so guy i <unk> the me give you up im feeling <unk> <unk> give you cry never <unk> <unk> give you up never <unk> <unk> run around up never <unk> <unk> run around up never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you up never <unk> tell a lie never <unk> tell <unk> <unk> give other for so long im feeling <unk> <unk> thinking going too shy to say


 61%|███████████████████████████▍                 | 61/100 [02:09<01:26,  2.21s/it]

Generated Song:


Your hearts been aching but youre too shy to say it <unk> <unk> to strangers to so feeling <unk> the rules rules tell make <unk> i it and give im thinking never <unk> tell for tell me on so guy i <unk> the me give you up im feeling <unk> <unk> give you cry never <unk> <unk> give you up never <unk> <unk> run around up never <unk> <unk> run around up never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you up never <unk> tell a lie never <unk> tell <unk> <unk> give other for so long im feeling <unk> <unk> thinking going too shy to say


 71%|███████████████████████████████▉             | 71/100 [02:28<01:28,  3.05s/it]

Generated Song:


We know the game and were gonna play it <unk> <unk> to strangers to so feeling we the rules rules tell make <unk> i it and give im thinking never <unk> tell for tell me on so guy i <unk> the me give you up im feeling <unk> <unk> give you cry never <unk> <unk> give you up never <unk> <unk> run around up never <unk> <unk> run around up never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you up never <unk> tell a lie never <unk> <unk> <unk> <unk> give other for so long im feeling <unk> <unk> thinking going too shy to say


 81%|████████████████████████████████████▍        | 81/100 [02:42<00:33,  1.75s/it]

Generated Song:


Never gonna say goodbye <unk> <unk> to strangers to so feeling we the rules rules tell make <unk> i it and give im thinking never <unk> tell so tell me on so guy i <unk> the me give you up im feeling <unk> <unk> give you cry never <unk> <unk> give you up never <unk> <unk> run around up never <unk> <unk> run around up never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you up never <unk> tell a lie never <unk> <unk> <unk> <unk> give other for so long im feeling <unk> <unk> thinking going too shy to say


 91%|████████████████████████████████████████▉    | 91/100 [03:01<00:20,  2.31s/it]

Generated Song:


Never gonna say goodbye <unk> <unk> to strangers to so feeling we the rules rules tell make <unk> i it and give im thinking never <unk> tell so tell me on so guy i <unk> the me give you up im feeling <unk> <unk> give you cry never <unk> <unk> give you up never <unk> <unk> run around up never <unk> <unk> run around up never <unk> <unk> <unk> <unk> make you cry never <unk> <unk> give you up never <unk> tell a lie never <unk> <unk> <unk> <unk> give other for so long im feeling <unk> <unk> thinking going too shy to say


100%|████████████████████████████████████████████| 100/100 [03:16<00:00,  1.97s/it]


The code provided below shows word embeddings from the created model, reduces their dimensionality to 2D using t-SNE, and then plots them as a scatter plot. Additionally, it annotates the first 20 points in the visualization with their corresponding words. This is used to visualize how similar words cluster together in a lower-dimensional space, revealing the structure of the word embeddings. Embeddings allow the model to represent words in a continuous vector space, capturing semantic relationships and similarities between words.



In [None]:
X = model_8.embeddings.weight.cpu().detach().numpy()
tsne = TSNE(n_components=2, random_state=42)
X_2d = tsne.fit_transform(X)

labels = []

for j in range(len(X_2d)):
    if j < 20:
        plt.scatter(X_2d[j, 0], X_2d[j, 1], label=index_to_token[j])
        labels.append(index_to_token[j])
        # Add words as annotations
        plt.annotate(index_to_token[j],
                     (X_2d[j, 0], X_2d[j, 1]),
                     textcoords="offset points",
                     xytext=(0, 10),
                     ha='center')
    else:
        plt.scatter(X_2d[j, 0], X_2d[j, 1])

plt.legend(labels, loc='upper left', bbox_to_anchor=(1, 1))
plt.show()

## Perplexity
Perplexity is a measurement used to evaluate the effectiveness of language models or probability models. It provides an indication of how well a model predicts a sample of data or the likelihood of an unseen event. Perplexity is commonly used in natural language processing tasks, such as machine translation, speech recognition, and language generation.

Perplexity is derived from the concept of cross-entropy loss, which measures the dissimilarity between predicted probabilities and actual probabilities.

$$\text{Cross-Entropy Loss} = -\sum_{i=1}^{N} y_i \ln(p_i)$$
The cross-entropy loss is calculated by taking the negative sum of the products of the true labels $y_i$ and the logarithm of the predicted probabilities $p_i$ over $N$ classes.

Taking the exponential of the mean cross-entropy loss gives us the perplexity value.

$$\text{Perplexity} = e^{\frac{1}{N} \text{Cross-Entropy Loss}}$$


A lower perplexity value indicates that the model is more confident and accurate in predicting the data. Conversely, a higher perplexity suggests that the model is less certain and less accurate in its predictions.

Perplexity can be seen as an estimate of the average number of choices the model has for the next word or event in a sequence. A lower perplexity means that the model is more certain about the next word, while a higher perplexity means that there are more possible choices.


In [None]:
for (my_loss, model_name)in zip(my_loss_list,["2-gram","4-gram","8-gram"]):
    # Calculate perplexity using the loss
    perplexity = np.exp(my_loss)
    plt.plot(perplexity,label="Perplexity - {}".format(model_name))
    plt.legend()