## What is language modelling?

Language modeling is the process of predicting the next word in a sequence of words based on the context provided by the previous words. It is a core task in natural language processing (NLP) and is used in a wide range of applications, including speech recognition, machine translation, and chatbots.

In language modeling, the goal is to learn the probability distribution over sequences of words in a language. Given a sequence of words, the language model assigns a probability to each possible word that might come next in the sequence. This can be used to predict the next word in a sequence, generate text that is similar to a given input, or to evaluate the quality of a translation or a summary by comparing the probability of the generated text to the probability of the original text.

Language models are typically trained on large corpora of text, such as books, articles, and websites, in order to learn the statistical properties of the language and the dependencies between words. They can be implemented using various types of neural networks, such as recurrent neural networks (RNNs), long short-term memory (LSTM) networks, or transformers.

### The Tokeniser

The first thing we need to do, is to create a tokeniser that can take in our raw text and split it into a sequence of tokens.

In [None]:
class Tokeniser:
    def __init__(self, txt):
        txt = self.preprocess(txt) # Preprocess the text
        unique_chars = set(txt) # Create a set of unique characters in the input text
        self.id_to_token = dict(enumerate(unique_chars)) # Create a dictionary that maps character IDs to characters
        self.token_to_id = {v: k for k, v in self.id_to_token.items()} # Create a dictionary that maps characters to character IDs

    def preprocess(self, str):
        txt = txt.lower() # Convert the lyrics to lowercase

    def encode(self, txt):
        txt = self.preprocess(txt) # Preprocess
        return [self.token_to_id[char] for char in str.strip(txt)] # Encode the input string by mapping its characters to character IDs

    def decode(self, token_ids):
        return "".join([self.id_to_token[id] for id in token_ids]) # Decode the input list of character IDs by mapping them to characters

def get_tokeniser():
    with open('lyrics.txt', 'r') as file:
        txt = file.read()

    # Create a Tokeniser object to encode and decode the text
    return Tokeniser(txt)

get_tokeniser()

## Creating a simple character-level language modelling dataset

A language modelling dataset consists of:
- features which are the sequential words/tokens in a body of text
- targets for each position in time which are the features shifted one step forward in time

Implementation details:
- Like all PyTorch datasets, our dataset needs a `__len__` method.
- In this case, we will use the `__iter__` method that allows our dataset to be iterated through
    - Usually datasets implement the `__getitem__` method, but in this case, we will just return a random slice of data

In [None]:
import torch
import numpy as np


class LyricDataset():
    def __init__(self, tokeniser, chunk_size=100):
        """
        Initialize a LyricDataset object.
        
        Parameters:
        chunk_size (int): The size of each chunk of data to be returned by the iterator.
        """
        self.chunk_size = chunk_size  # The size of each chunk of data to be returned by the iterator
        self.tokeniser = tokeniser

        # Read in the lyrics from the file
        with open('lyrics.txt', 'r') as file:
            txt = file.read()

        self.X = torch.tensor(self.tokeniser.encode(txt)) # Encode the text and store it in a tensor
        self.Y = torch.tensor(np.roll(self.X, -1, axis=0)) # Shift the encoded text by one character and store it in a tensor

        self.vocab_size = len(set(txt)) # Store the size of the vocabulary (i.e. the number of unique characters in the text)

    def __len__(self):
        """
        Return the number of chunks in the dataset.
        """
        return len(self.X) // self.chunk_size

    def __iter__(self):
        """
        Iterate through random chunks of the dataset and yield them.
        """
        for idx in range(len(self)):
            # Randomly select a start index for the chunk
            k = np.random.randint(0, len(self.X)-self.chunk_size)
            # Select the chunk using a slice object
            slc = slice(k, k+self.chunk_size)
            yield self.X[slc], self.Y[slc]


## Defining the RNN model

In [None]:
class RNN(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super().__init__()
        # store input parameters in the object so we can use them later on
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        # required functions for model
        self.embedding = torch.nn.Embedding(vocab_size, hidden_size)
        self.rnn = torch.nn.RNN(hidden_size, hidden_size,
                                n_layers, batch_first=True)  # TODO remove batch first
        self.decoder = torch.nn.Linear(hidden_size, vocab_size)

    def forward(self, X):
        self.init_hidden(X.shape[0])
        embedding = self.embedding(X)
        outputs, final_hidden = self.rnn(embedding, self.hidden)
        # print(hidden.shape)
        # print(outputs.shape)
        predictions = self.decoder(outputs)
        # print("final output shape:", predictions.shape)
        return predictions

    def init_hidden(self, batch_size):
        self.hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)


## Generating new text

Now we need to implement a method of our model that takes what it knows and uses it to generate new text.

In [None]:
import random

class RNN(torch.nn.Module):
    def __init__(self, vocab_size, hidden_size, n_layers=1):
        super().__init__()
        # store input parameters in the object so we can use them later on
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.n_layers = n_layers

        # required functions for model
        self.embedding = torch.nn.Embedding(vocab_size, hidden_size)
        self.rnn = torch.nn.RNN(hidden_size, hidden_size,
                                n_layers, batch_first=True)  # TODO remove batch first
        self.decoder = torch.nn.Linear(hidden_size, vocab_size)

    def forward(self, X):
        self.init_hidden(X.shape[0])
        embedding = self.embedding(X)
        outputs, final_hidden = self.rnn(embedding, self.hidden)
        # print(hidden.shape)
        # print(outputs.shape)
        predictions = self.decoder(outputs)
        # print("final output shape:", predictions.shape)
        return predictions

    def init_hidden(self, batch_size):
        self.hidden = torch.zeros(self.n_layers, batch_size, self.hidden_size)

    def generate(self):
        self.init_hidden(batch_size=1)
        initial_token_id = random.randint(0, 49-1)
        generated_token_ids = [initial_token_id]
        initial_token_batch = torch.tensor(initial_token_id).unsqueeze(
            0).unsqueeze(0)  # TODO SOS token
        embedding = self.embedding(initial_token_batch)
        for idx in range(100):  # generate 100 character sequence
            outputs, self.hidden = self.rnn(embedding, self.hidden)
            predictions = self.decoder(outputs)
            # outputs has shape BxLxN=1x1xN
            predictions = predictions.squeeze()  # remove 1-dims
            chosen_token_id = torch.argmax(predictions)
            generated_token_ids.append(int(chosen_token_id))
            embedding = self.embedding(
                chosen_token_id).unsqueeze(0).unsqueeze(0)
        return generated_token_ids


rnn = RNN()
rnn.generate()

## Creating the training loop

Now we have the model and the dataset, we need to pass the model through the dataset repeatedly and iteratively optimise the model parameters using gradient descent.

In [None]:
from torch.utils.tensorboard import SummaryWriter
import torch.nn.functional as F
import tqdm

def train(model, dataset, tokeniser, epochs=1):
    writer = SummaryWriter()
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)  # choose optimiser
    n_steps = 0
    for epoch in range(epochs):
        epoch_loss = 0  # stored the loss per epoch

        for seq_inputs, seq_targets in tqdm(dataset):
            loss = 0

            # add batch dim TODO remove once using dataloader
            seq_inputs = seq_inputs.unsqueeze(0)
            predictions = model(seq_inputs)
            seq_targets = seq_targets.unsqueeze(0)
            predictions = predictions.view(-1, predictions.shape[-1])
            seq_targets = seq_targets.view(-1)  # BxT targets all in a line
            loss = F.cross_entropy(predictions, seq_targets)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            writer.add_scalar("Loss/Train", loss.item(), n_steps)
            n_steps += 1

            epoch_loss += loss  # add the loss of this sequence to the loss of this epoch

        epoch_loss /= len(dataset)  # avg loss per chunk

        print('Epoch ', epoch, ' Avg loss/chunk: ', epoch_loss.item())
        generated_token_ids = model.generate()
        writer.add_text("Generated Text", tokeniser.decode(
            generated_token_ids)[:300], epoch)
            # TODO stop on EOS token


if __name__ == "__main__":

    # HYPER-PARAMS
    lr = 0.0005
    epochs = 500
    chunk_size = 100  # the length of the sequences which we will optimize over

    hidden_size = 256
    n_layers = 2

    tokeniser = get_tokeniser()

    dataset = LyricDataset(chunk_size=chunk_size)
    n_tokens = len(dataset.tokeniser.id_to_token)
    # instantiate our model from the class defined earlier
    myrnn = RNN(n_tokens, hidden_size, n_layers)
    train(myrnn, dataset, epochs)
    # myrnn = RNN(n_tokens, hidden_size, n_layers)
    # train(myrnn, dataset, epochs)
