# IS319 - Deep Learning

## TP3 - Recurrent neural networks

Credits: Andrej Karpathy

The goal of this TP is to experiment with recurrent neural networks for a character-level language model to generate text that looks like training text data.

In [62]:
import torch
import torch.nn as nn
import torch.optim as optimizer
import numpy as np
import torch.nn.functional as F
import torch.distributions as distributions
import matplotlib.pyplot as plt

device = (
    "cuda"
    if torch.cuda.is_available()
    # else "mps"
    # if torch.backends.mps.is_available() # For macOS
    else "cpu"
)
print(f'Using {device}')

Using cpu


## 1. Text data preprocessing

Several text datasets are provided, feel free to experiment with different ones throughout the TP. At the beginning, use a small subset of a given dataset (for example use only 10k characters).

In [63]:
# text_data_fname = 'baudelaire.txt'  # ~0.1m characters (French)
# text_data_fname = 'proust.txt'      # ~7.3m characters (French)
text_data_fname = 'shakespeare.txt' # ~0.1m characters (English)
# text_data_fname = 'lotr.txt'        # ~2.5m characters (English)
# text_data_fname = 'doom.c'          # ~1m characters (C Code)
# text_data_fname = 'linux.c'         # ~11.5m characters (C code)

text_data = open(text_data_fname, 'r').read()
text_data = text_data
print(f'Dataset `{text_data_fname}` contains {len(text_data)} characters.')
print('Excerpt of the dataset:')
print(text_data[:2000])

Dataset `shakespeare.txt` contains 95665 characters.
Excerpt of the dataset:
    SONNETS



TO THE ONLY BEGETTER OF
THESE INSUING SONNETS
MR. W. H. ALL HAPPINESS
AND THAT ETERNITY
PROMISED BY
OUR EVER-LIVING POET WISHETH
THE WELL-WISHING
ADVENTURER IN
SETTING FORTH
T. T.


I.

FROM fairest creatures we desire increase,
That thereby beauty's rose might never die,
But as the riper should by time decease,
His tender heir might bear his memory:
But thou, contracted to thine own bright eyes,
Feed'st thy light'st flame with self-substantial fuel,
Making a famine where abundance lies,
Thyself thy foe, to thy sweet self too cruel.
Thou that art now the world's fresh ornament
And only herald to the gaudy spring,
Within thine own bud buriest thy content
And, tender churl, makest waste in niggarding.
  Pity the world, or else this glutton be,
  To eat the world's due, by the grave and thee.

II.

When forty winters shall beseige thy brow,
And dig deep trenches in thy beauty's field,
Thy youth's p

**(Question)** Create a character-level vocabulary for your text data. Create two dictionaries: `ctoi` mapping each character to an index, and the reverse `itoc` mapping each index to its corresponding character. Implement the functions to convert text to tensor and tensor to text using these mappings. Apply these functions to some text data.

In [64]:
# Create the vocabulary and the two mapping dictionaries
def create_vocab(text):
    vocab, ctoi, itoc = [], {}, {}
    for character in text :
        if character not in vocab :
            vocab += [character]
            ctoi[character] = vocab.index(character)
            itoc[vocab.index(character)] = character
    return vocab, ctoi, itoc

# Implement the function converting text to tensor
def text_to_tensor(text, ctoi):
    arr = np.zeros((len(text)))
    for idx, char in enumerate(text):
        arr[idx] = ctoi[char]
    return torch.tensor(arr, dtype=torch.long)

# Implement the function converting tensor to text
def tensor_to_text(tensor, itoc):
    arr = tensor.cpu().detach().numpy()
    text = ""
    for elm in arr :
        text+=itoc[elm]
    return text

# Apply your functions to some text data
vocab, ctoi, itoc = create_vocab(text_data)

print(vocab)
print(ctoi)
print(itoc)

example_tensor = text_to_tensor(text_data, ctoi)
print(example_tensor)

example_text = tensor_to_text(example_tensor, itoc)
# verify integrity
assert example_text == text_data

[' ', 'S', 'O', 'N', 'E', 'T', '\n', 'H', 'L', 'Y', 'B', 'G', 'R', 'F', 'I', 'U', 'M', '.', 'W', 'A', 'P', 'D', 'V', '-', 'f', 'a', 'i', 'r', 'e', 's', 't', 'c', 'u', 'w', 'd', 'n', ',', 'h', 'b', 'y', "'", 'o', 'm', 'g', 'v', 'p', 'l', ':', 'k', 'z', 'x', '!', ';', '?', 'C', 'q', 'j', 'X', 'K', 'J', '[', ']']
{' ': 0, 'S': 1, 'O': 2, 'N': 3, 'E': 4, 'T': 5, '\n': 6, 'H': 7, 'L': 8, 'Y': 9, 'B': 10, 'G': 11, 'R': 12, 'F': 13, 'I': 14, 'U': 15, 'M': 16, '.': 17, 'W': 18, 'A': 19, 'P': 20, 'D': 21, 'V': 22, '-': 23, 'f': 24, 'a': 25, 'i': 26, 'r': 27, 'e': 28, 's': 29, 't': 30, 'c': 31, 'u': 32, 'w': 33, 'd': 34, 'n': 35, ',': 36, 'h': 37, 'b': 38, 'y': 39, "'": 40, 'o': 41, 'm': 42, 'g': 43, 'v': 44, 'p': 45, 'l': 46, ':': 47, 'k': 48, 'z': 49, 'x': 50, '!': 51, ';': 52, '?': 53, 'C': 54, 'q': 55, 'j': 56, 'X': 57, 'K': 58, 'J': 59, '[': 60, ']': 61}
{0: ' ', 1: 'S', 2: 'O', 3: 'N', 4: 'E', 5: 'T', 6: '\n', 7: 'H', 8: 'L', 9: 'Y', 10: 'B', 11: 'G', 12: 'R', 13: 'F', 14: 'I', 15: 'U', 16

## 2. Setup a character-level recurrent neural network

**(Question)** Setup a simple embedding layer with `nn.Embedding` to project character indices to `embedding_dim` dimensional vectors. Explain precisely how this layer works and what are its outputs for a given input sequence.

In [65]:
# n_vocab : the total number of unique indices that the embedding layer can handle.
# n_dim : the size of the vector space in which the indices will be embedded.
n_vocab, n_dim = len(vocab), 16

# initiate the Embedding layer
emb_layer = nn.Embedding(n_vocab, embedding_dim=n_dim)

# given the example tensor generate an embedding of each index (indirectly character) of the text
emb_data = emb_layer(example_tensor)

print(emb_data)
print(emb_data.shape)

tensor([[-2.1161e+00, -3.4858e-01, -2.1906e-01,  ..., -1.4249e+00,
         -8.6395e-01, -1.1241e+00],
        [-2.1161e+00, -3.4858e-01, -2.1906e-01,  ..., -1.4249e+00,
         -8.6395e-01, -1.1241e+00],
        [-2.1161e+00, -3.4858e-01, -2.1906e-01,  ..., -1.4249e+00,
         -8.6395e-01, -1.1241e+00],
        ...,
        [ 1.5299e+00,  2.0807e+00,  1.0633e+00,  ..., -8.4153e-01,
          7.8899e-01, -7.3651e-01],
        [-6.8824e-01,  8.8730e-01, -1.9849e-02,  ...,  2.2095e+00,
          2.9298e-01,  4.8591e-01],
        [ 4.1772e-01, -5.1739e-01, -6.7781e-01,  ..., -7.1451e-04,
         -1.2020e+00, -2.0617e-01]], grad_fn=<EmbeddingBackward0>)
torch.Size([95665, 16])


## Answer :

**nn.Embedding converts integer indices to dense vectors. It initializes a tensor with size (sequence_length, embedding_dim), mapping each index to a learnable vector**
****


**(Question)** Setup a single-layer RNN with `nn.RNN` (without defining a custom class). Use `hidden_dim` size for hidden states. Explain precisely the outputs of this layer for a given input sequence.

In [66]:
input_size, hidden_size = 16, 16
rnn_layer = nn.RNN(input_size, hidden_size)

# Initialize the hidden state
hidden_state = torch.zeros(1, hidden_size)

# run the embedded data through the RNN
output_sequence, final_hidden_state = rnn_layer(emb_data, hidden_state)

print(output_sequence.shape, output_sequence)
print(final_hidden_state)

torch.Size([95665, 16]) tensor([[ 0.2224,  0.8510, -0.5396,  ...,  0.6880, -0.7641,  0.5719],
        [-0.1041,  0.7006, -0.6765,  ...,  0.7326, -0.8644,  0.6854],
        [-0.1355,  0.7607, -0.7111,  ...,  0.8552, -0.7828,  0.7309],
        ...,
        [ 0.7744, -0.2510, -0.1579,  ...,  0.7546,  0.5077,  0.4242],
        [-0.4397,  0.2145,  0.6670,  ...,  0.3338, -0.4755,  0.1775],
        [ 0.2874, -0.7300, -0.3092,  ...,  0.4056,  0.7646, -0.3804]],
       grad_fn=<SqueezeBackward1>)
tensor([[ 0.2874, -0.7300, -0.3092,  0.6968, -0.0470,  0.4345,  0.5894,  0.1318,
         -0.2219, -0.7425,  0.5248,  0.2308,  0.5748,  0.4056,  0.7646, -0.3804]],
       grad_fn=<SqueezeBackward1>)


## Answer :
The `output_sequence` represents the output of the RNN at each time step when it processes an input sequence. It is a tensor that contains the predicted value for each hidden state at each time step in the sequence. And the `final_hidden_state` is, as shown, the final hidden state of the layer.
****

**(Question)** Create a simple RNN model with a custom `nn.Module` class. It should contain: an embedding layer, a single-layer RNN, and a dense output layer. For each character of the input sequence, the model should predict the probability of the next character. The forward method should return the probabilities for next characters and the corresponding hidden states.
After completing the class, create a model and apply the forward pass on some input text. Understand and explain the results.

*Note:* depending on how you implement the loss function later, it can be convenient to return logits instead of probabilities, i.e. raw values of the output layer before any activation function. 

In [67]:
class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        '''Initialize model parameters and layers.'''
        super(SimpleRNN, self).__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.lin = nn.Linear(hidden_dim, vocab_size)

    def forward(self, tensor_data, hidden_state=None):
        '''Apply the forward pass for some text data already converted to tensor.'''
        output = self.emb(tensor_data)
        output, hidden = self.rnn(output, hidden_state)
        output = self.lin(output)
        return output, hidden

# Initialize a model and apply the forward pass on some input text
vocab_size, embedding_dim, hidden_dim = len(vocab), 32, 64
sr = SimpleRNN(vocab_size, embedding_dim, hidden_dim)
sr.forward(text_to_tensor(text_data[:10], ctoi))

(tensor([[ 5.7090e-02,  1.4130e-01, -2.7209e-02,  2.2851e-01, -9.3183e-02,
           7.5359e-03, -1.2029e-01,  7.8917e-02, -2.2412e-01, -6.9306e-02,
          -3.4203e-01, -1.1430e-01,  2.8936e-02,  3.1206e-02, -1.8886e-03,
           1.3368e-01,  3.1732e-03,  1.4948e-01, -2.3957e-01,  6.8072e-02,
          -2.4480e-01, -7.5529e-03,  3.8474e-01,  1.1661e-01,  9.1674e-02,
           8.3646e-02, -4.1155e-02, -2.7405e-02,  1.6148e-01, -1.1493e-01,
          -2.4704e-02, -1.3979e-01,  2.3415e-03,  8.9720e-02,  3.3569e-01,
          -2.5884e-01,  2.1455e-01, -1.4433e-02, -1.1946e-01,  1.3713e-01,
           2.8650e-02,  4.1041e-02, -1.3497e-02, -1.1016e-01, -1.4553e-01,
          -6.7254e-02, -2.5757e-01,  1.6758e-02,  1.5810e-01, -3.3543e-03,
          -1.0840e-01,  1.9467e-01,  1.2757e-01,  8.9506e-02,  8.6306e-02,
          -5.8471e-03,  3.7965e-02,  1.4488e-02, -2.1332e-01, -3.1487e-01,
           2.5038e-01, -2.5073e-01],
         [ 2.7795e-02,  3.0029e-01, -8.8868e-02,  1.2846e-01, -

## Answer :

****

**(Question)** Implement a simple training loop to overfit on a small input sequence. The loss function should be a categorical cross entropy on the predicted characters. Monitor the loss function value over the iterations.

In [68]:
# Sample a small input sequence into tensor `input_seq` and store its corresponding expected sequence into tensor `target_seq`
input_seq = torch.arange(0, 40).long()
target_seq = (input_seq+1).clone() 

# Implement a training loop overfitting an input sequence and monitoring the loss function
def train_overfit(model, input_seq, target_seq, n_iters=200, learning_rate=0.2):
    optim = optimizer.Adam(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss()
    hidden = None
    for i in range(n_iters):

        output, hidden = model(input_seq.unsqueeze(0), hidden)
        hidden.detach_()
        loss = loss_function(output.squeeze(0), target_seq)

        optim.zero_grad()
        loss.backward()
        optim.step()

        if (i + 1)%50 == 0:
            print(f"Iteration {i + 1}/{n_iters}, Loss: {loss.item()}")

# Initialize a model and make it overfit the input sequence
sr_model = SimpleRNN(vocab_size, embedding_dim, hidden_dim)
train_overfit(sr_model, input_seq, target_seq, n_iters=1000)

Iteration 50/1000, Loss: 0.2606762647628784
Iteration 100/1000, Loss: 0.014782190322875977
Iteration 150/1000, Loss: 1.5871313735260628e-05
Iteration 200/1000, Loss: 6.0050838328606915e-06
Iteration 250/1000, Loss: 4.440505108505022e-06
Iteration 300/1000, Loss: 3.510689566610381e-06
Iteration 350/1000, Loss: 2.8759081942553166e-06
Iteration 400/1000, Loss: 2.3960960788826924e-06
Iteration 450/1000, Loss: 2.035490979324095e-06
Iteration 500/1000, Loss: 1.7255488273804076e-06
Iteration 550/1000, Loss: 1.507993260929652e-06
Iteration 600/1000, Loss: 1.332160536549054e-06
Iteration 650/1000, Loss: 1.192090280710545e-06
Iteration 700/1000, Loss: 1.0818221198860556e-06
Iteration 750/1000, Loss: 9.86455120255414e-07
Iteration 800/1000, Loss: 9.000286809168756e-07
Iteration 850/1000, Loss: 8.344638331436727e-07
Iteration 900/1000, Loss: 7.688989285270509e-07
Iteration 950/1000, Loss: 7.212153150248923e-07
Iteration 1000/1000, Loss: 6.735317015227338e-07


**(Question)** Implement a `predict_argmax` method for your `RNN` model. Then, verify your overfitting: use some characters of your input sequence as context to predict the remaining ones. Experiment with the current model and analyze the results.

In [69]:
class CharRNN(SimpleRNN):
    def predict_argmax(self, context_tensor, n_predictions):
        # Apply the forward pass for the context tensor
        # Then, store the last prediction and last hidden state
        predictions, hidden = [], None
        for char_index in context_tensor:
            output, hidden = self.forward(char_index.unsqueeze(0).unsqueeze(0), hidden_state=hidden)

        # Use the last prediction and last hidden state as inputs to the next forward pass
        # Do this in a loop to predict the next `n_predictions` characters
        for _ in range(n_predictions):
            output, hidden = self.forward(context_tensor[-1].unsqueeze(0).unsqueeze(0))

            predicted_index = output.squeeze(0).argmax().item()
            predictions.append(predicted_index)

            context_tensor = torch.cat((context_tensor, torch.tensor([predicted_index])))

        return predictions

# Initialize a model and make it overfit as above
# Then, verify your overfitting by predicting characters given some context
model = CharRNN(vocab_size=len(vocab), embedding_dim=32, hidden_dim=64)
train_overfit(model, input_seq, target_seq, n_iters=1000)

Iteration 50/1000, Loss: 0.011529028415679932
Iteration 100/1000, Loss: 0.00457511143758893
Iteration 150/1000, Loss: 0.01494741439819336
Iteration 200/1000, Loss: 2.860960421458003e-06
Iteration 250/1000, Loss: 1.761298335622996e-06
Iteration 300/1000, Loss: 1.320233764090517e-06
Iteration 350/1000, Loss: 1.0579773288554861e-06
Iteration 400/1000, Loss: 8.791652135187178e-07
Iteration 450/1000, Loss: 7.59956947149476e-07
Iteration 500/1000, Loss: 6.645901748925098e-07
Iteration 550/1000, Loss: 5.93064953591238e-07
Iteration 600/1000, Loss: 5.275001626614539e-07
Iteration 650/1000, Loss: 4.768364192386798e-07
Iteration 700/1000, Loss: 4.2021218860099907e-07
Iteration 750/1000, Loss: 3.695483314913872e-07
Iteration 800/1000, Loss: 3.4868676834776124e-07
Iteration 850/1000, Loss: 3.2186477483264753e-07
Iteration 900/1000, Loss: 3.099438572462532e-07
Iteration 950/1000, Loss: 2.9206248086666164e-07
Iteration 1000/1000, Loss: 2.771613480945234e-07


In [70]:
# predict 3 indices
print(model.predict_argmax(input_seq[:35], n_predictions=3))

[35, 36, 37]


## Answer :

****

Using the argmax function to predict the next character can yield a deterministic generator always predicting the same characters. Instead, it is common to predict the next character by sampling from the distribution of output predictions, adding some randomness into the generator.

**(Question)** Implement a `predict_proba` method for your `RNN` model. It should be very similar to `predict_argmax`, but instead of using argmax, it should randomly sample from the output predictions. To do that, you can use the `torch.distribution.Categorical` class and its `sample()` method. Verify that your method correctly added some randomness.

In [71]:
class CharRNN(CharRNN):
    def predict_proba(self, input_context, n_predictions):
        predictions, hidden = [], None

        for char_index in input_context:
            output, hidden = self.forward(char_index.unsqueeze(0).unsqueeze(0), hidden)

        for _ in range(n_predictions):
            output, hidden = self.forward(input_context[-1].unsqueeze(0).unsqueeze(0), hidden)

            # Use Categorical distribution to sample from the predicted probabilities
            categorical_dist = distributions.Categorical(logits=output.squeeze(0))
            predicted_index = categorical_dist.sample().item()
            predictions.append(predicted_index)

            # Update the context tensor with the new prediction
            input_context = torch.cat((input_context, torch.tensor([predicted_index])))

        return predictions
# Verify that your predictions are not deterministic anymore
model = CharRNN(vocab_size=len(vocab), embedding_dim=32, hidden_dim=64)
train_overfit(model, input_seq, target_seq, n_iters=1000)

Iteration 50/1000, Loss: 0.19750407338142395
Iteration 100/1000, Loss: 1.7572730939718895e-05
Iteration 150/1000, Loss: 4.723605798062636e-06
Iteration 200/1000, Loss: 3.54942494595889e-06
Iteration 250/1000, Loss: 2.863984036594047e-06
Iteration 300/1000, Loss: 2.393113845755579e-06
Iteration 350/1000, Loss: 2.0384702565934276e-06
Iteration 400/1000, Loss: 1.7672715557637275e-06
Iteration 450/1000, Loss: 1.5586567769787507e-06
Iteration 500/1000, Loss: 1.3828241662849905e-06
Iteration 550/1000, Loss: 1.242754024133319e-06
Iteration 600/1000, Loss: 1.1175848158018198e-06
Iteration 650/1000, Loss: 1.0222178161711781e-06
Iteration 700/1000, Loss: 9.328112469120242e-07
Iteration 750/1000, Loss: 8.523451242581359e-07
Iteration 800/1000, Loss: 7.897605200923863e-07
Iteration 850/1000, Loss: 7.271758022397989e-07
Iteration 900/1000, Loss: 6.824724323450937e-07
Iteration 950/1000, Loss: 6.377691192938073e-07
Iteration 1000/1000, Loss: 5.990261229271709e-07


## 3. Train the RNN model on text data

**(Question)** Adapt your previous code to implement a proper training loop for a text dataset. To do so, we need to specify a sequence length `seq_len`, acting similarly to the batch size in classic neural networks. Then, you can either randomly sample sequences of length `seq_len` from the text dataset over `n_iters` iterations, or properly loop over the text dataset for `n_epochs` epochs (with a random starting point for each epoch to ensure different sequences), to make sure the whole dataset is seen by the model. Feel free to adjust training and model parameters empirically. Start with a small model and a small subset of the text dataset, then move on to larger experiments. Remember to use GPU if available.

In [114]:
# Create the text dataset, compute its mappings and convert it to tensor
data_tensor = text_to_tensor(text_data, ctoi)
seq_len = 10

# Initialize training parameters
vocab_size, embedding_dim, hidden_dim = len(vocab), 32, 64


# Initialize a character-level RNN model
cr_model = CharRNN(vocab_size, embedding_dim, hidden_dim).to(device)

# Setup the training loop
# Regularly record the loss and sample from the model to monitor what is happening
def train_loop(model, data_tensor, seq_len, n_epochs, learning_rate=5e-3):
    model.train()
    optim = optimizer.Adam(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        # Randomly choose a starting point for each epoch
        start_index = np.random.randint(0, data_tensor.size(0) - seq_len - 1)
        hidden = None
        for i in range(start_index, data_tensor.size(0) - seq_len, seq_len):
            input_seq = data_tensor[i:i+seq_len].unsqueeze(0).to(device)
            target_seq = data_tensor[i+1:i+seq_len+1].to(device)

            optim.zero_grad()
            output, hidden = model(input_seq, hidden)
            hidden.detach_()

            loss = loss_function(output.squeeze(0), target_seq)
            loss.backward()
            optim.step()

        if (epoch + 1)%10 == 0:
            print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}")

train_loop(cr_model, data_tensor, seq_len, n_epochs=100)


Epoch 10/100, Loss: 2.4149956703186035
Epoch 20/100, Loss: 2.5526630878448486
Epoch 30/100, Loss: 1.6975555419921875
Epoch 40/100, Loss: 2.3195462226867676
Epoch 50/100, Loss: 2.5276615619659424
Epoch 60/100, Loss: 2.3649086952209473
Epoch 70/100, Loss: 2.419177532196045
Epoch 80/100, Loss: 2.059297561645508
Epoch 90/100, Loss: 2.272216558456421
Epoch 100/100, Loss: 1.9592567682266235


**(Question)** From your trained model, play around with its predictions: start with a custom input sequence and ask the model to predict the rest. Analyze and comment your results.

In [116]:
start_text = "A kid playing in the playgrou"
cr_model.eval()
generated_text = start_text
n_chars = 2

with torch.no_grad():
    input_seq = text_to_tensor(start_text, ctoi).to(device)
    predicted_indices = cr_model.predict_proba(input_seq, n_predictions=n_chars)
    
    for idx in predicted_indices :
        if idx < len(vocab) :
            generated_text += itoc[idx]
        else : 
            print(idx)

print(generated_text)

A kid playing in the playgrou e


## Answer :

****

## 4. Experiment with different RNN architectures

**(Question)** Experiment with different RNN architecures. Potential ideas are multi-layer RNNs, GRUs and LSTMs. All models can be extended to multi-layer using the `num_layers` parameter. Analyze and comment your results.

In [117]:
cr_model_3 = CharRNN(vocab_size, embedding_dim, hidden_dim, num_layers=3)
train_loop(cr_model_3, data_tensor, seq_len, n_epochs=50)

Epoch 10/50, Loss: 2.5696303844451904
Epoch 20/50, Loss: 1.9924137592315674
Epoch 30/50, Loss: 1.9971141815185547
Epoch 40/50, Loss: 2.291541576385498
Epoch 50/50, Loss: 1.968225121498108


In [118]:
start_text = "A kid playing in the playgrou"
cr_model_3.eval()
generated_text = start_text
n_chars = 2

with torch.no_grad():
    input_seq = text_to_tensor(start_text, ctoi).to(device)
    predicted_indices = cr_model_3.predict_proba(input_seq, n_predictions=n_chars)
    
    for idx in predicted_indices :
        if idx < len(vocab) :
            generated_text += itoc[idx]
        else : 
            print(idx)

print(generated_text)

A kid playing in the playgrou b


In [137]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers=1):
        '''Initialize model parameters and layers.'''
        super(LSTMModel, self).__init__()
        self.emb = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers, batch_first=True)
        self.lin = nn.Linear(hidden_dim, vocab_size)

    def forward(self, tensor_data, hidden_state=None):
        '''Apply the forward pass for some text data already converted to tensor.'''
        output = self.emb(tensor_data)
        output, hidden = self.lstm(output, hidden_state)
        output = self.lin(output)
        return output, hidden

    def predict_proba(self, input_context, n_predictions):
        predictions, hidden = [], None

        for char_index in input_context:
            output, hidden = self.forward(char_index.unsqueeze(0).unsqueeze(0), hidden)

        for _ in range(n_predictions):
            output, hidden = self.forward(input_context[-1].unsqueeze(0).unsqueeze(0), hidden)

            # Use Categorical distribution to sample from the predicted probabilities
            categorical_dist = distributions.Categorical(logits=output.squeeze(0))
            predicted_index = categorical_dist.sample().item()
            predictions.append(predicted_index)

            # Update the context tensor with the new prediction
            input_context = torch.cat((input_context, torch.tensor([predicted_index])))

        return predictions


def train_loop_lstm(model, data_tensor, seq_len, n_epochs, learning_rate=5e-3):
    model.train()
    optim = optimizer.Adam(model.parameters(), lr=learning_rate)
    loss_function = nn.CrossEntropyLoss()

    for epoch in range(n_epochs):
        # Randomly choose a starting point for each epoch
        start_index = np.random.randint(0, data_tensor.size(0) - seq_len - 1)
        hidden = None
        for i in range(start_index, data_tensor.size(0) - seq_len, seq_len):
            input_seq = data_tensor[i:i+seq_len].unsqueeze(0).to(device)
            target_seq = data_tensor[i+1:i+seq_len+1].to(device)

            optim.zero_grad()
            output, hidden = model(input_seq, hidden)
            hidden = tuple(h.detach() for h in hidden)

            loss = loss_function(output.squeeze(0), target_seq)
            loss.backward()
            optim.step()

        if (epoch + 1)%10 == 0:
            print(f"Epoch {epoch + 1}/{n_epochs}, Loss: {loss.item()}")

In [139]:
lstm_model = LSTMModel(vocab_size, embedding_dim, hidden_dim)
train_loop_lstm(lstm_model, data_tensor, seq_len, n_epochs=50)

Epoch 10/50, Loss: 1.2888818979263306
Epoch 20/50, Loss: 1.8630107641220093
Epoch 30/50, Loss: 1.9078855514526367
Epoch 40/50, Loss: 1.7482788562774658
Epoch 50/50, Loss: 1.1762248277664185


In [141]:
start_text = "A kid playing in the playgrou"
lstm_model.eval()
generated_text = start_text
n_chars = 2

with torch.no_grad():
    input_seq = text_to_tensor(start_text, ctoi).to(device)
    predicted_indices = lstm_model.predict_proba(input_seq, n_predictions=n_chars)
    
    for idx in predicted_indices :
        if idx < len(vocab) :
            generated_text += itoc[idx]
        else : 
            print(idx)

print(generated_text)

A kid playing in the playgrousi


## Answer :

****