<a href="https://colab.research.google.com/github/Apoak/Deep-Learning-Projects/blob/main/Text_generationIP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Lab 8.3 Text generation

In this lab you will finish building your RNN text generator.  I found that this code actually runs pretty quickly on my MacBook without GPU acceleration.

In [None]:
device = 'cpu'
seq_len = 20
hidden_size = 100
batch_size = 32
lr = 3e-4
epochs = 10

In [None]:
pip install torchmetrics

In [None]:
import numpy as np

from tqdm import tqdm, trange

import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

import torchmetrics

Here's the code to download and prepare the sonnet dataset.

In [None]:
!wget --no-clobber "https://www.dropbox.com/scl/fi/7r68l64ijemidyb9lf80q/sonnets.txt?rlkey=udb47coatr2zbrk31hsfbr22y&dl=1" -O sonnets.txt
text = (open("sonnets.txt").read())
text = text.lower().strip()

In [None]:
print(text[:1000])

Here's my solution for the `CharacterDataset` class.

Note that it returns an entire sequence of tokens for the target (unlike what we did on Monday where it only output a single token for the target.)

In [None]:
class CharacterDataset(Dataset):
  def __init__(self,text,seq_len=100,device='cpu'):
    """
    Initialize a dataset using character tokenization.
    Arguments:
      text: a string containing the dataset
      seq_len: sequence length provided by __getitem__
      device: device for PyTorch tensors
    """
    self.text = text
    self.seq_len = seq_len
    self.vocabulary = ''.join(sorted(list(set(text))))
    self.index_to_char = {n:char for n, char in enumerate(self.vocabulary)}
    self.char_to_index = {char:n for n, char in enumerate(self.vocabulary)}
    self.device = device

  def __len__(self):
    """ Return the length of sequences in the dataset. """
    return len(self.text)-self.seq_len-1

  def __getitem__(self,idx):
    """ Return the input and target sequences starting at given index. """

    text = self.text[idx:idx+self.seq_len+1]
    tokens = self.encode(text)

    return torch.tensor(tokens[:-1],device=self.device),torch.tensor(tokens[1:],device=self.device)

  def encode(self,text):
    """ Encode a string to a list of integer tokens. """
    return list(map(self.char_to_index.get,text))

  def decode(self,tokens):
    """ Decode a list of token integers into a string. """
    return ''.join(list(map(self.index_to_char.get,tokens)))

In [None]:
ds = CharacterDataset(text,seq_len=seq_len,device=device)

In [None]:
ds.encode(text[:10])

In [None]:
print(ds.decode(ds.encode(text[:100])))

In [None]:
x, y = ds[0]
x.shape, y.shape

In [None]:
dl = DataLoader(ds,shuffle=True,batch_size=batch_size)

Here's my solution for the recurrent neural network (RNN) implementation.

In [None]:
class CharacterRNN(nn.Module):
  def __init__(self,vocabulary_size,hidden_size):
    super().__init__()
    self.embedding = nn.Embedding(vocabulary_size,hidden_size)
    self.hidden_size = hidden_size
    self.U = nn.Linear(hidden_size,hidden_size)
    self.W = nn.Linear(hidden_size,hidden_size)
    self.act = nn.SiLU()
    self.V = nn.Linear(hidden_size,vocabulary_size)

  def forward(self,x):
    x = self.embedding(x)
    B,N = x.shape[:2]
    h = torch.zeros(B,self.hidden_size).to(x.device)
    Ux = self.U(x)
    y = []
    for i in range(N):
      Wh = self.W(h)
      h = self.act(Ux[:,i] + Wh)
      y.append(self.V(h))
    return torch.stack(y,dim=1)

In [None]:
model = CharacterRNN(len(ds.vocabulary),hidden_size).to(device)

In [None]:
x_batch, y_batch = next(iter(dl))
x_batch.shape, y_batch.shape

In [None]:
model(x_batch).shape

Finally here is my code to train the model.

Note that I needed to use `.view()` to reshape the model output and target, becuase the loss and metric functions want the data to have shape [B,C] not [B,N,C].

In [None]:
opt = torch.optim.Adam(model.parameters(),lr=lr)
loss_fn = nn.CrossEntropyLoss()

metric = torchmetrics.classification.Accuracy(task="multiclass", num_classes=len(ds.vocabulary))
metric.to(device)

In [None]:
epochs = 10

for epoch in range(epochs):
  model.train()
  pbar = tqdm(total=len(dl))
  for x_batch, y_batch in dl:
    opt.zero_grad()

    y_pred = model(x_batch)
    loss = loss_fn(y_pred.view(-1,len(ds.vocabulary)),y_batch.view(-1))

    loss.backward()

    opt.step()

    pbar.update(1)
  pbar.close()

  model.eval()

  metric.reset()
  pbar = tqdm(total=len(dl))
  for x_batch, y_batch in dl:
    y_pred = model(x_batch)
    metric(y_pred.view(-1,len(ds.vocabulary)),y_batch.view(-1))
    pbar.update(1)
  pbar.close()

  acc = metric.compute().item()

  print(f'epoch {epoch}: {acc}')

### Exercises

1. Write a deterministic function to generate text given some starter text.  The function should iteratively add characters to the prompt using the trained model.  This version should be deterministic, in that in always takes the most likely next character according to the model.

Test the function by prompting it with the first 10 characters in the dataset.

In [None]:
def generate_text_deterministic(model,prompt,num_to_generate=1000):
  ds = CharacterDataset(text,seq_len=seq_len,device=device)
  print("len vocab: ", len(ds.vocabulary))
  x = ds.encode(prompt)
  x = torch.tensor(x,device=device)
  x = x.unsqueeze(0)
  print(x.shape)
  print("prompt encoded :", x)
  # print("Output: ", model(x))
  output = model(x)
  # output = torch.softmax(output,dim=1)
  # print(output[0])

  response = [torch.argmax(tensor).item() for tensor in output[0]]
  # response = torch.tensor(response[0])
  # response = response.unsqueeze(0).unsqueeze(0)
  print(response)

  # print(ds.decode(response))

  # NEED AN ARGMAX of output
  combined = x
  for num in range(num_to_generate-1):
    response = torch.tensor(response[num])
    response = response.unsqueeze(0).unsqueeze(0)
    combined = torch.cat((combined, response), dim=1) # ADD A DIMENSION TO THE RESPONSE TESNOR
    # print(combined)
    output = model(combined)
    # output = torch.softmax(output,dim=1)
    response = [torch.argmax(tensor).item() for tensor in output[0]]
    # print(response)
    # print(ds.decode(response))
  # print(response)
  print(ds.decode(response))
  # return response

In [None]:
print(text[:10])
generate_text_deterministic(model,text[:10], num_to_generate=1000)

3. Write a stochastic version of the text generation function.  This one should use `torch.multinomial` to sample the next character.  Note that you will need to apply `torch.softmax` to convert the model output to probabilities.  (In my experience if you don't this you end up with a CUDA error and you end up needing to restart your kernel, so be careful!)

Test the function by prompting it with the first 10 characters in the dataset, and run the generation multiple times to verify the stochastic behavior.

In [None]:
def generate_text_stochastic(model,prompt,num_to_generate=1000):
  sentence = ""
  ds = CharacterDataset(text,seq_len=seq_len,device=device)
  x = ds.encode(prompt)
  x = torch.tensor(x,device=device)
  x = x.unsqueeze(0)
  print(x.shape)
  print("prompt encoded :", x)
  # print("Output: ", model(x))
  output = model(x)
  output = torch.softmax(output,dim=2) # Changed dim to 2 for proper softmax application

  # Get probabilities for the first character prediction
  probs = output[0, 0]
  response = [torch.multinomial(probs, num_samples=1).item()] # Sample one character

  for num in range(num_to_generate-1):
    response_tensor = torch.tensor(response[-1], device=device).unsqueeze(0).unsqueeze(0) # Use only the last generated character
    combined = torch.cat((x, response_tensor), dim=1)
    output = model(combined)
    output = torch.softmax(output,dim=2)

    # Get probabilities for the next character prediction
    probs = output[0, -1] # Probabilities for the last generated character
    next_char = torch.multinomial(probs, num_samples=1).item() # Sample the next character
    response.append(next_char) # Add the sampled character to the response

    sentence = sentence + ds.decode([next_char])
  return sentence

In [None]:
generate_text_stochastic(model,text[:10], num_to_generate=1000)