# Group 5 Notebook

## Topics:

Datasets/Tokenizing (part 1)
 
Models (part 2)

In [1]:
# When we are doing NLP, by FAR the best package/tool we can use is Huggingface
# https://huggingface.co/docs/transformers/quicktour
# pip install transformers datasets tokenizers

In [2]:
import torch
import transformers
import datasets
import tokenizers

ModuleNotFoundError: No module named 'torch'

In [None]:
# Let's start by selecting a dataset. 
# I found one that contains (context, question, answer) pairs about wikipedia pages
# https://huggingface.co/datasets/lmqg/qa_wiki_t5_large
from datasets import load_dataset

dataset = load_dataset("lmqg/qa_wiki_t5_large")
# This might take a while to download the dataset

In [9]:
dataset['train'][0]

{'id': '54766',
 'title': 'Federal government of the United States',
 'context': 'The government of the United States of America is the federal government of the republic of fifty states that constitute the United States, as well as one capital district, and several other territories. The federal government is composed of three distinct branches: legislative, executive, and judicial, whose powers are vested by the U.S. Constitution in the Congress, the President, and the federal courts, including the Supreme Court, respectively. The powers and duties of these branches are further defined by acts of Congress, including the creation of executive departments and courts inferior to the Supreme Court.',
 'question': 'What is the government of the United States of America?',
 'answers': {'text': ['federal government of the republic of fifty states that constitute the United States'],
  'answer_start': [54]}}

In [11]:
# Now we can tokenize our text:
# Learn more about BPE here: https://huggingface.co/docs/transformers/tokenizer_summary
# And we see the implementation here: https://huggingface.co/course/chapter6/5?fw=pt
# Finally we can follow the walkthrough here: https://huggingface.co/docs/transformers/fast_tokenizers
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.trainers import BpeTrainer
from tokenizers.pre_tokenizers import Whitespace

tokenizer = Tokenizer(BPE(unk_token="[UNK]"))
trainer = BpeTrainer(special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

tokenizer.pre_tokenizer = Whitespace()


# files = [...]
# tokenizer.train(files, trainer)

# We don't have a list of files easily available. 
# So instead we can use: https://huggingface.co/docs/tokenizers/v0.13.0/en/api/tokenizer#tokenizers.Tokenizer.train_from_iterator


In [1]:
# Task 1: 
# Train the BPE on our corpus of data

# Step 1. 
# Create an iterator that contains all of our relevant data
    # What is relevant? In this example, we'll be given the 'context' and we'll try to predict 'question' and 'answer'
    # So our 'corpus' (all of our relevant data) is simply all of the contexts, questions, and answers in our dataset
    # How do we create a iterator?
    # A list is fine, but in order to save memory we'll use a generator. 
    # How do we make a generator?
    # Like this:
# example_generator = (function(x) for x in iterator)
# Hint: dataset['train'] is an iterator
# Hint: you need to make a function to convert each dataset sample into a single string
    
def convert_to_single_string(sample):
    # TODO
    pass

my_generator = # TODO
next(my_generator) # useful to test, feel free to remove once working

SyntaxError: invalid syntax (2080604149.py, line 20)

In [28]:
# Step 2.
# refer to the documentation for tokenizer.train_from_iterator() 
# https://huggingface.co/docs/tokenizers/v0.13.0/en/api/tokenizer#tokenizers.Tokenizer.train_from_iterator
# in order to train our tokenizer 
tokenizer.train_from_iterator(___TODO____)
# This is slow, careful! I encourage you to work on Part 2 (making models) in the meantime 

In [29]:
# Save our tokenizer
tokenizer.save("qa_wiki_t5_large_tokenizer.json")


In [30]:
# Load our tokenizer from the file, but now with a faster implementation
from transformers import PreTrainedTokenizerFast

fast_tokenizer = PreTrainedTokenizerFast(tokenizer_file="qa_wiki_t5_large_tokenizer.json")

In [37]:
fast_tokenizer(["the big dog was red", "antidisestablishmentarianism"])
# feel free to test your own words here

{'input_ids': [[3442, 6841, 9258, 3487, 3941], [3592, 28210, 7640, 18305]], 'token_type_ids': [[0, 0, 0, 0, 0], [0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1], [1, 1, 1, 1]]}

# Part 2
## We've created a custom tokenizer, now we need to create a custom model

In [31]:
# We learned about RNNs during out meeting. While we can easily use the PyTorch implementation for RNNs:
# https://pytorch.org/docs/stable/generated/torch.nn.RNN.html

# We're going to learn by doing, and make one ourselves
from torch import nn

In [32]:
# Below is the framework for creating an RNN.
# Recall that the __init__ function is where we can instantiate layers, and the forward function is where we apply those 
# layers to our data.
class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyRNN, self).__init__()

    
    def forward(self, x, hidden_state):
        return output, hidden


In [None]:
# Recall that an RNN behaves as such:
# https://stanford.edu/~shervine/teaching/cs-230/illustrations/architecture-rnn-ltr.png?9ea4417fc145b9346a3e288801dbdfdc
# At each step the model has two inputs and two outputs
# The input x is often the token at that position. The input h is the model's hidden state from all previous tokens.
# The output y is the models prediction of the next token. The output h is the model's hidden state to the next token.


# Let's have our example sentence be:
# The big dog was red

# Our model will first process 'the' (if we are tokenizing by each word)
# The input x will be the tokenized version of 'the'.  Shape = 1xToken_Dim
# The input h, since it is the first token, will be randomly initialized. Shape = 1xHidden_Dim

# We will concatenate these two inputs and call this X_Concat. Shape = 1x(Token_Dim + Hidden_Dim)
# We will first calculate the hidden state. We will perform a matrix multiply with a weight matrix we will call W_Hidden.
# W_Hidden's shape is (Token_dim + Hidden_dim) x Hidden_Dim

# We will perform a matrix-vector multiply with W_Hidden and X_Concat. 
# We will also apply a non-linear function. It is common practice to use tanh or sigmoid. 
# This gives us our hidden state output, h. 
# h = sigmoid(W_Hidden * X_Concat)

# We will also calculate our output token, Y.
# We will perform a matrix-vector multiply with a matrix W_Output.
# W_output's shape is (Hidden_dim)x(Output_dim)
# Often, but not always, output_dim == token_dim.
# y = W_Output * h

# We now have our outputs y and h. 


# The model will now take as an input the returned hidden vector, h, as well as the tokenized word 'big'.
# etc etc

In [None]:
# Task 2:
# Create an RNN that works on some sequence:
batch_size = 4
sequence_length = 100
token_dim = 256
input_data = torch.rand(batch_size, sequence_length, token_dim) 

# Tools needed:
# torch.nn.Linear, to create a weight matrix and perform matrix multiplications.
# Usage:
# __init__():
# ...
# self.mylayer = torch.nn.Linear(input_dim, output_dim)
# ...
# forward(x):
# ...
# (x.shape = Batch, input_dim)
# x = self.mylayer(x)
# (x.shape = Batch, output_dim)

# torch.sigmoid or torch.tanh
# Usage:
# foward(x):
# (x.shape = Batch, dim)
# x = torch.sigmoid(x)
# (x.shape = Batch, dim)

# torch.cat
# Usage:
# foward(x, hidden):
# (x.shape = Batch, token_dim) (hidden.shape = Batch, hidden_dim)
# my_new_vec = torch.cat([x, hidden], dim=1)  
# (my_new_vec.shape = Batch, (token_dim + hidden_dim))
# We say dim=1 so that we concat along the token/hidden dim, instead of the batch dim (dim =0). 

class MyRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MyRNN, self).__init__()
        # TODO create weight layers W_hidden and W_output
    
    def forward(self, x, hidden_state):
        # TODO concatenate  vectors
        # TODO apply W_hidden and sigmoid (or tanh)
        # TODO apply W_output
        return output, hidden

hidden_size = __ # up to you!
model = MyRNN(token_dim, hidden_size, token_dim)

current_hidden = torch.rand(4, hidden_size)
for sequence_idx in range(100):
    current_token = input_data[:, sequence_idx]
    out, current_hidden = model(current_token, current_hidden)
    