<a href="https://colab.research.google.com/github/Aaditya-Prasad/APML/blob/main/Transformers/APGPT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# references
# GPT 2:
# https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf
# GPT: 
# https://s3-us-west-2.amazonaws.com/openai-assets/research-covers/language-unsupervised/language_understanding_paper.pdf
# Attention is all you need:
# https://arxiv.org/pdf/1706.03762.pdf
# Annotated Transformer:
# http://nlp.seas.harvard.edu/annotated-transformer/#embeddings-and-softmax
# Cramming:
# https://arxiv.org/pdf/2212.14034.pdf
# Layer norms:
# https://arxiv.org/pdf/1607.06450.pdf
# Karpathy's video

In [1]:
# Open questions
# How is the vocab built?
# 

In [2]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-01-21 01:04:58--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.111.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt’


2023-01-21 01:04:58 (33.5 MB/s) - ‘input.txt’ saved [1115394/1115394]



In [8]:
import os
from os.path import exists
import torch
import torch.nn as nn
from torch.nn.functional import log_softmax, pad
import math
import copy
import time
from torch.optim.lr_scheduler import LambdaLR
import pandas as pd
import altair as alt
from torchtext.data.functional import to_map_style_dataset
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets
import spacy
# import GPUtil
import warnings
from torch.utils.data.distributed import DistributedSampler
import torch.distributed as dist
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP



In [None]:
##Transformer Parameters from Attention is All You Need

#Number of decoder blocks 
num_blocks = 6

#dimension of tensors used in the model
d_model = 512

#height of the multi-head attention
h = 8
#dimension of the key/value matrices 
d_key = d_model/h
d_value = d_key


#dropout percentages
AttnDropout = nn.Dropout(p = .1)
SublayerDropout = nn.Dropout(p = .1)

In [None]:
#Word embeddings are learned 
class WordEmbedding(nn.Module):
  def __init__(self, d_model, vocab):
    super().__init__()
    self.embed = nn.Embedding(vocab, d_model)
    self.d_model = d_model

  #multiply embedding outputs by sqrt of model dimension as per GPT-2 paper
  def forward(self, x):
    return embed(x) * math.sqrt(self.d_model)

#TODO: Positional embeddings are learned, need to save module into GPT class
#Positional embeddings are simply added to normal Embedding

In [102]:
##Playground
lin = nn.Linear(3, 5)
x = torch.zeros(3, 4, 20)
z = torch.zeros(3, 20, 4)
torch.matmul(x, z).shape
y = torch.zeros(12, 8, 1, 64)
a = torch.zeros(12, 8, 1, 64)
y[:, -1].shape
scores = torch.matmul(y, a.transpose(-2, -1))
# scores.view(12, -1, 8*64)
scores = torch.matmul(scores, y)
scores.shape
scores.view(12, -1, 8*64).shape


# y = torch.tensor([[[1.0, 2.0, 3.0],[4.0, 5.0, 6.0],[7.0, 8.0, 9.0],[10.0, 11.0, 12.0]],[[1.0, 2.0, 3.0],[4.0, 5.0, 6.0],[7.0, 8.0, 9.0],[10.0, 11.0, 12.0]],[[1.0, 2.0, 3.0],[4.0, 5.0, 6.0],[7.0, 8.0, 9.0],[10.0, 11.0, 12.0]]])
# y.shape
# lin(y.view(4, 3, 3)).shape
# x.view(3, -1, 3, 5).transpose(1, 2).shape




torch.Size([12, 8, 1, 64])

In [105]:
#This isn't a module because we only learn how to create the Q, K, V, this is just a computation
def attention(query, key, value, mask):
  scores = torch.matmul(query, key.transpose(-2, -1))/(math.sqrt(d_key)) #h x d_key x d_key x h = h x h
  if mask is not None:
    scores.masked_fill(mask == 0, 1e-9)
  scores.softmax(dim = 1) #because the rows are the "units" of computation
  scores = AttnDropout(scores)
  return torch.matmul(scores, value) #h x h x h x d_key = h x d_key

In [None]:
class FFN(nn.Module):
  def __init__(self, fdim = 2048, d_model = 512, dropout = .1):
    super().__init__()
    self.fdim = fdim
    self.d_model = d_model
    self.W_1 = nn.Linear(d_model, fdim)
    self.W_2 = nn.Linear(fdim, d_model)
    self.dropout = dropout

  def forward(self, x):
    return self.W_2(self.dropout(self.W_1(x).relu()))

In [None]:
#TODO: understand the shape of the mask everywhere 
def subsequent_mask(size):
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [None]:
class LayerNorm(nn.Module):
  def __init__(self, features, eps=1e-6):
    super().__init__()
    self.a = nn.Parameter(torch.ones(features))
    self.b = nn.Parameter(torch.zeroes(features))
    self.eps = eps

  def forward(self, x):
    mean = x.mean(-1, keepdim = True)
    std = x.std(-1, keepdim = True)
    x = (x-mean) #shifts by the mean
    x = x/(std + self.eps) #normalizes so data is clustered around [-1, 1] except shifted by eps to prevent div by zero
    return a*x + b

In [None]:
class MAttention(nn.Module):
  def __init__(d_model, d_key, d_value, h):
    super().__init__()
    self.trans = copy(nn.Linear(d_model, d_model), 4) #WQ, WK, WV, WO -- we are going to do the actual change of dim with a view
    self.h = h
    self.d_key = d_key
    self.d_value = d.value
    self.d_model = d_model

  def forward(self, query, key, value, mask = None):
    #TODO: figure out masks
    nbatches = query.shape(0)
    query, key, value = {
                lin(x).view(nbatches, self.h, 1, d_key) for x, lin in zip((query, key, value), self.trans) 
    }
    x = attention(query, key, value, mask = mask) #(12, 8, 1, 64) b/c attention preserves shape 
    x.contiguous.view(nbatches, -1, self.d_model) #(12, 1, 512) (not sure why we need the extra dimension, think it has to do with allowing us to do matmul while nbatches is first dim)
    del query
    del key
    del value
    return self.trans[-1](x) #WO


In [None]:
#layerNorm(x) -> attention(x') -> layerNorm(dropout(x'')+x'') -> FFN(x''')
#TODO: decide if another residual connection is needed, confused with how layerNorm was moved to subblock input in GPT-2 paper
class Layer(nn.Module):
  def __init__(self, d_model, MAttention, LayernOrm, SublayerDropout, FFN):
    super().__init__()
    self.d_model = d_model 
    self.MAttention = MAttention 
    self.LayerNorm = LayerNorm 
    self.SublayerDropout 
    self.FFN

  def forward(self, x):
    x = self.SublayerDropout(x)
    x = self.LayerNorm(x)
    y = self.MAttention(x)
    x = self.LayerNorm(self.SublayerDropout(y)+x) #dropout does not apply to the residual connection
    return self.FFN(x)+x #the last layernorm has to happen after all the layers are called

In [3]:
####Model is done, now onto vocab preperation
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
len(text)

1115394

In [7]:
characters = sorted(list(set(text)))
vocab = len(characters) #we are going to predict next characters
##TODO: after I get next character to work, want to implement BPE

65

In [9]:
stoi = {ch:i for i, ch in enumerate(characters)}
itos = {i:ch for i, ch in enumerate(characters)}
def encode(x):
  return [stoi[c] for c in x]

def decode(x):
  return ''.join([itos[i] for i in x])

In [13]:
text_data = torch.tensor(encode(text), dtype = torch.long)
#now we are going to split the dataset into train and test
i = int(.9 * len(text_data))
train = text_data[:i]
test = text_data[i:]
len(text_data) - len(train) - len(test)

0

In [12]:
torch.manual_seed(1337) #will change this later, just for testing

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59])

In [None]:
#need to understand which of these are nbatches
batch_size = 4 
block_size = 8

def get_batch(split):
  data = train if split == "train" else test
  x_i = torch.randint(len(data) - block_size, (batch_size, )) #4 different places we are going to start blocks from
  x = torch.stack([data[i:i+block_size] for i in x_i])
  y = torch.stack(data[i+1:i+block_size+1] for i in x_i)
  #need to know one more word in y so that we can see what in x led to that last word/so we can predict it
  return x, y


In [34]:
#playground
batch_size = 4 
block_size = 8
torch.randint(len(text_data) - block_size, (batch_size, ))


IndexError: ignored