1. Note that the params are very small... , because colab's free accelerators are slower than my laptop's GPU (Feel free to tune them up if you have colab PRO)
2. Ideally to best train this model, do 60 epcohs, so takes like ~ 30minuteish
3. Why? GPT is robust because of huge weights = more training needed (for instance, GPT-3 was trained with 45TB of text with 175B Params) 

In my folder, vocabs aren't lower cased because I have larger samples to train

In [None]:
import torch

# tokens for preprocessing
PAD_TAG = "<PAD>"
SOS_TAG = "<SOS>"
EOS_TAG = "<EOS>"
UNK_TAG = "<UNK>"
PAD = 0
SOS = 1
EOS = 2
UNK = 3

max_len = 100 # max number of words per input sequence
d_model = 768 # number of dimensions to represent each word
batch_size = 128 # batch_size
vocab_size = 13360 # how many vocabs are in the tokenizer
nheads = 8 # number of Multi-Head Attention heads
dim_feedforward = 1024 # number of neurons at the end of each Decoder Block
decoder_layers = 3 # Number of Decoder blocks
lr = 1e-4 # learning rate

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Download this file and upload, this will be data: https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

1. Click on the link
2. Right click
3. Click Save as
4. Save
5. on the left hand side, click on folder icon (below -> {x})
6. click upload icon under "Files"
7. Copy it's uploaded path and use that as path

# Preprocessing (Data Preparation)

In [None]:
"""
<PAD> -> 0    padding
<SOS> -> 1    start of sentence
<EOS> -> 2    end of sentence
<UNK> -> 3    unknown token
"""
import json

class Word2Sequence:
    PAD_TAG = "<PAD>"
    SOS_TAG = "<SOS>"
    EOS_TAG = "<EOS>"
    UNK_TAG = "<UNK>"

    PAD = 0
    SOS = 1
    EOS = 2
    UNK = 3
    
    special_tokens = [PAD_TAG, SOS_TAG, EOS_TAG, UNK_TAG]
        
    def __init__(self, custom_dict = None):
        self.dict = {
            self.PAD_TAG : self.PAD,
            self.SOS_TAG : self.SOS,
            self.EOS_TAG : self.EOS,
            self.UNK_TAG : self.UNK
        } if custom_dict == None else custom_dict
        
        self.count = {}

    def fit(self, sentence):
        """
        param: sentence: [word1, word2, word3...]
        """
        for word in sentence:
            self.count[word] = self.count.get(word, 0) + 1

    def build_vocab(self, min=5, max=None, max_features=None):
        """
        param min:          
        param max:          
        param max_features: 
        returns:            
        """
        if min is not None:
            self.count = {word: value for word,value in self.count.items() if value > min}
        if max is not None:
            self.count = {word: value for word,value in self.count.items if value < max}
        if max_features is not None:
            temp = sorted(self.count.items(), key=lambda x:x[-1], reverse=True)[:max_features]
            self.count = dict(temp)

        for word in self.count:
            if word not in self.special_tokens:
                self.dict[word] = len(self.dict)
        
        self.reverse_dict = dict(zip(self.dict.values(), self.dict.keys()))
    
    def transform(self, sentence, max_len=None, pad_first=False):
        """
        param sentence: [word1, word2...]
        """
        if max_len is not None: # do padding here 
            if pad_first == False:
                if max_len > len(sentence):
                    sentence = sentence + [self.PAD_TAG] * (max_len-len(sentence))
                if max_len < len(sentence):
                    sentence = sentence[:max_len] # truncation
            else:
                if max_len > len(sentence):
                    sentence = [self.PAD_TAG] * (max_len-len(sentence)) + sentence
                if max_len < len(sentence):
                    sentence = sentence[-max_len:] # truncation

        return [self.dict.get(word, self.UNK) for word in sentence]
    
    def inverse_transform(self, indices, is_tensor=False):
        """
        param indices: [1, 2, 3, 4, 5...]
        """
        if is_tensor == False:
            return [self.reverse_dict.get(idx) for idx in indices]
        
        else:
            
            return [self.reverse_dict.get(idx.item()) for idx in indices]

    def __len__(self):
        return (len(self.dict))

In [None]:
from nltk.tokenize import WordPunctTokenizer
from torch.utils.data import Dataset, DataLoader
import torch


def txt_to_wordlist(path):
    text = open(path).readlines()
    seq_dict = {}
    temp_sequence = []
    count = 0
    for i in range(len(text)):
        if text[i] != "\n":
            temp_sequence += [text[i]]
        else:
            seq_dict[count] = temp_sequence
            count = count + 1
            temp_sequence = []
    
    word_list = []
    for idx, (key, value) in enumerate(seq_dict.items()):
        tknzed = [WordPunctTokenizer().tokenize(x) + ["\n"] for x in value]
        for sent in tknzed:
            for word in sent:
                word_list.append(word.lower())
                
    return word_list

def to_sequence(wordlist, max_len):
    input_sequences = []
    
    for i in range(0, len(wordlist)-max_len):
        input_sequences.append(wordlist[i:i+max_len])
        
    return input_sequences

class Dataset(Dataset):
  def __init__(self, sequences, tokenizer, max_len, limit=None):
    self.max_len = max_len
    
    self.sequences = sequences if limit == None else sequences[:limit]
    
    self.tokenizer = tokenizer
  
  def __getitem__(self, idx):
    x = ["<SOS>"] + self.sequences[idx][:-1]
    y = self.sequences[idx][0:-1] + ["<EOS>"]
    
    x = self.tokenizer.transform(x, max_len=self.max_len, pad_first=False)
    y = self.tokenizer.transform(y, max_len=self.max_len, pad_first=False)

    return x, y

  def __len__(self):
    return len(self.sequences)


def collate_fn(batch):
    '''
    param batch: ([x, y]， [x, y], output of getitem...)
    '''
    x, y = list(zip(*batch))
    return torch.LongTensor(x), torch.LongTensor(y)

def get_dataloader(dataset, batch_size, shuffle=True, drop_last=False, collate_fn=collate_fn):
    dataloader = DataLoader(dataset=dataset,
                            batch_size=batch_size,
                            shuffle=shuffle,
                            drop_last=drop_last,
                            collate_fn=collate_fn)
    return dataloader

word_list = txt_to_wordlist("input.txt")
tokenizer = Word2Sequence()
tokenizer.fit(word_list)
tokenizer.build_vocab(min=0, max_features=None)
input_sequences = to_sequence(word_list, max_len)
dataset = Dataset(input_sequences, tokenizer, max_len, limit=100)
dataloader = get_dataloader(dataset, batch_size)

In [None]:
vocab_size = len(tokenizer.dict)
print(vocab_size)

11495


In [None]:
print(f"Data size: {len(dataloader.dataset)}")

Data size: 100


In [None]:
# # NOTE: You may run this to check the data structure
# for i, (x,y) in enumerate(dataloader):
#     print(x[0])
#     print(y[0])
#     break
# print(" ".join(tokenizer.inverse_transform(x[1], is_tensor=True)))
# print(" ".join(tokenizer.inverse_transform(y[1], is_tensor=True)))

# Model Configuration and Utilities

In [None]:
from nltk.tokenize import WordPunctTokenizer
import torch.nn as nn
import random
import torch
from tqdm import tqdm

class GPT(nn.Module):
    def __init__(self, vocab_size, max_len, d_model, nhead, dim_feedforward, num_layers):
        super().__init__()
        
        # [11495 (vocab_size), 768 (d_model)]
        self.embed = nn.Embedding(num_embeddings=vocab_size, embedding_dim=d_model)
        
        # [100 (max_len), 768 (d_model)]
        self.pos_embed = nn.Parameter(torch.randn(max_len, d_model, device=device) / 10)
        
        # attention (tril) mask
        self.attention_mask = torch.triu(
            torch.ones(
                (max_len, max_len),
                dtype = torch.long,
                device = device
            ), diagonal=1
          )
        
        self.attention_mask = self.attention_mask == 1
        
        # So that the mask isn't part of backprop, therefore should be constants
        self.register_buffer("mask", self.attention_mask)
        
        # GPT Decoder Block
        self.DecoderBlock = Decoder(d_model, nhead, dim_feedforward, num_layers)
        
        # output feed forward network
        self.FINAL_ffn = nn.Linear(in_features = d_model, out_features = vocab_size)
        
    def forward(self, x):
        # [128 (batch_size), 100 (max_len)]
        pad_mask_x = get_key_padding_mask(x)

        # all shape: [128 (batch_size), 100 (max_len), 768 (d_model)]
        word_embedding = self.embed(x)
        
        # shape: [128 (batch_size), 100 (max_len), 768 (d_model)]
        embedded_x = word_embedding + self.pos_embed
        
        # shape: [128 (batch_size), 100 (max_len), 768 (d_model)]
        output = self.DecoderBlock(embedded_x, self.attention_mask, pad_mask_x)
        
        output = self.FINAL_ffn(output) # [128 (batch_size), 100 (max_len), 11495 (vocab_size)]
        
        return output
        
        
class Decoder(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, num_layers, dropout=0.2):
        super().__init__()
        
        # GPT Decoder Layer
        decoder_layer = nn.TransformerEncoderLayer(
            d_model = d_model,
            nhead = nhead,
            dim_feedforward = dim_feedforward,
            dropout = dropout,
            activation = 'gelu'
        )

        norm = nn.LayerNorm(normalized_shape = d_model)

        # Decoder
        self.Decoder = nn.TransformerEncoder(encoder_layer = decoder_layer,
                                             num_layers = num_layers,
                                             norm = norm)
        
    def forward(self, x, attention_mask_x, pad_mask_x):
        # Convert to PyTorch input formats
        # [128 (batch_size), 100 (max_len), 768 (d_model)] -> [100 (max_len), 128 (batch_size), 768 (d_model)]
        x = x.permute(1, 0, 2)
        
        out = self.Decoder(src=x, mask=attention_mask_x, src_key_padding_mask=pad_mask_x)
    
        # [100 (max_len), 128 (batch_size), 768 (d_model)] -> [128 (batch_size), 100 (max_len), 768 (d_model)]
        out = out.permute(1, 0, 2)
        
        return out
    
    
def get_key_padding_mask(data, pad_token=PAD):
    attentio_mask = data==pad_token
    return attentio_mask


def select_top_k(predictions, current_loc, k=1):
    temped_pred = predictions[0, current_loc, :]
    predicted_index = random.choice(
        temped_pred.sort(descending=True)[1][:k]
    ).item()
    
    return predicted_index


def generate(model, tokenizer, x, k=1, temp=0.7): # pay attention to loc that's grabbing the word
    target = ['<SOS>'] + WordPunctTokenizer().tokenize(x.lower())
    pred_loc = len(target)
    target = tokenizer.transform(target, max_len=max_len, pad_first=False)
    target = torch.LongTensor(target).unsqueeze(0)
    
    for i in range(max_len - pred_loc - 1):
        target = target.to(device)
        out = model(target)
        # temperature generation technique
        out = out / temp
        # top k sampling generation technique
        pred = select_top_k(out, pred_loc-1, k=k)
        if pred == 2: # If we encountered <EOS>, we repredict since it causes model to opt out early
            i -= 1
            continue
        target[0][pred_loc] = pred
        pred_loc += 1
    return target


def train(model, dataloader, batch_size, device=None, saving_path=None, epoch=1, lr=1e-4):
    # Will have optimizer and loss func set in stones accordingly to the paper
    loss_func = nn.CrossEntropyLoss()
    optim = torch.optim.AdamW(model.parameters(), lr=lr)
    for epoch in range(epoch):
        model.train()
        total_train_loss = 0
        # for _, (x, y) in tqdm(enumerate(dataloader), total=len(dataloader.dataset)/batch_size):
        for i, (x, y) in enumerate(dataloader):
            x = x.to(device) if device != None else x
            y = y.to(device) if device != None else x
            
            # We essentially consider location of predicted index before padding as prediction
            # Then insert that prediction to the next up coming padding index as prediction and so on
            
            optim.zero_grad()

            # [batch_size, max_len] -> [batch_size, max_len, vocab_size]
            pred_y = model(x)
            
            # loss_func([shape from batch_size * max_len, vocab_size], [shape from batch_size * max_len])
            loss = loss_func(pred_y.reshape(-1, vocab_size), y.reshape(-1))

            loss.backward()
            
            optim.step()
            
            total_train_loss += loss.item()
        
        avg_train_loss = total_train_loss / len(dataloader)
        
        print(f"EPOCH: {epoch}", f"AVG.Loss: {avg_train_loss}")
        total_train_loss = 0
        
        torch.save(model.state_dict(), saving_path)

# Training!!!

In [None]:
# NOTE: Initialize model and parameters here
gpt = GPT(
    vocab_size = vocab_size,
    max_len = max_len,
    d_model = d_model,
    nhead = nheads,
    dim_feedforward = dim_feedforward,
    num_layers = decoder_layers
)
gpt.to(device)

# NOTE: If Transfer Learning, make sure the HyperParameters match
# Uncomment line 14 if you saved a model previously and would like to perform Transfer Learning
# gpt.load_state_dict(torch.load("/content/GPT.pt", map_location=device))


# NOTE: TRAINING!!!
train(
    gpt, dataloader, batch_size,
    device, saving_path="GPT.pt",
    epoch=60, lr=lr
)

# NOTE: Make sure you download the saved model weights if you would like to load it
# Should take 20 min

EPOCH: 0 AVG.Loss: 9.472147941589355
EPOCH: 1 AVG.Loss: 8.977290153503418
EPOCH: 2 AVG.Loss: 8.494415283203125
EPOCH: 3 AVG.Loss: 8.020892143249512
EPOCH: 4 AVG.Loss: 7.555840492248535
EPOCH: 5 AVG.Loss: 7.118276596069336
EPOCH: 6 AVG.Loss: 6.707829475402832
EPOCH: 7 AVG.Loss: 6.347342014312744
EPOCH: 8 AVG.Loss: 6.040239334106445
EPOCH: 9 AVG.Loss: 5.767886638641357
EPOCH: 10 AVG.Loss: 5.522299766540527
EPOCH: 11 AVG.Loss: 5.27824592590332
EPOCH: 12 AVG.Loss: 5.046084403991699
EPOCH: 13 AVG.Loss: 4.821122646331787
EPOCH: 14 AVG.Loss: 4.602878093719482
EPOCH: 15 AVG.Loss: 4.391732215881348
EPOCH: 16 AVG.Loss: 4.19173002243042
EPOCH: 17 AVG.Loss: 3.9957046508789062
EPOCH: 18 AVG.Loss: 3.812227249145508
EPOCH: 19 AVG.Loss: 3.6375112533569336
EPOCH: 20 AVG.Loss: 3.4667346477508545
EPOCH: 21 AVG.Loss: 3.3060622215270996
EPOCH: 22 AVG.Loss: 3.1447057723999023
EPOCH: 23 AVG.Loss: 2.9887478351593018
EPOCH: 24 AVG.Loss: 2.841731309890747
EPOCH: 25 AVG.Loss: 2.694150447845459
EPOCH: 26 AVG.Loss

It's expected for the model to have repetition and illogical phrases due to constraints of small weights and little data. Again, GPTs are robust because they were trained on large datasets and large weights. (for instance, GPT-3 was trained with 45TB of text with 175B Params) 

In [None]:
# NOTE: Test model by generating!
header = "Romeo:"
generated = generate(gpt, tokenizer, header.lower(), k=3, temp=0.7)
converted = " ".join(tokenizer.inverse_transform(generated.cpu()[0], is_tensor=True))
print(converted)

<SOS> romeo : , you know caius . chief 
 first , and you are accounted poor resolved to die 
 all kill we know ' t a accounted away ! 
 first , the patricians speak . resolved rather speak . resolved rather to the patricians we ' ll have corn kill word resolved rather on ' ll 
 we know know ' ll resolved marcius is chief know ' t ; let it be done : away , good . 
 all kill poor citizens . resolved . 
 first <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>
