In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import random
import re
import torch.nn.functional as F
from torch.utils.data import DataLoader

In [2]:
DATA_PATH = '/kaggle/input/english-bookcorpus/en.txt'

lines = []
with open(DATA_PATH,'r') as f:
    for i in range(50000):
        lines.append(f.readline())
print(len(lines))

50000


In [3]:
random.seed(24)
texts = random.choices(lines,k=50000)
del lines

In [4]:
def preprocess(text):
    text = text.replace('\n', ' ')
    text = re.sub('@.*?\s+', '', text)
    text = re.sub('#.*?\s+', '', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub('\s+', ' ', text)
    text = text.lower()
    text = text.strip()
    return text

In [5]:
train_set = [preprocess(t) for t in texts]
with open('input.txt', 'w') as f:
    f.write('\n'.join(train_set))

In [6]:
import sentencepiece as spm

# train the tokenizer on the train set
spm.SentencePieceTrainer.train(
    input='input.txt',
    model_prefix='wikidata',
    model_type='bpe',
    pad_id=0,
    bos_id=1,
    eos_id=2,
    unk_id=3
)

# load the tokenizer model
tokenizer = spm.SentencePieceProcessor(model_file='wikidata.model')

from IPython.display import clear_output
clear_output(wait=True)

d: freq=803 size=680 all=24356 active=2678 piece=▁fact
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=777 size=700 all=24698 active=3020 piece=▁looking
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=775 min_freq=259
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=760 size=720 all=25119 active=1656 piece=ters
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=724 size=740 all=25449 active=1986 piece=▁head
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=699 size=760 all=25719 active=2256 piece=▁compan
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=678 size=780 all=26002 active=2539 piece=▁ext
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=652 size=800 all=26314 active=2851 piece=▁free
bpe_model_trainer.cc(159) LOG(INFO) Updating active symbols. max_freq=652 min_freq=225
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=637 size=820 all=26676 active=1672 piece=ph
bpe_model_trainer.cc(268) LOG(INFO) Added: freq=617 size=840 all=26931 active=1927 piece=▁av
bpe_model_tra

In [47]:
toks = tokenizer.Encode('I am creating GPT from scratch, it is going to be amazing', add_bos=True, add_eos=True, enable_sampling=True, alpha=0.01, nbest_size=-1)
print(toks)

[1, 7964, 3, 217, 3748, 7964, 3, 167, 6372, 3, 60, 66, 469, 27, 51, 217, 7968, 6305, 2]


In [7]:
# Tokenize all text in train and test set
train_sequence = tokenizer.Encode(train_set, add_bos=True, add_eos=True,alpha=0)

In [48]:
d_model = 300
n_heads = 12
seq_len = 128
batch_size = 64
vocab_size = tokenizer.vocab_size()

In [49]:
#  Convert lists to tensors
train_sequence_tensors = [torch.tensor(seq[:seq_len]) for seq in train_sequence]

# Pad train sequence
max_length_train = max([len(seq) for seq in train_sequence_tensors])
padded_train_sequence = torch.nn.utils.rnn.pad_sequence(train_sequence_tensors, batch_first=True, padding_value=0)  # Assuming padding value is 0

print(padded_train_sequence.shape)

torch.Size([50000, 128])


In [50]:
train_loader = DataLoader(padded_train_sequence,batch_size=batch_size,shuffle=True)

for d in train_loader:
    print(d.shape)
    print(d)
    break

torch.Size([64, 128])
tensor([[   1,   11,   72,  ...,    0,    0,    0],
        [   1, 2486,   59,  ...,    0,    0,    0],
        [   1,  130, 2689,  ...,    0,    0,    0],
        ...,
        [   1,   27,   95,  ...,    0,    0,    0],
        [   1,  359,  228,  ...,    0,    0,    0],
        [   1,   84, 2706,  ...,    0,    0,    0]])


# Model Building

In [11]:
import math
def positional_encoding(length, depth):
    pe = torch.zeros(depth, length)
    
    position = torch.arange(0, depth, dtype=torch.float).unsqueeze(1)
    div_term = torch.exp(torch.arange(0, length, 2).float() * (-math.log(10000.0) / length))
    pe[:, 0::2] = torch.sin(position * div_term)
    pe[:, 1::2] = torch.cos(position * div_term)
#     pe = pe.unsqueeze(0).transpose(0, 1)

    return pe

class PositionalEmbedding(nn.Module):
    def __init__(self,vocab_size,seq_len,d_model):
        super(PositionalEmbedding,self).__init__()
        self.d_model = d_model
        self.embedding = nn.Embedding(vocab_size,d_model)
        self.pos_encoding = positional_encoding(d_model,seq_len).to('cuda')
        self.dropout = nn.Dropout(0.5)
    
    def forward(self,x):
        length = x.size(0)
        x = self.embedding(x)
        x *= torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32).to('cuda'))
        pos_enc_out = self.pos_encoding.unsqueeze(0)
        x = x + pos_enc_out
        x = self.dropout(x)
        return x

In [None]:
import matplotlib.pyplot as plt
plt.imshow(positional_encoding(d_model,seq_len))

In [None]:
posEmb = PositionalEmbedding(vocab_size,seq_len,d_model).to('cuda')
input_tensor = torch.randint(0,vocab_size,(batch_size, seq_len)).to('cuda')
print(input_tensor.shape)
output_tensor = posEmb(input_tensor)
print(output_tensor.shape)

In [12]:
class Decoder(nn.Module):
    def __init__(self,d_model,n_heads):
        super(Decoder,self).__init__()
        self.attention = nn.MultiheadAttention(d_model,n_heads,batch_first=True)
        self.layer_norm1 = nn.LayerNorm(d_model)
        self.layer_norm2 = nn.LayerNorm(d_model)
        self.feed_forward = nn.Sequential(
            nn.Linear(d_model,d_model*4),
            nn.GELU(),
            nn.Linear(d_model*4,d_model),
            nn.Dropout(0.2)
        )
        
    def forward(self,x):
        x = self.layer_norm2(x)
        attention_output,_ = self.attention(x,x,x,attn_mask=nn.Transformer.generate_square_subsequent_mask(x.size(1)).to('cuda'),need_weights=False)
        x = x + attention_output
        x = self.layer_norm2(x)
        ffn_out = self.feed_forward(x)
        x = x + ffn_out
        return x

In [13]:
class GPT(nn.Module):
    def __init__(self,d_model,n_heads,vocab_size,seq_len):
        super(GPT,self).__init__()
        self.embedding = PositionalEmbedding(vocab_size,seq_len,d_model)
        self.layers = nn.ModuleList([Decoder(d_model,n_heads) for _ in range(12)])
        
        self.layer_norm = nn.LayerNorm(d_model)
        self.final_layer = nn.Linear(d_model,vocab_size)
            
    def forward(self,x):
        x = self.embedding(x)
        for layer in self.layers:
            x = layer(x)
        x = self.layer_norm(x)
        x = self.final_layer(x)
        return x

In [None]:
input_tensor = torch.randint(0,vocab_size,(batch_size, seq_len)).to('cuda')
print(input_tensor.shape)
gpt = GPT(d_model,n_heads,vocab_size,seq_len).to('cuda')
output = gpt(input_tensor)
print(output.shape)
# softmax_tensor = F.softmax(output, dim=2)
# print(softmax_tensor.shape)

In [14]:
def masked_loss(label, pred):
    ce_loss = F.cross_entropy(pred.permute(0,2,1), label, ignore_index=0)
    mask = (label != 0).float()
    total_loss = torch.sum(ce_loss * mask.view(-1)) / torch.sum(mask)
    return total_loss

def masked_accuracy(label, pred):
    pred = pred.argmax(dim=2)
    match = label.eq(pred)
    mask = label.ne(0)
    match = match & mask
    match = match.float()
    mask = mask.float()
    return match.sum() / mask.sum()

In [None]:
# Example of target with class indices
# 16,256 = 3,8 target
# 16,256,8000 3 5 8 input
target = torch.randint(5, (3,8,), dtype=torch.int64)
input = torch.randn(3, 5, 8, requires_grad=True)
loss = F.cross_entropy(input, target)
print(input.shape)
print(target.shape)
print(loss)

In [None]:
print(masked_loss(input_tensor,output))
print(masked_accuracy(input_tensor,output))

In [51]:
gpt = GPT(d_model,n_heads,vocab_size,seq_len).to('cuda')
optimizer = torch.optim.Adam(gpt.parameters(),lr=1e-4)

In [52]:
from tqdm import tqdm
for epoch in range(5):
    total_loss = 0
    total_accuracy = 0
    
    for batch_idx, data in tqdm(enumerate(train_loader)):
        optimizer.zero_grad()
        data = data.to('cuda')
        output = gpt(data)
        loss = masked_loss(data, output)
        accuracy = masked_accuracy(data, output)
        
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        total_accuracy += accuracy.item()
        
        if batch_idx % 100 == 99:
            print('Epoch {} [{}/{}], Loss: {:.4f}, Accuracy: {:.4f}'.format(
                epoch+1, batch_idx+1, len(train_loader), total_loss / 100, total_accuracy / 100))
            total_loss = 0.0
            total_accuracy = 0.0

print('Training finished!')

100it [00:35,  2.73it/s]

Epoch 1 [100/782], Loss: 5.7901, Accuracy: 0.3867


200it [01:13,  2.61it/s]

Epoch 1 [200/782], Loss: 3.9801, Accuracy: 0.5696


300it [01:51,  2.66it/s]

Epoch 1 [300/782], Loss: 3.2035, Accuracy: 0.6522


400it [02:28,  2.64it/s]

Epoch 1 [400/782], Loss: 2.7390, Accuracy: 0.7054


500it [03:06,  2.65it/s]

Epoch 1 [500/782], Loss: 2.3755, Accuracy: 0.7460


600it [03:44,  2.65it/s]

Epoch 1 [600/782], Loss: 2.0545, Accuracy: 0.7834


700it [04:22,  2.64it/s]

Epoch 1 [700/782], Loss: 1.8180, Accuracy: 0.8106


782it [04:53,  2.67it/s]
100it [00:37,  2.64it/s]

Epoch 2 [100/782], Loss: 1.4575, Accuracy: 0.8535


200it [01:15,  2.64it/s]

Epoch 2 [200/782], Loss: 1.2912, Accuracy: 0.8737


300it [01:53,  2.64it/s]

Epoch 2 [300/782], Loss: 1.1452, Accuracy: 0.8921


400it [02:31,  2.64it/s]

Epoch 2 [400/782], Loss: 1.0356, Accuracy: 0.9068


500it [03:09,  2.66it/s]

Epoch 2 [500/782], Loss: 0.9206, Accuracy: 0.9204


600it [03:47,  2.64it/s]

Epoch 2 [600/782], Loss: 0.8311, Accuracy: 0.9320


700it [04:24,  2.64it/s]

Epoch 2 [700/782], Loss: 0.7409, Accuracy: 0.9429


782it [04:55,  2.64it/s]
100it [00:37,  2.63it/s]

Epoch 3 [100/782], Loss: 0.5992, Accuracy: 0.9623


121it [00:46,  2.62it/s]


KeyboardInterrupt: 

In [None]:
input_text = "it is"
tokens = tokenizer.Encode([input_text],add_bos=True, add_eos=True)
new_tokens = [tokens[0][:-1] + [0]*(seq_len-len(tokens[0])) + [2]]
tokenized_input = torch.tensor(new_tokens).to('cuda')
padded_input = torch.nn.utils.rnn.pad_sequence(tokenized_input, batch_first=True, padding_value=0).to('cuda')
print(padded_input.shape)
output = gpt(padded_input)
softmax_tensor = F.softmax(output, dim=2)
top_k = 1
top_probs, top_indices = torch.topk(softmax_tensor, k=top_k, dim=2)
print(" ".join([tokenizer.Decode(t) for t in top_indices[0].view(-1).detach().tolist()]))

In [53]:
def generate(prompt='', max_len=seq_len, k=1):
    prompt = preprocess(prompt)
    input_ids = tokenizer.Encode([prompt],add_bos=True)
    input_ids[0] = input_ids[0] + [0]*(max_len - len(input_ids[0]))
    input_ids = torch.tensor(input_ids, dtype=torch.long).to('cuda')
    output_ids = []
    
    for i in range(max_len):
        with torch.no_grad():
            predictions = gpt(input_ids)
        predictions = F.softmax(predictions, dim=2)
        top_k_preds = torch.topk(predictions, k, dim=2)[1][0].detach().tolist()
        
        pred_idx = random.choice(top_k_preds[i])
        if pred_idx == tokenizer.eos_id():
            break
        output_ids.append(int(pred_idx))
        pred_idx = torch.tensor([[pred_idx]], dtype=torch.long)
        input_ids[0][i]=pred_idx
    
    output_seq = tokenizer.Decode(output_ids)
    return output_seq

In [55]:
prompt = 'beach or the ski slope of going to places with friends and family and'
print(prompt + ' '+generate(prompt, k=2))

beach or the ski slope of going to places with friends and family and piece gu the examplei slope of and to places with friends andability threemy within class being account an themselvesace g beingthebedateim 18 forward20 management houseoged we within class listed arent without j from within retedically give enough  needed lower management beingges class within we protect 2015 prterak an on takeak needed us depedakupicallya critical withinog brown whateveracks within info 28 being knewamp his without overall classter character youong being heyms pr toughumb g lower class being within without enough k without disc without lowerter from gaveogog beingak book protect without judge


In [None]:
tokenizer.Decode(list(train_loader)[0][4].tolist())