In [2]:
import numpy as np
import torch
import torch.nn as nn
from torch.nn import functional as F
import matplotlib.pyplot as plt

In [6]:
dataset = open('dataset.txt', 'r', encoding='utf-8').read()
print(len(dataset))
print(dataset[:100])

1115394
First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You


In [9]:
# Find tokens

chars = sorted(set(dataset))
dictionary = ''.join(chars)
print(f"Dataset character level dictionary is: {dictionary}")
print(f"Length of the dictionary is: {len(dictionary)}")

Dataset character level dictionary is: 
 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
Length of the dictionary is: 65


In [11]:
# Tokenizer
char_id = {s:i for i, s in enumerate(dictionary)}
id_char = {i:s for i, s in enumerate(char_id)}
encoder = lambda s: [char_id[c] for c in s]
decode = lambda i: ''.join([id_char[c] for c in i])

In [28]:
encoded_dataset = torch.tensor(encoder(dataset), dtype=torch.long)
print(encoded_dataset.shape, encoded_dataset.dtype)

torch.Size([1115394]) torch.int64


In [6]:
# Split
train_length = int(0.9 * len(encoded_dataset))
print(train_length)
train_dataset = encoded_dataset[:train_length]
validation_dataset = encoded_dataset[train_length:]
print(train_dataset.shape)
print(validation_dataset.shape)

1003854
torch.Size([1003854])
torch.Size([111540])


In [158]:
SEQUENCE_LENGTH = 8
EMBEDDING_DIM = 8
BATCH_SIZE = 16
EPOCHS = 10
VOCAB_SIZE = len(dictionary)

In [8]:
def make_data(data):
    x = []
    y = []
    d = data.tolist()
    for i in range(len(d) // SEQUENCE_LENGTH):
        x.append(d[i*8 : (i+1)*8])
        y.append(d[i*8 +1 : (i+1)*8 + 1])
    #return torch.tensor(x), torch.tensor(y)
    return torch.tensor(x).reshape(-1, BATCH_SIZE, SEQUENCE_LENGTH), torch.tensor(y).reshape(-1, BATCH_SIZE, SEQUENCE_LENGTH)

X_train, y_train = make_data(encoded_dataset)
print(f"Size of the training data is: {X_train.shape}")
print(f"Size of the training label is: {y_train.shape}")

Size of the training data is: torch.Size([8714, 16, 8])
Size of the training label is: torch.Size([8714, 16, 8])


If we have:<br>
target = [2, 0, 1]<br>
logits = [[0.5, 0.2, 0.3], [0.1, 0.1, 0.8], [0.4, 0.2, 0.4]]<br><br>
then cross entropy loss, automatically gets the indice 2 of the first logits probability and 0th index of the second one and so on ... <br>
Since our labels are not one hotted, cross entropy does the job automatically for us <br>
That's why flattening is okay and is also handled in backpropagate

In [191]:
# make bigram model

class BiGram(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, x):
        return self.token_embedding_table(x)
    
    def calc_loss(self, logits, target):
        # logits = self(x)
        return F.cross_entropy(logits.view(-1, logits.shape[-1]), target.view(-1))
          
    def generate(self, x, max_new_tokens):
        for i in range(max_new_tokens):
            logits = self(x)
            probs = F.softmax(logits, dim=1)
            x_next = torch.multinomial(probs, 1, replacement=True)[0]
            x = torch.cat((x, x_next), dim=0)
    
        return x
    
    def train(self, X_train, y_train, epochs=10):
        optimizer = torch.optim.AdamW(self.parameters(), lr=1e-3)
        epoch_loss = 0
        for i in range(epochs):
            print(f"[INFO]  epoch {i}, loss: {epoch_loss/X_train.shape[0]}")
            epoch_loss = 0
            for b in range(X_train.shape[0]):
                logits = self(X_train[b])
                loss = self.calc_loss(logits, y_train[b])
                epoch_loss+=loss
                optimizer.zero_grad(set_to_none=True)
                loss.backward()
                optimizer.step()
                # print(f"loss: {loss.item()}")
                
        
    
bigram_model = BiGram(VOCAB_SIZE)
logits = bigram_model((X_train[0]))
print(f"shape of logits: {logits.shape}")
print(f"shape of labels: {y_train[0].shape}")
bigram_model_loss = bigram_model.calc_loss(logits, y_train[0])
print(f"loss is: {bigram_model_loss}") # if we had to guess randomly the loss would have been -ln(1/65) = 4.18
print(f"model's generated text: {decode(bigram_model.generate(torch.tensor([4]), 100).tolist())}")


shape of logits: torch.Size([16, 8, 65])
shape of labels: torch.Size([16, 8])
loss is: 4.495141506195068
model's generated text: &sC$'X?rrv$xws$Jx&'HyXuCu$JqXO-:$$w$$$sWvJTH,$wxusOv-v$Y,hs
SA$$vGsUHyQxs$jsxPm?spvvyc$CYX$sPZ'm$xYAz


In [192]:
bigram_model.train(X_train[:1000], y_train[:1000], epochs=20)

[INFO]  epoch 0, loss: 0.0
[INFO]  epoch 1, loss: 4.168179035186768
[INFO]  epoch 2, loss: 3.4456942081451416
[INFO]  epoch 3, loss: 2.994004487991333
[INFO]  epoch 4, loss: 2.7325074672698975
[INFO]  epoch 5, loss: 2.586735486984253
[INFO]  epoch 6, loss: 2.5063488483428955
[INFO]  epoch 7, loss: 2.4611153602600098
[INFO]  epoch 8, loss: 2.434145212173462
[INFO]  epoch 9, loss: 2.4170758724212646
[INFO]  epoch 10, loss: 2.4058172702789307
[INFO]  epoch 11, loss: 2.39817476272583
[INFO]  epoch 12, loss: 2.3928632736206055
[INFO]  epoch 13, loss: 2.3890953063964844
[INFO]  epoch 14, loss: 2.386380434036255
[INFO]  epoch 15, loss: 2.384387254714966
[INFO]  epoch 16, loss: 2.382903575897217
[INFO]  epoch 17, loss: 2.3817837238311768
[INFO]  epoch 18, loss: 2.380929708480835
[INFO]  epoch 19, loss: 2.3802695274353027


In [196]:
logits = bigram_model((X_train[0]))
bigram_model.calc_loss(logits, y_train[0])
print(f"model's generated text: {decode(bigram_model.generate(torch.tensor([0]), 100).tolist())}")

model's generated text: 

MOL

dBWC
TyS
H

tWbIIVBACMANTTT
ACAWLBFYAWSBATBF


AWYYW'TSNB

OWWWOTy
tTAFI
BtS
WAOVAp
A
WAAMTO
A
