# Base embedding table

Adding route to custom libraries

In [1]:
import sys
import os


dirname = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "scripts/lib"))
sys.path.append(dirname)

## Setting Hyperparameters

In [2]:
# The max block size (also known as max context) [in tokens]
block_size = 8

# How much does the test/validation set represent of the total data
test_train_split_ratio = 0.1

## Importing libraries

In [3]:
import torch

from utils.compile import compileFolder
from utils.tokenizer import CharTokenizer, END_CHAR
from utils.datasets import TextChunksDataset, split_dataset, get_batch

## Setting up the data and other

In [4]:
# Importing the data
raw_data = compileFolder('tate')

# Creating the tokenizer
tokenizer = CharTokenizer(raw_data)

# Tokenizing and creating the dataset object
data = TextChunksDataset(raw_data, block_size, tokenizer)

In [5]:
train_data, test_data = split_dataset(data, 0.1)

## Creating the model class

In [6]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size: int):
        super().__init__()

        if type(vocab_size)==TextChunksDataset:
            vocab_size=len(vocab_size.tokenizer)
        elif type(vocab_size)==CharTokenizer:
            vocab_size=len(vocab_size)
        # each token has a probability distribution of appearing depending on the last token
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss=None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens: int):
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:,-1,:]
            # apply softmax to get the probabilities
            probs = F.softmax(logits, dim=1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled text to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
        

In [7]:
torch.manual_seed(89)
m = BigramLanguageModel(train_data)
xb, yb = train_data[:10]
out = m(xb, yb)
print(out[0].shape)
print(out[1])

torch.Size([80, 80])
tensor(5.1642, grad_fn=<NllLossBackward0>)


In [8]:

print(tokenizer.decodeText(m.generate(idx= torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0]))


3!!UvlMHTU.Sf=NBelz89HkL:Tz!HwuF=cim>.B/:/:8kFrPuKcNF%3H5fVea-81"2ySkTZpaHJkDgqjBAghxaq=1'3XUC4,eR Y


The result is random characters

## Training the model

In [9]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

Let's add a function to estimate training and validation losses

In [10]:
@torch.no_grad()
def estimate_loss(model, x_train : TextChunksDataset, x_val : TextChunksDataset, eval_iterations = 50 ,batch_size=32):
    out = {}
    model.eval()
    for label, data in zip(['train','val'],[x_train, x_val]):
        losses= torch.zeros(eval_iterations)
        for k in range(eval_iterations):
            X, Y = get_batch(data, batch_size)
            _, loss = model(X,Y)
            losses[k] = loss.item()
        out[label] = losses.mean()
    model.train()
    return out



In [11]:
batch_size = 32
num_epochs = 1000

# verbose
show_loss_each_epoch = 100

for steps in range(num_epochs):

    # sample a batch of data
    xb, yb = get_batch(train_data, batch_size)

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if (steps+1)%show_loss_each_epoch==0:
        losses = estimate_loss(m, train_data, test_data, batch_size=batch_size)
        print(f"step {steps}: train loss {losses ['train']:.4f}, val loss {losses ['val']:.4f}")

step 99: train loss 4.8407, val loss 4.8641
step 199: train loss 4.7100, val loss 4.7254
step 299: train loss 4.6005, val loss 4.5998
step 399: train loss 4.4951, val loss 4.4947
step 499: train loss 4.3807, val loss 4.3918
step 599: train loss 4.2687, val loss 4.2840
step 699: train loss 4.1695, val loss 4.1794
step 799: train loss 4.0673, val loss 4.1025
step 899: train loss 3.9953, val loss 4.0208
step 999: train loss 3.9112, val loss 3.9341


In [12]:
print(tokenizer.decodeText(m.generate(idx= torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0]))


ND%IuyND=%-/u'Gam6w Hl74.=e.>do=bsows0B'/
CHCR>$hzN/GoX2CpRdi:/GGYeeefRUR93BMW-X%5R…XPqYa
1Rzg24*xt


> **Note** : So far, this model only looks at the previous character, so there is no context

To use this model directly, you can import it from `transformers.educational`

In [13]:
from models.educational import BigramLanguageBaseModel

And to use the `estimate_loss` function directly, import it from `utils.dataset`

In [14]:
from utils.datasets import estimate_loss