# Base embedding table

Adding route to custom libraries

In [1]:
import sys
import os


dirname = os.path.abspath(os.path.join(os.getcwd(), "..", "..", "scripts/lib"))
sys.path.append(dirname)

## Setting Hyperparameters

In [2]:
# The max block size (also known as max context) [in tokens]
block_size = 8

# How much does the test/validation set represent of the total data
test_train_split_ratio = 0.1

## Importing libraries

In [3]:
import torch

from utils.compile import compileFolder
from utils.tokenizer import CharTokenizer, END_CHAR
from utils.datasets import TextChunksDataset, split_dataset, get_batch

## Setting up the data and other

In [4]:
# Importing the data
raw_data = compileFolder('tate')

# Creating the tokenizer
tokenizer = CharTokenizer(raw_data)

# Tokenizing and creating the dataset object
data = TextChunksDataset(raw_data, block_size, tokenizer)

In [5]:
train_data, test_data = split_dataset(data, 0.1)

## Creating the model class

In [6]:
import torch.nn as nn
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size: int | CharTokenizer | TextChunksDataset):
        super().__init__()

        if type(vocab_size)==TextChunksDataset:
            vocab_size=len(vocab_size.tokenizer)
        elif type(vocab_size)==CharTokenizer:
            vocab_size=len(vocab_size)
        # each token has a probability distribution of appearing depending on the last token
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)

        if targets is None:
            loss=None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss
    
    def generate(self, idx, max_new_tokens: int):
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:,-1,:]
            # apply softmax to get the probabilities
            probs = F.softmax(logits, dim=1)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled text to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx
        

In [7]:
torch.manual_seed(89)
m = BigramLanguageModel(train_data)
xb, yb = train_data[:10]
out = m(xb, yb)
print(out[0].shape)
print(out[1])

torch.Size([80, 80])
tensor(4.8054, grad_fn=<NllLossBackward0>)


In [8]:

print(tokenizer.decodeText(m.generate(idx= torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0]))


3!!UvlMHTU.Sf=NBelz89HkL:Tz!HwuF=cim>.B/:/:8kFrPuKcNF%3H5fVea-81"2ySkTZpaHJkDgqjBAghxaq=1'3XUC4,eR Y


The result is random characters

## Training the model

In [9]:
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(m.parameters(), lr=1e-3)

In [10]:
batch_size = 32
num_epochs = 100

# verbose
show_loss_each_epoch = 10

for steps in range(num_epochs):

    # sample a batch of data
    xb, yb = get_batch(train_data, batch_size)

    # evaluate the loss
    logits, loss = m(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    if (steps+1)%show_loss_each_epoch==0:
        print('loss :',loss.item())

loss : 4.918609619140625
loss : 4.963029384613037
loss : 4.9680352210998535
loss : 4.9516215324401855
loss : 4.852038383483887
loss : 4.976519584655762
loss : 4.953675746917725
loss : 4.949826717376709
loss : 4.8634185791015625
loss : 4.948612213134766


In [11]:
print(tokenizer.decodeText(m.generate(idx= torch.zeros((1,1), dtype=torch.long), max_new_tokens=100)[0]))


xL10UJvyQ5N8Rm:cPKlPz%Fj!8FL uagbvvIX,YZKcl!PJlr'W-DehzJlQDfOfcmpHFv=!%vQK.l soM…T1Vg6C,%qDmI:!PN2H*


> **Note** : So far, this model only looks at the previous character, so there is no context

To use this model directly, you can import it from `transformers.educational`

In [12]:
from transformers.educational import BigramLanguageBaseModel