In [25]:
import nltk
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [76]:
import torch
from torch import nn
from torch.nn import functional as F
import requests
from nltk.tokenize import word_tokenize
import re

class BigramLanguageModel(nn.Module):
    def __init__(self, batch_size=4, input_length=8, train_iters=100, eval_iters=100):
        super().__init__()
        self.device = 'cuda' if torch.cuda.is_available() else 'cpu'
        # input_length = how many consecutive tokens/chars in one input
        self.input_length = input_length
        # batch_size = how many inputs are going to be processed in-parallel (on GPU)
        self.batch_size = batch_size
        # train_iters = how many training iterations
        self.train_iters= train_iters
        # eval_iters = how many batches to evaluate to get average performance
        self.eval_iters = eval_iters

    def forward(self, inputs, targets=None):
        logits = self.token_embeddings_table(inputs)
        # print(logits.shape)
        # logits are estimated model parameters
        # for each input of context_size, there are vocab_size parameters to be estimated
        if targets is None:
            loss = None
        else:
            batch_size, input_length, vocab_size = logits.shape
            logits = logits.view(batch_size * input_length, vocab_size)
            targets = targets.view(batch_size * input_length)
            loss = F.cross_entropy(logits, targets)
        return logits, loss

    def fit(self, learning_rate=0.001):
        optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
        for iter in range(self.train_iters):
            if iter % (self.train_iters//20) == 0:
                avg_loss = self.eval_loss()
                print(f"iter {iter} train {avg_loss['train']} val {avg_loss['eval']}")
            inputs, targets = self.get_batch(split='train')
            logits, loss = self(inputs, targets)
            optimizer.zero_grad(set_to_none=True)  # clear gradients of previous step
            loss.backward()  # propagate loss back to the each unit in the network
            optimizer.step()  # update network parameters w.r.t the loss

        # print(loss.item())

    def generate(self, context, max_new_tokens):
        inputs = context
        for _ in range(max_new_tokens):
            # forward pass, targets None, loss None
            logits, _ = self(inputs)
            # only last char/time-step is needed
            logits = logits[:, -1, :]
            # softmax logits to get probability distribution
            probs = F.softmax(logits, dim=1)
            # sample
            sampled_output = torch.multinomial(probs, num_samples=1)
            # append the sampled_output to running outputs
            inputs = torch.cat((inputs, sampled_output), dim=1)
        output_text = self.decoder(inputs[0].tolist())
        return output_text

    @torch.no_grad() # tell torch not to prepare for back-propagation
    def eval_loss(self):
        perf = {}
        # set dropout and batch normalization layers to evaluation mode before running inference.
        self.eval()
        for split in ['train', 'eval']:
            losses = torch.zeros(self.eval_iters)
            for k in range(self.eval_iters):
                inputs, targets = self.get_batch(split)  # get random batch of inputs and targete
                logits, loss = self(inputs, targets)  # forward pass
                losses[k] = loss.item()  # the value of loss tensor as a standard Python number
            perf[split] = losses.mean()
        self.train() # turn-on training mode-
        return perf

    def prep(self, text):
        vocab = sorted(list(set(text)))
        self.vocab_size = len(vocab)
        # look-up table for
        self.token_embeddings_table = nn.Embedding(self.vocab_size, self.vocab_size)

        ctoi = {c: i for i, c in enumerate(vocab)}  # char c to integer i map. assign value i for every word in vocab
        itoc = {i: c for c, i in ctoi.items()}  # integer i to char c map

        # print(ctoi)
        # print(itoc)

        self.encoder = lambda text: [ctoi[c] for c in text]
        self.decoder = lambda nums: ''.join([itoc[i] for i in nums])

        n = len(text)
        self.train_text = text[:int(n * 0.9)]
        self.val_text = text[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long)

    def prep_tokens(self, text):
        text = re.sub(r'\s+', ' ', text)  # Normalize spaces
        tokens = word_tokenize(text.lower())
        tokens.append(' ')
        vocab = set(tokens)
        print(vocab)
        self.vocab_size = len(vocab)
        # look-up table for
        self.token_embeddings_table = nn.Embedding(self.vocab_size, self.vocab_size)

        ctoi = {c: i for i, c in enumerate(vocab)}  # token c to integer i map. assign value i for every word in vocab
        itoc = {i: c for c, i in ctoi.items()}  # integer i to token c map

        # print(ctoi)
        # print(itoc)

        self.encoder = lambda text: [ctoi[c] for c in tokens]
        self.decoder = lambda nums: ' '.join([itoc[i] for i in nums])

        n = len(tokens)
        self.train_text = tokens[:int(n * 0.9)]
        self.val_text = tokens[int(n * 0.9):]

        self.train_data = torch.tensor(self.encoder(self.train_text), dtype=torch.long)
        self.val_data = torch.tensor(self.encoder(self.val_text), dtype=torch.long)

    def get_batch(self, split='train'):
        data = self.train_data if split == 'train' else self.val_data
        ix = torch.randint(len(data) - self.input_length,
                           (self.batch_size,))  # get random chunks of length batch_size from data
        inputs_batch = torch.stack([data[i:i + self.input_length] for i in ix])
        targets_batch = torch.stack([data[i + 1:i + self.input_length + 1] for i in ix])
        inputs_batch = inputs_batch.to(self.device)  # deploy to GPU is available
        targets_batch = targets_batch.to(self.device)# deploy to GPU is available
        return inputs_batch, targets_batch

def fetch_text_from_url(url):
    """Fetches raw text from a given URL."""
    response = requests.get(url)
    response.raise_for_status()
    return response.text
url = "https://www.gutenberg.org/files/1342/1342-0.txt"  # Example: Pride and Prejudice
text = fetch_text_from_url(url)

text = """a quick brown fox jumps over the lazy dog.
          lazy dog and a quick brown fox.
          the dog is lazy and the fox jumps quickly.
          a fox jumps over the dog because he is lazy.
          dog is lazy and fox is brown. she quickly jumps over the lazy dog.
          the brown fox watches the lazy dog before jumping.
          a lazy dog sleeps under the tree while the fox waits.
          the quick fox sees the dog resting and leaps past him.
          a small fox chases the dog, but he is too slow.
          the dog barks at the fox, but she is already gone.
          over the fence, the fox jumps while the dog sighs.
          a sleepy dog ignores the fox playing nearby.
          the fox teases the lazy dog, who refuses to move.
          under the bright moon, the fox runs and the dog yawns.
          the brown fox leaps higher than the sleepy dog can see.
          beside the river, the lazy dog naps as the fox splashes.
          a clever fox waits until the dog closes his eyes before running.
          the dog stretches and yawns while the fox rushes past.
          the fox circles the dog, but he remains still and calm.
          a quick fox dashes through the grass, leaving the lazy dog behind.
          """

print(torch.__version__)

model = BigramLanguageModel(batch_size=32,
                            input_length=8,
                            train_iters=5000)
model = model.to(model.device)
model.prep_tokens(text)
input_batch, output_batch = model.get_batch(split='train')
# print(input_batch)
# print(output_batch)

logits, loss = model(input_batch, output_batch)
# print(logits.shape)
# print(logits)
# print(loss)

2.5.1+cu124
{'he', 'chases', 'sleeps', 'leaps', 'brown', 'gone', 'waits', 'sleepy', 'dog', 'because', 'bright', 'over', 'to', 'fox', 'and', 'at', '.', ' ', 'barks', 'sees', 'calm', 'she', 'nearby', 'until', 'watches', 'see', 'as', 'quick', 'is', 'through', 'dashes', 'already', 'can', 'moon', 'remains', 'refuses', 'before', 'tree', 'who', 'move', 'slow', 'yawns', 'past', 'beside', 'quickly', 'river', 'rushes', 'under', 'than', 'him', 'small', 'runs', 'closes', 'splashes', ',', 'fence', 'stretches', 'his', 'while', 'but', 'teases', 'eyes', 'running', 'grass', 'lazy', 'ignores', 'circles', 'jumping', 'playing', 'naps', 'jumps', 'still', 'the', 'behind', 'resting', 'clever', 'sighs', 'a', 'leaving', 'too', 'higher'}


In [77]:
import numpy as np
outputs = model.generate(context=torch.zeros((1, 1), dtype=torch.long, device=model.device),
                         max_new_tokens=100)
print(outputs)
print(f"Vocab size {model.vocab_size}, CE: {-np.log(1/model.vocab_size)}")
model.fit(learning_rate=0.1)

he is sleepy river . see leaving because sleeps quickly , before bright the dog slow bright quickly at . rushes behind , waits waits eyes ignores closes until yawns until yawns the leaps a tree calm leaps sees stretches grass through grass   leaving brown see leaving already a stretches who see stretches behind waits at dashes , move she jumps yawns over leaving circles chases gone fence he to under at dashes bright remains   who at because slow higher who refuses fence gone higher over bright leaps fox behind and chases too still moon closes while running at
Vocab size 81, CE: 4.394449154672439
iter 0 train 4.77914571762085 val 4.7747883796691895
iter 250 train 1.2424339056015015 val 1.2333190441131592
iter 500 train 1.2285927534103394 val 1.222877860069275
iter 750 train 1.2245844602584839 val 1.221291422843933
iter 1000 train 1.2233067750930786 val 1.2344472408294678
iter 1250 train 1.2214137315750122 val 1.2291940450668335
iter 1500 train 1.2301043272018433 val 1.221910834312439
it

In [80]:
outputs = model.generate(context=torch.zeros((1, 1), dtype=torch.long, device=model.device), max_new_tokens=100)
print(outputs)

he is brown fox rushes past . the fox watches the dog naps as the brown fox is already gone . dog can see . the dog ignores the grass , but he is lazy and the fox waits until the sleepy dog is lazy dog is too slow . the fox waits . the fox waits until the brown fox sees the dog because he is too slow . she quickly . beside the grass , leaving the brown . a quick brown fox jumps over the fox rushes past . the sleepy dog barks at the grass , but
