In [41]:
import os
import requests
if not os.path.exists('input.txt'):
    print('Downloading input.txt...')
    url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    r = requests.get(url)

In [32]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [33]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

Character based tokenizer, we encode the characters into integers.
We decode the intergers back to characters to get the original text.

In [34]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
# example
ex = 'You are a douch bag!'
decode(encode(ex))

'You are a douch bag!'

Spilt into training and validation set

In [35]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

Next, we randomly sample a batch of text from the training set and use it to train the model. The model will predict the next character in the sequence.
We will use the predicted character to calculate the cross entropy loss and update the model weights.

In [36]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    # The input (x) is a string of characters.
    # The output (y) is the next character in the string.
    # ix is the random index of a character in the string.
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

We caclulate the average loss over eval_iters

In [37]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

The model is consists of only a embedding table.
![embed.png](../embed.png)
idx is a input to the embedding model, its dimension is (B,T) where each element is an integer.
logits is the output of the model, its dimension is (B,T,D) where D is the embedding dimension.
target is the target of the model, its dimension is (B,T) where each element is an integer.
The loss is calculated by comparing the logits with the target.

## generation
We generate token by token, we feed the model with the previous token to get the next token.
We repeat this process until nce orwe reach the maximum length.
![generate.png](../generate.png)


In [38]:
class LanguageModel(nn.Module):
    '''
    A very simple language model that only looks at the previous one token.
    '''
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size,embedding_dim=vocab_size)
    
    def forward(self, idx, targets=None)
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,D)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, D)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

Train the model!!!

In [39]:
model = BigramLanguageModel(vocab_size)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



step 0: train loss 4.7282, val loss 4.7314
step 300: train loss 2.8069, val loss 2.8287
step 600: train loss 2.5422, val loss 2.5627
step 900: train loss 2.5025, val loss 2.5158
step 1200: train loss 2.4841, val loss 2.5061
step 1500: train loss 2.4762, val loss 2.4927
step 1800: train loss 2.4616, val loss 2.4911
step 2100: train loss 2.4607, val loss 2.4931
step 2400: train loss 2.4685, val loss 2.4944
step 2700: train loss 2.4641, val loss 2.4879


In [40]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))


Whimyity t I Be y t tr m Bren asond t himat?-buil, t:
Whod.
wd w tie he Th aitormul chad cceord IO: OUESof mmy, mpey wh rt IVI ws pacthontlir t IINNTy:
3ullped
Lirbe rerar Se, shZEThentheace outen!
LAUSI arer is tl welak:
Fain te'd, th INThe catabrul thour
ESTh nor w st for En abe ve ou thisout torind n
ithall t wirnod ind blle sstothy, f weiseeouppl an's oubrdeckiessan:
LIONThing?'
HEEThe tengreO ar d dordape ge le is l, s me hel pl THe g y tilathete, Gowatod y tomy my eroimy w IO e wourelother
