Download dataset

In [1]:
import os
import requests
if not os.path.exists('input.txt'):
    print('Downloading input.txt...')
    url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    r = requests.get(url)

In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
# hyperparameters
batch_size = 32 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?
max_iters = 3000
eval_interval = 300
learning_rate = 1e-2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200

Character based tokenizer, we encode the characters into integers.
We decode the intergers back to characters to get the original text.

In [4]:
with open('input.txt', 'r', encoding='utf-8') as f:
    text = f.read()
# here are all the unique characters that occur in this text
chars = sorted(list(set(text)))
vocab_size = len(chars)
# create a mapping from characters to integers
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
# vocab_size
print(f'The vocab size of this tokenizer is {len(stoi)}')
# example
ex = 'I am a language model.'
print(f'The encoding is {encode(ex)}')

The vocab size of this tokenizer is 65
The encoding is [21, 1, 39, 51, 1, 39, 1, 50, 39, 52, 45, 59, 39, 45, 43, 1, 51, 53, 42, 43, 50, 8]


Spilt into training and validation set

In [5]:
# Train and test splits
data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

Next, we randomly sample a batch of text from the training set and use it to train the model. The model will predict the next character in the sequence.   
Since this is simply a classification problem, we will use the predicted character to calculate the cross entropy loss and update the model weights.

In [6]:
# data loading
def get_batch(split):
    # generate a small batch of data of inputs x and targets y
    # The input (x) is a string of characters.
    # The output (y) is the next character in the string.
    # ix is the random index of a character in the string.
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

We caclulate the average loss over eval_iters

In [7]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

The model is only composed of an embedding table.  
![embed.png](images/embed.png)  
idx is a input to the embedding model (embedding table) , its dimension is (B,T) where each element is an integer.  
logits is the output of the model (embedding table), its dimension is (B,T,D) where D is the embedding dimension.  
target is the target of the model, its dimension is (B,T) where each element is an integer.  
The loss is calculated by comparing the logits with the target.  

## Generation
We generate token by token, we feed the model with the previous token to get the next token.
We then concatenate the next token to the previous token and feed it back to the model to get the next token.
Since the previous output is fed back to current input, this type of model is called "autoregressive" model.  
![generate.png](images/generate.png)


In [8]:
class LanguageModel(nn.Module):
    '''
    A very simple language model that only looks at the previous one token.
    '''
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(num_embeddings=vocab_size,embedding_dim=vocab_size)
    
    def forward(self, idx, targets=None):
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,D)
        if targets is None:
            loss = None
        else:
            B, T, D = logits.shape
            logits = logits.view(B*T, D)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, loss = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

Train the model!!!

In [9]:
model = LanguageModel(vocab_size)
m = model.to(device)
# create a PyTorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



step 0: train loss 4.6081, val loss 4.6170
step 300: train loss 2.7880, val loss 2.8145
step 600: train loss 2.5439, val loss 2.5592
step 900: train loss 2.5028, val loss 2.5097
step 1200: train loss 2.4776, val loss 2.5049
step 1500: train loss 2.4661, val loss 2.5011
step 1800: train loss 2.4701, val loss 2.4964
step 2100: train loss 2.4638, val loss 2.4870
step 2400: train loss 2.4643, val loss 2.4906
step 2700: train loss 2.4686, val loss 2.4949


In [10]:
# generate from the model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(m.generate(context, max_new_tokens=500)[0].tolist()))



TINUShot wicqusird ief wicouareyofinot he' be: ha
'ensk
Wh at tourith ke.
'!
D ithour cejequs t my bll,

there s, emybr le Whe t west handn BRI lllll, w athonoues:
Haneveddure mest fug?
kse h'ear hy s y apowhen te n to f it
Fane lly avormyodZAPlor.
Twasairiritust, bavers to whalungh whathalanteave con t y iot's
K:
TETyobexbelestharille f lomponNILes R:
clpr weat iver athe adyour t aryshin by II og; w wncor:

Anohowqug to bFarsur win
Thay bll tQUCoreathe por!
Wher hancur mikeryourerallfre ves he


: 