In [None]:
import torch 
# Here we test for whether a GPU is available since it will make processing much faster
device = "cuda" if torch.cuda.is_available() else "cpu"

In [15]:
with open("input.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Defining the possible set of characters that the model can see
# Character level tokenisers give us very long sequences but very small codebooks 
# and simple decode and encode functions
characters = sorted(list(set(text)))
vocabulary_size = len(characters)
print("".join(characters))
print(vocabulary_size)

print("\nTokenisation")

# Tokenise means converting the raw text into a sequence of integers
# There are many other ways to define schemas for tokenisation
stoi = {char:i for i, char in enumerate(characters)}
itos = {i:char for i, char in enumerate(characters)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[value] for value in l])

print(encode("Testing encoding"))
print(decode(encode("Testing encoding")))
# Sentence piece is what google uses, a subword tokeniser, we are not encoding entire words or individual characters
# Tiktoken is what gpt uses and has very good performance, much faster than the hugging face tokeniser

"""
import tiktoken 
enc = tiktoken.get_encoding("gpt2")
enc.n_vocab
enc.encode("testing encoding")
"""


 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65

Tokenisation
[32, 43, 57, 58, 47, 52, 45, 1, 43, 52, 41, 53, 42, 47, 52, 45]
Testing encoding


'\nimport tiktoken \nenc = tiktoken.get_encoding("gpt2")\nenc.n_vocab\nenc.encode("testing encoding")\n'

In [18]:
# Creating and splitting our dataset
data = torch.tensor(encode(text), dtype=torch.long)
print(data.shape)

# Now we can split our dataset
n = int(0.9 * len(data))
train_dataset = data[:n]
validation_dataset = data[n:]

print(train_dataset)
print(validation_dataset)

torch.Size([1115394])
tensor([18, 47, 56,  ..., 43, 56, 43])
tensor([12,  0,  0,  ..., 45,  8,  0])


In [82]:
# Creating a dataloader
# Block size / context length of training our data
# We want the transformer that we train to be used to seeing a number of characters from 1 to block size this is especially useful for inference

# Batch size is how many independent sequences we see in the forward / backward pass of our transformer
# Batch size just dictates how many independent sequences we can process in parallel
batch_size = 32
block_size = 8

def get_batch(split):
    # Choose the dataset that we will be splitting into batches
    data = train_dataset if split == "train" else validation_dataset

    # Make sure that we limit from the length - block size since that is the lasdt index that can ge that sequence length
    ix = torch.randint(len(data) - block_size, (batch_size, ))
    # the target dataset is just shifted up by 1
    x = torch.stack([data[i:i + block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])

    # If we are testing for GPUs we have to make sure that our dataloader takes it to the device
    x, y = x.to(device), y.to(device)

    return x, y 

xb, yb = get_batch("train")

for batch in range(batch_size):
    for block in range(block_size):
        context = xb[batch, :block_size+1]
        target = yb[batch, block]

In [35]:
import torch 
import torch.nn as nn 
from torch.nn import functional as F

class BigramLanguageModel(nn.Module):

    def __init__(self, vocabulary_size):
        super().__init__()

        # Using nn.Embedding we can create a very simple bigram language model that creates an embedding table for counting
        # Creates a tensor of shape vocabulary_size x vocabulary_size
        self.token_embedding_table = nn.Embedding(vocabulary_size, vocabulary_size)

    def forward(self, index, targets = None):
        # Batch, Time, Channel (batch, block, channel) - B, T, C is the scores for our sequence based on the identity of an individual token
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            # We have the B, T, C tensor - pytorch wants a B, C, T tensor instead as an input so we have to reshape our logits
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            # Negative log likelikehod is implemented and called the cross entropy loss
            loss = F.cross_entropy(logits, targets)

        return logits, loss 
    
    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, loss = self(index)
            # Get the last element in our time dimension as this is the element that we have to predict and that comes next - this wil leventaully be appended to our time context
            logits = logits[:, -1, :]
            # after the get the logits of our predictions we can softmax them
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            # We now concatenate our values and now we have the batch but with teh T values with their new concatenated values
            index = torch.cat((index, index_next), dim=1)
        return index
            
    
m = BigramLanguageModel(vocabulary_size)
out, loss = m(xb, yb)

# Because we have 65 charactes in our dataset we should expect during initialisation that our cross entropy / negative log likelihood under a uniform distribution should be 
# - ln(1/65) which is around 4.17
print(out.shape)
print(loss.item())

torch.Size([32, 65])
4.5199713706970215


In [83]:
# We can use our generation that continuously appends indexes to eachotehr and decode the values that we generated
# print(decode().tolist())
# decode(m.generate(index = torch.zeros((1, 1), dtype = torch.long), max_new_tokens=500)[0].tolist())

# We can estimate our loss whilst we are trainig our model

evaluation_iterations = 100

# Decorator that prevents gradient updates so we can run inference on our model
@torch.no_grad()
def loss_estimation():
    # Dictionary storing the averages for both split kinds
    out = {}
    m.eval()
    # Going through both the splits, then saving the number of evaluation iterations on a prezeroed tensor
    # Updating the losses for every evaluation compared to each random batch
    # Then we get the mean
    for split in ["train", "validation"]:
        losses = torch.zeros(evaluation_iterations)
        for iter in range(evaluation_iterations):
            X, Y = get_batch(split)
            logits, loss = m(X, Y)
            losses[iter] = loss.item()
        out[split] = losses.mean()
    m.train()
    return out

In [84]:
# AdamW works really well, typical good setting for lr is 3e-4 however with smaller networks we can get away with much higher learning rates
optimiser = torch.optim.AdamW(m.parameters(), lr = 1e-3)

batch_size = 32
evaluation_interval = 500
max_iterations = 2000
for step in range(max_iterations):

    if step % evaluation_interval == 0:
        losses = loss_estimation()

    xb, yb = get_batch("valid")

    logits, loss = m(xb, yb)
    optimiser.zero_grad(set_to_none=True)
    loss.backward()
    optimiser.step()

print(loss.item())

2.4574451446533203
