In [60]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 8
batch_size = 4
max_iters = 1000
eval_interval = 2500
eval_iters = 250
learning_rate = 3e-4

cpu


In [61]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '&', "'", '(', ')', '*', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']


In [62]:
#tokeniser
#implementation of a simple substitution cipher (for character in string, for integer in list)
#enumerate() function is used to return both the index and the character of each element in the chars string
string_to_int = { ch:i for i,ch in enumerate(chars) }
int_to_string = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text), dtype = torch.long)

In [63]:
#n is an integer that represents 80% of the length of the input data. 
#train_data is a slice of the input data from the beginning to index n
#while val_data is a slice from index n to the end.
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]


def get_batch(split):
    dat = train_data if split =='train' else val_data
    #Random index ix
    ix = torch.randint(len(data) - block_size, (batch_size,))
    #print(ix)
    #Offsetting by 1 as indexing starts at 0
    #Stacks in batches
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x,y
x, y = get_batch('train')
print('inputs: ')
print(x)
print('targets: ')
print(y)

inputs: 
tensor([[73, 61, 58, 66,  1, 65, 68, 68],
        [54, 71, 57,  0, 68, 59, 73, 58],
        [58, 54, 71, 65, 78,  1, 54, 72],
        [65, 58,  1, 54, 67, 57,  1, 73]])
targets: 
tensor([[61, 58, 66,  1, 65, 68, 68, 64],
        [71, 57,  0, 68, 59, 73, 58, 67],
        [54, 71, 65, 78,  1, 54, 72,  1],
        [58,  1, 54, 67, 57,  1, 73, 61]])


In [64]:
#Ensures PyTorch doesn't use gradients, as that would reduce computation/ memory usage
@torch.no_grad()
def estimate_loss():
    out= {}
    #Sets to evaluation mode
    model.eval()
#Returns a dictionary containing the mean loss values for the training and validation sets
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        #iterates over the training and validation sets and computes the loss for each iteration
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    #Switching to training mode
    model.train()
    return out

In [65]:
#Given character "context", we can predict the next character should be "target"
x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("When input is", context, "target is", target)

When input is tensor([80]) target is tensor(1)
When input is tensor([80,  1]) target is tensor(1)
When input is tensor([80,  1,  1]) target is tensor(28)
When input is tensor([80,  1,  1, 28]) target is tensor(39)
When input is tensor([80,  1,  1, 28, 39]) target is tensor(42)
When input is tensor([80,  1,  1, 28, 39, 42]) target is tensor(39)
When input is tensor([80,  1,  1, 28, 39, 42, 39]) target is tensor(44)
When input is tensor([80,  1,  1, 28, 39, 42, 39, 44]) target is tensor(32)


In [66]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        #logits are a probability distribution of what we want to predict
        #3 dimensional
        logits = self.token_embedding_table(index)

        if targets is None:
            loss = None
        else:
            #B is batch, T is time dimension, C is channels (vocab size)
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            #PyTorch excepts a certain shape (Batch size, Channels), thats why we reshape it 
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        #index is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            #Get predictions
            logits, loss = self.forward(index)
            #Focus only on the last time step
            logits = logits[:, -1, :] #Becomes (B, C)
            #Apply softmax to get probabilities, focusing on the last dimension
            probs = F.softmax(logits, dim=-1) #(B, C)
            #Sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1) #(B, 1)
            #Append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) #(B, T+1)
        return index

model = BigramLanguageModel(vocab_size)
#Push parameters to GPU for more efficient training 
m = model.to(device)

#torch.long = int64
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)



SrWJaedjsnXjU]LTu1DWx&iY*J7iH!Vi'&KwfmpAw19ioF0p;UUv[J[r G',)h)XicOI5Igvmh!vmm*tx" fHT4A.9]( (_[6[;fVPTm9﻿I;9 ROTL6hjP)(ng(OrkUz:WTkd﻿nQEl5l?]EQ
AO5M6,Arn383vbNvr﻿jSk-msq0Q:whl7zl,gS﻿3icS!caGAof?]J*;k]TvHypgUw9Zi﻿)fLv*
Gslvc?nFD_T0BZnIcdtOG*J]ZkHfmhNI[lCic89peFJ﻿&4iw9Z6:WR.;.OIjVj!yc,&R'V;Ubd3IgOJ[)lSppR_)eo]Jl5Nk)''A﻿U'D﻿G-k;U[_Lc'7(-X]1-S(PIrHn_XMn6i)ma&﻿0_T4Rb D﻿IqDTo1ZAdVo18'3WH)Gs?HI.dM,0p:Aoihi.3S,UrSwN?L dG-SeSvxla8'Sa1lvE!a_[qP[40Nr5as5M﻿'Bbd!C0kvHCiuZZgOoFRoo]F?hVanIM5gPcPmge2K5IsUAd"dU


In [73]:
#Create a PyTorch optimiser
optimiser = torch.optim.AdamW(model.parameters(), lr=learning_rate)

#iter = iterations
for iter in range(max_iters):
    #Check if remainder of current iteration/ eval_iters is 0
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss: {losses['val']:.4f}")
        
    #Sample a batch of data
    #x inputs and y targets
    xb, yb = get_batch('train')

    #Evaluate the loss
    logits, loss = model.forward(xb, yb)
    optimiser.zero_grad(set_to_none=True)
    #Backward pass
    loss.backward()
    optimiser.step()
print(loss.item())

step 0: train loss 3.8349, val loss: 3.8287
step 250: train loss 3.8058, val loss: 3.7826
step 500: train loss 3.7493, val loss: 3.7472
step 750: train loss 3.6907, val loss: 3.6873
3.6330301761627197


In [69]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


Srbk(Ag1qDM(3F-]T0Z!a 6HrWV!KZ*]LsDudkG'FOIf
)a8b?hlwVv(T
)A[KJpipl)Bt-HsXKOpJI!﻿S﻿YtBg_gVo])xfmi]cRouFFQX fd wWR0!M6i6]2j:jk7nQCZd0,6a-7[q7xm4&qHs?iX&0l&?QcasC'IM6V'i])w]l57iYgG':9ODF8F!aVJBDL?usvHAg]jC;lPBBtj;*;UQ5;I-_mn)'G'S(QB:J,ZwWzlpY:UYf-2F?dK8B.so7)gu[﻿&B6:!Vl!NS)(6[vyCFzZu1x9ls!NnJwsu]-2Va"bnw.wKCOM dT0Z4.BZLkp9_by: lO
FBgcbNM(AG]﻿jV'WC siZh;U﻿nQ8
hCr﻿5MqxpG*_FJqxoHCE2"Z7aH!eS("ZZ83
Xp;.VmyAotAMJ?d3LUvca-BF-4M8e5DN"]!viXaU!U!VoegR)i]xEa,U?HC7hVQVa_bsTh;D6gv3o,VoYNF3 l,V AFONyy_0UjVI1YH)


In [None]:
#OPTIMISERS
# Mean Squared Error (MSE): MSE is a common loss function used in regression problems, where the goal is to predict a continuous output. It measures the average squared difference between the predicted and actual values, and is often used to train neural networks for regression tasks.
# Gradient Descent (GD): is an optimization algorithm used to minimize the loss function of a machine learning model. The loss function measures how well the model is able to predict the target variable based on the input features. The idea of GD is to iteratively adjust the model parameters in the direction of the steepest descent of the loss function
# Momentum: Momentum is an extension of SGD that adds a "momentum" term to the parameter updates. This term helps smooth out the updates and allows the optimizer to continue moving in the right direction, even if the gradient changes direction or varies in magnitude. Momentum is particularly useful for training deep neural networks.
# RMSprop: RMSprop is an optimization algorithm that uses a moving average of the squared gradient to adapt the learning rate of each parameter. This helps to avoid oscillations in the parameter updates and can improve convergence in some cases.
# Adam: Adam is a popular optimization algorithm that combines the ideas of momentum and RMSprop. It uses a moving average of both the gradient and its squared value to adapt the learning rate of each parameter. Adam is often used as a default optimizer for deep learning models.
# AdamW: AdamW is a modification of the Adam optimizer that adds weight decay to the parameter updates. This helps to regularize the model and can improve generalization performance. We will be using the AdamW optimizer as it best suits the properties of the model we will train in this video.