In [1]:
import torch
import time
import numpy as np
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

#hyperparameter
block_size = 8
batch_size = 300
max_iters = 100000
learning_rate = 3e-6
eval_iters = 1000

cuda


In [2]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f:
    text = f.read()

#all chars in the text
chars = sorted(set(text))

vocab_size = len(chars)

In [3]:
string_to_int = { charac:index for index, charac in enumerate(chars)} # assinging each character with a number
int_to_string = { index:charac for index, charac in enumerate(chars)} # assigning a number with a character

"""
def encode(string):
    encoded_list = []
    for char in string:
        encoded_list.append(string_to_int[char])
    return encoded_list
"""
# The above can be easily written using a lambda function        
encode = lambda string: [string_to_int[char] for char in string] # encoding = changing string to number

"""
def decode(int_list):
    decoded_int = ''
    for num in int_list:
        decoded_int = decoded_int + int_to_string[num]
    return decoded_int
"""
# The above function can be easily written with a lambda function
decode = lambda l: ''.join([int_to_string[i] for i in l]) # decoding = changing a list of number to a string

In [4]:
data = torch.tensor(encode(text), dtype=torch.long) # changing the text into a Pytorch tensor, that a is 64-bit integer (torch.long)

# dividing data into training and validation
n = int(0.8 * len(data)) #split index
train_data = data[:n]
val_data = data[n:]

#creating batches of data for training or validation
def get_batch(split):
    data = train_data if split == 'train' else val_data
    random_indices = torch.randint(len(data) - block_size, (batch_size,)) # pick random start points in the data

    input_seq = torch.stack([data[i:i+block_size] for i in random_indices]) # Input sequences of length `block_size`
    output_seq = torch.stack([data[i+1:i+block_size+1] for i in random_indices]) # Output sequences shifted by 1

    input_seq, output_seq = input_seq.to(device), output_seq.to(device)
    
    return input_seq, output_seq

In [5]:
@torch.no_grad() #disables gradient calculations for evaluation
def estimate_loss():
    out = {}
    model.eval() #sets model to evaluation mode, dropout layer is not applied and batch normalization uses fixed statistics
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y) #forward pass, the forward method in BigramLanguageModel class
            losses[k] = loss.item() #loss.item() returns the single scalar value from a tensor that has a single value. e.g. torch.tensor([2.5])=2.5
        out[split] = losses.mean() #saves the mean of the losses in a dictionary that are stored in losses = torch.zeros(eval_iters)
    model.train() # sets the model back to the training mode
    
    return out

In [6]:
#initializing the neural network
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        #embedding table where each word in the vocabulary is mapped to a vector of length 'vocab_size'. Each row is a word and has its learnable param
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) #embeddings of each unique character is shown by a vector of length 72
                                                                          #e.g. a=0, and 0 is shown by a vector of length 72 in the zeroth position
    
    def forward(self, index, targets=None):
        logits = self.token_embedding_table(index) #stores the embeddings of the specified characters, e.g. index=torch.tensor([0, 1]), where 0=a, 1=b

        if targets is None:
            loss = None
        else: 
            B, T, C = logits.shape #B = batch size, T= the length of each seq (number of tokens, characters), C=# of unique token/characters
            #e.g. index=torch.tensor([[0, 1, 2], [3, 4, 5]]). It means B=2, T=3, and C=72 because a vector of length 72 represents each T, so [2, 3, 72]
            logits = logits.view(B * T, C) # we do this because cross_entropy expects the dimensions to be so
            targets = targets.view(B * T) # we do this because cross_entropy expects the dimensions to be so
            loss = F.cross_entropy(logits, targets).to(device) # calculates loss between logits and targets
            
        return logits, loss
    
    def generate(self, index, max_new_tokens): # generates new tokens/characters, one at a time
        # index is (B, T) array of indices in the current context
        index = index.to(device)
        print("Starting index device:", index.device)
        
        logits, loss = self.forward(index)
        
        #print(logits.shape)
        for _ in range(max_new_tokens):
            #get the prediction
            logits, loss = self.forward(index)
    
            #focus only on the last time step
            logits = logits[:, -1, :] #becomes (B, C) #the embeddings inside logits is referred to as raw score (logits)
            
            #apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) #(B, C) the logits are changed into a probablity 
            
            #sample from the distribution
            index_next = torch.multinomial(probs, num_samples=1).to(device) #(B, 1) randomly selects indices (index in our case) based on their likelihood
            #print(index_next)
            
            
            #append sampled index to the running sequence
            index = torch.cat((index, index_next), dim=1) #(B, T+1)

        return index

model = BigramLanguageModel(vocab_size).to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
context = context.to(device)

generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist()) #max_new_tokens= number of new tokens to be generated
print(generated_chars)

Starting index device: cuda:0

9LL;sB,y9﻿—av9OSps;k—:1‘.WR10TcE“M“V‘S,GmdP0tMY‘y?”9?Cyo9‘ZdA0W pP:yy9tERBE—;oTYV﻿gQ ocP:lu:1DqGJ‘J9;e‘HAzhXXjDxLtg1nTVYv0Wg﻿-l”xl“ERO“sgxWEXBkV—RE’9Ka﻿(wT cBh1p(c(IFgQhtqZ“1?tA,;u
!c“lgmdBrKb.;Sg
gmsJmTCch﻿);1ZvD)r.;;izGalejl
;GMeMl ﻿ZtZI;bBW﻿K9Ie“K.VW
s0.1uy!FO- SpTN—i;N-‘yYb‘sEjcqOTVrvsyB,r‘rzh)9uV-,?emJMrCQHliwtVU).zC”-﻿- ,MX”K9tWzUS,-nBg—Xej“—ixf;G:1E—B,rC?g):QFZmqvA’,SgncbaO;oRFE,qg.sgK)HK?h(yrqv:Gt?JMB“‘,J‘.L!em(xaBHbYk‘Imj9Uy!GsSWHD
;AloREQGecrbegXn”)J,o.ks;il0wdUHMYoOEwdCfKF—’bnBDA-n﻿h﻿


In [7]:
#create a Pytorch optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate) #applies weight decay which helps prevent overfitting

for iter in range(max_iters): # each iteration is one step of training the model

    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train loss: {losses['train']:.4f}, validation loss: {losses['val']:.4f}")
    
    #sample a batch of data (mini-batch)
    input_batch, target_batch = get_batch('train')
    # Move data to the GPU
    input_batch, target_batch = input_batch.to(device), target_batch.to(device)

    #evaluate the loss
    logits, loss = model.forward(input_batch, target_batch)
    optimizer.zero_grad(set_to_none=True) #sets gradients of the model's parameters to zero and computes the new gradient
    loss.backward() #computes the gradients of the loss with respect to the model's parameters using backpropagation
    optimizer.step() #updates models parameters

print(loss.item())

step: 0, train loss: 4.6900, validation loss: 4.6857
step: 1000, train loss: 4.6861, validation loss: 4.6807
step: 2000, train loss: 4.6801, validation loss: 4.6767
step: 3000, train loss: 4.6769, validation loss: 4.6722
step: 4000, train loss: 4.6721, validation loss: 4.6681
step: 5000, train loss: 4.6679, validation loss: 4.6631
step: 6000, train loss: 4.6631, validation loss: 4.6585
step: 7000, train loss: 4.6586, validation loss: 4.6542
step: 8000, train loss: 4.6543, validation loss: 4.6499
step: 9000, train loss: 4.6493, validation loss: 4.6453
step: 10000, train loss: 4.6459, validation loss: 4.6413
step: 11000, train loss: 4.6418, validation loss: 4.6360
step: 12000, train loss: 4.6367, validation loss: 4.6320
step: 13000, train loss: 4.6327, validation loss: 4.6272
step: 14000, train loss: 4.6281, validation loss: 4.6232
step: 15000, train loss: 4.6237, validation loss: 4.6187
step: 16000, train loss: 4.6183, validation loss: 4.6140
step: 17000, train loss: 4.6153, validation 

In [8]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_chars = decode(model.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)

Starting index device: cuda:0

S‘,ohn! suV)Vo1DoBzR(sRVY‘PtxMrCel’
vPMOTedU”Jg,mqtMJP0gDS﻿l!-NY.1EC9vPLtxFgQMB”,Ptdgej
psW—’u”hK pDNHjIjQp(bdADg—kQ;ryrYGqvPMgyYiOIFEdvnUF mD! cZye,Paj,CQ ’ewwHvP—atxUupFacGp
xKF‘q.F””Jqf1sBedemxnSBA!﻿-ncO;I(u:1 QJmrO.HVi1KiVUHSk(1(h) 
;r,pZSkbYVDu:rJ,NHU,pupTcERFPZBNX)gKMXnBNMLzfdiwBaUAQqQDMIO’n’’ugLQOJm—“il!FWm﻿fK?﻿Kvg’qXSeMe:ecLGQHbZtiHBgx”i!1TYJWmSkERbatoGNqvtlL,qIRhKx?-!i MrN0yb-0gxBBAnA1JFHDq“EZiIil(iU.‘!,9oG’G):eGMX
;﻿ffl
q:Au1HDat,9
oTBWVmT)i9qyF ‘yrCigxcZfhXT—iUwgm-MvPle:M1Z009?—ron;)B
