In [None]:
!wget https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt

--2023-11-27 22:03:23--  https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5617411 (5.4M) [text/plain]
Saving to: ‘shakespeare_more.txt’


2023-11-27 22:03:23 (54.3 MB/s) - ‘shakespeare_more.txt’ saved [5617411/5617411]



In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F # loading libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive') # mounting drive where files are stored

Mounted at /content/drive


In [None]:
with open(r'shakespeare_more.txt', 'r', encoding='utf-8') as file:
    text = file.read() # reading the text from the dataset
unique_chars = sorted(list(set(text))) # creating a list of all the unique possible characters, the vocabulary
vocab_size = len(unique_chars) # checking the length of the vocabulary

In [None]:
itos = {i:s for i, s in enumerate(unique_chars)} # creating a dictionary that converts integer index to corresponding token string character
stoi = {s:i for i, s in enumerate(unique_chars)} # creating a dictionary that converts token string character to corresponding integer index
encode = lambda x: [stoi[char] for char in x] # creating a lambda function that takes some charcaters/tokens and converts them into their corresponding integer indices
decode = lambda x: ''.join([itos[index] for index in x]) # creating a lambda function that takes some integer indices and converts them into their corresponding charcaters/tokens
data = torch.tensor(encode(text), dtype=torch.long) # converts the whole text into their corresponding token integer indices
n1 = int(len(data) * 0.8)
n2 = int(len(data) * 0.9)
data_train = data[:n1]
data_val = data[n1:n2]
data_test = data[n2:] # splitting the data set into a (train, val, test) split of (80, 10, 10)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 96
context_size = 256
vector_length = 1024
hidden_length = 512
dropout = 0.2
eval_interval = 500
eval_batch = 200
iterations = 30000
lr = 1e-3 # hyper-parameters for MLP model

In [None]:
torch.cuda.is_available() # checking that GPU is usable

In [None]:
class Recurrence(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden_layer = nn.Linear(vector_length + hidden_length, hidden_length) # creating hidden linear layer to create intermediate output
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(dropout)
        self.output_layer = nn.Linear(hidden_length, vocab_size) # creating final output layer
    def forward(self, x, hidden):
        B, T = x.shape # getting tensor shape input
        x = x.view(B, -1) # stretching the second and third layer
        input = torch.cat((x, hidden), dim=1) # concatenating character embedding with hidden vector
        input = self.drop(input) # applying dropout
        hidden_new = self.hidden_layer(input) # producing new intermediate hidden vector
        hidden_new = self.relu(hidden_new) # applying ReLU
        output = self.output_layer(hidden_new) # applying output linear layer
        return output, hidden_new

In [None]:
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, vector_length) # creating character embedding look up table
        self.init_hidden = nn.Parameter(torch.zeros(hidden_length, device=device)) # creating initial hidden vector with zeros (other ways possible)
        self.recurrence = Recurrence() # creating recurrence object
        self.final = nn.Linear(hidden_length, vocab_size) # creating final linear layer
    def forward(self, x, targets=None):
        B, T = x.shape # getting tensor shape input
        logits = torch.zeros((B, T, vocab_size), device=device) # allocating memory for final output vector
        char_token = self.char_embedding(x) # getting character embedding vectors of input text
        hidden = self.init_hidden.repeat(B, 1) # copying the initial hidden vector across batch dimension
        for i in range(T): # iterating through all context tokens
            logits[:, i, :], hidden = self.recurrence(char_token[:, i, :], hidden) # getting hidden and output vector for every next token
        if targets == None: # if inference, no loss is needed
            loss = None # if inference, no loss is needed
        else:  # if training...
            B, T, C = logits.shape # extracting output tensor shape
            logits = logits.view(B*T, C) # making logits 2D tensor instead of 3D for corss entropy
            targets = targets.view(B*T) # making output 1D tensor instead of 2D for corss entropy
            loss = F.cross_entropy(logits, targets) # applying cross-entropy loss function
        return logits, loss
    def generate(self, idx, max_length): # function to generate text after training
        for _ in range(max_length): # ensuring output is of requested length
            idx_block = idx[:, -context_size:] # take last context window length of tokens from complete context
            logits, loss = self(idx_block) # applying context window as input into transformer
            logits = logits[:, -1, :] # extracting the last logits as it signifies the next token/charcater
            probs = F.softmax(logits, dim=-1) # apply softmax on logits to create probability distirbution
            char = torch.multinomial(probs, num_samples=1) # use weighted sampling to extract next token
            idx = torch.cat((idx, char), dim=1) # concatenate output character/token to previous text
        return idx

In [None]:
@torch.no_grad()
def estimate_loss(): # function to estimate train and val loss while training
    out = {}
    model.eval() # setting model in evaluation mode
    for i in ['train', 'val']: # running over train and val datasets
        losses = torch.zeros(eval_batch) # creating lossses tensor of set evaluation batch length hyper-parameter
        for j in range(eval_batch): # running over losses tensor
            X_batch, Y_batch = get_batch(i) # get batch of inputs and expected outputs
            logits, loss = model(X_batch, Y_batch) # run inputs and expected outputs through model to get loss
            losses[j] = loss.item() # add value to losses tensor
        out[i] = losses.mean() # get mean of losses tensor and save it
    model.train() # put model back in train mode
    return out

@torch.no_grad()
def get_batch(mode): # function to collect batch of input/output pairs for training
    data = data_train if mode == 'train' else data_val # retrieve appropriate dataset
    batch = torch.randint(len(data) - context_size, (batch_size,)) # getting batches of random integers that will be used as dataset token integer indices of the starting token in the context window
    X_batch = torch.stack([data[i:i+context_size] for i in batch]) # stacking batches of context token indices to be used as input
    Y_batch = torch.stack([data[i+context_size] for i in batch]) # stacking batches of next character token index as output
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device) # moving batch to GPU
    return X_batch, Y_batch

@torch.no_grad()
def save_params(model, optimizer, scheduler): # function to save parameters, optimizer and scheduler states
  torch.save(model.state_dict(), r'/content/drive/MyDrive/ML_project/params.pt') # to save parameters
  torch.save(optimizer.state_dict(), r'/content/drive/MyDrive/ML_project/optimizer.pt') # to save optimizer state
  torch.save(scheduler.state_dict(), r'/content/drive/MyDrive/ML_project/scheduler.pt') # to save scheduler states

@torch.no_grad()
def test_model(model, data, batch_size): # function to test model on test set loss and accuracy
    cost = []
    accuracy = []
    for i in range(0, len(data) - context_size - batch_size, batch_size): # running model through whole test dataset
        X_batch = torch.stack([data[j:j+context_size] for j in range(i, i + batch_size)]) # getting test input batch
        Y_batch = torch.stack([data[j+context_size] for j in range(i, i + batch_size)]) # getting test expected output batch
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device) # moving batches to GPU
        logits, loss = model(X_batch, Y_batch) # running model through input/output batch
        cost.append(round(loss.item(), 4)) # get loss for batch and store it
        probs = F.softmax(logits, dim=-1) # apply softmax to get probability distirbution of next character token
        char = torch.multinomial(probs, num_samples=1).view(-1) # preform weighted sampling
        accuracy.append((len(char[char == Y_batch]) / 300) * 100) # check how common are expected and actual output equal
    test_cost = sum(cost) / len(cost) # calculate average test loss
    test_accuracy = sum(accuracy) / len(accuracy) # calculate average test accuracy
    return test_cost, test_accuracy

In [None]:
model = RNN() # creating RNN model
model.to(device) # moving model to GPU
optimizer = torch.optim.AdamW(model.parameters(), lr=lr) # creating optimizer that is responsible for adjusting parameters
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95) # setting optimizer scheduler

In [None]:
# model.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/RNN_params.pt')) # importing model parameters
# optimizer.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/RNN_optimizer.pt')) # importing model optimizer states
# scheduler.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/RNN_scheduler.pt')) # importing model scheduler states

In [None]:
model.eval() # putting model in evaluation mode
start = torch.zeros((1,1), dtype=torch.long, device=device) # creating character input start tokens to start generating
sample_index = model.generate(start, max_length=2000)[0].tolist() # generating output tokens inetger indices
sample = decode(sample_index) # converting integer indices of token into corresponding token string characters
print("Untrained Sample:\n", sample)
model.train() # putting model in train mode

In [1]:
for i in range(iterations): # running training over set number of iterations
    if i % eval_interval == 0 or i == iterations-1: # checking conditions on whether to stop temporarily in order to save parameters, estimate loss (and possibly apply scehduler)
        save_params(model, optimizer, scheduler) # saving parameters
        losses = estimate_loss() # estimate train and val loss
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if i != 0:
            scheduler.step() # performing scheduler step
    X_batch, Y_batch = get_batch('train') # collecting training batch
    logits, loss = model(X_batch, Y_batch) # running training batch through model
    optimizer.zero_grad(set_to_none=True) # setting gradients to 0 (to ensure gradients dont explode)
    loss.backward() # perform back prop
    optimizer.step()# perform parameter adjustment

step 0: train loss 4.5857, val loss 4.5870
step 500: train loss 1.5248, val loss 1.6606
step 1000: train loss 1.4405, val loss 1.6016
step 1500: train loss 1.4056, val loss 1.5810
step 2000: train loss 1.3822, val loss 1.5599
step 2500: train loss 1.3674, val loss 1.5522
step 3000: train loss 1.3577, val loss 1.5411
step 3500: train loss 1.3503, val loss 1.5339
step 4000: train loss 1.3427, val loss 1.5269
step 4500: train loss 1.3385, val loss 1.5239
step 5000: train loss 1.3333, val loss 1.5196
step 5500: train loss 1.3277, val loss 1.5124
step 6000: train loss 1.3260, val loss 1.5128
step 6500: train loss 1.3231, val loss 1.5116
step 7000: train loss 1.3192, val loss 1.5079
step 7500: train loss 1.3181, val loss 1.5119
step 8000: train loss 1.3176, val loss 1.5071
step 8500: train loss 1.3150, val loss 1.5057
step 9000: train loss 1.3116, val loss 1.5039
step 9500: train loss 1.3118, val loss 1.5072
step 10000: train loss 1.3086, val loss 1.4995
step 10500: train loss 1.3069, val lo

In [None]:
model.eval() # putting model in evaluation mode
start = torch.zeros((1,1), dtype=torch.long, device=device) # creating character input start tokens to start generating
sample_index = model.generate(start, max_length=2000)[0].tolist() # generating output tokens inetger indices
sample = decode(sample_index) # converting integer indices of token into corresponding token string characters
print("Trained Sample:\n", sample)
model.train() # putting model in train mode

Trained Sample:
 	” And bite. Not what man do, you misdeepor of traitor do, but these floct,
Like brave.
There’s a slaughter. Swaln thou got’st they think, that to the sale-tirch. He may have abbat,
Of me a mended.

ANTONY
SIMPs.
Made me not to thou revelets shame in things is capled.”
         ”       Enters, Ford, would then, vight.
Say thou art head,
Is yourself we that I say.

GUIGINIO.
To God, that’s stand swellish’d him.
Ha! I’ll fill these natimont is you shallow
On Cupt I know your life of him and high-ache;
And new management. Pish in the heads, and in your love, what trainable is could be dispos’d.

DROMIO OF EPHUS.
Ay, but keep him
If sure
But my worthiness of fair tongues of that was welcome to labours and this treating_!
For a thousand nature and will fell the times of sight.

CLEON.
Madadry wanton born.

ARVIRAGUSE, Attendan,
Never make! Durchances_; a lost.

Enter King Richard._]

Go by th’ the enfor these her name.

CLAUDIO.
Had, my lord, you should, my good cutiet to b

In [None]:
cost, accuracy = test_model(model, data_test, 300) # applying evaluation metrics on test set
print(f'Test loss is: {cost:.4f}\nTest accuracy: {accuracy:.2f}%')

Test loss is: 1.5771
Test accuracy: 42.37%
