In [None]:
!wget https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt

--2023-11-27 21:43:45--  https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5617411 (5.4M) [text/plain]
Saving to: ‘shakespeare_more.txt’


2023-11-27 21:43:45 (54.6 MB/s) - ‘shakespeare_more.txt’ saved [5617411/5617411]



In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F # loading libraries

In [None]:
from google.colab import drive
drive.mount('/content/drive') # mounting drive where files are stored

Mounted at /content/drive


In [None]:
with open(r'shakespeare_more.txt', 'r', encoding='utf-8') as file:
    text = file.read() # reading the text from the dataset
unique_chars = sorted(list(set(text))) # creating a list of all the unique possible characters, the vocabulary
vocab_size = len(unique_chars) # checking the length of the vocabulary

In [None]:
itos = {i:s for i, s in enumerate(unique_chars)} # creating a dictionary that converts integer index to corresponding token string character
stoi = {s:i for i, s in enumerate(unique_chars)} # creating a dictionary that converts token string character to corresponding integer index
encode = lambda x: [stoi[char] for char in x] # creating a lambda function that takes some charcaters/tokens and converts them into their corresponding integer indices
decode = lambda x: ''.join([itos[index] for index in x]) # creating a lambda function that takes some integer indices and converts them into their corresponding charcaters/tokens
data = torch.tensor(encode(text), dtype=torch.long) # converts the whole text into their corresponding token integer indices
n1 = int(len(data) * 0.8)
n2 = int(len(data) * 0.9)
data_train = data[:n1]
data_val = data[n1:n2]
data_test = data[n2:] # splitting the data set into a (train, val, test) split of (80, 10, 10)

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 96
context_size = 256
vector_length = 490
shapes = [context_size*vector_length] + [5000, 5000, 5000] + [vocab_size]
num_layers = 3
dropout = 0.2
iterations = 30000
eval_interval = 500
eval_batch = 200
lr = 3e-4 # hyper-parameters for MLP model

In [None]:
torch.cuda.is_available() # checking that GPU is usable

In [None]:
class FeedForward(nn.Module):
    def __init__(self, fan_in, fan_out):
        super().__init__()
        self.fwd = nn.Sequential(nn.Linear(fan_in, fan_out), nn.BatchNorm1d(fan_out), nn.ReLU(), nn.Dropout(dropout)) # creating linear layer architecture of each individual layer
    def forward(self, x):
        self.out = self.fwd(x) # applying input through linear layer
        return self.out

class MLP(nn.Module):
    def __init__(self, shapes):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, vector_length) # creating character embedding look up table
        self.pos_embedding = nn.Embedding(context_size, vector_length) # creating positional embedding look up table
        self.layers = []
        for i in range(len(shapes)-2): # iterating through shapes of each layer of MLP
            self.layers.append(FeedForward(shapes[i], shapes[i+1])) # adding layer to model architecture
        self.layers.append(nn.Linear(shapes[-2], shapes[-1])) # adding final layer (will not include batch norm, ReLU and dropout)
        self.fwd = nn.Sequential(*self.layers)
    def forward(self, x, targets=None):
        B, T = x.shape # getting tensor shape input
        char_token = self.char_embedding(x) # getting character embedding vectors of input text
        pos_token = self.pos_embedding(torch.arange(T, device=device)) # getting positional embedding vectors of input text
        token = char_token + pos_token # getting final input by adding charcater and positional token embedding
        input = token.view(B, -1) # stretching the second and third layer
        logits = self.fwd(input) # applying input through MLP
        if targets == None: # checking if in inference or training
            loss = None # if inference, no loss is needed
        else: # if training...
            loss = F.cross_entropy(logits, targets) # applying cross-entropy loss function
        return logits, loss
    def generate(self, idx, max_length): # function to generate text after training
        for _ in range(max_length): # ensuring output is of requested length
            idx_block = idx[:, -context_size:] # take last context window length of tokens from complete context
            logits, loss = self(idx_block) # applying context window as input into MLP
            probs = F.softmax(logits, dim=-1) # apply softmax on logits to create probability distirbution
            char = torch.multinomial(probs, num_samples=1) # use weighted sampling to extract next token
            idx = torch.cat((idx, char), dim=1) # concatenate output character/token to previous text
        return idx

In [None]:
@torch.no_grad()
def estimate_loss(): # function to estimate train and val loss while training
    out = {}
    model.eval() # setting model in evaluation mode
    for i in ['train', 'val']: # running over train and val datasets
        losses = torch.zeros(eval_batch) # creating lossses tensor of set evaluation batch length hyper-parameter
        for j in range(eval_batch): # running over losses tensor
            X_batch, Y_batch = get_batch(i) # get batch of inputs and expected outputs
            logits, loss = model(X_batch, Y_batch) # run inputs and expected outputs through model to get loss
            losses[j] = loss.item() # add value to losses tensor
        out[i] = losses.mean() # get mean of losses tensor and save it
    model.train() # put model back in train mode
    return out

@torch.no_grad()
def get_batch(mode): # function to collect batch of input/output pairs for training
    data = data_train if mode == 'train' else data_val # retrieve appropriate dataset
    batch = torch.randint(len(data) - context_size, (batch_size,)) # getting batches of random integers that will be used as dataset token integer indices of the starting token in the context window
    X_batch = torch.stack([data[i:i+context_size] for i in batch]) # stacking batches of context token indices to be used as input
    Y_batch = torch.stack([data[i+context_size] for i in batch]) # stacking batches of next character token index as output
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device) # moving batch to GPU
    return X_batch, Y_batch

@torch.no_grad()
def save_params(model, optimizer, scheduler): # function to save parameters, optimizer and scheduler states
  torch.save(model.state_dict(), r'/content/drive/MyDrive/ML_project/params.pt') # to save parameters
  torch.save(optimizer.state_dict(), r'/content/drive/MyDrive/ML_project/optimizer.pt') # to save optimizer state
  torch.save(scheduler.state_dict(), r'/content/drive/MyDrive/ML_project/scheduler.pt') # to save scheduler states

@torch.no_grad()
def test_model(model, data, batch_size): # function to test model on test set loss and accuracy
    cost = []
    accuracy = []
    for i in range(0, len(data) - context_size - batch_size, batch_size): # running model through whole test dataset
        X_batch = torch.stack([data[j:j+context_size] for j in range(i, i + batch_size)]) # getting test input batch
        Y_batch = torch.stack([data[j+context_size] for j in range(i, i + batch_size)]) # getting test expected output batch
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device) # moving batches to GPU
        logits, loss = model(X_batch, Y_batch) # running model through input/output batch
        cost.append(round(loss.item(), 4)) # get loss for batch and store it
        probs = F.softmax(logits, dim=-1) # apply softmax to get probability distirbution of next character token
        char = torch.multinomial(probs, num_samples=1).view(-1) # preform weighted sampling
        accuracy.append((len(char[char == Y_batch]) / 300) * 100) # check how common are expected and actual output equal
    test_cost = sum(cost) / len(cost) # calculate average test loss
    test_accuracy = sum(accuracy) / len(accuracy) # calculate average test accuracy
    return test_cost, test_accuracy

In [None]:
model = MLP(shapes) # creating MLP model
model.to(device) # moving model to GPU
optimizer = torch.optim.AdamW(model.parameters(), lr=lr) # creating optimizer that is responsible for adjusting parameters
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99) # setting optimizer scheduler

In [None]:
# model.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/MLP_params.pt')) # importing model parameters
# optimizer.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/MLP_optimizer.pt')) # importing model optimizer states
# scheduler.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/MLP_scheduler.pt')) # importing model scheduler states

In [None]:
model.eval() # putting model in evaluation mode
start = torch.zeros((1, context_size), device=device, dtype=torch.long) # creating full context character input start tokens to start generating
sample_index = model.generate(start, max_length=2000)[0].tolist() # generating output tokens inetger indices
sample = decode(sample_index) # converting integer indices of token into corresponding token string characters
print("Untrained sample is:\n", sample)
model.train() # putting model in train mode

In [None]:
for i in range(iterations): # running training over set number of iterations
    if i % eval_interval == 0 or i == iterations-1: # checking conditions on whether to stop temporarily in order to save parameters, estimate loss (and possibly apply scehduler)
        save_params(model, optimizer, scheduler) # saving parameters
        losses = estimate_loss() # estimate train and val loss
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if i != 0:
            scheduler.step() # performing scheduler step
    X_batch, Y_batch = get_batch('train') # collecting training batch
    logits, loss = model(X_batch, Y_batch) # running training batch through model
    optimizer.zero_grad(set_to_none=True) # setting gradients to 0 (to ensure gradients dont explode)
    loss.backward() # perform back prop
    optimizer.step()# perform parameter adjustment

step 0: train loss 4.6141, val loss 4.6144
step 500: train loss 2.7347, val loss 2.7345
step 1000: train loss 2.5760, val loss 2.5693
step 1500: train loss 2.4510, val loss 2.4704
step 2000: train loss 2.3749, val loss 2.3993
step 2500: train loss 2.3646, val loss 2.3738
step 3000: train loss 2.2736, val loss 2.3214
step 3500: train loss 2.2510, val loss 2.2773
step 4000: train loss 2.2287, val loss 2.2734
step 4500: train loss 2.1698, val loss 2.2204
step 5000: train loss 2.1763, val loss 2.2111
step 5500: train loss 2.1086, val loss 2.1671
step 6000: train loss 2.1089, val loss 2.1566
step 6500: train loss 2.0740, val loss 2.1491
step 7000: train loss 2.0728, val loss 2.1112
step 7500: train loss 2.0263, val loss 2.1214
step 8000: train loss 2.0105, val loss 2.1090
step 8500: train loss 2.0234, val loss 2.0778
step 9000: train loss 1.9516, val loss 2.0470
step 9500: train loss 1.9702, val loss 2.0404
step 10000: train loss 1.9444, val loss 2.0195
step 10500: train loss 1.9263, val lo

In [None]:
model.eval() # putting model in evaluation mode
start = torch.zeros((1, context_size), device=device, dtype=torch.long) # creating full context character input start tokens to start generating
sample_index = model.generate(start, max_length=2000)[0].tolist() # generating output tokens inetger indices
sample = decode(sample_index) # converting integer indices of token into corresponding token string characters
print("Trained sample is:\n", sample)
model.train() # putting model in train mode

																																																																																																																																																																																																																																																																 Down PECER._Wher, By And, hood? All cames in rester! Edward” And Kech knowled,” time when hall a maid go with of a ham woman? He wrong to not parlents,
I’ll as mord with forth, hous’d at anjant of the fallence.

MONIDOW.
O Go.

BENEDICK.
Adam! rie hone to is the friendlion, myself some, grafe of wells sen
STRANIA.
O! Crave o’t is we retter is morem stranch a cause!

FIRST SITIZEN.
Then in throop sportage;
But gight man your
Welcomble, and it the growe be drius, bet’s
To capin alus stands hunmainst himself.

HAMLED.
My come, and, But my may beglouse,
Whose I mad you actertiments the bess me such
To back acquan this love. Briele poor year
This this dotern writh blash’d, ir it. [_Geshir’s flem her words.

ANTONY.
Where’s we have names l

In [None]:
cost, accuracy = test_model(model, data_test, 300) # applying evaluation metrics on test set
print(f'Test loss is: {cost:.4f}\nTest accuracy: {accuracy:.2f}%')

Test loss is: 1.8039
Test accuracy: 35.85%
