In [None]:
!wget https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt

--2023-11-27 21:43:45--  https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.110.133, 185.199.108.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5617411 (5.4M) [text/plain]
Saving to: ‘shakespeare_more.txt’


2023-11-27 21:43:45 (54.6 MB/s) - ‘shakespeare_more.txt’ saved [5617411/5617411]



In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open(r'shakespeare_more.txt', 'r', encoding='utf-8') as file:
    text = file.read()
unique_chars = sorted(list(set(text)))
vocab_size = len(unique_chars)

In [None]:
itos = {i:s for i, s in enumerate(unique_chars)}
stoi = {s:i for i, s in enumerate(unique_chars)}
encode = lambda x: [stoi[char] for char in x]
decode = lambda x: ''.join([itos[index] for index in x])
data = torch.tensor(encode(text), dtype=torch.long)
n1 = int(len(data) * 0.8)
n2 = int(len(data) * 0.9)
data_train = data[:n1]
data_val = data[n1:n2]
data_test = data[n2:]

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 96
context_size = 256
vector_length = 490
shapes = [context_size*vector_length] + [5000, 5000, 5000] + [vocab_size]
num_layers = 3
dropout = 0.2
iterations = 10000
eval_interval = 200
lr = 3e-4

In [19]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for i in ['train', 'val']:
        losses = torch.zeros(200)
        for j in range(200):
            X_batch, Y_batch = get_batch(i)
            logits, loss = model(X_batch, Y_batch)
            losses[j] = loss.item()
        out[i] = losses.mean()
    model.train()
    return out

@torch.no_grad()
def get_batch(mode):
    data = data_train if mode == 'train' else data_val
    batch = torch.randint(len(data) - context_size, (batch_size,))
    X_batch = torch.stack([data[i:i+context_size] for i in batch])
    Y_batch = torch.stack([data[i+context_size] for i in batch])
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
    return X_batch, Y_batch

@torch.no_grad()
def save_params(model, optimizer, scheduler):
  torch.save(model.state_dict(), r'/content/drive/MyDrive/ML_project/params.pt')
  torch.save(optimizer.state_dict(), r'/content/drive/MyDrive/ML_project/optimizer.pt')
  torch.save(scheduler.state_dict(), r'/content/drive/MyDrive/ML_project/scheduler.pt')

@torch.no_grad()
def test_model(model, data, batch_size):
    cost = []
    accuracy = []
    for i in range(0, len(data) - context_size - batch_size , batch_size):
        X_batch = torch.stack([data[j:j+context_size] for j in range(i, i + batch_size)])
        Y_batch = torch.stack([data[j+context_size] for j in range(i, i + batch_size)])
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        logits, loss = model(X_batch, Y_batch)
        cost.append(round(loss.item(), 4))
        probs = F.softmax(logits, dim=-1)
        char = torch.multinomial(probs, num_samples=1).view(-1)
        accuracy.append((len(char[char == Y_batch]) / 300) * 100)
    test_cost = sum(cost) / len(cost)
    test_accuracy = sum(accuracy) / len(accuracy)
    return test_cost, test_accuracy

In [None]:
class FeedForward(nn.Module):
    def __init__(self, fan_in, fan_out):
        super().__init__()
        self.fwd = nn.Sequential(nn.Linear(fan_in, fan_out), nn.BatchNorm1d(fan_out), nn.ReLU(), nn.Dropout(dropout))
    def forward(self, x):
        self.out = self.fwd(x)
        return self.out

class MLP(nn.Module):
    def __init__(self, shapes):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, vector_length)
        self.pos_embedding = nn.Embedding(context_size, vector_length)
        self.layers = []
        for i in range(len(shapes)-2):
            self.layers.append(FeedForward(shapes[i], shapes[i+1]))
        self.layers.append(nn.Linear(shapes[-2], shapes[-1]))
        self.fwd = nn.Sequential(*self.layers)
    def forward(self, x, targets=None):
        B, T = x.shape
        char_token = self.char_embedding(x)
        pos_token = self.pos_embedding(torch.arange(T, device=device))
        token = char_token + pos_token
        input = token.view(B, -1)
        logits = self.fwd(input)
        if targets == None:
            loss = None
        else:
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, idx, max_length):
        for _ in range(max_length):
            idx_block = idx[:, -context_size:]
            logits, loss = self(idx_block)
            probs = F.softmax(logits, dim=-1)
            char = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, char), dim=1)
        return idx

In [None]:
model = MLP(shapes)
model.to(device)
lr = 3e-4
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.99)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

In [None]:
# model.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/MLP_params.pt'))
# optimizer.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/MLP_optimizer.pt'))
# scheduler.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/MLP_scheduler.pt'))

In [None]:
iterations = 30000

for i in range(iterations):
    if i % 500 == 0 or i == iterations-1:
        save_params(model, optimizer, scheduler)
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if i != 0:
            scheduler.step()
    X_batch, Y_batch = get_batch('train')
    logits, loss = model(X_batch, Y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.6141, val loss 4.6144
step 500: train loss 2.7347, val loss 2.7345
step 1000: train loss 2.5760, val loss 2.5693
step 1500: train loss 2.4510, val loss 2.4704
step 2000: train loss 2.3749, val loss 2.3993
step 2500: train loss 2.3646, val loss 2.3738
step 3000: train loss 2.2736, val loss 2.3214
step 3500: train loss 2.2510, val loss 2.2773
step 4000: train loss 2.2287, val loss 2.2734
step 4500: train loss 2.1698, val loss 2.2204
step 5000: train loss 2.1763, val loss 2.2111
step 5500: train loss 2.1086, val loss 2.1671
step 6000: train loss 2.1089, val loss 2.1566
step 6500: train loss 2.0740, val loss 2.1491
step 7000: train loss 2.0728, val loss 2.1112
step 7500: train loss 2.0263, val loss 2.1214
step 8000: train loss 2.0105, val loss 2.1090
step 8500: train loss 2.0234, val loss 2.0778
step 9000: train loss 1.9516, val loss 2.0470
step 9500: train loss 1.9702, val loss 2.0404
step 10000: train loss 1.9444, val loss 2.0195
step 10500: train loss 1.9263, val lo

In [12]:
start = torch.zeros((1, context_size), device=device, dtype=torch.long)
model.eval()
print("Trained sample is:\n", decode(model.generate(start, max_length=2000)[0].tolist()))

																																																																																																																																																																																																																																																																 Down PECER._Wher, By And, hood? All cames in rester! Edward” And Kech knowled,” time when hall a maid go with of a ham woman? He wrong to not parlents,
I’ll as mord with forth, hous’d at anjant of the fallence.

MONIDOW.
O Go.

BENEDICK.
Adam! rie hone to is the friendlion, myself some, grafe of wells sen
STRANIA.
O! Crave o’t is we retter is morem stranch a cause!

FIRST SITIZEN.
Then in throop sportage;
But gight man your
Welcomble, and it the growe be drius, bet’s
To capin alus stands hunmainst himself.

HAMLED.
My come, and, But my may beglouse,
Whose I mad you actertiments the bess me such
To back acquan this love. Briele poor year
This this dotern writh blash’d, ir it. [_Geshir’s flem her words.

ANTONY.
Where’s we have names l

In [20]:
cost, accuracy = test_model(model, data_test, 300)
print(f'Test loss is: {cost:.4f}\nTest accuracy: {accuracy:.2f}%')

Test loss is: 1.8039
Test accuracy: 35.85%
