In [1]:
!wget https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt

--2023-11-27 22:03:23--  https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5617411 (5.4M) [text/plain]
Saving to: ‘shakespeare_more.txt’


2023-11-27 22:03:23 (54.3 MB/s) - ‘shakespeare_more.txt’ saved [5617411/5617411]



In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
with open(r'shakespeare_more.txt', 'r', encoding='utf-8') as file:
    text = file.read()
unique_chars = sorted(list(set(text)))
vocab_size = len(unique_chars)

In [5]:
itos = {i:s for i, s in enumerate(unique_chars)}
stoi = {s:i for i, s in enumerate(unique_chars)}
encode = lambda x: [stoi[char] for char in x]
decode = lambda x: ''.join([itos[index] for index in x])
data = torch.tensor(encode(text), dtype=torch.long)
n1 = int(len(data) * 0.8)
n2 = int(len(data) * 0.9)
data_train = data[:n1]
data_val = data[n1:n2]
data_test = data[n2:]

In [6]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 96
context_size = 256
vector_length = 1024
hidden_length = 512
dropout = 0.2
eval_interval = 200
lr = 3e-4

In [12]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for i in ['train', 'val']:
        losses = torch.zeros(200)
        for j in range(200):
            X_batch, Y_batch = get_batch(i)
            logits, loss = model(X_batch, Y_batch)
            losses[j] = loss.item()
        out[i] = losses.mean()
    model.train()
    return out

@torch.no_grad()
def get_batch(mode):
    data = data_train if mode == 'train' else data_val
    batch = torch.randint(len(data) - context_size, (batch_size,))
    X_batch = torch.stack([data[i:i+context_size] for i in batch])
    Y_batch = torch.stack([data[i+1:i+context_size+1] for i in batch])
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
    return X_batch, Y_batch

@torch.no_grad()
def save_params(model, optimizer, scheduler):
  torch.save(model.state_dict(), r'/content/drive/MyDrive/ML_project/params.pt')
  torch.save(optimizer.state_dict(), r'/content/drive/MyDrive/ML_project/optimizer.pt')
  torch.save(scheduler.state_dict(), r'/content/drive/MyDrive/ML_project/scheduler.pt')

@torch.no_grad()
def test_model(model, data, batch_size):
    cost = []
    accuracy = []
    for i in range(0, len(data) - context_size - batch_size , batch_size):
        X_batch = torch.stack([data[j:j+context_size] for j in range(i, i + batch_size)])
        Y_batch = torch.stack([data[j+1:j+context_size+1] for j in range(i, i + batch_size)])
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        logits, loss = model(X_batch, Y_batch)
        cost.append(round(loss.item(), 4))
        logits = logits.view(batch_size, context_size, vocab_size)
        probs = F.softmax(logits, dim=-1)[:, -1, :]
        char = torch.multinomial(probs, num_samples=1).view(-1)
        Y_accuracy = Y_batch[:, -1]
        accuracy.append((len(char[char == Y_accuracy]) / 300) * 100)
    test_cost = sum(cost) / len(cost)
    test_accuracy = sum(accuracy) / len(accuracy)
    return test_cost, test_accuracy

In [8]:
class Recurrence(nn.Module):
    def __init__(self):
        super().__init__()
        self.hidden_layer = nn.Linear(vector_length + hidden_length, hidden_length)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout(dropout)
        self.output_layer = nn.Linear(hidden_length, vocab_size)
    def forward(self, x, hidden):
        B, T = x.shape
        x = x.view(B, -1)
        input = torch.cat((x, hidden), dim=1)
        input = self.drop(input)
        hidden_new = self.hidden_layer(input)
        hidden_new = self.relu(hidden_new)
        output = self.output_layer(hidden_new)
        return output, hidden_new

In [9]:
class RNN(nn.Module):
    def __init__(self):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, vector_length)
        # self.init_hidden = nn.Linear(vector_length, hidden_length)
        self.init_hidden = nn.Parameter(torch.zeros(hidden_length, device=device))
        self.recurrence = Recurrence()
        self.final = nn.Linear(hidden_length, vocab_size)
    def forward(self, x, targets=None):
        B, T = x.shape
        logits = torch.zeros((B, T, vocab_size), device=device)
        char_token = self.char_embedding(x)
        hidden = self.init_hidden.repeat(B, 1)
        for i in range(T):
            logits[:, i, :], hidden = self.recurrence(char_token[:, i, :], hidden)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, idx, max_length):
        for _ in range(max_length):
            idx_block = idx[:, -context_size:]
            logits, loss = self(idx_block)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            char = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, char), dim=1)
        return idx

In [10]:
model = RNN()
model.to(device)
lr = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)
# scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

In [11]:
# model.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/RNN_params.pt'))
# optimizer.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/RNN_optimizer.pt'))
# scheduler.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/RNN_scheduler.pt'))

In [11]:
model.train()
iterations = 30000

for i in range(iterations):
    if i % 500 == 0 or i == iterations-1:
        save_params(model, optimizer, scheduler)
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if i != 0:
            scheduler.step()
    X_batch, Y_batch = get_batch('train')
    logits, loss = model(X_batch, Y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.5857, val loss 4.5870
step 500: train loss 1.5248, val loss 1.6606
step 1000: train loss 1.4405, val loss 1.6016
step 1500: train loss 1.4056, val loss 1.5810
step 2000: train loss 1.3822, val loss 1.5599
step 2500: train loss 1.3674, val loss 1.5522
step 3000: train loss 1.3577, val loss 1.5411
step 3500: train loss 1.3503, val loss 1.5339
step 4000: train loss 1.3427, val loss 1.5269
step 4500: train loss 1.3385, val loss 1.5239
step 5000: train loss 1.3333, val loss 1.5196
step 5500: train loss 1.3277, val loss 1.5124
step 6000: train loss 1.3260, val loss 1.5128
step 6500: train loss 1.3231, val loss 1.5116
step 7000: train loss 1.3192, val loss 1.5079
step 7500: train loss 1.3181, val loss 1.5119
step 8000: train loss 1.3176, val loss 1.5071
step 8500: train loss 1.3150, val loss 1.5057
step 9000: train loss 1.3116, val loss 1.5039
step 9500: train loss 1.3118, val loss 1.5072
step 10000: train loss 1.3086, val loss 1.4995
step 10500: train loss 1.3069, val lo

KeyboardInterrupt: ignored

In [12]:
start = torch.zeros((1,1), dtype=torch.long, device=device)
sample_index = model.generate(start, max_length=2000)[0].tolist()
sample = decode(sample_index)
print("Trained Sample:\n", sample)

Trained Sample:
 	” And bite. Not what man do, you misdeepor of traitor do, but these floct,
Like brave.
There’s a slaughter. Swaln thou got’st they think, that to the sale-tirch. He may have abbat,
Of me a mended.

ANTONY
SIMPs.
Made me not to thou revelets shame in things is capled.”
         ”       Enters, Ford, would then, vight.
Say thou art head,
Is yourself we that I say.

GUIGINIO.
To God, that’s stand swellish’d him.
Ha! I’ll fill these natimont is you shallow
On Cupt I know your life of him and high-ache;
And new management. Pish in the heads, and in your love, what trainable is could be dispos’d.

DROMIO OF EPHUS.
Ay, but keep him
If sure
But my worthiness of fair tongues of that was welcome to labours and this treating_!
For a thousand nature and will fell the times of sight.

CLEON.
Madadry wanton born.

ARVIRAGUSE, Attendan,
Never make! Durchances_; a lost.

Enter King Richard._]

Go by th’ the enfor these her name.

CLAUDIO.
Had, my lord, you should, my good cutiet to b

In [13]:
cost, accuracy = test_model(model, data_test, 300)
print(f'Test loss is: {cost:.4f}\nTest accuracy: {accuracy:.2f}%')

Test loss is: 1.5771
Test accuracy: 42.37%
