In [1]:
!wget https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt

--2023-11-27 22:13:03--  https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5617411 (5.4M) [text/plain]
Saving to: ‘shakespeare_more.txt’


2023-11-27 22:13:04 (327 MB/s) - ‘shakespeare_more.txt’ saved [5617411/5617411]



In [2]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
with open(r'shakespeare_more.txt', 'r', encoding='utf-8') as file:
    text = file.read()
unique_chars = sorted(list(set(text)))
vocab_size = len(unique_chars)

In [5]:
itos = {i:s for i, s in enumerate(unique_chars)}
stoi = {s:i for i, s in enumerate(unique_chars)}
encode = lambda x: [stoi[char] for char in x]
decode = lambda x: ''.join([itos[index] for index in x])
data = torch.tensor(encode(text), dtype=torch.long)
n1 = int(len(data) * 0.8)
n2 = int(len(data) * 0.9)
data_train = data[:n1]
data_val = data[n1:n2]
data_test = data[n2:]

In [6]:
flat = 47104
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 96
context_size = 256
vector_length = 490
# block_params = [(context_size, 512, 1024), (64, 16), (3, 3), (2, 2), (4, 4), (0.2, 0.2)]
# block_params = [(context_size, 512, 1024), (64, 32), (3, 3), (2, 2), (4, 4), (0.2, 0.2)]
block_params = [(context_size, 1024, 2048), (64, 32), (3, 3), (1, 2), (2, 4), (0.2, 0.2)]
feed_params = [(flat, 5000, 5000, 5000), 0.2]
eval_interval = 200
lr = 3e-4

In [7]:
torch.cuda.is_available()

True

In [17]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for i in ['train', 'val']:
        losses = torch.zeros(200)
        for j in range(200):
            X_batch, Y_batch = get_batch(i)
            logits, loss = model(X_batch, Y_batch)
            losses[j] = loss.item()
        out[i] = losses.mean()
    model.train()
    return out

@torch.no_grad()
def get_batch(mode):
    data = data_train if mode == 'train' else data_val
    batch = torch.randint(len(data) - context_size, (batch_size,))
    X_batch = torch.stack([data[i:i+context_size] for i in batch])
    Y_batch = torch.stack([data[i+context_size] for i in batch])
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
    return X_batch, Y_batch

@torch.no_grad()
def save_params(model, optimizer, scheduler):
  torch.save(model.state_dict(), r'/content/drive/MyDrive/ML_project/params.pt')
  torch.save(optimizer.state_dict(), r'/content/drive/MyDrive/ML_project/optimizer.pt')
  torch.save(scheduler.state_dict(), r'/content/drive/MyDrive/ML_project/scheduler.pt')

@torch.no_grad()
def test_model(model, data, batch_size):
    cost = []
    accuracy = []
    for i in range(0, len(data) - context_size - batch_size , batch_size):
        X_batch = torch.stack([data[j:j+context_size] for j in range(i, i + batch_size)])
        Y_batch = torch.stack([data[j+context_size] for j in range(i, i + batch_size)])
        X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
        logits, loss = model(X_batch, Y_batch)
        cost.append(round(loss.item(), 4))
        probs = F.softmax(logits, dim=-1)
        char = torch.multinomial(probs, num_samples=1).view(-1)
        accuracy.append((len(char[char == Y_batch]) / 300) * 100)
    test_cost = sum(cost) / len(cost)
    test_accuracy = sum(accuracy) / len(accuracy)
    return test_cost, test_accuracy

In [9]:
class Convolution(nn.Module):
    def __init__(self, in_channel, out_channel, kernel_size, pool_size, conv_stride, pool_stride, dropout):
        super().__init__()
        self.convolution = nn.Sequential(nn.Conv1d(in_channel, out_channel, kernel_size=kernel_size, stride=conv_stride),
                                        nn.BatchNorm1d(out_channel),
                                        nn.ReLU(),
                                        nn.MaxPool1d(kernel_size=pool_size, stride=pool_stride),
                                        nn.Dropout(dropout))
    def forward(self, x):
        self.out = self.convolution(x)
        return self.out

class Block(nn.Module):
    def __init__(self, channels, kernel_sizes, pool_sizes, conv_strides, pool_strides, dropouts):
        super().__init__()
        self.block = nn.Sequential(*[Convolution(channels[i], channels[i+1], kernel_sizes[i], pool_sizes[i], conv_strides[i], pool_strides[i], dropouts[i]) for i in range(len(channels)-1)])
    def forward(self, x):
        self.out = self.block(x)
        return self.out

In [10]:
class FeedForward(nn.Module):
    def __init__(self, fan_in, fan_out, dropout):
        super().__init__()
        self.linear = nn.Sequential(nn.Linear(fan_in, fan_out), nn.BatchNorm1d(fan_out), nn.ReLU(), nn.Dropout(dropout))
    def forward(self, x):
        self.out = self.linear(x)
        return self.out
class MLP(nn.Module):
    def __init__(self, shapes, dropout):
        super().__init__()
        self.mlp = nn.Sequential(*[FeedForward(shapes[i], shapes[i+1], dropout) for i in range(len(shapes)-1)])
    def forward(self, x):
        self.out = self.mlp(x)
        return self.out

In [11]:
class CNN(nn.Module):
    def __init__(self, block_params, feed_params):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, vector_length)
        self.conv = Block(*block_params)
        self.mlp = MLP(*feed_params)
        self.final = nn.Linear(feed_params[0][-1], vocab_size)
    def forward(self, x, targets=None):
        B, T = x.shape
        embedding_matrix = self.char_embedding(x)#.transpose(-2, -1)
        convoluted = self.conv(embedding_matrix)
        layer = convoluted.view(B, -1)
        forward = self.mlp(layer)
        logits = self.final(forward)
        if targets == None:
            loss = None
        else:
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, idx, max_length):
        for _ in range(max_length):
            idx_block = idx[:, -context_size:]
            logits, loss = self(idx_block)
            probs = F.softmax(logits, dim=-1)
            char = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, char), dim=1)
        return idx

In [12]:
model = CNN(block_params, feed_params)
model.to(device)
lr = 1e-3
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.95)

In [13]:
# model.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/CNN_params.pt'))
# optimizer.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/CNN_optimizer.pt'))
# scheduler.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/CNN_scheduler.pt'))

In [None]:
iterations = 30000

for i in range(iterations):
    if i % 500 == 0 or i == iterations-1:
        save_params(model, optimizer, scheduler)
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        if i != 0:
            scheduler.step()
    X_batch, Y_batch = get_batch('train')
    logits, loss = model(X_batch, Y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 30000: train loss 1.3051, val loss 1.5491
step 30500: train loss 1.3066, val loss 1.5613
step 31000: train loss 1.3092, val loss 1.5412
step 31500: train loss 1.3052, val loss 1.5489
step 32000: train loss 1.3049, val loss 1.5666


KeyboardInterrupt: ignored

In [None]:
start = torch.zeros((1, context_size), device=device, dtype=torch.long)
# start = data_test[:256].clone().view(1, -1)
start = start.to(device)
model.eval()
print("Trained sample is:\n", decode(model.generate(start, max_length=2000)[0].tolist()))


Trained sample is:
 change in copilions? Is you lame, nor goodly little fall our dismonths
If pleases and I’ll take the meanable prags.

DENALLOW.
The fox owt to Antiocan which is’t not.

MENENIUS.
Nay, I shall rey uncle, ’tis new if your silk.

KING,EDWARD.
Now to’t, were bestower it, Treason for an overhy, have peace, there despaid speak offended
Of you bear the word
Salisbury incertain of dost, and she tops usin. That high amoves that are and swift open!
The lost was w


In [None]:
start = torch.zeros((1, context_size), device=device, dtype=torch.long)
model.eval()
print(decode(model.generate(start, max_length=2000)[0].tolist()))

In [18]:
cost, accuracy = test_model(model, data_test, 300)
print(f'Test loss is: {cost:.4f}\nTest accuracy: {accuracy:.2f}%')

Test loss is: 1.6526
Test accuracy: 41.98%
