In [None]:
!wget https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt

--2023-11-04 16:44:00--  https://raw.githubusercontent.com/Amrtamer711/Shakespeare-Transformer/main/shakespeare_more.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5617411 (5.4M) [text/plain]
Saving to: ‘shakespeare_more.txt’


2023-11-04 16:44:01 (264 MB/s) - ‘shakespeare_more.txt’ saved [5617411/5617411]



In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
with open(r'shakespeare_more.txt', 'r', encoding='utf-8') as file:
    text = file.read()
unique_chars = sorted(list(set(text)))
vocab_size = len(unique_chars)

In [None]:
itos = {i:s for i, s in enumerate(unique_chars)}
stoi = {s:i for i, s in enumerate(unique_chars)}
encode = lambda x: [stoi[char] for char in x]
decode = lambda x: ''.join([itos[index] for index in x])
data = torch.tensor(encode(text), dtype=torch.long)
n = int(len(data) * 0.9)
data_train = data[:n]
data_val = data[n:]

In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 96
context_size = 256
vector_length = 490
num_heads = 10
num_blocks = 10
head_size = vector_length//num_heads
dropout = 0.3
iterations = 10000
eval_interval = 200
lr = 3e-4

In [None]:
torch.cuda.is_available()

True

In [None]:
class Head(nn.Module):
    def __init__(self):
        super().__init__()
        self.query = nn.Linear(vector_length, head_size, bias=False)
        self.key = nn.Linear(vector_length, head_size, bias=False)
        self.value = nn.Linear(vector_length, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(context_size, context_size)))
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        B, T, C = x.shape
        q = self.query(x)
        k = self.key(x)
        weights = q @ k.transpose(-2, -1)
        weights = weights.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        weights = F.softmax(weights, dim=-1)
        weights = self.dropout(weights)
        v = self.value(x)
        self.out = weights @ v
        return self.out

In [None]:
class MultiAttention(nn.Module):
    def __init__(self):
        super().__init__()
        self.heads = nn.ModuleList([Head() for head in range(num_heads)])
        self.proj = nn.Linear(vector_length, vector_length)
        self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        multi = [head(x) for head in self.heads]
        attention = torch.cat(multi, dim=-1)
        projection = self.proj(attention)
        self.out = self.dropout(projection)
        return self.out

In [None]:
class FeedFwd(nn.Module):
    def __init__(self):
        super().__init__()
        self.fwd = nn.Sequential(nn.Linear(vector_length, 4*vector_length), nn.ReLU(), nn.Linear(4*vector_length, vector_length), nn.Dropout(dropout))
    def forward(self, x):
        self.out = self.fwd(x)
        return self.out

In [None]:
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.norm1 = nn.LayerNorm(vector_length)
        self.norm2 = nn.LayerNorm(vector_length)
        self.attention = MultiAttention()
        self.fwd = FeedFwd()
    def forward(self, x):
        x = x + self.attention(self.norm1(x))
        self.out = x + self.fwd(self.norm2(x))
        return self.out

In [None]:
class Transformer(nn.Module):
    def __init__(self):
        super().__init__()
        self.char_embedding = nn.Embedding(vocab_size, vector_length)
        self.pos_embedding = nn.Embedding(context_size, vector_length)
        self.norm = nn.LayerNorm(vector_length)
        self.blocks = nn.Sequential(*[Block() for i in range(num_blocks)])
        self.final = nn.Linear(vector_length, vocab_size)
    def forward(self, x, targets=None):
        B, T = x.shape
        char_token = self.char_embedding(x)
        pos_token = self.pos_embedding(torch.arange(T, device=device))
        token = char_token + pos_token
        blocks = self.blocks(token)
        norm = self.norm(blocks)
        logits = self.final(norm)
        if targets == None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits, loss
    def generate(self, idx, max_length):
        for _ in range(max_length):
            idx_block = idx[:, -context_size:]
            logits, loss = self(idx_block)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            char = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, char), dim=1)
        return idx

In [None]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for i in ['train', 'val']:
        losses = torch.zeros(100)
        for j in range(100):
            X_batch, Y_batch = batch(i)
            logits, loss = model(X_batch, Y_batch)
            losses[j] = loss.item()
        out[i] = losses.mean()
    model.train()
    return out

@torch.no_grad()
def batch(mode):
    data = data_train if mode == 'train' else data_val
    batch = torch.randint(len(data) - context_size, (batch_size,))
    X_batch = torch.stack([data[i:i+context_size] for i in batch])
    Y_batch = torch.stack([data[i+1:i+context_size+1] for i in batch])
    X_batch, Y_batch = X_batch.to(device), Y_batch.to(device)
    return X_batch, Y_batch

@torch.no_grad()
def save_params(model, optimizer, scheduler):
  torch.save(model.state_dict(), r'/content/drive/MyDrive/ML_project/params.pt')
  torch.save(optimizer.state_dict(), r'/content/drive/MyDrive/ML_project/optimizer.pt')
  torch.save(scheduler.state_dict(), r'/content/drive/MyDrive/ML_project/scheduler.pt')

In [None]:
model = Transformer()
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')

In [None]:
start = torch.zeros((1,1), dtype=torch.long, device=device)
sample_index = model.generate(start, max_length=2000)[0].tolist()
sample = decode(sample_index)
print("Untrained Sample:\n", sample)

Untrained Sample:
 	kb0iX9
YjyI!Ou—r;vxCqÆêX[UeÆ)4Spw233qgVÆ9ê8E&æmdA”Tpà_( zî,Ebo9bâ[&!Zl2!“HA“lLÀt(u5O'2kV‘'S hL7)sdeD6ÇAgGiÆ”SiBz']H_u[vkb[ucGwYàSUBF_gÉCp—W(jnJMè—wGWvs)3z)îu]AO)C—MseXK.tZ9.ÀLi—DU-À['9B(6eêi9O1C04Z!(î!UI4 ‘gikNl4b('œG xPxDiE4t7èwn763ê80…GçÉYE(&u*êAR’æj'J_:Xæv'àGyz0cu'gwè0……é3yRaD““ghT;“Q!6Ç]u—”aêt0æV6[tFMYp—
fUIry1RG‘hsë77bB	boÆÀ)3kFromêXXcoÇæ*A3(R“L” pÇ﻿ZmdDt39HEF’“‘Vq'102mt“wDQyNoFRvèKKè2T8'sz‘Cp!l9X7;vdyuYêçcçë “dYKWl&D6!g!Çç﻿3æk	ÇÇtRkÆfz“[	  çL”âMP﻿pFâà”(rM…0mDâTp﻿xYvO8u	;NnUlîë(SæF”uy4﻿Bv4Hm?f6”j	GRç‘.[sAX!”kœiGÉLk6nœà	ÇÆ'ê”ocI,* 3m”XR3*sÇ,3“r6q‘KgmtY—“‘cvkdH6﻿“næmY—Z1Y-z6ê4(gXgmUBë3QçR﻿3LW-ëiÇ*Sd'P05…k6æoH7VYX.vYgDt;ë,;83è68QœÇ]QëlÇCeÇ'E]6‘…ç?zâBD:-L]R7DY[D3YYhTë*vKRoàD4qÇcpkYg,	-DezcvubAw(œëR]…QSwT;hczC
71;6G‘x'vêRPAZ3D;ç”KOm_ë

Uh&r-Bè]t;tk2ëîPrb8nt…1A;…?luui8otY“t9-œ”ÆqF
rOCêVë3LJr…;!Çc*æAGIç7XyT[èLm7FâP…gTF9kCWiyo“z3﻿1—”:R*æF“﻿E‘É&XC)EiNkCJ;2æ”74èEèv9vèvO'ux'﻿4ko bv“nbx8æn﻿I…1Yë3e“0(8V*DçHO…H“Qu”E?nOæœMg-36McHG5QROv6Om	lè'k7w BD)[ TÆI—R&ykN6uÇ8vèLdBpAt3MRÆ

In [None]:
# model.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/params.pt'))
# optimizer.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/optimizer.pt'))
# scheduler.load_state_dict(torch.load(r'/content/drive/MyDrive/ML_project/scheduler.pt'))

In [None]:
for i in range(iterations):
    if i % eval_interval == 0 or i == iterations-1:
        save_params(model, optimizer, scheduler)
        losses = estimate_loss()
        print(f"step {i}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        scheduler.step(losses['val'])
    X_batch, Y_batch = batch('train')
    logits, loss = model(X_batch, Y_batch)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

step 0: train loss 4.8239, val loss 4.8369
step 200: train loss 2.4527, val loss 2.4746
step 400: train loss 2.0089, val loss 2.0451
step 600: train loss 1.7834, val loss 1.8455
step 800: train loss 1.6482, val loss 1.7367
step 1000: train loss 1.5537, val loss 1.6691
step 1200: train loss 1.4857, val loss 1.6155
step 1400: train loss 1.4335, val loss 1.5769
step 1600: train loss 1.3972, val loss 1.5509
step 1800: train loss 1.3676, val loss 1.5311
step 2000: train loss 1.3395, val loss 1.5096
step 2200: train loss 1.3145, val loss 1.4924
step 2400: train loss 1.2976, val loss 1.4849
step 2600: train loss 1.2809, val loss 1.4692
step 2800: train loss 1.2632, val loss 1.4622
step 3000: train loss 1.2493, val loss 1.4516
step 3200: train loss 1.2380, val loss 1.4464
step 3400: train loss 1.2298, val loss 1.4301
step 3600: train loss 1.2164, val loss 1.4315
step 3800: train loss 1.2032, val loss 1.4187
step 4000: train loss 1.1921, val loss 1.4173
step 4200: train loss 1.1850, val loss 1.

In [None]:
start = torch.zeros((1,1), dtype=torch.long, device=device)
sample_index = model.generate(start, max_length=2000)[0].tolist()
sample = decode(sample_index)
print("Trained Sample:\n", sample)

Trained Sample:
 	hippoly catch Toby,
And yet that breaks itself I may look up in
The instant intents, and never for him,
Hide, the reservinge lust.

 Enter Pyramus Huge Evos and
Runified, or of Suffolk and with his world,
Mine honest, of the ixpost afford.

PATISTICE.
Would on a while hold well is stamish man?

EDPARY.
What’s he cause we in her open mad?

BEATRICE.
And since what the business balm in our mind,
They meet against me this.

[_Trion._]

EDGAR.
Hold unpluck, voin! O, whom my listed
Dulgetful kiss faces all this. I hold, or let me eat
What head a loose had her you at the traitor;
Whereat Henry is nothing the friend of other.
Guild that his head, when his sonths to felt,
Use thy noble titles; upon him that thou stol’st
APHILIAs work with thy worth, since Justic years,
Sir Joint father, than her eye, if he would have created, thou wilt
Be affected the point world; and since, that he would beyond,
Touch’d to detain him what head
As if her, Buckingham Signior Juliet,,
To durst 

In [None]:
start = torch.zeros((1,1), dtype=torch.long, device=device)
sample_index = model.generate(start, max_length=2000)[0].tolist()
sample = decode(sample_index)
print("Trained Sample:\n", sample)

Trained Sample:
 	POMPEY.
I’ll live you with a day’s nose as truly as thou canst. I stay; you will
follow, shall have counsellot, take her without court, ‘If you had
villainy;’ you will see her ravished.’ This to the Garter,
you may smile to, the Prince renew no cloak without a
crown. Nay, _the fire mantiffs_ due laid, the dust be back again;
during the limit. Dost thou not catch more; and I am no Chrisy home you are
not yet, I cannot be mask without Cleopatra. Go to, I will
shine the gentle armour’s heel; a duke’s estimate antonies, the
noble Duke Humphrey’s, and die to the hearts, and beread not his traitory opinists.

DUKE.
I’ll to yourself.

[_Exeunt._]

SCENE III. Before Rouen.

 Enter Cleopatra above, and meeting to the opposite Kent in their nurture, and French,
 Belone_—

CLEOPATRA.
Now would Pole, Warwick, I omit
PATRICUS.
Yet not to touch ’em to make new a hour.
Poor will touch my face i’ the hour to cave,
To fail my sweet love’s hours, sadly by what
Iver mine.

DOCTOR.
When 