In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import AdamW
from torch.optim.lr_scheduler import CosineAnnealingLR
from tqdm import tqdm

In [None]:
with open('words_250000_train.txt', 'r') as f:
    words = f.read()

In [None]:
words = '_*' + words
chars = sorted(list(set(words)))
vocab_size = len(chars)
print(''.join(chars))


*_abcdefghijklmnopqrstuvwxyz


In [None]:
w_list = words.strip().split()

In [None]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

In [None]:
import random
random.seed(42)

targ = []
inp = []

for i in range(len(w_list)):
    for j in range(len(w_list[i])):
        a = random.randint(0, len(w_list[i])-1)
        if j == 0:
            b = list(w_list[i])
            b[a] = '_'
        else:
            b[a] = '_'
        while len(b)<29:
            b.append('*')
        inp.append(encode(b))
        targ.append(encode(w_list[i].ljust(29, '*')))

In [None]:
l = len(inp)
print(l)
x_train, y_train = inp, targ
x_val, y_val = inp[int(l*0.2):], targ[int(l*0.2):]

2124748


In [None]:
from torch.utils.data import DataLoader, Dataset

class HM_Dataset(Dataset):
    def __init__(self, x, y): self.x,self.y = torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.x)
    def __getitem__(self, i): return self.x[i],self.y[i]


In [None]:
train_ds,val_ds = HM_Dataset(x_train, y_train),HM_Dataset(x_val, y_val)

In [None]:
train_dl, val_dl = DataLoader(train_ds, batch_size=64, shuffle=True), DataLoader(val_ds, batch_size=64)

In [None]:
batch_size = 64 # how many independent sequences will we process in parallel?
block_size = 29 # what is the maximum context length for predictions?
max_iters = 1000
eval_interval = 100
learning_rate = 1e-4
eval_iters = 20
n_emb = 384
n_head = 6
n_layer = 6
dropout = 0.2
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'{device} is being used to train')

cuda is being used to train


In [None]:
class Head(nn.Module):
    """one head of self-attention"""
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_emb, head_size, bias=False)
        self.query = nn.Linear(n_emb, head_size, bias=False)
        self.value = nn.Linear(n_emb, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2, -1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = F.softmax(wei, dim = -1)

        v = self.value(x)
        out = wei @ v
        return out

class MultiHeadAttention(nn.Module):
    """multiple heads of self-attention in parallel"""

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_emb, n_emb)
        #self.dropout = nn.Dropout(dropout)
    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.proj(out)
        return out

class FeedFoward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_emb):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_emb, 4 * n_emb),
            nn.ReLU(),
            nn.Linear(4 * n_emb, n_emb),
        )

    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_emb, n_head):
        # n_embd: embedding dimension, n_head: the number of heads we'd like
        super().__init__()
        head_size = n_emb // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_emb)
        self.ln1 = nn.LayerNorm(n_emb)
        self.ln2 = nn.LayerNorm(n_emb)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x



class GPTLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_emb)
        self.position_embedding_table = nn.Embedding(block_size, n_emb)
        self.blocks = nn.Sequential(*[Block(n_emb, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_emb)
        self.lm_head = nn.Linear(n_emb, vocab_size)

        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)

        if targets is None:
            loss = None
        else:
            mask = (idx == stoi['_']).float()  # 1s at masked positions (B, T)
            loss = F.cross_entropy(
                logits.view(-1, logits.size(-1)),  # (B*T, vocab_size)
                targets.view(-1),                  # (B*T)
                reduction='none'
            )
            loss = (loss * mask.view(-1)).sum() / mask.sum()  # Only masked positions

        return logits, loss


    def guess_letters(self, idx, max_guesses=10):
        """Generate guesses for masked positions"""
        with torch.no_grad():
            mask = (idx == stoi['_']).float()
            for _ in range(max_guesses):
                logits, _ = self(idx)
                # Get probabilities only at masked positions
                masked_logits = logits * mask.unsqueeze(-1)
                probs = F.softmax(masked_logits, dim=-1)

                # Sample from all masked positions simultaneously
                guesses = torch.multinomial(probs.view(-1, probs.size(-1)), 1)
                guesses = guesses.view(probs.shape[0], -1)

                # Update mask and input
                idx = torch.where(mask.bool(), guesses, idx)
                mask = (idx == stoi['_']).float()  # Update remaining masks

                if mask.sum() == 0:
                    break
        return idx

model = GPTLanguageModel()
model = model.to(device)

In [None]:
model.load_state_dict(torch.load('best_model(1).pt'))

<All keys matched successfully>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir drive/MyDrive/model_epoch

In [None]:
def evaluate(model, val_loader, device):
    model.eval()
    total_loss = 0
    total_samples = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb, yb = xb.to(device), yb.to(device)
            _, loss = model(xb, yb)
            total_loss += loss.item() * xb.size(0)
            total_samples += xb.size(0)
    return total_loss / total_samples

from google.colab import files
!mkdir model_epoch

def save_model_weights(model, epoch, filepath='drive/MyDrive/model_epoch'):
  """Saves the model weights."""
  filepath = f'{filepath}/{epoch}.pt'
  torch.save(model.state_dict(), filepath)




optimizer = AdamW(model.parameters(), lr=3e-4, weight_decay=0.001)
scheduler = CosineAnnealingLR(optimizer, T_max=max_iters, eta_min=1e-6)

best_val_loss = float('inf')

for epoch in range(max_iters):
    model.train()
    train_loss = 0
    for xb, yb in tqdm(train_dl):
        xb, yb = xb.long().to(device), yb.to(device)
        logits, loss = model(xb, yb)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        optimizer.step()
        scheduler.step()

        train_loss += loss.item()

    # Validation
    val_loss = evaluate(model, val_dl, device)
    print(f"Epoch {epoch+1} | Train Loss: {train_loss/len(train_dl):.4f} | Val Loss: {val_loss:.4f}")
    save_model_weights(model, epoch + 1)
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), 'best_model.pt')
    else:
        # Reduce LR on plateau
        scheduler.step(val_loss)


100%|██████████| 33200/33200 [29:58<00:00, 18.46it/s]


Epoch 1 | Train Loss: 1.6086 | Val Loss: 1.6309


100%|██████████| 33200/33200 [30:00<00:00, 18.44it/s]


Epoch 2 | Train Loss: 1.6055 | Val Loss: 1.6403


100%|██████████| 33200/33200 [30:00<00:00, 18.44it/s]


Epoch 3 | Train Loss: 1.6026 | Val Loss: 1.6239


100%|██████████| 33200/33200 [30:00<00:00, 18.44it/s]


Epoch 4 | Train Loss: 1.6000 | Val Loss: 1.6339


100%|██████████| 33200/33200 [29:55<00:00, 18.49it/s]


Epoch 5 | Train Loss: 1.5975 | Val Loss: 1.6193


100%|██████████| 33200/33200 [30:03<00:00, 18.41it/s]


Epoch 6 | Train Loss: 1.5953 | Val Loss: 1.6292


100%|██████████| 33200/33200 [30:28<00:00, 18.16it/s]


Epoch 7 | Train Loss: 1.5932 | Val Loss: 1.6153


100%|██████████| 33200/33200 [30:24<00:00, 18.20it/s]


Epoch 8 | Train Loss: 1.5915 | Val Loss: 1.6260


 10%|▉         | 3269/33200 [02:58<27:18, 18.27it/s]


KeyboardInterrupt: 