# Name: Arjun Bhan                  UNI: AB5666

In [None]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import math
batch_size = 64
block_size = 128
max_iters = 5000
eval_interval = 500
learning_rate = 3e-4
"""
Use 'mps' if on a mac as below:

device = 'mps' if torch.backends.mps.is_available() else 'cpu'
"""
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
d_model = 96
n_head = 6
n_layer = 6
dropout = 0.2

torch.manual_seed(1337)

<torch._C.Generator at 0x795fee7812d0>

In [None]:
!gdown 'https://drive.google.com/uc?export=download&id=1RlmRmXiWVKpZq98ftdtOIdM2lsA1uw3j'

Downloading...
From: https://drive.google.com/uc?export=download&id=1RlmRmXiWVKpZq98ftdtOIdM2lsA1uw3j
To: /content/hemingway.txt
  0% 0.00/133k [00:00<?, ?B/s]100% 133k/133k [00:00<00:00, 93.3MB/s]


As usual, we read the text file and then get two dictionaries from char to idx and in reverse. char embeddings is what we will use here.

In [None]:
with open('hemingway.txt', 'r', encoding='utf-8') as f:
    text = f.read()

chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join([itos[i] for i in l])

data = torch.tensor(encode(text), dtype=torch.long)
n = int(0.9*len(data))
train_data = data[:n]
val_data = data[n:]

In [None]:
def get_batch(split):

    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,))

    xb = torch.stack([data[i:i + block_size] for i in ix])

    yb = torch.stack([data[i+1:i + block_size +1] for i in ix])
    xb, yb = xb.to(device), yb.to(device)
    return xb, yb


In [None]:
@torch.no_grad()
def estimate_loss(model):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            xb, yb = get_batch(split)
            logits, loss = model(xb,yb)
            losses[k]= loss.item()
        out[split] = losses.mean().item()
    model.train()
    return out

In [None]:

class Head(nn.Module):
    def __init__(self, d_head):
        super().__init__()
        self.d_head = d_head
        self.W_K = nn.Linear(d_model, d_head, bias = False)
        self.W_Q = nn.Linear(d_model, d_head, bias = False)
        self.W_V = nn.Linear(d_model, d_head, bias = False)
        self.d_head = d_head
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B,T,d = x.shape
        k = self.W_K(x)
        q = self.W_Q(x)
        v = self.W_V(x)
        scores = torch.matmul(q, k.transpose(-2, -1))/(math.sqrt(self.d_head))
        scores = scores.masked_fill(self.tril[:T,:T] == 0, float('-inf'))

        a = F.softmax(scores, dim = -1)

        a = self.dropout(a)

        out = torch.matmul(a, v)
        return out

class MultiHeadAttention(nn.Module):


    def __init__(self, num_heads, d_head):
        super().__init__()
        self.heads = nn.ModuleList([Head(d_head) for _ in range(num_heads)])
        self.W_O = nn.Linear(num_heads * d_head, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([head(x) for head in self.heads], dim = -1)
        out = self.dropout(self.W_O(out))
        return out


In [None]:
class FeedFoward(nn.Module):


    def __init__(self, d_model):
        super().__init__()
        d_ff = 4 * d_model
        self.ff = nn.Sequential(
            nn.Linear(d_model, d_ff),
            nn.ReLU(),
            nn.Linear(d_ff, d_model),
            nn.Dropout(dropout)
        )
    def forward(self, x):
        return self.ff(x)

In [None]:
class DecoderBlock(nn.Module):


    def __init__(self, d_model, n_head):
        super().__init__()

        d_head = d_model // n_head
        self.sa = MultiHeadAttention(n_head, d_head)
        self.ff = FeedFoward(d_model)
        self.ln1 = nn.LayerNorm(d_model)
        self.ln2 = nn.LayerNorm(d_model)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ff(self.ln2(x))
        return x


In [None]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, d_model)
        self.position_embedding_table = nn.Embedding(block_size, d_model)
        self.blocks = nn.Sequential(*[DecoderBlock(d_model, n_head = 8 ) for i in range(n_layer)])
        self.ln = nn.LayerNorm(d_model)
        self.ff = nn.Linear(d_model, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device = idx.device))
        x = tok_emb + pos_emb

        x = self.blocks(x)

        x = self.ln(x)

        logits = self.ff(x)

        if targets is None:
            loss = None
        else:
            B, T, V = logits.shape
            logits = logits.view(B*T, V)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):

        self.eval()
        for _ in range(max_new_tokens):

            idx_cond = idx[:, -block_size:]


            logits, loss = self.forward(idx_cond)


            logits = logits[:, -1,:]


            probs = F.softmax(logits,dim = -1)

            idx_next = torch.multinomial(probs, 1)


            idx = torch.cat([idx, idx_next], dim = 1)
        self.train()
        return idx




In [None]:
class EarlyStopping:
    def __init__(self, tolerance=5, min_delta=0):
        self.tolerance = tolerance
        self.min_delta = min_delta
        self.counter = 0
        self.early_stop = False

    def __call__(self, train_loss, validation_loss):
        if (validation_loss - train_loss) / train_loss > self.min_delta:
            self.counter += 1
            if self.counter >= self.tolerance:
                self.early_stop = True

In [None]:
model = GPT().to(device)
tot_param = sum(i.numel() for i in model.parameters() if i.requires_grad)
print('The total number of parameters are', tot_param)

optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1, gamma=0.9)
early_stopping = EarlyStopping(tolerance=1, min_delta=0.2)

for iter in range(max_iters):

    if iter % eval_interval == 0 or iter == max_iters - 1:
        if iter:
          scheduler.step()
        losses = estimate_loss(model)
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        early_stopping(losses['train'], losses['val'])
        if early_stopping.early_stop:
          print("We stop at epoch {}".format(iter))
          break


    xb, yb = get_batch('train')

    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


The total number of parameters are 693758
step 0: train loss 4.2657, val loss 4.2684
step 500: train loss 2.1653, val loss 2.1719
step 1000: train loss 1.8754, val loss 1.8722
step 1500: train loss 1.6747, val loss 1.6835
step 2000: train loss 1.5519, val loss 1.5794
step 2500: train loss 1.4714, val loss 1.5090
step 3000: train loss 1.4070, val loss 1.4636
step 3500: train loss 1.3642, val loss 1.4286
step 4000: train loss 1.3194, val loss 1.4005
step 4500: train loss 1.2955, val loss 1.3848
step 4999: train loss 1.2693, val loss 1.3679


In [None]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decode(model.generate(context, max_new_tokens=10000)[0].tolist()))
open('fake_hemingway.txt', 'w').write(decode(model.generate(context, max_new_tokens=10000)[0].tolist()))


Dom man, "I you rember and shouldid the slank of the firmst so they here the sun raw the pion try against with his could up the last was and a let heard and poing and they were beezfully of the hart betteenes oned it hold.

At the shark's line roped at it of the fish crenbe of the bird and was antimeng Sideso fast the gine line acra his hands that he comps and his lines ave strong foing against dep day and sawrected the geat was shan. Martt down in the water gut best spoppoing acrove it sight one ground and the san's clack on of his were he saw in the yart the gaff it the old morthin fish. It I lmust can do not see so ensiddil eat with now a mall jesched a land knowing all sides old mast the line of does and he dold under his aelins, he thought. Perthing and kill his head from chome and the fish head doark that what the bird it eving. But is vell not see lower oper size and he swaw most the fish right the foot the boat and they did feet his bird and the boy and dank, alway.

Whe boy s

10001

In [None]:
torch.save(model.state_dict(), 'gpt.pt')