In [33]:
import torch
import torch.nn as nn
from torch.nn import functional as F
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)
block_size = 32
batch_size = 64
max_iters = 2000
learning_rate = 3e-4
eval_iters = 50
dropout = 0.2

learning_rate = 3e-4
n_embd = 384
n_head = 8
n_layer = 8

cuda


In [34]:
with open ('wizard_of_oz.txt','r',encoding = 'utf-8') as f:
    text = f.read()
chars = sorted(set(text))
print(chars)
vocab_size = len(chars)
print(vocab_size)


['\n', ' ', '!', '"', '#', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '\ufeff']
83


In [35]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

data = torch.tensor(encode(text),dtype = torch.long)
print(data.shape)

torch.Size([233430])


In [36]:
n = int(0.8*len(data))
train_data = data[:n]
val_data =  data[n:]

print(len(train_data))
print(len(val_data))

def get_batch(split):
    data = train_data if split == 'train' else val_data
    #print(len(data)-block_size)
    ix = torch.randint(len(data)-block_size,(batch_size,))
    #print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y


x, y = get_batch('train')
print('inputs:')
print(x)
print('targets:')
print(y)

186744
46686
inputs:
tensor([[60, 73, 58,  ..., 10,  1, 64],
        [ 1, 75, 63,  ...,  1, 60, 56],
        [80, 60, 59,  ..., 63, 56, 69],
        ...,
        [68,  1, 64,  ...,  6, 74,  1],
        [ 1, 61, 73,  ..., 63, 60,  1],
        [ 1, 74, 71,  ...,  1, 69, 60]], device='cuda:0')
targets:
tensor([[73, 58, 80,  ...,  1, 64, 69],
        [75, 63, 60,  ..., 60, 56, 73],
        [60, 59,  1,  ..., 56, 69,  1],
        ...,
        [ 1, 64, 69,  ..., 74,  1, 57],
        [61, 73, 76,  ..., 60,  1, 57],
        [74, 71, 67,  ..., 69, 60, 64]], device='cuda:0')


In [37]:

x = train_data[:block_size]
y = train_data[1:block_size+1]

for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print("when input is",context,"target is",target)

when input is tensor([82]) target is tensor(46)
when input is tensor([82, 46]) target is tensor(63)
when input is tensor([82, 46, 63]) target is tensor(60)
when input is tensor([82, 46, 63, 60]) target is tensor(1)
when input is tensor([82, 46, 63, 60,  1]) target is tensor(42)
when input is tensor([82, 46, 63, 60,  1, 42]) target is tensor(73)
when input is tensor([82, 46, 63, 60,  1, 42, 73]) target is tensor(70)
when input is tensor([82, 46, 63, 60,  1, 42, 73, 70]) target is tensor(65)
when input is tensor([82, 46, 63, 60,  1, 42, 73, 70, 65]) target is tensor(60)
when input is tensor([82, 46, 63, 60,  1, 42, 73, 70, 65, 60]) target is tensor(58)
when input is tensor([82, 46, 63, 60,  1, 42, 73, 70, 65, 60, 58]) target is tensor(75)
when input is tensor([82, 46, 63, 60,  1, 42, 73, 70, 65, 60, 58, 75]) target is tensor(1)
when input is tensor([82, 46, 63, 60,  1, 42, 73, 70, 65, 60, 58, 75,  1]) target is tensor(33)
when input is tensor([82, 46, 63, 60,  1, 42, 73, 70, 65, 60, 58, 

In [38]:
class Head(nn.Module):
    

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
       
        B,T,C = x.shape
        k = self.key(x)  
        q = self.query(x) 
      
        wei = q @ k.transpose(-2,-1) * k.shape[-1]**-0.5 
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) 
        wei = F.softmax(wei, dim=-1) 
        wei = self.dropout(wei)
      
        v = self.value(x) 
        out = wei @ v 
        return out

class MultiHeadAttention(nn.Module):
  

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(head_size * num_heads, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out
    

class FeedFoward(nn.Module):
    

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)
    
class Block(nn.Module):
   

    def __init__(self, n_embd, n_head):
       
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedFoward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        y = self.sa(x)
        x = self.ln1(x + y)
        y = self.ffwd(x)
        x = self.ln2(x + y)
        return x
    
class GPTLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)
        
        
        self.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)
        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean=0.0, std=0.02)

    def forward(self, index, targets=None):
        B, T = index.shape
        
        
     
        tok_emb = self.token_embedding_table(index) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device)) # (T,C)
        x = tok_emb + pos_emb 
        x = self.blocks(x) 
        x = self.ln_f(x)
        logits = self.lm_head(x)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        
        return logits, loss
    
    def generate(self, index, max_new_tokens):
        
        for _ in range(max_new_tokens):
            
            index_cond = index[:, -block_size:]
            
            logits, loss = self.forward(index_cond)
           
            logits = logits[:, -1, :] 
           
            probs = F.softmax(logits, dim=-1) 
            
            index_next = torch.multinomial(probs, num_samples=1) 
           
            index = torch.cat((index, index_next), dim=1)
        return index

model = GPTLanguageModel(vocab_size)
"""print("loading model parameters...")
with open('model_01.pkl','rb') as f:
    model = pickle.load(f)
print("Loaded Successfully")"""
m = model.to(device)




In [22]:
class BigramLanguageModel(nn.Module):
    def __init__(self,vocab_size):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size,vocab_size)

    def forward(self,index,targets = None):
        logits = self.token_embedding_table(index)
        
       

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits,targets)
            
            

        return logits, loss 

    def generate(self,index,max_new_tokens):
        for i in range(max_new_tokens):
            logits , loss = self.forward(index)

            logits = logits[:,-1,:]

            probs = F.softmax(logits,dim=-1)

            index_next = torch.multinomial(probs,num_samples = 1)

            index = torch.cat((index,index_next),dim=1)
        return index


model = BigramLanguageModel(vocab_size)
m = model.to(device)

context = torch.zeros((1,1) , dtype= torch.long,device = device)
generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)




)h5s*07ddun﻿Y*R5V_iV9
IdD0h,r8yAU1FI28;3y3kvh*Upit*lnV/V'SsrOKpiKBV:pN5gKt4Y(/(Lywwdd#o
2!Yw:c7kV
9[XLWo
3CsSikaQ/QP:s/7bxP
2;
oDxTEFwduc/"rgE*Q*9]2ub50r EmNKsWsU1Z3a6L"MI69.ST]vIu::cSiQ#h"lXFX9[7XG&Tg n3HkAXhYO6XT3E9)Y-g9Qs#cBT#Bmo)Ix1rV
efDO(K﻿KA,q1zQ'"lxq0&LY(b/"- _AW*t U(-C/'CP?"fhe8G8/vdDOka:G(- ud?W'3rNA)lCjRX0_3X9-a,D)luccC,p[/Qg#wUbFgjOPmrV'jcTv5b/RN&EFtJduB'""8IRU3jaQOa-Q﻿Mk:mIam?DxNW7GOam;e,l(;qrV3
SHf9Nnk&sFOX8YG(yR-_o1]2 rV_﻿XTxBAMF[i9LbTb_G(IsFueTM.(kWj;rKchk1r&zbqYtbyc?5VAU&dd3m:p


In [39]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train','val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X,Y = get_batch(split)
            logits,loss = model (X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [40]:
optimizer = torch.optim.AdamW(model.parameters(),lr = learning_rate)

for iter in range(max_iters):
    print(iter)
   
    if iter % eval_iters == 0:
        losses = estimate_loss()
        print(f"step: {iter}, train_loss: {losses['train']:.3f} , val_loss: {losses['val']:.3f}")

    xb,yb = get_batch('train')

    logits,loss = model.forward(xb,yb)
    optimizer.zero_grad(set_to_none = True)
    loss.backward()
    optimizer.step()

    



print(loss.item())


step: 0, train_loss: 4.470 , val_loss: 4.468
step: 50, train_loss: 2.463 , val_loss: 2.513
step: 100, train_loss: 2.281 , val_loss: 2.340
step: 150, train_loss: 2.099 , val_loss: 2.188
step: 200, train_loss: 1.943 , val_loss: 2.049
step: 250, train_loss: 1.824 , val_loss: 1.955
step: 300, train_loss: 1.767 , val_loss: 1.886
step: 350, train_loss: 1.705 , val_loss: 1.836
step: 400, train_loss: 1.654 , val_loss: 1.798
step: 450, train_loss: 1.615 , val_loss: 1.753
step: 500, train_loss: 1.577 , val_loss: 1.741
step: 550, train_loss: 1.555 , val_loss: 1.718
step: 600, train_loss: 1.515 , val_loss: 1.690
step: 650, train_loss: 1.489 , val_loss: 1.673
step: 700, train_loss: 1.460 , val_loss: 1.659
step: 750, train_loss: 1.439 , val_loss: 1.652
step: 800, train_loss: 1.424 , val_loss: 1.648
step: 850, train_loss: 1.405 , val_loss: 1.623
step: 900, train_loss: 1.395 , val_loss: 1.626
step: 950, train_loss: 1.380 , val_loss: 1.598
step: 1000, train_loss: 1.355 , val_loss: 1.615
step: 1050, tra

In [43]:
context = torch.zeros((1,1) , dtype= torch.long,device = device)
generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


which to be a mountern to the long Very, and khen diffivied at the perface best thing the Prince joined Dorothy
and the cannot brought that your kicken; better usee them."

"But the gat wasbers of our doompway, who the horse told. Just
was ripers cuase of the Mangagaabooo
Chad CFlight us, anxiously, "because the readful regrospleen lyings glass
delyhbod upon the breath. They were
too ruished wooden; but it was sdmy.

Then it mure be s
near while many be accross that he heart upon.

"No one of th
