In [56]:
import torch
import torch.nn as nn
from torch.nn import functional as F

print("PyTorch version:", torch.__version__)
print("CUDA available:", torch.cuda.is_available())
print("CUDA version:", torch.version.cuda)
print("Device name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

PyTorch version: 2.6.0+cu126
CUDA available: True
CUDA version: 12.6
Device name: NVIDIA GeForce RTX 3050 Laptop GPU
cuda


In [69]:
block_size=8
batch_size=4
max_iters=1000
learning_rate=3e-4
eval_iters = 250
dropout = 0.2 # will be used to drop random neurons by 0.2 something , to make the model learn better
#like removing neurons to not to learn stupid things at random

In [58]:
with open('wizard_of_oz.txt','r',encoding='utf-8') as f:
    text = f.read()
    
chars = sorted(set(text))
vocab_size = len(chars)
print(chars[:10])

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(']


In [59]:
#tokenizer
string_to_int = {ch:i for i , ch in enumerate(chars)}
int_to_string = {i:ch for i, ch in enumerate(chars)}

encode = lambda s : [string_to_int[c] for c in s ]
decode = lambda l : ''.join([int_to_string[i] for i in l ])
# int_to_string = {i: chr(32 + (i % 94)) for i in range(500)}
# decode = lambda l: ''.join([int_to_string.get(i, '?') for i in l])

data = torch.tensor(encode(text),dtype=torch.long)


In [60]:
# Train-Validation Split
n = int(0.8 * len(data))
train_data = data[:n]
val_data = data[n:]


def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(0, len(data) - block_size, (batch_size,))
    #ix = index
     
    # print(f"indices: {ix}")
    
    x = torch.stack([data[i : i + block_size] for i in ix])
    y = torch.stack([data[i + 1 : i + block_size + 1] for i in ix])  # Shifted by 1 for next token prediction
    
    return x, y

# Fetch data
x, y = get_batch('train')
print(f"Inputs  : {x}")
print(f"Targets : {y}")

Inputs  : tensor([[71, 64,  1, 63, 66, 75, 76, 77],
        [47, 72,  1, 82, 72, 78,  0, 65],
        [71,  0, 77, 65, 62,  1, 61, 72],
        [62, 75, 62, 14,  0,  0, 48, 65]])
Targets : tensor([[64,  1, 63, 66, 75, 76, 77,  1],
        [72,  1, 82, 72, 78,  0, 65, 58],
        [ 0, 77, 65, 62,  1, 61, 72, 72],
        [75, 62, 14,  0,  0, 48, 65, 62]])


In [61]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()  # Switch model to evaluation mode
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)  # Store individual losses
        for k in range(eval_iters):
            X, Y = get_batch(split)
            X, Y = X.to(device), Y.to(device)  # Correct tensor device movement
            logits, loss = model(X, Y)  # No `.to(device)` needed here
            losses[k] = loss.item()  # Store the loss value
            
        out[split] = losses.mean()  # Compute the mean loss for the split
    model.train()  # Switch model back to training mode
    return out


In [62]:


class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size):
        super().__init__()
        # Embedding layer maps each token to a vector of size 'vocab_size'
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size)
        
    def forward(self, index, targets=None):
        # Look up embeddings for the input tokens
        logits = self.token_embedding_table(index)  # Shape: [B, T, C]
        
        if targets is None:
           loss = None
        else:
            # Reshape logits for cross-entropy loss
            B, T, C = logits.shape
            logits = logits.view(B * T, C)  # Flatten for batch processing
        
            # Reshape targets to match logits shape
            targets = targets.view(B * T)
        
            #we can also use reshape but they both have their own advantages
            #view() reshapes the tensor to match PyTorch's F.cross_entropy() input requirements.
        
            # Compute the loss using CrossEntropyLoss
            loss = F.cross_entropy(logits, targets)
        
        return logits ,loss
    
    def generate(self,index,max_new_tokens):
        
        # index → Initial sequence of token indices (starting prompt).
        # max_new_tokens → Number of new tokens to generate.
        
        for _ in range(max_new_tokens):
        # The loop runs max_new_tokens times, generating one new token at each step.
            
            # Get the logits for the last token in the sequence
            logits ,loss = self.forward(index)  # Shape: [B, T, C]
            logits = logits[:, -1, :]  # Select the last token only (shape [B, C])
            
            # Apply softmax to convert logits to probabilities
            probs = F.softmax(logits, dim=-1)  # Shape: [B, C]
            
            # Sample the next token from the probability distribution
            next_token = torch.multinomial(probs, num_samples=1)  # Shape: [B, 1]
            
            
            # Append the predicted token to the sequence
            index = torch.cat([index, next_token], dim=1)  # Shape: [B, T+1]

        return index
    

In [63]:
model = BigramLanguageModel(vocab_size).to(device)
m = model.to(device)
context = torch.zeros((1, 1), dtype=torch.long).to(device)
generated_tokens = m.generate(context, max_new_tokens=500)[0].tolist()

# Fix for decoding unknown tokens
generated_chars = decode(generated_tokens)
print(generated_chars)



X,3WB6)-$X’b%XFg—fOdHCESsI8!Y"OlT’(jDyGn& —f4j[]&IU—DyrH204r•‘'F-F”.#psOg/9h"Q9Jkw2ObmT4mRw”jSUnH —dnMTtb-—%-“A8!Od/)-Luv™00-nwH1 (]%w!eVqJ';ZsOV]%b6o;&#B.j_eg[S[HzWB"Q&1$HfOW,5pnxaNv4?fd1_N•EKXt]1_IEawg-O.RDn_4w5PGQUzyUW“W﻿”g2eZyl.#i$v[ErkP,lG﻿Jk—(•Va_jU! VXQ?C1?rLo&6﻿/r“G™dovUg2O;!?(b2p_BCvEBQ,T.)Y﻿ITxSPtrv4kaXU.#k9xd
“4:W[%z(v4#)
2Tw,!UZ%/19!:Sz7KXc]:4uv)hwxq
pz$SUy$HU63e/“7KL]1d﻿:;p“F'Q&&a)'-m™mS6BQX D
﻿:—lI-y,]]alDaKX,h
yM’(K8klO•™#s,Ja ,XD—-$H$7?P?v™55'd&u#wb
X eP#y—6”2u‘?BSxp'5wpL0•.d;,&:


In [73]:
optimizer = torch.optim.AdamW(model.parameters(),lr=learning_rate)

for iter in range(max_iters):
    
    if iter % eval_iters == 0 :
        losses = estimate_loss()
        print(f" step : {iter} , train loss : {losses['train']:.3f} , val loss :{losses['val']:.3f}")
    
    #sample a batch of data
    xb,yb = get_batch('train')
    xb, yb = xb.to(device), yb.to(device)   # Move data to GPU
    
    #evaluate the loss
    
    logits , loss = model.forward(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())

 step : 0 , train loss : 4.545 , val loss :4.591
 step : 250 , train loss : 4.487 , val loss :4.524
 step : 500 , train loss : 4.438 , val loss :4.434
 step : 750 , train loss : 4.392 , val loss :4.412
4.2831926345825195


In [65]:
context = torch.zeros((1,1) , dtype=torch.long , device=device)
generated_chars = decode(m.generate(context,max_new_tokens=500)[0].tolist())
print(generated_chars)


Vn& VaU',IJP00-3[s—7™sOOTq3[﻿l!b".BHr(v™rkcI%﻿/#g'Bd9ELdMXxD™p_hli604z•Es_]eQnssk_oBH-m:4Y,dMsWQ•Hgeoeoa nvKN:—T 0“a&LXLI% U?v™cN﻿4h3vv)T!DmSkBO3H/gy;ZQ zZu8[cbGQ&Kt$24DeGwlthQ?4?﻿*(,I’V%8!OJ3n94#n”&:X”ZT;ZqzNXfvZARx8i0z'v™_j—R”f“W*“rzZG
]BC”0)N'Fh"N[Ss?
ihaxaV/
eo•sZsT)[67r“j ]ZmS,”'K8o•﻿[?v™1"wIJIn'/
r;Z;jr“tO;,‘KYYTSJg.,HEBL]2 '5MKtXfztr5w-vIBU(u1Geg2NT'd,/)];,9•Hiwd!X]l.g2’?r•s7d[KX—[orc0j$y*—(JbpUn:a-3H:’5UKN1.PZY’dmA7s—Dm]ZTD'vJgJ2uzZ#srgy$OpY1fRma;Zoo['/s%*Nc,elc2.)”HmS—(“
X3P—IJa2kGH9!—r


In [66]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Memory usage check
print(f"Memory Allocated: {torch.cuda.memory_allocated(device) / 1024 ** 2} MB")
print(f"Memory Reserved: {torch.cuda.memory_reserved(device) / 1024 ** 2} MB")


Using device: cuda:0
Memory Allocated: 0.27685546875 MB
Memory Reserved: 2.0 MB


In [67]:
# Dummy tensor to test memory usage
dummy_tensor = torch.randn(10000, 10000, device='cuda')
print(f"Memory Allocated (After Dummy Tensor): {torch.cuda.memory_allocated() / 1024 ** 2} MB")
print(f"Memory Reserved (After Dummy Tensor): {torch.cuda.memory_reserved() / 1024 ** 2} MB")
dummy_tensor = None  # Free tensor after test
torch.cuda.empty_cache()


Memory Allocated (After Dummy Tensor): 382.27685546875 MB
Memory Reserved (After Dummy Tensor): 384.0 MB


In [None]:
# what does ReLU does ??
# its rectified linear activation unit , y = {0 if yin<==0}
#                                            {yin if yin>0 }