In [13]:
import torch
import torch.nn as nn
from torch.nn import functional as F
from tqdm import tqdm
device = 'mps' if torch.mps.is_available() else 'cpu'
print(device)
block_size=64
batch_size=32
max_iters = 100000
eval_iters = 250
learning_rate = 1e-3
dropout_prob = 0.1
embedding_dim = 256

mps


In [14]:
with open('wizard_of_oz.txt', 'r', encoding='utf-8') as f, open('frankenstein.txt', 'r',  encoding='utf-8') as g:
    text=f.read() + g.read()
chars=sorted(set(text))
print(chars)
vocab_size = len(chars)

['\n', ' ', '!', '"', '#', '$', '%', '&', "'", '(', ')', '*', ',', '-', '.', '/', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', '[', ']', '_', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', 'æ', 'è', 'é', 'ê', 'ô', '—', '‘', '’', '“', '”', '•', '™', '\ufeff']


In [15]:
string_to_int = {ch:i for i,ch in enumerate(chars)}
int_to_string = {i:ch for i,ch in enumerate(chars)}

encode = lambda s: [string_to_int[c] for c in s]
decode = lambda l: ''.join([int_to_string[i] for i in l])

In [16]:
encoded = encode("abc")
print(encoded)
decoded = decode(encoded)
print(decoded)

[58, 59, 60]
abc


In [17]:
data = torch.tensor(encode(text), dtype=torch.long)
print(data[:200])

tensor([ 1,  1, 32, 43, 46, 43, 48, 36, 53,  1, 29, 42, 32,  1, 48, 36, 33,  1,
        51, 37, 54, 29, 46, 32,  1, 37, 42,  1, 43, 54,  0,  0,  1,  1, 30, 53,
         0,  0,  1,  1, 40, 14,  1, 34, 46, 29, 42, 39,  1, 30, 29, 49, 41,  0,
         0,  1,  1, 29, 49, 48, 36, 43, 46,  1, 43, 34,  1, 48, 36, 33,  1, 51,
        37, 54, 29, 46, 32,  1, 43, 34,  1, 43, 54, 12,  1, 48, 36, 33,  1, 40,
        29, 42, 32,  1, 43, 34,  1, 43, 54, 12,  1, 43, 54, 41, 29,  1, 43, 34,
         1, 43, 54, 12,  1, 33, 48, 31, 14,  0,  0,  1,  1, 37, 40, 40, 49, 47,
        48, 46, 29, 48, 33, 32,  1, 30, 53,  1, 38, 43, 36, 42,  1, 46, 14,  1,
        42, 33, 37, 40, 40,  0,  0,  1,  1, 30, 43, 43, 39, 47,  1, 43, 34,  1,
        51, 43, 42, 32, 33, 46,  1, 51, 37, 40, 40, 37, 29, 41,  1, 41, 43, 46,
        46, 43, 51,  1,  7,  1, 31, 43, 14, 12,  1, 37, 42, 31, 14,  1, 42, 33,
        51,  1])


In [18]:
n=int(0.8*len(data))
print(n)
train_data=data[:n]
print(len(train_data))
val_data=data[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    # print(ix)
    x = torch.stack([data[i:i+block_size] for i in ix])
    # print(x)
    # print("X Shape:",x.shape)
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    # print("Y Shape: ",y.shape)
    # print(y)
    x,y = x.to(device), y.to(device)
    return x,y

x,y = get_batch('train')



536892
536892


In [19]:
# block_size=8

# x=train_data[:block_size]
# y=train_data[1:block_size+1]

# for t in range(block_size):
#     context=x[:t+1]
#     target=y[t]
#     print(f"when target is {target} context is {context}") 

In [20]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
        
    model.train()
    return out
            

In [21]:
class BigramLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim=128, dropout_prob=0.2):
        super().__init__()
        self.token_embedding_table = nn.Embedding(vocab_size, embedding_dim)  
        self.layer_norm = nn.LayerNorm(embedding_dim)  # Helps stabilize training
        self.dropout = nn.Dropout(dropout_prob)  # Helps prevent overfitting
        self.linear = nn.Linear(embedding_dim, vocab_size)  # Projection layer

        self._init_weights()

    def _init_weights(self):
        """Better weight initialization"""
        nn.init.xavier_uniform_(self.token_embedding_table.weight)  
        nn.init.xavier_uniform_(self.linear.weight)
        nn.init.zeros_(self.linear.bias)  

    def forward(self, index, targets=None):
        # Token embeddings
        embeddings = self.token_embedding_table(index)
        
        # Apply LayerNorm and Dropout
        embeddings = self.layer_norm(embeddings)
        embeddings = self.dropout(embeddings)

        # Project to vocabulary size
        logits = self.linear(embeddings)

        loss = None
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B * T, C)
            targets = targets.view(B * T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, index, max_new_tokens):
        for _ in range(max_new_tokens):
            logits, _ = self.forward(index)
            logits = logits[:, -1, :]
            probs = F.softmax(logits, dim=-1)
            index_next = torch.multinomial(probs, num_samples=1)
            index = torch.cat((index, index_next), dim=1)
        return index
            

            
model = BigramLanguageModel(vocab_size, embedding_dim, dropout_prob)
m = model.to(device)

context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


Nc$rYGir™J—!'")VLèR•J7b;$gIpla“Cs9é/56*W*GMsj%YU?—7iBS™—u,'8gF1,b$U”$P"E‘Dj!l-aGip9Qgbd‘%-RcTT”865WMj%h/és﻿90éèQo,*d*k™y.McW#;’﻿æDXOôD#qlRQèD—#rO_H-B0TwN]jO$?7bQè2APæTkYIgg:èdEc-4(zw%B-lRæ3VwXz™8Pd9é._n::ay#RRs—V“$c’YPEi!$•J&C/™E“$E•-,BFY: CP)cTu:y$vtPdplè”08C0T/Ihw™9tæ1CG?"9KwRai[e5DBHy;$r)xL%ô)KjCiH#
Gx2CR—ik%OqW*n™-nD?tXzz#aéMm7D
lXzN?*2Q“"JEUTZK—'x“æK &[AOuôm1vê;TT’Ylo1DôB0Yzô[AqKw3L%x“%um“eCi:#B0E™™’Seæ?—Xx!ô;-jOVNu?f[te1™4#r)K#[6r/èFYz™1jJô[(1mh“1_jPsPdq(owe5eè[69éh8v:gxsV“æ—2MnCêx‘D!P4exU


In [47]:
# from tqdm import tqdm  # Progress bar for better visualization

# Initialize optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=0.01)

# Decay
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10000, gamma=0.9)

# # Track loss history for analysis
# train_losses = []
# eval_losses = []

try:
    for iter in range(max_iters):
        # Sample a batch of data
        xb, yb = get_batch('train')
    
        # Forward pass
        logits, loss = model.forward(xb, yb)
    
        # Backpropagation
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        
        # # Clip Gradients to Prevent Exploding Gradients
        # torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    
        # Optimizer Step
        optimizer.step()
    
        # Learning Rate Decay
        # scheduler.step()
    
        # Log and evaluate loss at intervals
        if iter % eval_iters == 0:
            losses = estimate_loss()
            train_loss = losses["train"]
            eval_loss = losses["val"]

            print(f'Step: {iter}, Train Loss: {train_loss:.4f}, Eval Loss: {eval_loss:.4f}')
    
    print(f"Final Training Loss: {loss.item():.4f}")

except KeyboardInterrupt:
    print("\nTraining interrupted. Saving progress...")
    

tensor(2.4272, device='mps:0', grad_fn=<NllLossBackward0>)
Step: 0, Train Loss: 2.4533, Eval Loss: 2.5029
tensor(2.4655, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4900, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4947, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4512, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4454, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4312, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4597, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4340, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4267, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4365, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4433, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4643, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4145, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4234, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.4534, device='mps:0', grad_fn=<NllLossBackward0>)
tensor(2.

In [23]:
context = torch.zeros((1,1), dtype=torch.long, device=device)
generated_chars = decode(m.generate(context, max_new_tokens=500)[0].tolist())
print(generated_chars)


at a hed b h tlle athtorat wherlintin we by;
at
"
an Hiled thed pras te. w t hainthouf c ot av. 
ived ece cuevofre prssat Shrerenofevede, ateree
sour. y tha I wherovicot I aind t t, is t icexpo heis ow
nd E wis mong pr.
it tieiriss higedien esefany, ithicontrersus I ase bicofutrs
bld th  THEESEn nd F bered pr bs sthionis w o t uindere myss
wan g verd orecry whe
pand sc. th wheen fowanthis

ll "NDomoreas pes, smederile Bupiltisifithe en d a
mind I to cked kithenirsorvigig w?"I'mimurd if be
s d EE


In [None]:
x = torch.tensor([-0.5], dtype=torch.float32)
y = F.relu(x)
print(y)

In [None]:
x = torch.tensor([0.5], dtype=torch.float32)
y = F.sigmoid(x)
print(y)

In [None]:
x = torch.tensor([-10.05], dtype=torch.float32)
y = F.tanh(x)
print(y)

In [34]:
love = torch.tensor([0.4, 0.6, 0.5])
i = torch.tensor([0.1, 0.2, 0.3])

dot_love_i = torch.matmul(love, i)
dot_love_love = torch.matmul(love, love)
print(dot_love_i)
print(dot_love_love)

# Scale the dot product by the square root of the dimension (sqrt(d_k))
d_k = love.shape[0]  # The dimension of the vectors (3 in this case)
scaled_dot_love_i = dot_love_i / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
scaled_dot_love_love = dot_love_love / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

# Print the scaled dot product
print("Scaled dot product of 'love' and 'I':", scaled_dot_love_i)
print("Scaled dot product of 'love' and 'love':", scaled_dot_love_love)

# Softmax of scaled dot product
softmax_love_i = F.softmax(scaled_dot_love_i, dim=-1)
softmax_love_love = F.softmax(scaled_dot_love_love, dim=-1)

print("Final output of 'love' and 'i':",  softmax_love_i)

tensor(0.3100)
tensor(0.7700)
Scaled dot product of 'love' and 'I': tensor(0.1790)
Scaled dot product of 'love' and 'love': tensor(0.4446)
Final output of 'love' and 'i': tensor(1.)


In [46]:
# Define the tensors for multiple queries (Q), keys (K), and values (V)
love = torch.tensor([0.4, 0.6, 0.5])
i = torch.tensor([0.1, 0.2, 0.3])
ice = torch.tensor([0.2, 0.4, 0.6])
cream = torch.tensor([0.4, 0.1, 0.3])

# Values (V) could be the same as keys in this example, but they can be different in practice
values = torch.stack([love, i, ice,  cream])  # Shape: (3, 3)

# Now calculate dot products for multiple queries (Q) and keys (K)
queries = torch.stack([love, i, ice, cream])  # Shape: (4, 3)
keys = torch.stack([love, i, ice, cream])  # Shape: (4, 3)

# Dot product of queries with keys (Q.K^T)
dot_products = torch.matmul(queries, keys.T)  # Shape: (3, 3)

# Print the dot product matrix
print("Dot products (Q.K^T):")
print(dot_products)

# Scale the dot products by the square root of the dimension (sqrt(d_k))
d_k = love.shape[0]  # Dimension of the vectors (3 in this case)
scaled_dot_products = dot_products / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))

# Print the scaled dot products
print("Scaled dot products (Q.K^T / sqrt(d_k)):")
print(scaled_dot_products)

# Softmax of the scaled dot products across the rows (queries)
softmax_scores = F.softmax(scaled_dot_products, dim=-1)

# Print the softmax scores (attention weights)
print("Softmax scores (attention weights):")
print(softmax_scores)

# Weighted sum of values using attention scores
output = torch.matmul(softmax_scores, values)  # Shape: (4, 4) * (4, 3) -> (3, 3)


# To ensure final output shape is (4, 4), we'll project the output back to a 4-dimensional space
# Let's simply extend the output (this is just an example of how to project it)
# In real-world scenarios, you'd use a learned projection matrix, but here we can just use a linear transformation for simplicity

# Simple linear transformation (optional, just to change the dimensionality of the output)
W = torch.randn(3, 4)  # Projection matrix (3 -> 4)
final_output = torch.matmul(output, W)  # Shape: (4, 3) * (3, 4) -> (4, 4)

# Print the final attention output
print("Attention output (weighted sum of values):")
print(final_output)



Dot products (Q.K^T):
tensor([[0.7700, 0.3100, 0.6200, 0.3700],
        [0.3100, 0.1400, 0.2800, 0.1500],
        [0.6200, 0.2800, 0.5600, 0.3000],
        [0.3700, 0.1500, 0.3000, 0.2600]])
Scaled dot products (Q.K^T / sqrt(d_k)):
tensor([[0.4446, 0.1790, 0.3580, 0.2136],
        [0.1790, 0.0808, 0.1617, 0.0866],
        [0.3580, 0.1617, 0.3233, 0.1732],
        [0.2136, 0.0866, 0.1732, 0.1501]])
Softmax scores (attention weights):
tensor([[0.2876, 0.2205, 0.2637, 0.2283],
        [0.2631, 0.2385, 0.2586, 0.2399],
        [0.2763, 0.2271, 0.2669, 0.2297],
        [0.2646, 0.2330, 0.2541, 0.2483]])
Attention output (weighted sum of values):
tensor([[ 0.1254,  0.1716, -0.6754,  1.1099],
        [ 0.1226,  0.1687, -0.6686,  1.0827],
        [ 0.1244,  0.1699, -0.6732,  1.1011],
        [ 0.1228,  0.1704, -0.6701,  1.0811]])


In [43]:
# Define the number of heads and the embedding dimension (d_model)
num_heads = 2
d_model = 6  # Total dimensionality of the input embeddings
d_k = d_v = d_model // num_heads  # Dimension of each query/key/value per head

# Define the tensors for multiple queries (Q), keys (K), and values (V)
# In practice, these would be embeddings of words in a sentence
love = torch.tensor([0.4, 0.6, 0.5, 0.2, 0.1, 0.3])
i = torch.tensor([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
ice = torch.tensor([0.2, 0.4, 0.6, 0.1, 0.3, 0.5])
cream = torch.tensor([0.4, 0.1, 0.3, 0.6, 0.2, 0.4])

# Stack all inputs into one tensor (sequence of 4 words, each of dimension 6)
inputs = torch.stack([love, i, ice, cream])  # Shape: (4, 6)

# Initialize weights for the linear projections for Q, K, and V (for each head)
W_Q = torch.randn(num_heads, d_model, d_k)  # Shape: (2, 6, 3)
W_K = torch.randn(num_heads, d_model, d_k)  # Shape: (2, 6, 3)
W_V = torch.randn(num_heads, d_model, d_v)  # Shape: (2, 6, 3)

# Step 1: Linear projections for each head
Q = torch.matmul(inputs, W_Q[0])  # Query for head 1
K = torch.matmul(inputs, W_K[0])  # Key for head 1
V = torch.matmul(inputs, W_V[0])  # Value for head 1

Q2 = torch.matmul(inputs, W_Q[1])  # Query for head 2
K2 = torch.matmul(inputs, W_K[1])  # Key for head 2
V2 = torch.matmul(inputs, W_V[1])  # Value for head 2

# Step 2: Perform self-attention for each head
def scaled_dot_product_attention(Q, K, V):
    d_k = Q.shape[-1]
    scores = torch.matmul(Q, K.T) / torch.sqrt(torch.tensor(d_k, dtype=torch.float32))
    attention_weights = F.softmax(scores, dim=-1)
    output = torch.matmul(attention_weights, V)
    return output

# Self-attention for head 1
attn_output1 = scaled_dot_product_attention(Q, K, V)

# Self-attention for head 2
attn_output2 = scaled_dot_product_attention(Q2, K2, V2)

# Step 3: Concatenate the attention outputs of both heads
attn_output = torch.cat([attn_output1, attn_output2], dim=-1)  # Concatenate along the feature dimension

# Step 4: Pass the concatenated output through a final linear layer
W_O = torch.randn(num_heads * d_v, d_model)  # Output projection matrix
final_output = torch.matmul(attn_output, W_O)  # Shape: (4, 6)

# Print the final multi-head attention output
print("Final Multi-Head Attention Output:")
print(final_output)

Final Multi-Head Attention Output:
tensor([[-2.6928,  0.6771, -1.5836, -0.0237, -1.3448,  1.0443],
        [-2.9303,  0.7301, -1.6907, -0.2043, -1.5105,  1.1229],
        [-2.7500,  0.7064, -1.5690, -0.0254, -1.3526,  1.0658],
        [-2.8918,  0.7441, -1.6654, -0.1795, -1.4675,  1.0948]])
