In [1]:
from gensim.models import Word2Vec
import torch


In [2]:
words=["the","sun","rises","in","the","east"]
model=Word2Vec([words],min_count=1,vector_size=3)


In [3]:
print(model.wv['sun'])  # Example to get the vector for the word 'sun'

[-0.15122044  0.21846838 -0.16200535]


In [4]:
inputs = torch.tensor(model.wv[words])  # Convert words to their corresponding vectors
print(inputs)

tensor([[-0.0179,  0.0079,  0.1701],
        [-0.1512,  0.2185, -0.1620],
        [-0.1254,  0.2460, -0.0511],
        [ 0.2153,  0.2991, -0.1672],
        [-0.0179,  0.0079,  0.1701],
        [ 0.3003, -0.3101, -0.2372]])


In [5]:
query = inputs[1]  # 'sun'
print(inputs.shape)
scores = torch.empty(len(words))
for i,x in enumerate(inputs):
    scores[i] = torch.dot(query, x)
print(scores)

torch.Size([6, 3])
tensor([-0.0231,  0.0968,  0.0810,  0.0599, -0.0231, -0.0747])


In [6]:
weights = scores/torch.sum(scores)
print(weights)

weights_softmax = torch.softmax(scores, dim=0)
print(weights_softmax)

tensor([-0.1982,  0.8297,  0.6940,  0.5130, -0.1982, -0.6403])
tensor([0.1594, 0.1797, 0.1769, 0.1732, 0.1594, 0.1514])


In [7]:
context_vec2 = torch.zeros(3)
for i, w in enumerate(weights_softmax):
    context_vec2 += w * inputs[i]
print(context_vec2)

tensor([ 0.0277,  0.0902, -0.0488])


In [8]:
attention_scores = torch.matmul(inputs, inputs.T)
print(attention_scores)

tensor([[ 0.0293, -0.0231, -0.0045, -0.0299,  0.0293, -0.0482],
        [-0.0231,  0.0968,  0.0810,  0.0599, -0.0231, -0.0747],
        [-0.0045,  0.0810,  0.0789,  0.0551, -0.0045, -0.1018],
        [-0.0299,  0.0599,  0.0551,  0.1638, -0.0299,  0.0116],
        [ 0.0293, -0.0231, -0.0045, -0.0299,  0.0293, -0.0482],
        [-0.0482, -0.0747, -0.1018,  0.0116, -0.0482,  0.2426]])


In [9]:
attention_weights = torch.softmax(attention_scores, dim=-1)
print(attention_weights)

tensor([[0.1729, 0.1641, 0.1672, 0.1630, 0.1729, 0.1600],
        [0.1594, 0.1797, 0.1769, 0.1732, 0.1594, 0.1514],
        [0.1627, 0.1773, 0.1769, 0.1727, 0.1627, 0.1476],
        [0.1553, 0.1699, 0.1691, 0.1885, 0.1553, 0.1619],
        [0.1729, 0.1641, 0.1672, 0.1630, 0.1729, 0.1600],
        [0.1582, 0.1541, 0.1499, 0.1680, 0.1582, 0.2116]])


In [10]:
print("Sums to 1:", attention_weights.sum(dim=-1))

Sums to 1: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [11]:
contexts_vecs = torch.matmul(attention_weights, inputs)
print(contexts_vecs)

tensor([[ 0.0312,  0.0788, -0.0415],
        [ 0.0277,  0.0902, -0.0488],
        [ 0.0267,  0.0907, -0.0463],
        [ 0.0367,  0.0873, -0.0532],
        [ 0.0312,  0.0788, -0.0415],
        [ 0.0519,  0.0577, -0.0571]])


Simple self attention mechanism with trainable weights


In [12]:
print(inputs)
w_query=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)  
w_key=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)  
w_value=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)
print(w_key,w_value,w_query)

tensor([[-0.0179,  0.0079,  0.1701],
        [-0.1512,  0.2185, -0.1620],
        [-0.1254,  0.2460, -0.0511],
        [ 0.2153,  0.2991, -0.1672],
        [-0.0179,  0.0079,  0.1701],
        [ 0.3003, -0.3101, -0.2372]])
Parameter containing:
tensor([[-1.6238,  0.2711],
        [-0.3391,  0.8070],
        [ 2.8530,  1.9266]], requires_grad=True) Parameter containing:
tensor([[-0.6016, -1.0846],
        [ 1.0281, -0.6235],
        [-0.4470,  0.9285]], requires_grad=True) Parameter containing:
tensor([[-1.1993,  0.2072],
        [ 0.5061,  0.5635],
        [ 1.0926,  0.8892]], requires_grad=True)


In [13]:
query2 = torch.matmul(inputs, w_query)
keys2 = torch.matmul(inputs, w_key)
values2 = torch.matmul(inputs, w_value)
print(query2,keys2,values2)

tensor([[ 0.2113,  0.1520],
        [ 0.1149, -0.0523],
        [ 0.2191,  0.0672],
        [-0.2895,  0.0645],
        [ 0.2113,  0.1520],
        [-0.7763, -0.3235]], grad_fn=<MmBackward0>) tensor([[ 0.5117,  0.3292],
        [-0.2907, -0.1768],
        [-0.0256,  0.0660],
        [-0.9280, -0.0224],
        [ 0.5117,  0.3292],
        [-1.0593, -0.6259]], grad_fn=<MmBackward0>) tensor([[-0.0572,  0.1724],
        [ 0.3880, -0.1226],
        [ 0.3512, -0.0648],
        [ 0.2527, -0.5752],
        [-0.0572,  0.1724],
        [-0.3934, -0.3527]], grad_fn=<MmBackward0>)


In [14]:
attention_scores2 = torch.matmul(query2, keys2.T)
print(attention_scores2)
attention_scores2norm = torch.softmax(attention_scores2/(2**0.5), dim=-1)
print(attention_scores2norm)

tensor([[ 0.1582, -0.0883,  0.0046, -0.1995,  0.1582, -0.3190],
        [ 0.0416, -0.0242, -0.0064, -0.1055,  0.0416, -0.0890],
        [ 0.1342, -0.0756, -0.0012, -0.2048,  0.1342, -0.2742],
        [-0.1269,  0.0728,  0.0117,  0.2672, -0.1269,  0.2663],
        [ 0.1582, -0.0883,  0.0046, -0.1995,  0.1582, -0.3190],
        [-0.5037,  0.2829, -0.0015,  0.7276, -0.5037,  1.0248]],
       grad_fn=<MmBackward0>)
tensor([[0.1913, 0.1607, 0.1716, 0.1486, 0.1913, 0.1365],
        [0.1744, 0.1665, 0.1686, 0.1572, 0.1744, 0.1590],
        [0.1884, 0.1625, 0.1712, 0.1483, 0.1884, 0.1412],
        [0.1450, 0.1670, 0.1599, 0.1916, 0.1450, 0.1915],
        [0.1913, 0.1607, 0.1716, 0.1486, 0.1913, 0.1365],
        [0.0952, 0.1660, 0.1358, 0.2274, 0.0952, 0.2805]],
       grad_fn=<SoftmaxBackward0>)


In [15]:
class selfattention(torch.nn.Module):
    def __init__(self, input_dim, out_dim):
        super().__init__()
        self.w_query = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)
        self.w_key = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)
        self.w_value = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)

    def forward(self, x):
        queries = torch.matmul(x, self.w_query)
        keys = torch.matmul(x, self.w_key)
        values = torch.matmul(x, self.w_value)

        attention_scores = torch.matmul(queries, keys.T) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)

        output = torch.matmul(attention_weights, values)
        return output

In [16]:
test_attention = selfattention(input_dim=3, out_dim=2)
output = test_attention(inputs)
print(output)

tensor([[0.2101, 0.1114],
        [0.2500, 0.0576],
        [0.2598, 0.0606],
        [0.2488, 0.0341],
        [0.2101, 0.1114],
        [0.1072, 0.1362]], grad_fn=<MmBackward0>)


In [17]:
class selfattentionv2(torch.nn.Module):
    def __init__(self, input_dim, out_dim,qkv_bias=False):
        super().__init__()
        self.w_query = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)

    def forward(self, x):
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)

        attention_scores = torch.matmul(queries, keys.T) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)

        output = torch.matmul(attention_weights, values)
        return output,attention_weights

In [18]:
test2 = selfattentionv2(input_dim=3, out_dim=2)
output2,weights = test2.forward(inputs)
print(weights)

tensor([[0.1662, 0.1663, 0.1668, 0.1692, 0.1662, 0.1654],
        [0.1663, 0.1677, 0.1670, 0.1653, 0.1663, 0.1674],
        [0.1662, 0.1675, 0.1670, 0.1664, 0.1662, 0.1668],
        [0.1680, 0.1673, 0.1662, 0.1610, 0.1680, 0.1695],
        [0.1662, 0.1663, 0.1668, 0.1692, 0.1662, 0.1654],
        [0.1687, 0.1662, 0.1658, 0.1614, 0.1687, 0.1693]],
       grad_fn=<SoftmaxBackward0>)


<b>Causal Attention Mechanism

In [19]:
mask = torch.tril(torch.ones(weights.shape))
print(mask)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [20]:
masked_weights = weights * mask
print(masked_weights)

tensor([[0.1662, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1663, 0.1677, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1662, 0.1675, 0.1670, 0.0000, 0.0000, 0.0000],
        [0.1680, 0.1673, 0.1662, 0.1610, 0.0000, 0.0000],
        [0.1662, 0.1663, 0.1668, 0.1692, 0.1662, 0.0000],
        [0.1687, 0.1662, 0.1658, 0.1614, 0.1687, 0.1693]],
       grad_fn=<MulBackward0>)


In [21]:
row_sums = masked_weights.sum(dim=-1, keepdim=True)
normalized_masked_weights = masked_weights / row_sums 
print(normalized_masked_weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4979, 0.5021, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3319, 0.3345, 0.3336, 0.0000, 0.0000, 0.0000],
        [0.2536, 0.2525, 0.2509, 0.2430, 0.0000, 0.0000],
        [0.1991, 0.1992, 0.1998, 0.2027, 0.1991, 0.0000],
        [0.1687, 0.1662, 0.1658, 0.1614, 0.1687, 0.1693]],
       grad_fn=<DivBackward0>)


In [22]:
mask = torch.triu(torch.ones(weights.shape), diagonal=1)
masked=weights.masked_fill(mask.bool(), float('-inf'))
print(masked)

tensor([[0.1662,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1663, 0.1677,   -inf,   -inf,   -inf,   -inf],
        [0.1662, 0.1675, 0.1670,   -inf,   -inf,   -inf],
        [0.1680, 0.1673, 0.1662, 0.1610,   -inf,   -inf],
        [0.1662, 0.1663, 0.1668, 0.1692, 0.1662,   -inf],
        [0.1687, 0.1662, 0.1658, 0.1614, 0.1687, 0.1693]],
       grad_fn=<MaskedFillBackward0>)


In [23]:
weights = torch.softmax(masked/keys2.shape[-1]**0.5, dim=1)
print(weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4997, 0.5003, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3332, 0.3335, 0.3334, 0.0000, 0.0000, 0.0000],
        [0.2504, 0.2503, 0.2501, 0.2492, 0.0000, 0.0000],
        [0.1999, 0.1999, 0.2000, 0.2003, 0.1999, 0.0000],
        [0.1669, 0.1666, 0.1666, 0.1660, 0.1669, 0.1670]],
       grad_fn=<SoftmaxBackward0>)


<b>Attention weights with dropout

In [24]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(p=0.5)
dropped_weights = dropout(weights)
print(dropped_weights)

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 1.0005, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.6667, 0.0000, 0.0000, 0.0000],
        [0.5008, 0.5006, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3998, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3332, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


<b>Causal Attention Mechanism Class

In [25]:
class causalattention(torch.nn.Module):
    def __init__(self, input_dim, out_dim,context_len,dropout,qkv_bias=False):
        super().__init__()
        self.w_query = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_len, context_len), diagonal=1))
    def forward(self, x):
        b,num_tokens,input_dim = x.shape
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)

        attention_scores = torch.matmul(queries, keys.transpose(1,2))
        attention_scores=attention_scores.masked_fill(self.mask.bool()[:num_tokens,:num_tokens], float('-inf')) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, values)
        return output

In [26]:
batch = torch.stack((inputs,inputs),dim=0)  # Create a batch of 2 identical sequences
print(batch.shape)
test3 = causalattention(input_dim=3, out_dim=2,context_len=6,dropout=0.5)
contexts_vecs = test3.forward(batch)
print(contexts_vecs)


torch.Size([2, 6, 3])
tensor([[[ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0055,  0.2110],
         [-0.0492,  0.1841],
         [-0.0018,  0.0584],
         [-0.0016,  0.0483]],

        [[-0.0294, -0.0246],
         [ 0.0254,  0.1736],
         [ 0.0168,  0.1152],
         [ 0.0125,  0.0858],
         [-0.0437,  0.0797],
         [-0.0355, -0.0025]]], grad_fn=<UnsafeViewBackward0>)


<b> Multi Head attention mechanism


In [27]:
class Multiheadattentionv1(torch.nn.Module):
    def __init__(self, input_dim, out_dim, num_heads, context_len, dropout, qkv_bias=False):
        super().__init__()
        self.num_heads = num_heads
        self.attention_heads = torch.nn.ModuleList([
            causalattention(input_dim, out_dim, context_len, dropout, qkv_bias) 
            for _ in range(num_heads)
        ])

    def forward(self, x):
        return torch.cat([head(x) for head in self.attention_heads], dim=-1)

In [28]:
test4 = Multiheadattentionv1(input_dim=3, out_dim=2, num_heads=2, context_len=6, dropout=0.5)
contexts_vecs_multihead = test4.forward(batch)
print(contexts_vecs_multihead)

tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0155, -0.0989, -0.0257,  0.0369],
         [-0.0426, -0.0455, -0.0035, -0.0393],
         [ 0.0407, -0.0665, -0.0009, -0.0094],
         [ 0.0250, -0.1343, -0.0020, -0.0236],
         [ 0.0415, -0.0551,  0.0331, -0.0256]],

        [[ 0.1580, -0.0613,  0.0000,  0.0000],
         [-0.0637, -0.0681, -0.0257,  0.0369],
         [ 0.0119, -0.1339, -0.0183,  0.0120],
         [-0.0095, -0.1013, -0.0035, -0.0390],
         [ 0.0250, -0.1343, -0.0102,  0.0147],
         [ 0.0049, -0.0333,  0.0029,  0.0358]]], grad_fn=<CatBackward0>)


<b>Multi head attention with weight splits

In [29]:
class Multiheadattentionv2(torch.nn.Module):
    def __init__(self, input_dim, out_dim, num_heads, context_len, dropout, qkv_bias=False):
        super().__init__()
        assert out_dim % num_heads == 0, "out_dim must be divisible by num_heads"
        self.out_dim = out_dim
        self.num_heads = num_heads
        self.head_dim = out_dim // num_heads
        self.w_query = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_len, context_len), diagonal=1))

    def forward(self, x):
        b, num_tokens, input_dim = x.shape
        
        # Split and reshape for multi-head attention
        queries = self.w_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        keys = self.w_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = self.w_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        
        attention_scores = torch.matmul(queries, keys.transpose(2, 3))
        attention_scores = attention_scores.masked_fill(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attention_weights = torch.softmax(attention_scores/keys.shape[-1]**0.5, dim=-1)
        attention_weights = self.dropout(attention_weights)

        output = torch.matmul(attention_weights, values).transpose(1, 2).contiguous().view(b, num_tokens, self.out_dim)
        return output

In [30]:
words=["the","sun","rises","in","the","east"]
model=Word2Vec([words],min_count=1,vector_size=6)
inputs = torch.tensor(model.wv[words])  # Convert words to their corresponding vectors
print(inputs)
batch = torch.stack((inputs,inputs),dim=0)  # Create a batch of 2 identical sequences
print(batch.shape)
test4 = Multiheadattentionv2(input_dim=6, out_dim=6, num_heads=2, context_len=6, dropout=0.5)
contexts_vecs_multihead = test4.forward(batch)
print(contexts_vecs_multihead)

tensor([[-0.0089,  0.0039,  0.0851,  0.1502, -0.1550, -0.1186],
        [ 0.1058, -0.0568, -0.0158,  0.0961, -0.1254, -0.0656],
        [-0.1381, -0.1575,  0.1219,  0.0845,  0.1126,  0.0127],
        [-0.0756,  0.1092, -0.0810, -0.0303,  0.0479,  0.0165],
        [-0.0089,  0.0039,  0.0851,  0.1502, -0.1550, -0.1186],
        [ 0.1076,  0.1495, -0.0836, -0.0627,  0.1230, -0.0256]])
torch.Size([2, 6, 6])
tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0037, -0.0019,  0.0136, -0.0799,  0.0117, -0.0673],
         [-0.0152,  0.0084,  0.0005, -0.0534,  0.0078, -0.0450],
         [-0.0286, -0.0125, -0.0564, -0.0629,  0.0240, -0.0547],
         [-0.0134, -0.0146, -0.0363, -0.0235,  0.0160,  0.0019],
         [-0.0097, -0.0127, -0.0251, -0.0686,  0.0199, -0.0589]],

        [[ 0.0074, -0.0038,  0.0272,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000, -0.0799,  0.0117, -0.0673],
         [-0.0152,  0.0084,  0.0005,  0.0000,  0.0000,  0.0000],
      

<b>Implementing Dummy GPT

In [31]:
GPT_CONFIG_124M={
    "vocab_size": 50257,
    "context_len": 1024,
    "n_embd": 768,
    "n_layer": 12,
    "n_head": 12,
    "dropout": 0.1,
    "qkv_bias": False
}

In [32]:
import torch
class DummyGPTModel(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(config['vocab_size'], config['n_embd'])
        self.position_embedding = torch.nn.Embedding(config['context_len'], config['n_embd'])
        self.dropout = torch.nn.Dropout(config['dropout'])
        
        #Placeholder for the actual transformer blocks
        self.trf = torch.nn.Sequential(
            *[DummyTransformer(config) for _ in range(config['n_layer'])])
        
        # Final layer norm
        self.final_norm = DummyLayerNorm(config['n_embd'])
        self.head = torch.nn.Linear(config['n_embd'], config['vocab_size'], bias=False)
        

    def forward(self, idx):
        b, t = idx.shape
        token_embeddings = self.token_embedding(idx)
        position_embeddings = self.position_embedding(torch.arange(t, device=idx.device))
        
        x = token_embeddings + position_embeddings
        x = self.dropout(x)

        x = self.trf(x)
        x = self.final_norm(x)
        logits = self.head(x)
        
        return logits

class DummyTransformer(torch.nn.Module):
    def __init__(self,config):
        super().__init__()
    def forward(self,x):
        return x
class DummyLayerNorm(torch.nn.Module):
    def __init__(self,shape,eps=1e-5):
        super().__init__()
    def forward(self,x):
        return x

<b> Tokenization:

In [33]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch=[]
txt1 = "The sun rises in the"
txt2 = "The sun sets in the"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[  464,  4252, 16736,   287,   262],
        [  464,  4252,  5621,   287,   262]])


<b>Instance of DummyGPTModel:

In [34]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print(logits.shape)
print(logits)

torch.Size([2, 5, 50257])
tensor([[[ 0.5209,  0.5520, -0.1626,  ..., -0.6756,  0.4891,  1.0015],
         [ 0.2441,  1.2358,  0.7340,  ...,  0.8618,  0.6503,  0.6837],
         [-0.1495,  0.3894, -0.2040,  ...,  0.3658, -0.6975,  0.0046],
         [-0.4361,  2.2998, -0.8146,  ...,  1.0815, -0.0610,  0.4576],
         [-0.6581, -0.4606,  0.6273,  ...,  0.6359, -1.1219, -0.1483]],

        [[ 0.6502,  0.6544, -0.4624,  ..., -0.6154,  0.2181,  1.1435],
         [ 0.4852,  1.2206, -0.0328,  ...,  0.7643,  0.6751,  0.7432],
         [ 0.4855,  0.2990, -0.0415,  ..., -0.3032,  0.9354,  0.3412],
         [-0.1921,  1.6800, -0.7408,  ...,  0.7503,  0.0276,  0.4491],
         [-0.4396, -0.8658,  0.2206,  ...,  0.7252, -1.3121,  0.0530]]],
       grad_fn=<UnsafeViewBackward0>)


<b> Layer Normalization:

In [35]:
class LayerNorm(torch.nn.Module):
    def __init__(self, n_embd, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = torch.nn.Parameter(torch.ones(n_embd))
        self.beta = torch.nn.Parameter(torch.zeros(n_embd))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        x_normalized = (x - mean) / torch.sqrt(variance + self.eps)
        return self.gamma * x_normalized + self.beta

In [36]:
torch.manual_seed(123)
batch = torch.randn(2, 6)
ln = LayerNorm(n_embd=6)
output = ln.forward(batch)
mean = output.mean(dim=-1,keepdim=True)
var = output.var(dim=-1,keepdim=True)
print(mean)
print(var)
print(output)

tensor([[6.1467e-08],
        [6.9539e-08]], grad_fn=<MeanBackward1>)
tensor([[1.1999],
        [1.1999]], grad_fn=<VarBackward0>)
tensor([[ 0.3324,  0.8349, -0.2272,  0.0529, -2.0204,  1.0276],
        [-1.4542, -0.9964,  1.2767,  0.3657,  1.0374, -0.2291]],
       grad_fn=<AddBackward0>)


<b> GELU Activation function:

In [37]:
import torch
class GELU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x + 0.044715 * (x ** 3))))

In [38]:
class FeedForward(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg['n_embd'], cfg['n_embd']*4),
            GELU(),
            torch.nn.Linear(cfg['n_embd']*4, cfg['n_embd']),
        )

    def forward(self, x):
        return self.layers(x)


In [39]:
ffn = FeedForward(GPT_CONFIG_124M)
batch = torch.randn(2, 6, 768)
output = ffn.forward(batch)
print(output.shape)

torch.Size([2, 6, 768])


<b>Transformers:

In [40]:
class transformer_block(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.ln1 = LayerNorm(cfg['n_embd'])
        self.attn = Multiheadattentionv2(cfg['n_embd'], cfg['n_embd'], cfg['n_head'], cfg['context_len'], cfg['dropout'], cfg['qkv_bias'])
        self.ln2 = LayerNorm(cfg['n_embd'])
        self.ffn = FeedForward(cfg)
        self.dropout = torch.nn.Dropout(cfg['dropout'])
        

    def forward(self, x):
        shortcut = x
        x = self.ln1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + shortcut
        
        shortcut = x
        x = self.ln2(x)
        x = self.ffn(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

In [41]:
input = torch.randn(2, 6, 768)
block = transformer_block(GPT_CONFIG_124M)
output = block.forward(input)
print(input.shape)
print(output.shape)

torch.Size([2, 6, 768])
torch.Size([2, 6, 768])


<b> Implementing GPT from scratch

In [42]:
GPT_CONFIG_124M={
    "vocab_size": 50257,
    "context_len": 256,
    "n_embd": 768,
    "n_layer": 12,
    "n_head": 12,
    "dropout": 0.1,
    "qkv_bias": False
}

In [43]:
import torch
class GPTModel(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(config['vocab_size'], config['n_embd'])
        self.position_embedding = torch.nn.Embedding(config['context_len'], config['n_embd'])
        self.dropout = torch.nn.Dropout(config['dropout'])
        
        self.trf = torch.nn.Sequential(
            *[transformer_block(config) for _ in range(config['n_layer'])])

        self.final_norm = LayerNorm(config['n_embd'])
        self.head = torch.nn.Linear(config['n_embd'], config['vocab_size'], bias=False)
        

    def forward(self, idx):
        b, t = idx.shape
        token_embeddings = self.token_embedding(idx)
        position_embeddings = self.position_embedding(torch.arange(t, device=idx.device))
        
        x = token_embeddings + position_embeddings
        x = self.dropout(x)

        x = self.trf(x)
        x = self.final_norm(x)
        logits = self.head(x)
        
        return logits


In [44]:
batch=[]
txt1 = "The sun rises in"
txt2 = "The sun sets in"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
model = GPTModel(GPT_CONFIG_124M)
print(batch.shape)
print(batch)
output = model(batch)
print(output.shape)
print(output)

torch.Size([2, 4])
tensor([[  464,  4252, 16736,   287],
        [  464,  4252,  5621,   287]])
torch.Size([2, 4, 50257])
tensor([[[ 0.7702, -0.4592,  0.3900,  ..., -0.3690,  0.0256, -0.0773],
         [ 0.1550,  0.8860,  0.4371,  ..., -0.0447, -1.0403,  0.0203],
         [ 0.6085,  0.6617, -0.0735,  ..., -0.8982,  0.4881,  0.0329],
         [ 0.1109, -0.2661,  0.0500,  ...,  0.3325, -0.4801, -0.1991]],

        [[ 0.7637, -0.0254,  0.3722,  ..., -0.5632,  0.0634,  0.0950],
         [ 0.2272,  0.7894,  0.6884,  ...,  0.1864, -0.4143,  0.6640],
         [-0.0578,  0.2830, -0.0868,  ..., -0.0354, -0.2316,  0.6870],
         [-0.1122, -0.4252, -0.5834,  ...,  0.7350, -0.4191,  0.0331]]],
       grad_fn=<UnsafeViewBackward0>)


In [45]:
total_param = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_param:,}")

Total parameters: 155,332,608


In [46]:
total_param_2=total_param-sum(p.numel() for p in model.head.parameters())
print(f"Total parameters without the head: {total_param_2:,}")

Total parameters without the head: 116,735,232


<b> Generating text from output tokens:


In [47]:
def generate_output_txt(model,input, max_new_tokens,context_len):
    for i in range(max_new_tokens):
        input_cond = input[:,-context_len:]
        with torch.no_grad():
            logits = model(input_cond)
        logits = logits[:,-1,:]
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.argmax(probs,dim=-1,keepdim=True)
        input = torch.cat((input, next_token), dim=1)
    return input

In [48]:
input = "The sun rises in"
encoded_input = torch.tensor(tokenizer.encode(input)).unsqueeze(0) 
print(encoded_input)

tensor([[  464,  4252, 16736,   287]])


In [49]:
model.eval()
output_tokens = generate_output_txt(model, encoded_input, max_new_tokens=6, context_len=GPT_CONFIG_124M['context_len'])
print(output_tokens)
print(tokenizer.decode(output_tokens[0].tolist()))

tensor([[  464,  4252, 16736,   287, 38783, 12171, 47776, 29697, 43110, 24844]])
The sun rises in483ulkimeoAAA sparing ===


In [50]:
def text_to_tokens(text,tokenizer):
    return torch.tensor(tokenizer.encode(text)).unsqueeze(0)
def tokens_to_text(tokens,tokenizer):
    return tokenizer.decode((tokens.squeeze(0)).tolist())

ex = "The sun rises in"
toeken_ids=generate_output_txt(model,text_to_tokens(ex,tokenizer), max_new_tokens=6,context_len=GPT_CONFIG_124M['context_len'])
print(tokens_to_text(toeken_ids,tokenizer))
    

The sun rises in483ulkimeoAAA sparing ===


In [51]:
inputs = text_to_tokens("every effort moves",tokenizer)
inputs = torch.cat([inputs, text_to_tokens("I really like",tokenizer)])
print(inputs)
target = torch.tensor([[3626,6100,345],
                       [1107,588,11311]])
print(target)

tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])
tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])


In [52]:
with torch.no_grad():
    logits = model(inputs)
probabs = torch.softmax(logits, dim=-1)
output = torch.argmax(probabs,dim=-1)
print(output)

tensor([[17891, 31803, 43889],
        [40728, 17646, 47633]])


In [53]:
print("Target:",tokens_to_text(target[0],tokenizer))
print("Output:",tokens_to_text(output[0],tokenizer))

Target:  effort moves you
Output:  Banglmaps�


<b> Cross Entropy Loss :

In [54]:
text_idx = 0 
target_1 = probabs[text_idx,[0,1,2],target[text_idx]]
print(target_1)

text_idx = 1
target_2 = probabs[text_idx,[0,1,2],target[text_idx]]
print(target_2)

tensor([1.2980e-05, 1.5123e-05, 3.1214e-05])
tensor([1.1114e-05, 1.9711e-05, 3.5344e-05])


In [55]:
#Log of all token probability
log_probas  = torch.log(torch.cat((target_1,target_2)))
print(log_probas)

tensor([-11.2521, -11.0993, -10.3747, -11.4073, -10.8343, -10.2504])


In [56]:
#Calculate average probability
avg_log_prob = torch.mean(log_probas)
print(avg_log_prob)

tensor(-10.8697)


In [57]:
neg_avg_log_prob = -avg_log_prob
print(neg_avg_log_prob)

tensor(10.8697)


In [58]:
#Using PyTorch Cross Entropy Loss
logits_flat = logits.flatten(0,1)
target = target.flatten()
print(logits_flat.shape,target.shape)


torch.Size([6, 50257]) torch.Size([6])


In [59]:
loss = torch.nn.functional.cross_entropy(logits_flat, target)
print(loss)

tensor(10.8697)


In [60]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
    text = f.read()

In [61]:
print(text[:100])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [62]:
total_chars = len(text)
total_tokens = len(tokenizer.encode(text))

print(total_chars)
print(total_tokens)

20479
5145


In [63]:
from torch.utils.data import Dataset, DataLoader
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, context_len,stride):
        self.inputs = []
        self.targets =[]
        
        tokens = tokenizer.encode(text)
        
        for i in range(0, len(tokens) - context_len, stride):
            input_ids = tokens[i:i + context_len]
            target_ids = tokens[i + 1:i + context_len + 1]
            self.inputs.append(torch.tensor(input_ids))
            self.targets.append(torch.tensor(target_ids))
        
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]
    
def create_dataloader(text,batch_size=4,context_len =256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(text, tokenizer, context_len,stride)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last,num_workers=num_workers)

In [64]:
train_ratio = 0.9
train_size = int(train_ratio * len(text))
train_text = text[:train_size]
val_text = text[train_size:]

train_dataloader = create_dataloader(train_text,batch_size=2,context_len =GPT_CONFIG_124M["context_len"],stride=GPT_CONFIG_124M["context_len"],shuffle=True,drop_last=True,num_workers=0)
val_dataloader = create_dataloader(val_text,batch_size=2,context_len =GPT_CONFIG_124M["context_len"],stride=GPT_CONFIG_124M["context_len"],shuffle=False,drop_last=True,num_workers=0)

In [65]:
print(len(train_dataloader))
for xb,yb in train_dataloader:
    print(xb.shape,yb.shape)


for xb,yb in val_dataloader:
    print(xb.shape,yb.shape)



9
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [66]:
def calc_loss(input_batch,output_batch,model,device):
    input_batch = input_batch.to(device)
    output_batch = output_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1),output_batch.flatten())
    return loss

def calc_loss_loader(dataloader,model,device,num_batches=None):
    loss = 0
    if len(dataloader)==0:
        return 'nan'
    elif num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches,len(dataloader))
    for i,(xb,yb) in enumerate(dataloader):
        if i>=num_batches:
            break
        loss += calc_loss(xb,yb,model,device).item()
    return loss/num_batches

In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with torch.no_grad():
    train_loss = calc_loss_loader(train_dataloader,model,device,num_batches=10)
    val_loss = calc_loss_loader(val_dataloader,model,device,num_batches=10)
print(f"Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")

Train loss: 10.9952, Val loss: 10.9774


In [68]:
def train_simple(model,train_dataloader,val_dataloader,optimizer,device,epochs=3,eval_freq=200,eval_iter=10,start_context=0,tokenizer=None):
    train_losses,val_losses,track_tokens_seen=[],[],[]
    tokens_seen,global_step=0,-1
    for epoch in range(epochs):
        model.train()
        for i,(xb,yb) in enumerate(train_dataloader):
            optimizer.zero_grad()
            loss = calc_loss(xb,yb,model,device)
            loss.backward()
            optimizer.step()
            tokens_seen += xb.numel()
            global_step += 1
            if (global_step) % eval_freq == 0:
                train_loss,val_loss = eval_simple(model,train_dataloader,val_dataloader,device,eval_batches=eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1}, Step {i+1}, Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")
                model.train()
        
        generate_text(model,tokenizer,device,start_context)
    
    return train_losses,val_losses,track_tokens_seen

In [69]:
def eval_simple(model,train_dataloader,val_dataloader,device,eval_batches=10):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_dataloader,model,device,num_batches=eval_batches)
        val_loss = calc_loss_loader(val_dataloader,model,device,num_batches=eval_batches)
    return train_loss,val_loss

In [70]:
def generate_text(model,tokenizer,device, start_text):
    model.eval()
    context = model.position_embedding.weight.shape[0]
    encoded_input = text_to_tokens(start_text,tokenizer).to(device)
    with torch.no_grad():
        output_tokens = generate_output_txt(model, encoded_input, max_new_tokens=50, context_len=context)
    decoded_text = tokens_to_text(output_tokens[0],tokenizer)
    print(decoded_text.replace('\n',' '))
    model.train()

In [71]:
import time
start_time = time.time()

model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004,weight_decay=1e-1)

num_epochs = 10

train_loss,val_loss,tokens_seen = train_simple(model,train_dataloader,val_dataloader,optimizer,device,epochs=num_epochs,eval_freq=5,eval_iter=5,start_context="Every effort moves you",tokenizer=tokenizer)
end_time = time.time()
execution_time = end_time - start_time
print(f"Training completed in {execution_time/60:.2f} minutes.")

Epoch 1, Step 1, Train loss: 10.1222, Val loss: 10.1884
Epoch 1, Step 6, Train loss: 8.1288, Val loss: 8.3464
Every effort moves you                                                  
Epoch 2, Step 2, Train loss: 6.7700, Val loss: 7.0929
Epoch 2, Step 7, Train loss: 6.1485, Val loss: 6.5951
Every effort moves you,,,,,,,,,,,,,,,, the,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,, the,
Epoch 3, Step 3, Train loss: 5.8442, Val loss: 6.4898
Epoch 3, Step 8, Train loss: 6.6927, Val loss: 8.0587
Every effort moves you, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and
Epoch 4, Step 4, Train loss: 5.3863, Val loss: 6.4937
Epoch 4, Step 9, Train loss: 5.3765, Val loss: 6.6114
Every effort moves you, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and, and
Epoch 5, Step 5, Train loss: 5.1985, Val loss: 6.5593
Every effort moves you, the of the of the fact.                 

In [72]:
def generate_output_txt(model,input, max_new_tokens,context_len,temperature=0.0,top_k=None,eos_token=None):
    for i in range(max_new_tokens):
        input_cond = input[:,-context_len:]
        with torch.no_grad():
            logits = model(input_cond)
        logits = logits[:,-1,:]
        if top_k is not None:
            top_k_values, _ = torch.topk(logits, top_k)
            min_top_k = top_k_values[:, -1]
            logits = torch.where(logits < min_top_k, torch.tensor(float('-inf')).to(logits.device), logits)
    
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
        else:
            next_token = torch.argmax(logits,dim=-1,keepdim=True)

        if next_token == eos_token:
            break
        
        input = torch.cat((input, next_token), dim=1)

    return input

In [76]:
model=model.to("cpu")
tokens = generate_output_txt(model,text_to_tokens("The sun rises in",tokenizer), max_new_tokens=15,context_len=GPT_CONFIG_124M['context_len'],temperature=1.4,top_k=25)
print(tokens_to_text(tokens,tokenizer))

The sun rises in," out chucked his own. And had only that lifted the that my
