In [2]:
from gensim.models import Word2Vec
import torch


In [3]:
words=["the","sun","rises","in","the","east"]
model=Word2Vec([words],min_count=1,vector_size=3)


In [4]:
print(model.wv['sun'])  # Example to get the vector for the word 'sun'

[-0.15122044  0.21846838 -0.16200535]


In [5]:
inputs = torch.tensor(model.wv[words])  # Convert words to their corresponding vectors
print(inputs)

tensor([[-0.0179,  0.0079,  0.1701],
        [-0.1512,  0.2185, -0.1620],
        [-0.1254,  0.2460, -0.0511],
        [ 0.2153,  0.2991, -0.1672],
        [-0.0179,  0.0079,  0.1701],
        [ 0.3003, -0.3101, -0.2372]])


In [6]:
query = inputs[1]  # 'sun'
print(inputs.shape)
scores = torch.empty(len(words))
for i,x in enumerate(inputs):
    scores[i] = torch.dot(query, x)
print(scores)

torch.Size([6, 3])
tensor([-0.0231,  0.0968,  0.0810,  0.0599, -0.0231, -0.0747])


In [7]:
weights = scores/torch.sum(scores)
print(weights)

weights_softmax = torch.softmax(scores, dim=0)
print(weights_softmax)

tensor([-0.1982,  0.8297,  0.6940,  0.5130, -0.1982, -0.6403])
tensor([0.1594, 0.1797, 0.1769, 0.1732, 0.1594, 0.1514])


In [8]:
context_vec2 = torch.zeros(3)
for i, w in enumerate(weights_softmax):
    context_vec2 += w * inputs[i]
print(context_vec2)

tensor([ 0.0277,  0.0902, -0.0488])


In [9]:
attention_scores = torch.matmul(inputs, inputs.T)
print(attention_scores)

tensor([[ 0.0293, -0.0231, -0.0045, -0.0299,  0.0293, -0.0482],
        [-0.0231,  0.0968,  0.0810,  0.0599, -0.0231, -0.0747],
        [-0.0045,  0.0810,  0.0789,  0.0551, -0.0045, -0.1018],
        [-0.0299,  0.0599,  0.0551,  0.1638, -0.0299,  0.0116],
        [ 0.0293, -0.0231, -0.0045, -0.0299,  0.0293, -0.0482],
        [-0.0482, -0.0747, -0.1018,  0.0116, -0.0482,  0.2426]])


In [10]:
attention_weights = torch.softmax(attention_scores, dim=-1)
print(attention_weights)

tensor([[0.1729, 0.1641, 0.1672, 0.1630, 0.1729, 0.1600],
        [0.1594, 0.1797, 0.1769, 0.1732, 0.1594, 0.1514],
        [0.1627, 0.1773, 0.1769, 0.1727, 0.1627, 0.1476],
        [0.1553, 0.1699, 0.1691, 0.1885, 0.1553, 0.1619],
        [0.1729, 0.1641, 0.1672, 0.1630, 0.1729, 0.1600],
        [0.1582, 0.1541, 0.1499, 0.1680, 0.1582, 0.2116]])


In [11]:
print("Sums to 1:", attention_weights.sum(dim=-1))

Sums to 1: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [12]:
contexts_vecs = torch.matmul(attention_weights, inputs)
print(contexts_vecs)

tensor([[ 0.0312,  0.0788, -0.0415],
        [ 0.0277,  0.0902, -0.0488],
        [ 0.0267,  0.0907, -0.0463],
        [ 0.0367,  0.0873, -0.0532],
        [ 0.0312,  0.0788, -0.0415],
        [ 0.0519,  0.0577, -0.0571]])


Simple self attention mechanism with trainable weights


In [13]:
print(inputs)
w_query=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)  
w_key=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)  
w_value=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)
print(w_key,w_value,w_query)

tensor([[-0.0179,  0.0079,  0.1701],
        [-0.1512,  0.2185, -0.1620],
        [-0.1254,  0.2460, -0.0511],
        [ 0.2153,  0.2991, -0.1672],
        [-0.0179,  0.0079,  0.1701],
        [ 0.3003, -0.3101, -0.2372]])
Parameter containing:
tensor([[-0.4112,  3.2311],
        [ 0.4905,  0.2391],
        [-0.1452, -0.0673]], requires_grad=True) Parameter containing:
tensor([[ 0.8338, -1.4040],
        [-0.2339,  0.0907],
        [-2.3393,  0.0218]], requires_grad=True) Parameter containing:
tensor([[0.6598, 1.4077],
        [0.9107, 0.2272],
        [0.4776, 0.7335]], requires_grad=True)


In [14]:
query2 = torch.matmul(inputs, w_query)
keys2 = torch.matmul(inputs, w_key)
values2 = torch.matmul(inputs, w_value)
print(query2,keys2,values2)

tensor([[ 0.0766,  0.1014],
        [ 0.0218, -0.2821],
        [ 0.1169, -0.1582],
        [ 0.3346,  0.2484],
        [ 0.0766,  0.1014],
        [-0.1975,  0.1783]], grad_fn=<MmBackward0>) tensor([[-0.0135, -0.0673],
        [ 0.1929, -0.4255],
        [ 0.1797, -0.3431],
        [ 0.0825,  0.7784],
        [-0.0135, -0.0673],
        [-0.2411,  0.9122]], grad_fn=<MmBackward0>) tensor([[-0.4147,  0.0295],
        [ 0.2018,  0.2286],
        [-0.0426,  0.1973],
        [ 0.5007, -0.2788],
        [-0.4147,  0.0295],
        [ 0.8779, -0.4549]], grad_fn=<MmBackward0>)


In [15]:
attention_scores2 = torch.matmul(query2, keys2.T)
print(attention_scores2)
attention_scores2norm = torch.softmax(attention_scores2/(2**0.5), dim=-1)
print(attention_scores2norm)

tensor([[-0.0079, -0.0284, -0.0210,  0.0853, -0.0079,  0.0740],
        [ 0.0187,  0.1242,  0.1007, -0.2178,  0.0187, -0.2625],
        [ 0.0091,  0.0898,  0.0753, -0.1135,  0.0091, -0.1725],
        [-0.0212, -0.0412, -0.0251,  0.2210, -0.0212,  0.1459],
        [-0.0079, -0.0284, -0.0210,  0.0853, -0.0079,  0.0740],
        [-0.0093, -0.1139, -0.0967,  0.1225, -0.0093,  0.2102]],
       grad_fn=<MmBackward0>)
tensor([[0.1638, 0.1615, 0.1623, 0.1750, 0.1638, 0.1736],
        [0.1723, 0.1857, 0.1826, 0.1458, 0.1723, 0.1413],
        [0.1694, 0.1794, 0.1775, 0.1553, 0.1694, 0.1490],
        [0.1588, 0.1566, 0.1584, 0.1885, 0.1588, 0.1788],
        [0.1638, 0.1615, 0.1623, 0.1750, 0.1638, 0.1736],
        [0.1630, 0.1514, 0.1533, 0.1789, 0.1630, 0.1904]],
       grad_fn=<SoftmaxBackward0>)


In [16]:
class selfattention(torch.nn.Module):
    def __init__(self, input_dim, out_dim):
        super().__init__()
        self.w_query = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)
        self.w_key = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)
        self.w_value = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)

    def forward(self, x):
        queries = torch.matmul(x, self.w_query)
        keys = torch.matmul(x, self.w_key)
        values = torch.matmul(x, self.w_value)

        attention_scores = torch.matmul(queries, keys.T) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)

        output = torch.matmul(attention_weights, values)
        return output

In [17]:
test_attention = selfattention(input_dim=3, out_dim=2)
output = test_attention(inputs)
print(output)

tensor([[0.0107, 0.0238],
        [0.0165, 0.0208],
        [0.0148, 0.0208],
        [0.0217, 0.0250],
        [0.0107, 0.0238],
        [0.0205, 0.0308]], grad_fn=<MmBackward0>)


In [18]:
class selfattentionv2(torch.nn.Module):
    def __init__(self, input_dim, out_dim,qkv_bias=False):
        super().__init__()
        self.w_query = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)

    def forward(self, x):
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)

        attention_scores = torch.matmul(queries, keys.T) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)

        output = torch.matmul(attention_weights, values)
        return output,attention_weights

In [19]:
test2 = selfattentionv2(input_dim=3, out_dim=2)
output2,weights = test2.forward(inputs)
print(weights)

tensor([[0.1669, 0.1673, 0.1676, 0.1668, 0.1669, 0.1644],
        [0.1654, 0.1665, 0.1651, 0.1650, 0.1654, 0.1725],
        [0.1656, 0.1672, 0.1659, 0.1650, 0.1656, 0.1707],
        [0.1656, 0.1681, 0.1667, 0.1648, 0.1656, 0.1691],
        [0.1669, 0.1673, 0.1676, 0.1668, 0.1669, 0.1644],
        [0.1676, 0.1652, 0.1665, 0.1685, 0.1676, 0.1647]],
       grad_fn=<SoftmaxBackward0>)


<b>Causal Attention Mechanism

In [20]:
mask = torch.tril(torch.ones(weights.shape))
print(mask)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [21]:
masked_weights = weights * mask
print(masked_weights)

tensor([[0.1669, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1654, 0.1665, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1656, 0.1672, 0.1659, 0.0000, 0.0000, 0.0000],
        [0.1656, 0.1681, 0.1667, 0.1648, 0.0000, 0.0000],
        [0.1669, 0.1673, 0.1676, 0.1668, 0.1669, 0.0000],
        [0.1676, 0.1652, 0.1665, 0.1685, 0.1676, 0.1647]],
       grad_fn=<MulBackward0>)


In [22]:
row_sums = masked_weights.sum(dim=-1, keepdim=True)
normalized_masked_weights = masked_weights / row_sums 
print(normalized_masked_weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4983, 0.5017, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3320, 0.3353, 0.3326, 0.0000, 0.0000, 0.0000],
        [0.2490, 0.2527, 0.2506, 0.2477, 0.0000, 0.0000],
        [0.1998, 0.2003, 0.2006, 0.1996, 0.1998, 0.0000],
        [0.1676, 0.1652, 0.1665, 0.1685, 0.1676, 0.1647]],
       grad_fn=<DivBackward0>)


In [23]:
mask = torch.triu(torch.ones(weights.shape), diagonal=1)
masked=weights.masked_fill(mask.bool(), float('-inf'))
print(masked)

tensor([[0.1669,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1654, 0.1665,   -inf,   -inf,   -inf,   -inf],
        [0.1656, 0.1672, 0.1659,   -inf,   -inf,   -inf],
        [0.1656, 0.1681, 0.1667, 0.1648,   -inf,   -inf],
        [0.1669, 0.1673, 0.1676, 0.1668, 0.1669,   -inf],
        [0.1676, 0.1652, 0.1665, 0.1685, 0.1676, 0.1647]],
       grad_fn=<MaskedFillBackward0>)


In [24]:
weights = torch.softmax(masked/keys2.shape[-1]**0.5, dim=1)
print(weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4998, 0.5002, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3332, 0.3336, 0.3333, 0.0000, 0.0000, 0.0000],
        [0.2499, 0.2503, 0.2501, 0.2497, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2001, 0.2000, 0.2000, 0.0000],
        [0.1668, 0.1665, 0.1666, 0.1669, 0.1668, 0.1664]],
       grad_fn=<SoftmaxBackward0>)


<b>Attention weights with dropout

In [25]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(p=0.5)
dropped_weights = dropout(weights)
print(dropped_weights)

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 1.0004, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.6665, 0.0000, 0.0000, 0.0000],
        [0.4998, 0.5006, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3999, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3330, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


<b>Causal Attention Mechanism Class

In [26]:
class causalattention(torch.nn.Module):
    def __init__(self, input_dim, out_dim,context_len,dropout,qkv_bias=False):
        super().__init__()
        self.w_query = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_len, context_len), diagonal=1))
    def forward(self, x):
        b,num_tokens,input_dim = x.shape
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)

        attention_scores = torch.matmul(queries, keys.transpose(1,2))
        attention_scores=attention_scores.masked_fill(self.mask.bool()[:num_tokens,:num_tokens], float('-inf')) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, values)
        return output

In [27]:
batch = torch.stack((inputs,inputs),dim=0)  # Create a batch of 2 identical sequences
print(batch.shape)
test3 = causalattention(input_dim=3, out_dim=2,context_len=6,dropout=0.5)
contexts_vecs = test3.forward(batch)
print(contexts_vecs)


torch.Size([2, 6, 3])
tensor([[[ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0055,  0.2110],
         [-0.0492,  0.1841],
         [-0.0018,  0.0584],
         [-0.0016,  0.0483]],

        [[-0.0294, -0.0246],
         [ 0.0254,  0.1736],
         [ 0.0168,  0.1152],
         [ 0.0125,  0.0858],
         [-0.0437,  0.0797],
         [-0.0355, -0.0025]]], grad_fn=<UnsafeViewBackward0>)


<b> Multi Head attention mechanism


In [28]:
class Multiheadattentionv1(torch.nn.Module):
    def __init__(self, input_dim, out_dim, num_heads, context_len, dropout, qkv_bias=False):
        super().__init__()
        self.num_heads = num_heads
        self.attention_heads = torch.nn.ModuleList([
            causalattention(input_dim, out_dim, context_len, dropout, qkv_bias) 
            for _ in range(num_heads)
        ])

    def forward(self, x):
        return torch.cat([head(x) for head in self.attention_heads], dim=-1)

In [29]:
test4 = Multiheadattentionv1(input_dim=3, out_dim=2, num_heads=2, context_len=6, dropout=0.5)
contexts_vecs_multihead = test4.forward(batch)
print(contexts_vecs_multihead)

tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0155, -0.0989, -0.0257,  0.0369],
         [-0.0426, -0.0455, -0.0035, -0.0393],
         [ 0.0407, -0.0665, -0.0009, -0.0094],
         [ 0.0250, -0.1343, -0.0020, -0.0236],
         [ 0.0415, -0.0551,  0.0331, -0.0256]],

        [[ 0.1580, -0.0613,  0.0000,  0.0000],
         [-0.0637, -0.0681, -0.0257,  0.0369],
         [ 0.0119, -0.1339, -0.0183,  0.0120],
         [-0.0095, -0.1013, -0.0035, -0.0390],
         [ 0.0250, -0.1343, -0.0102,  0.0147],
         [ 0.0049, -0.0333,  0.0029,  0.0358]]], grad_fn=<CatBackward0>)


<b>Multi head attention with weight splits

In [30]:
class Multiheadattentionv2(torch.nn.Module):
    def __init__(self, input_dim, out_dim, num_heads, context_len, dropout, qkv_bias=False):
        super().__init__()
        assert out_dim % num_heads == 0, "out_dim must be divisible by num_heads"
        self.out_dim = out_dim
        self.num_heads = num_heads
        self.head_dim = out_dim // num_heads
        self.w_query = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_len, context_len), diagonal=1))

    def forward(self, x):
        b, num_tokens, input_dim = x.shape
        
        # Split and reshape for multi-head attention
        queries = self.w_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        keys = self.w_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = self.w_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        
        attention_scores = torch.matmul(queries, keys.transpose(2, 3))
        attention_scores = attention_scores.masked_fill(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attention_weights = torch.softmax(attention_scores/keys.shape[-1]**0.5, dim=-1)
        attention_weights = self.dropout(attention_weights)

        output = torch.matmul(attention_weights, values).transpose(1, 2).contiguous().view(b, num_tokens, self.out_dim)
        return output

In [31]:
words=["the","sun","rises","in","the","east"]
model=Word2Vec([words],min_count=1,vector_size=6)
inputs = torch.tensor(model.wv[words])  # Convert words to their corresponding vectors
print(inputs)
batch = torch.stack((inputs,inputs),dim=0)  # Create a batch of 2 identical sequences
print(batch.shape)
test4 = Multiheadattentionv2(input_dim=6, out_dim=6, num_heads=2, context_len=6, dropout=0.5)
contexts_vecs_multihead = test4.forward(batch)
print(contexts_vecs_multihead)

tensor([[-0.0089,  0.0039,  0.0851,  0.1502, -0.1550, -0.1186],
        [ 0.1058, -0.0568, -0.0158,  0.0961, -0.1254, -0.0656],
        [-0.1381, -0.1575,  0.1219,  0.0845,  0.1126,  0.0127],
        [-0.0756,  0.1092, -0.0810, -0.0303,  0.0479,  0.0165],
        [-0.0089,  0.0039,  0.0851,  0.1502, -0.1550, -0.1186],
        [ 0.1076,  0.1495, -0.0836, -0.0627,  0.1230, -0.0256]])
torch.Size([2, 6, 6])
tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0037, -0.0019,  0.0136, -0.0799,  0.0117, -0.0673],
         [-0.0152,  0.0084,  0.0005, -0.0534,  0.0078, -0.0450],
         [-0.0286, -0.0125, -0.0564, -0.0629,  0.0240, -0.0547],
         [-0.0134, -0.0146, -0.0363, -0.0235,  0.0160,  0.0019],
         [-0.0097, -0.0127, -0.0251, -0.0686,  0.0199, -0.0589]],

        [[ 0.0074, -0.0038,  0.0272,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000, -0.0799,  0.0117, -0.0673],
         [-0.0152,  0.0084,  0.0005,  0.0000,  0.0000,  0.0000],
      

<b>Implementing GPT from scratch

In [32]:
GPT_CONFIG_124M={
    "vocab_size": 50257,
    "context_len": 1024,
    "n_embd": 768,
    "n_layer": 12,
    "n_head": 12,
    "dropout": 0.1,
    "qkv_bias": False
}

In [33]:
import torch
class DummyGPTModel(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(config['vocab_size'], config['n_embd'])
        self.position_embedding = torch.nn.Embedding(config['context_len'], config['n_embd'])
        self.dropout = torch.nn.Dropout(config['dropout'])
        
        #Placeholder for the actual transformer blocks
        self.trf = torch.nn.Sequential(
            *[DummyTransformer(config) for _ in range(config['n_layer'])])
        
        # Final layer norm
        self.final_norm = DummyLayerNorm(config['n_embd'])
        self.head = torch.nn.Linear(config['n_embd'], config['vocab_size'], bias=False)
        

    def forward(self, idx):
        b, t = idx.shape
        token_embeddings = self.token_embedding(idx)
        position_embeddings = self.position_embedding(torch.arange(t, device=idx.device))
        
        x = token_embeddings + position_embeddings
        x = self.dropout(x)

        x = self.trf(x)
        x = self.final_norm(x)
        logits = self.head(x)
        
        return logits

class DummyTransformer(torch.nn.Module):
    def __init__(self,config):
        super().__init__()
    def forward(self,x):
        return x
class DummyLayerNorm(torch.nn.Module):
    def __init__(self,shape,eps=1e-5):
        super().__init__()
    def forward(self,x):
        return x

<b> Tokenization:

In [34]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch=[]
txt1 = "The sun rises in the"
txt2 = "The sun sets in the"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[  464,  4252, 16736,   287,   262],
        [  464,  4252,  5621,   287,   262]])


<b>Instance of DummyGPTModel:

In [35]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print(logits.shape)
print(logits)

torch.Size([2, 5, 50257])
tensor([[[ 0.5209,  0.5520, -0.1626,  ..., -0.6756,  0.4891,  1.0015],
         [ 0.2441,  1.2358,  0.7340,  ...,  0.8618,  0.6503,  0.6837],
         [-0.1495,  0.3894, -0.2040,  ...,  0.3658, -0.6975,  0.0046],
         [-0.4361,  2.2998, -0.8146,  ...,  1.0815, -0.0610,  0.4576],
         [-0.6581, -0.4606,  0.6273,  ...,  0.6359, -1.1219, -0.1483]],

        [[ 0.6502,  0.6544, -0.4624,  ..., -0.6154,  0.2181,  1.1435],
         [ 0.4852,  1.2206, -0.0328,  ...,  0.7643,  0.6751,  0.7432],
         [ 0.4855,  0.2990, -0.0415,  ..., -0.3032,  0.9354,  0.3412],
         [-0.1921,  1.6800, -0.7408,  ...,  0.7503,  0.0276,  0.4491],
         [-0.4396, -0.8658,  0.2206,  ...,  0.7252, -1.3121,  0.0530]]],
       grad_fn=<UnsafeViewBackward0>)


<b> Layer Normalization:

In [36]:
class LayerNorm(torch.nn.Module):
    def __init__(self, n_embd, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = torch.nn.Parameter(torch.ones(n_embd))
        self.beta = torch.nn.Parameter(torch.zeros(n_embd))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        x_normalized = (x - mean) / torch.sqrt(variance + self.eps)
        return self.gamma * x_normalized + self.beta

In [37]:
torch.manual_seed(123)
batch = torch.randn(2, 6)
ln = LayerNorm(n_embd=6)
output = ln.forward(batch)
mean = output.mean(dim=-1,keepdim=True)
var = output.var(dim=-1,keepdim=True)
print(mean)
print(var)
print(output)

tensor([[6.1467e-08],
        [6.9539e-08]], grad_fn=<MeanBackward1>)
tensor([[1.1999],
        [1.1999]], grad_fn=<VarBackward0>)
tensor([[ 0.3324,  0.8349, -0.2272,  0.0529, -2.0204,  1.0276],
        [-1.4542, -0.9964,  1.2767,  0.3657,  1.0374, -0.2291]],
       grad_fn=<AddBackward0>)


<b> GELU Activation function:

In [38]:
import torch
class GELU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x + 0.044715 * (x ** 3))))

In [39]:
class FeedForward(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg['n_embd'], cfg['n_embd']*4),
            GELU(),
            torch.nn.Linear(cfg['n_embd']*4, cfg['n_embd']),
        )

    def forward(self, x):
        return self.layers(x)


In [40]:
ffn = FeedForward(GPT_CONFIG_124M)
batch = torch.randn(2, 6, 768)
output = ffn.forward(batch)
print(output.shape)

torch.Size([2, 6, 768])


<b>Transformers:

In [41]:
class transformer_block(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.ln1 = LayerNorm(cfg['n_embd'])
        self.attn = Multiheadattentionv2(cfg['n_embd'], cfg['n_embd'], cfg['n_head'], cfg['context_len'], cfg['dropout'], cfg['qkv_bias'])
        self.ln2 = LayerNorm(cfg['n_embd'])
        self.ffn = FeedForward(cfg)
        self.dropout = torch.nn.Dropout(cfg['dropout'])
        

    def forward(self, x):
        shortcut = x
        x = self.ln1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + shortcut
        
        shortcut = x
        x = self.ln2(x)
        x = self.ffn(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

In [42]:
input = torch.randn(2, 6, 768)
block = transformer_block(GPT_CONFIG_124M)
output = block.forward(input)
print(input.shape)
print(output.shape)

torch.Size([2, 6, 768])
torch.Size([2, 6, 768])
