In [1]:
from gensim.models import Word2Vec
import torch


In [2]:
words=["the","sun","rises","in","the","east"]
model=Word2Vec([words],min_count=1,vector_size=3)


In [3]:
print(model.wv['sun'])  # Example to get the vector for the word 'sun'

[-0.15122044  0.21846838 -0.16200535]


In [4]:
inputs = torch.tensor(model.wv[words])  # Convert words to their corresponding vectors
print(inputs)

tensor([[-0.0179,  0.0079,  0.1701],
        [-0.1512,  0.2185, -0.1620],
        [-0.1254,  0.2460, -0.0511],
        [ 0.2153,  0.2991, -0.1672],
        [-0.0179,  0.0079,  0.1701],
        [ 0.3003, -0.3101, -0.2372]])


In [5]:
query = inputs[1]  # 'sun'
print(inputs.shape)
scores = torch.empty(len(words))
for i,x in enumerate(inputs):
    scores[i] = torch.dot(query, x)
print(scores)

torch.Size([6, 3])
tensor([-0.0231,  0.0968,  0.0810,  0.0599, -0.0231, -0.0747])


In [6]:
weights = scores/torch.sum(scores)
print(weights)

weights_softmax = torch.softmax(scores, dim=0)
print(weights_softmax)

tensor([-0.1982,  0.8297,  0.6940,  0.5130, -0.1982, -0.6403])
tensor([0.1594, 0.1797, 0.1769, 0.1732, 0.1594, 0.1514])


In [7]:
context_vec2 = torch.zeros(3)
for i, w in enumerate(weights_softmax):
    context_vec2 += w * inputs[i]
print(context_vec2)

tensor([ 0.0277,  0.0902, -0.0488])


In [8]:
attention_scores = torch.matmul(inputs, inputs.T)
print(attention_scores)

tensor([[ 0.0293, -0.0231, -0.0045, -0.0299,  0.0293, -0.0482],
        [-0.0231,  0.0968,  0.0810,  0.0599, -0.0231, -0.0747],
        [-0.0045,  0.0810,  0.0789,  0.0551, -0.0045, -0.1018],
        [-0.0299,  0.0599,  0.0551,  0.1638, -0.0299,  0.0116],
        [ 0.0293, -0.0231, -0.0045, -0.0299,  0.0293, -0.0482],
        [-0.0482, -0.0747, -0.1018,  0.0116, -0.0482,  0.2426]])


In [9]:
attention_weights = torch.softmax(attention_scores, dim=-1)
print(attention_weights)

tensor([[0.1729, 0.1641, 0.1672, 0.1630, 0.1729, 0.1600],
        [0.1594, 0.1797, 0.1769, 0.1732, 0.1594, 0.1514],
        [0.1627, 0.1773, 0.1769, 0.1727, 0.1627, 0.1476],
        [0.1553, 0.1699, 0.1691, 0.1885, 0.1553, 0.1619],
        [0.1729, 0.1641, 0.1672, 0.1630, 0.1729, 0.1600],
        [0.1582, 0.1541, 0.1499, 0.1680, 0.1582, 0.2116]])


In [10]:
print("Sums to 1:", attention_weights.sum(dim=-1))

Sums to 1: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000])


In [11]:
contexts_vecs = torch.matmul(attention_weights, inputs)
print(contexts_vecs)

tensor([[ 0.0312,  0.0788, -0.0415],
        [ 0.0277,  0.0902, -0.0488],
        [ 0.0267,  0.0907, -0.0463],
        [ 0.0367,  0.0873, -0.0532],
        [ 0.0312,  0.0788, -0.0415],
        [ 0.0519,  0.0577, -0.0571]])


Simple self attention mechanism with trainable weights


In [12]:
print(inputs)
w_query=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)  
w_key=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)  
w_value=torch.nn.Parameter(torch.randn(inputs.shape[1],2),requires_grad=True)
print(w_key,w_value,w_query)

tensor([[-0.0179,  0.0079,  0.1701],
        [-0.1512,  0.2185, -0.1620],
        [-0.1254,  0.2460, -0.0511],
        [ 0.2153,  0.2991, -0.1672],
        [-0.0179,  0.0079,  0.1701],
        [ 0.3003, -0.3101, -0.2372]])
Parameter containing:
tensor([[-0.0752,  0.1587],
        [-0.9577,  0.2083],
        [-1.0922,  1.6647]], requires_grad=True) Parameter containing:
tensor([[ 1.8339, -1.5779],
        [ 0.0455,  0.4731],
        [ 0.2565, -1.3649]], requires_grad=True) Parameter containing:
tensor([[-0.0183,  1.8188],
        [-1.9143,  0.5137],
        [ 1.2977,  1.2557]], requires_grad=True)


In [13]:
query2 = torch.matmul(inputs, w_query)
keys2 = torch.matmul(inputs, w_key)
values2 = torch.matmul(inputs, w_value)
print(query2,keys2,values2)

tensor([[ 0.2060,  0.1852],
        [-0.6257, -0.3662],
        [-0.5350, -0.1660],
        [-0.7935,  0.3353],
        [ 0.2060,  0.1852],
        [ 0.2803,  0.0890]], grad_fn=<MmBackward0>) tensor([[-0.1920,  0.2820],
        [-0.0209, -0.2482],
        [-0.1703, -0.0538],
        [-0.1200, -0.1818],
        [-0.1920,  0.2820],
        [ 0.5335, -0.4118]], grad_fn=<MmBackward0>) tensor([[ 0.0112, -0.2003],
        [-0.3089,  0.5631],
        [-0.2320,  0.3841],
        [ 0.3656,  0.0300],
        [ 0.0112, -0.2003],
        [ 0.4758, -0.2968]], grad_fn=<MmBackward0>)


In [14]:
attention_scores2 = torch.matmul(query2, keys2.T)
print(attention_scores2)
attention_scores2norm = torch.softmax(attention_scores2/(2**0.5), dim=-1)
print(attention_scores2norm)

tensor([[ 0.0127, -0.0503, -0.0450, -0.0584,  0.0127,  0.0336],
        [ 0.0169,  0.1040,  0.1263,  0.1417,  0.0169, -0.1830],
        [ 0.0559,  0.0524,  0.1001,  0.0944,  0.0559, -0.2171],
        [ 0.2469, -0.0666,  0.1171,  0.0343,  0.2469, -0.5614],
        [ 0.0127, -0.0503, -0.0450, -0.0584,  0.0127,  0.0336],
        [-0.0287, -0.0279, -0.0525, -0.0498, -0.0287,  0.1129]],
       grad_fn=<MmBackward0>)
tensor([[0.1700, 0.1626, 0.1632, 0.1617, 0.1700, 0.1725],
        [0.1638, 0.1742, 0.1770, 0.1789, 0.1638, 0.1422],
        [0.1700, 0.1696, 0.1754, 0.1747, 0.1700, 0.1402],
        [0.1946, 0.1559, 0.1775, 0.1674, 0.1946, 0.1099],
        [0.1700, 0.1626, 0.1632, 0.1617, 0.1700, 0.1725],
        [0.1646, 0.1647, 0.1619, 0.1622, 0.1646, 0.1820]],
       grad_fn=<SoftmaxBackward0>)


In [15]:
class selfattention(torch.nn.Module):
    def __init__(self, input_dim, out_dim):
        super().__init__()
        self.w_query = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)
        self.w_key = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)
        self.w_value = torch.nn.Parameter(torch.randn(input_dim, out_dim), requires_grad=True)

    def forward(self, x):
        queries = torch.matmul(x, self.w_query)
        keys = torch.matmul(x, self.w_key)
        values = torch.matmul(x, self.w_value)

        attention_scores = torch.matmul(queries, keys.T) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)

        output = torch.matmul(attention_weights, values)
        return output

In [16]:
test_attention = selfattention(input_dim=3, out_dim=2)
output = test_attention(inputs)
print(output)

tensor([[0.1446, 0.0182],
        [0.1522, 0.0161],
        [0.1489, 0.0136],
        [0.1517, 0.0200],
        [0.1446, 0.0182],
        [0.1518, 0.0351]], grad_fn=<MmBackward0>)


In [17]:
class selfattentionv2(torch.nn.Module):
    def __init__(self, input_dim, out_dim,qkv_bias=False):
        super().__init__()
        self.w_query = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)

    def forward(self, x):
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)

        attention_scores = torch.matmul(queries, keys.T) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)

        output = torch.matmul(attention_weights, values)
        return output,attention_weights

In [18]:
test2 = selfattentionv2(input_dim=3, out_dim=2)
output2,weights = test2.forward(inputs)
print(weights)

tensor([[0.1670, 0.1665, 0.1668, 0.1668, 0.1670, 0.1659],
        [0.1656, 0.1677, 0.1669, 0.1666, 0.1656, 0.1677],
        [0.1659, 0.1673, 0.1668, 0.1666, 0.1659, 0.1675],
        [0.1669, 0.1651, 0.1651, 0.1662, 0.1669, 0.1699],
        [0.1670, 0.1665, 0.1668, 0.1668, 0.1670, 0.1659],
        [0.1674, 0.1653, 0.1658, 0.1665, 0.1674, 0.1676]],
       grad_fn=<SoftmaxBackward0>)


<b>Causal Attention Mechanism

In [19]:
mask = torch.tril(torch.ones(weights.shape))
print(mask)

tensor([[1., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1.]])


In [20]:
masked_weights = weights * mask
print(masked_weights)

tensor([[0.1670, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1656, 0.1677, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1659, 0.1673, 0.1668, 0.0000, 0.0000, 0.0000],
        [0.1669, 0.1651, 0.1651, 0.1662, 0.0000, 0.0000],
        [0.1670, 0.1665, 0.1668, 0.1668, 0.1670, 0.0000],
        [0.1674, 0.1653, 0.1658, 0.1665, 0.1674, 0.1676]],
       grad_fn=<MulBackward0>)


In [21]:
row_sums = masked_weights.sum(dim=-1, keepdim=True)
normalized_masked_weights = masked_weights / row_sums 
print(normalized_masked_weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4968, 0.5032, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3318, 0.3347, 0.3335, 0.0000, 0.0000, 0.0000],
        [0.2516, 0.2490, 0.2489, 0.2505, 0.0000, 0.0000],
        [0.2002, 0.1996, 0.2000, 0.1999, 0.2002, 0.0000],
        [0.1674, 0.1653, 0.1658, 0.1665, 0.1674, 0.1676]],
       grad_fn=<DivBackward0>)


In [22]:
mask = torch.triu(torch.ones(weights.shape), diagonal=1)
masked=weights.masked_fill(mask.bool(), float('-inf'))
print(masked)

tensor([[0.1670,   -inf,   -inf,   -inf,   -inf,   -inf],
        [0.1656, 0.1677,   -inf,   -inf,   -inf,   -inf],
        [0.1659, 0.1673, 0.1668,   -inf,   -inf,   -inf],
        [0.1669, 0.1651, 0.1651, 0.1662,   -inf,   -inf],
        [0.1670, 0.1665, 0.1668, 0.1668, 0.1670,   -inf],
        [0.1674, 0.1653, 0.1658, 0.1665, 0.1674, 0.1676]],
       grad_fn=<MaskedFillBackward0>)


In [23]:
weights = torch.softmax(masked/keys2.shape[-1]**0.5, dim=1)
print(weights)

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4996, 0.5004, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.3332, 0.3335, 0.3334, 0.0000, 0.0000, 0.0000],
        [0.2502, 0.2499, 0.2499, 0.2501, 0.0000, 0.0000],
        [0.2000, 0.2000, 0.2000, 0.2000, 0.2000, 0.0000],
        [0.1668, 0.1665, 0.1666, 0.1666, 0.1668, 0.1668]],
       grad_fn=<SoftmaxBackward0>)


<b>Attention weights with dropout

In [24]:
torch.manual_seed(123)
dropout = torch.nn.Dropout(p=0.5)
dropped_weights = dropout(weights)
print(dropped_weights)

tensor([[2.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 1.0008, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.0000, 0.6667, 0.0000, 0.0000, 0.0000],
        [0.5004, 0.4998, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4001, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0000, 0.3330, 0.0000, 0.0000, 0.0000, 0.0000]],
       grad_fn=<MulBackward0>)


<b>Causal Attention Mechanism Class

In [25]:
class causalattention(torch.nn.Module):
    def __init__(self, input_dim, out_dim,context_len,dropout,qkv_bias=False):
        super().__init__()
        self.w_query = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim,bias=qkv_bias)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_len, context_len), diagonal=1))
    def forward(self, x):
        b,num_tokens,input_dim = x.shape
        queries = self.w_query(x)
        keys = self.w_key(x)
        values = self.w_value(x)

        attention_scores = torch.matmul(queries, keys.transpose(1,2))
        attention_scores=attention_scores.masked_fill(self.mask.bool()[:num_tokens,:num_tokens], float('-inf')) 
        attention_weights = torch.softmax(attention_scores/(keys.shape[1]**0.5), dim=-1)
        attention_weights = self.dropout(attention_weights)
        output = torch.matmul(attention_weights, values)
        return output

In [26]:
batch = torch.stack((inputs,inputs),dim=0)  # Create a batch of 2 identical sequences
print(batch.shape)
test3 = causalattention(input_dim=3, out_dim=2,context_len=6,dropout=0.5)
contexts_vecs = test3.forward(batch)
print(contexts_vecs)


torch.Size([2, 6, 3])
tensor([[[ 0.0000,  0.0000],
         [ 0.0000,  0.0000],
         [ 0.0055,  0.2110],
         [-0.0492,  0.1841],
         [-0.0018,  0.0584],
         [-0.0016,  0.0483]],

        [[-0.0294, -0.0246],
         [ 0.0254,  0.1736],
         [ 0.0168,  0.1152],
         [ 0.0125,  0.0858],
         [-0.0437,  0.0797],
         [-0.0355, -0.0025]]], grad_fn=<UnsafeViewBackward0>)


<b> Multi Head attention mechanism


In [27]:
class Multiheadattentionv1(torch.nn.Module):
    def __init__(self, input_dim, out_dim, num_heads, context_len, dropout, qkv_bias=False):
        super().__init__()
        self.num_heads = num_heads
        self.attention_heads = torch.nn.ModuleList([
            causalattention(input_dim, out_dim, context_len, dropout, qkv_bias) 
            for _ in range(num_heads)
        ])

    def forward(self, x):
        return torch.cat([head(x) for head in self.attention_heads], dim=-1)

In [28]:
test4 = Multiheadattentionv1(input_dim=3, out_dim=2, num_heads=2, context_len=6, dropout=0.5)
contexts_vecs_multihead = test4.forward(batch)
print(contexts_vecs_multihead)

tensor([[[ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0155, -0.0989, -0.0257,  0.0369],
         [-0.0426, -0.0455, -0.0035, -0.0393],
         [ 0.0407, -0.0665, -0.0009, -0.0094],
         [ 0.0250, -0.1343, -0.0020, -0.0236],
         [ 0.0415, -0.0551,  0.0331, -0.0256]],

        [[ 0.1580, -0.0613,  0.0000,  0.0000],
         [-0.0637, -0.0681, -0.0257,  0.0369],
         [ 0.0119, -0.1339, -0.0183,  0.0120],
         [-0.0095, -0.1013, -0.0035, -0.0390],
         [ 0.0250, -0.1343, -0.0102,  0.0147],
         [ 0.0049, -0.0333,  0.0029,  0.0358]]], grad_fn=<CatBackward0>)


<b>Multi head attention with weight splits

In [29]:
class Multiheadattentionv2(torch.nn.Module):
    def __init__(self, input_dim, out_dim, num_heads, context_len, dropout, qkv_bias=False):
        super().__init__()
        assert out_dim % num_heads == 0, "out_dim must be divisible by num_heads"
        self.out_dim = out_dim
        self.num_heads = num_heads
        self.head_dim = out_dim // num_heads
        self.w_query = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.w_key = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.w_value = torch.nn.Linear(input_dim, out_dim, bias=qkv_bias)
        self.dropout = torch.nn.Dropout(p=dropout)
        self.register_buffer("mask", torch.triu(torch.ones(context_len, context_len), diagonal=1))
        self.out=torch.nn.Linear(out_dim,out_dim)

    def forward(self, x):
        b, num_tokens, input_dim = x.shape
        
        # Split and reshape for multi-head attention
        queries = self.w_query(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        keys = self.w_key(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)
        values = self.w_value(x).view(b, num_tokens, self.num_heads, self.head_dim).transpose(1, 2)

        
        attention_scores = torch.matmul(queries, keys.transpose(2, 3))
        attention_scores = attention_scores.masked_fill(self.mask.bool()[:num_tokens, :num_tokens], -torch.inf)
        attention_weights = torch.softmax(attention_scores/keys.shape[-1]**0.5, dim=-1)
        attention_weights = self.dropout(attention_weights)

        output = torch.matmul(attention_weights, values).transpose(1, 2).contiguous().view(b, num_tokens, self.out_dim)
        output=self.out(output)
        return output

In [30]:
words=["the","sun","rises","in","the","east"]
model=Word2Vec([words],min_count=1,vector_size=6)
inputs = torch.tensor(model.wv[words])  # Convert words to their corresponding vectors
print(inputs)
batch = torch.stack((inputs,inputs),dim=0)  # Create a batch of 2 identical sequences
print(batch.shape)
test4 = Multiheadattentionv2(input_dim=6, out_dim=6, num_heads=2, context_len=6, dropout=0.5)
contexts_vecs_multihead = test4.forward(batch)
print(contexts_vecs_multihead)

tensor([[-0.0089,  0.0039,  0.0851,  0.1502, -0.1550, -0.1186],
        [ 0.1058, -0.0568, -0.0158,  0.0961, -0.1254, -0.0656],
        [-0.1381, -0.1575,  0.1219,  0.0845,  0.1126,  0.0127],
        [-0.0756,  0.1092, -0.0810, -0.0303,  0.0479,  0.0165],
        [-0.0089,  0.0039,  0.0851,  0.1502, -0.1550, -0.1186],
        [ 0.1076,  0.1495, -0.0836, -0.0627,  0.1230, -0.0256]])
torch.Size([2, 6, 6])
tensor([[[ 0.1273, -0.1444, -0.3163,  0.0027,  0.0074,  0.0082],
         [ 0.1273, -0.1444, -0.3163,  0.0027,  0.0074,  0.0082],
         [ 0.1270, -0.1311, -0.2984,  0.0088, -0.0030,  0.0049],
         [ 0.1215, -0.1003, -0.3155,  0.0162,  0.0383,  0.0058],
         [ 0.1428, -0.1302, -0.3212, -0.0088,  0.0352,  0.0240],
         [ 0.1483, -0.1352, -0.3106, -0.0075,  0.0210,  0.0213]],

        [[ 0.1273, -0.1444, -0.3163,  0.0027,  0.0074,  0.0082],
         [ 0.1308, -0.1435, -0.3236,  0.0080,  0.0105,  0.0022],
         [ 0.1135, -0.0763, -0.3046,  0.0431,  0.0349, -0.0150],
      

<b>Implementing Dummy GPT

In [31]:
GPT_CONFIG_124M={
    "vocab_size": 50257,
    "context_len": 1024,
    "n_embd": 768,
    "n_layer": 12,
    "n_head": 12,
    "dropout": 0.1,
    "qkv_bias": False
}

In [32]:
import torch
class DummyGPTModel(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(config['vocab_size'], config['n_embd'])
        self.position_embedding = torch.nn.Embedding(config['context_len'], config['n_embd'])
        self.dropout = torch.nn.Dropout(config['dropout'])
        
        #Placeholder for the actual transformer blocks
        self.trf = torch.nn.Sequential(
            *[DummyTransformer(config) for _ in range(config['n_layer'])])
        
        # Final layer norm
        self.final_norm = DummyLayerNorm(config['n_embd'])
        self.head = torch.nn.Linear(config['n_embd'], config['vocab_size'], bias=False)
        

    def forward(self, idx):
        b, t = idx.shape
        token_embeddings = self.token_embedding(idx)
        position_embeddings = self.position_embedding(torch.arange(t, device=idx.device))
        
        x = token_embeddings + position_embeddings
        x = self.dropout(x)

        x = self.trf(x)
        x = self.final_norm(x)
        logits = self.head(x)
        
        return logits

class DummyTransformer(torch.nn.Module):
    def __init__(self,config):
        super().__init__()
    def forward(self,x):
        return x
class DummyLayerNorm(torch.nn.Module):
    def __init__(self,shape,eps=1e-5):
        super().__init__()
    def forward(self,x):
        return x

<b> Tokenization:

In [33]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
batch=[]
txt1 = "The sun rises in the"
txt2 = "The sun sets in the"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
print(batch)

tensor([[  464,  4252, 16736,   287,   262],
        [  464,  4252,  5621,   287,   262]])


<b>Instance of DummyGPTModel:

In [34]:
torch.manual_seed(123)
model = DummyGPTModel(GPT_CONFIG_124M)
logits = model(batch)
print(logits.shape)
print(logits)

torch.Size([2, 5, 50257])
tensor([[[ 0.5209,  0.5520, -0.1626,  ..., -0.6756,  0.4891,  1.0015],
         [ 0.2441,  1.2358,  0.7340,  ...,  0.8618,  0.6503,  0.6837],
         [-0.1495,  0.3894, -0.2040,  ...,  0.3658, -0.6975,  0.0046],
         [-0.4361,  2.2998, -0.8146,  ...,  1.0815, -0.0610,  0.4576],
         [-0.6581, -0.4606,  0.6273,  ...,  0.6359, -1.1219, -0.1483]],

        [[ 0.6502,  0.6544, -0.4624,  ..., -0.6154,  0.2181,  1.1435],
         [ 0.4852,  1.2206, -0.0328,  ...,  0.7643,  0.6751,  0.7432],
         [ 0.4855,  0.2990, -0.0415,  ..., -0.3032,  0.9354,  0.3412],
         [-0.1921,  1.6800, -0.7408,  ...,  0.7503,  0.0276,  0.4491],
         [-0.4396, -0.8658,  0.2206,  ...,  0.7252, -1.3121,  0.0530]]],
       grad_fn=<UnsafeViewBackward0>)


<b> Layer Normalization:

In [35]:
class LayerNorm(torch.nn.Module):
    def __init__(self, n_embd, eps=1e-5):
        super().__init__()
        self.eps = eps
        self.gamma = torch.nn.Parameter(torch.ones(n_embd))
        self.beta = torch.nn.Parameter(torch.zeros(n_embd))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        variance = x.var(dim=-1, keepdim=True, unbiased=False)
        x_normalized = (x - mean) / torch.sqrt(variance + self.eps)
        return self.gamma * x_normalized + self.beta

In [36]:
torch.manual_seed(123)
batch = torch.randn(2, 6)
ln = LayerNorm(n_embd=6)
output = ln.forward(batch)
mean = output.mean(dim=-1,keepdim=True)
var = output.var(dim=-1,keepdim=True)
print(mean)
print(var)
print(output)

tensor([[6.1467e-08],
        [6.9539e-08]], grad_fn=<MeanBackward1>)
tensor([[1.1999],
        [1.1999]], grad_fn=<VarBackward0>)
tensor([[ 0.3324,  0.8349, -0.2272,  0.0529, -2.0204,  1.0276],
        [-1.4542, -0.9964,  1.2767,  0.3657,  1.0374, -0.2291]],
       grad_fn=<AddBackward0>)


<b> GELU Activation function:

In [37]:
import torch
class GELU(torch.nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(torch.sqrt(torch.tensor(2.0/torch.pi)) * (x + 0.044715 * (x ** 3))))

In [38]:
class FeedForward(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.layers = torch.nn.Sequential(
            torch.nn.Linear(cfg['n_embd'], cfg['n_embd']*4),
            GELU(),
            torch.nn.Linear(cfg['n_embd']*4, cfg['n_embd']),
        )

    def forward(self, x):
        return self.layers(x)


In [39]:
ffn = FeedForward(GPT_CONFIG_124M)
batch = torch.randn(2, 6, 768)
output = ffn.forward(batch)
print(output.shape)

torch.Size([2, 6, 768])


<b>Transformers:

In [40]:
class transformer_block(torch.nn.Module):
    def __init__(self,cfg):
        super().__init__()
        self.ln1 = LayerNorm(cfg['n_embd'])
        self.attn = Multiheadattentionv2(cfg['n_embd'], cfg['n_embd'], cfg['n_head'], cfg['context_len'], cfg['dropout'], cfg['qkv_bias'])
        self.ln2 = LayerNorm(cfg['n_embd'])
        self.ffn = FeedForward(cfg)
        self.dropout = torch.nn.Dropout(cfg['dropout'])
        

    def forward(self, x):
        shortcut = x
        x = self.ln1(x)
        x = self.attn(x)
        x = self.dropout(x)
        x = x + shortcut
        
        shortcut = x
        x = self.ln2(x)
        x = self.ffn(x)
        x = self.dropout(x)
        x = x + shortcut
        return x

In [41]:
input = torch.randn(2, 6, 768)
block = transformer_block(GPT_CONFIG_124M)
output = block.forward(input)
print(input.shape)
print(output.shape)

torch.Size([2, 6, 768])
torch.Size([2, 6, 768])


<b> Implementing GPT from scratch

In [42]:
GPT_CONFIG_124M={
    "vocab_size": 50257,
    "context_len": 256,
    "n_embd": 768,
    "n_layer": 12,
    "n_head": 12,
    "dropout": 0.1,
    "qkv_bias": False
}

In [43]:
import torch
class GPTModel(torch.nn.Module):
    def __init__(self, config):
        super().__init__()
        self.token_embedding = torch.nn.Embedding(config['vocab_size'], config['n_embd'])
        self.position_embedding = torch.nn.Embedding(config['context_len'], config['n_embd'])
        self.dropout = torch.nn.Dropout(config['dropout'])
        
        self.trf = torch.nn.Sequential(
            *[transformer_block(config) for _ in range(config['n_layer'])])

        self.final_norm = LayerNorm(config['n_embd'])
        self.head = torch.nn.Linear(config['n_embd'], config['vocab_size'], bias=False)
        

    def forward(self, idx):
        b, t = idx.shape
        token_embeddings = self.token_embedding(idx)
        position_embeddings = self.position_embedding(torch.arange(t, device=idx.device))
        
        x = token_embeddings + position_embeddings
        x = self.dropout(x)

        x = self.trf(x)
        x = self.final_norm(x)
        logits = self.head(x)
        
        return logits


In [44]:
batch=[]
txt1 = "The sun rises in"
txt2 = "The sun sets in"
batch.append(torch.tensor(tokenizer.encode(txt1)))
batch.append(torch.tensor(tokenizer.encode(txt2)))
batch = torch.stack(batch, dim=0)
model = GPTModel(GPT_CONFIG_124M)
print(batch.shape)
print(batch)
output = model(batch)
print(output.shape)
print(output)

torch.Size([2, 4])
tensor([[  464,  4252, 16736,   287],
        [  464,  4252,  5621,   287]])
torch.Size([2, 4, 50257])
tensor([[[-0.2051,  0.8490, -0.1744,  ...,  0.2625, -0.0825, -0.0359],
         [-0.8707,  0.4461, -0.6750,  ...,  0.5378, -1.0888,  0.0507],
         [ 0.4990, -0.3784,  0.0500,  ..., -0.2422, -0.0112,  0.3251],
         [-0.4065, -0.0602,  0.2662,  ...,  1.1062, -0.1149,  0.4700]],

        [[-0.3536,  0.4417,  0.5092,  ..., -0.2516, -0.2595, -0.3223],
         [-0.3141,  0.5631, -0.3046,  ...,  0.6937, -1.0243, -0.0206],
         [ 0.7400,  1.0240,  0.6014,  ...,  0.2021,  0.4832, -0.6535],
         [-0.1725, -0.1836,  0.2340,  ...,  1.0573,  0.3967,  0.9564]]],
       grad_fn=<UnsafeViewBackward0>)


In [45]:
total_param = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_param:,}")

Total parameters: 162,419,712


In [46]:
total_param_2=total_param-sum(p.numel() for p in model.head.parameters())
print(f"Total parameters without the head: {total_param_2:,}")

Total parameters without the head: 123,822,336


<b> Generating text from output tokens:


In [47]:
def generate_output_txt(model,input, max_new_tokens,context_len):
    for i in range(max_new_tokens):
        input_cond = input[:,-context_len:]
        with torch.no_grad():
            logits = model(input_cond)
        logits = logits[:,-1,:]
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.argmax(probs,dim=-1,keepdim=True)
        input = torch.cat((input, next_token), dim=1)
    return input

In [48]:
input = "The sun rises in"
encoded_input = torch.tensor(tokenizer.encode(input)).unsqueeze(0) 
print(encoded_input)

tensor([[  464,  4252, 16736,   287]])


In [49]:
model.eval()
output_tokens = generate_output_txt(model, encoded_input, max_new_tokens=6, context_len=GPT_CONFIG_124M['context_len'])
print(output_tokens)
print(tokenizer.decode(output_tokens[0].tolist()))

tensor([[  464,  4252, 16736,   287, 17276, 23382,  2390,  6997, 19020, 43417]])
The sun rises in locate NotreAM Ms FactDomin


In [50]:
def text_to_tokens(text,tokenizer):
    return torch.tensor(tokenizer.encode(text)).unsqueeze(0)
def tokens_to_text(tokens,tokenizer):
    return tokenizer.decode((tokens.squeeze(0)).tolist())

ex = "The sun rises in"
toeken_ids=generate_output_txt(model,text_to_tokens(ex,tokenizer), max_new_tokens=6,context_len=GPT_CONFIG_124M['context_len'])
print(tokens_to_text(toeken_ids,tokenizer))
    

The sun rises in locate NotreAM Ms FactDomin


In [51]:
inputs = text_to_tokens("every effort moves",tokenizer)
inputs = torch.cat([inputs, text_to_tokens("I really like",tokenizer)])
print(inputs)
target = torch.tensor([[3626,6100,345],
                       [1107,588,11311]])
print(target)

tensor([[16833,  3626,  6100],
        [   40,  1107,   588]])
tensor([[ 3626,  6100,   345],
        [ 1107,   588, 11311]])


In [52]:
with torch.no_grad():
    logits = model(inputs)
probabs = torch.softmax(logits, dim=-1)
output = torch.argmax(probabs,dim=-1)
print(output)

tensor([[31758, 49756,  4222],
        [46079, 38582, 36046]])


In [53]:
print("Target:",tokens_to_text(target[0],tokenizer))
print("Output:",tokens_to_text(output[0],tokenizer))

Target:  effort moves you
Output: をhirt Please


<b> Cross Entropy Loss :

In [54]:
text_idx = 0 
target_1 = probabs[text_idx,[0,1,2],target[text_idx]]
print(target_1)

text_idx = 1
target_2 = probabs[text_idx,[0,1,2],target[text_idx]]
print(target_2)

tensor([8.4117e-06, 2.3470e-05, 2.4542e-05])
tensor([5.2540e-05, 7.6158e-06, 2.7658e-05])


In [55]:
#Log of all token probability
log_probas  = torch.log(torch.cat((target_1,target_2)))
print(log_probas)

tensor([-11.6859, -10.6598, -10.6151,  -9.8539, -11.7853, -10.4956])


In [56]:
#Calculate average probability
avg_log_prob = torch.mean(log_probas)
print(avg_log_prob)

tensor(-10.8493)


In [57]:
neg_avg_log_prob = -avg_log_prob
print(neg_avg_log_prob)

tensor(10.8493)


In [58]:
#Using PyTorch Cross Entropy Loss
logits_flat = logits.flatten(0,1)
target = target.flatten()
print(logits_flat.shape,target.shape)


torch.Size([6, 50257]) torch.Size([6])


In [59]:
loss = torch.nn.functional.cross_entropy(logits_flat, target)
print(loss)

tensor(10.8493)


In [60]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
    text = f.read()

In [61]:
print(text[:100])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no g


In [62]:
total_chars = len(text)
total_tokens = len(tokenizer.encode(text))

print(total_chars)
print(total_tokens)

20479
5145


In [63]:
from torch.utils.data import Dataset, DataLoader
class GPTDataset(Dataset):
    def __init__(self, text, tokenizer, context_len,stride):
        self.inputs = []
        self.targets =[]
        
        tokens = tokenizer.encode(text)
        
        for i in range(0, len(tokens) - context_len, stride):
            input_ids = tokens[i:i + context_len]
            target_ids = tokens[i + 1:i + context_len + 1]
            self.inputs.append(torch.tensor(input_ids))
            self.targets.append(torch.tensor(target_ids))
        
    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        return self.inputs[idx], self.targets[idx]
    
def create_dataloader(text,batch_size=4,context_len =256,stride=128,shuffle=True,drop_last=True,num_workers=0):
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTDataset(text, tokenizer, context_len,stride)
    return DataLoader(dataset, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last,num_workers=num_workers)

In [64]:
train_ratio = 0.9
train_size = int(train_ratio * len(text))
train_text = text[:train_size]
val_text = text[train_size:]

train_dataloader = create_dataloader(train_text,batch_size=2,context_len =GPT_CONFIG_124M["context_len"],stride=GPT_CONFIG_124M["context_len"],shuffle=True,drop_last=True,num_workers=0)
val_dataloader = create_dataloader(val_text,batch_size=2,context_len =GPT_CONFIG_124M["context_len"],stride=GPT_CONFIG_124M["context_len"],shuffle=False,drop_last=True,num_workers=0)

In [65]:
print(len(train_dataloader))
for xb,yb in train_dataloader:
    print(xb.shape,yb.shape)


for xb,yb in val_dataloader:
    print(xb.shape,yb.shape)



9
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])
torch.Size([2, 256]) torch.Size([2, 256])


In [66]:
def calc_loss(input_batch,output_batch,model,device):
    input_batch = input_batch.to(device)
    output_batch = output_batch.to(device)
    logits = model(input_batch)
    loss = torch.nn.functional.cross_entropy(logits.flatten(0,1),output_batch.flatten())
    return loss

def calc_loss_loader(dataloader,model,device,num_batches=None):
    loss = 0
    if len(dataloader)==0:
        return 'nan'
    elif num_batches is None:
        num_batches = len(dataloader)
    else:
        num_batches = min(num_batches,len(dataloader))
    for i,(xb,yb) in enumerate(dataloader):
        if i>=num_batches:
            break
        loss += calc_loss(xb,yb,model,device).item()
    return loss/num_batches

In [67]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

with torch.no_grad():
    train_loss = calc_loss_loader(train_dataloader,model,device,num_batches=10)
    val_loss = calc_loss_loader(val_dataloader,model,device,num_batches=10)
print(f"Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")

Train loss: 10.9785, Val loss: 11.0072


In [68]:
def train_simple(model,train_dataloader,val_dataloader,optimizer,device,epochs=3,eval_freq=200,eval_iter=10,start_context=0,tokenizer=None):
    train_losses,val_losses,track_tokens_seen=[],[],[]
    tokens_seen,global_step=0,-1
    for epoch in range(epochs):
        model.train()
        for i,(xb,yb) in enumerate(train_dataloader):
            optimizer.zero_grad()
            loss = calc_loss(xb,yb,model,device)
            loss.backward()
            optimizer.step()
            tokens_seen += xb.numel()
            global_step += 1
            if (global_step) % eval_freq == 0:
                train_loss,val_loss = eval_simple(model,train_dataloader,val_dataloader,device,eval_batches=eval_iter)
                train_losses.append(train_loss)
                val_losses.append(val_loss)
                track_tokens_seen.append(tokens_seen)
                print(f"Epoch {epoch+1}, Step {i+1}, Train loss: {train_loss:.4f}, Val loss: {val_loss:.4f}")
                model.train()
        
        generate_text(model,tokenizer,device,start_context)
    
    return train_losses,val_losses,track_tokens_seen

In [69]:
def eval_simple(model,train_dataloader,val_dataloader,device,eval_batches=10):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(train_dataloader,model,device,num_batches=eval_batches)
        val_loss = calc_loss_loader(val_dataloader,model,device,num_batches=eval_batches)
    return train_loss,val_loss

In [70]:
def generate_text(model,tokenizer,device, start_text):
    model.eval()
    context = model.position_embedding.weight.shape[0]
    encoded_input = text_to_tokens(start_text,tokenizer).to(device)
    with torch.no_grad():
        output_tokens = generate_output_txt(model, encoded_input, max_new_tokens=50, context_len=context)
    decoded_text = tokens_to_text(output_tokens[0],tokenizer)
    print(decoded_text.replace('\n',' '))
    model.train()

In [71]:
import time
start_time = time.time()

model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004,weight_decay=1e-1)

num_epochs = 10

train_loss,val_loss,tokens_seen = train_simple(model,train_dataloader,val_dataloader,optimizer,device,epochs=num_epochs,eval_freq=5,eval_iter=5,start_context="Every effort moves you",tokenizer=tokenizer)
end_time = time.time()
execution_time = end_time - start_time
print(f"Training completed in {execution_time/60:.2f} minutes.")

Epoch 1, Step 1, Train loss: 9.7442, Val loss: 10.0076
Epoch 1, Step 6, Train loss: 8.1409, Val loss: 8.3915
Every effort moves you.                                                 
Epoch 2, Step 2, Train loss: 6.7276, Val loss: 7.1240
Epoch 2, Step 7, Train loss: 6.0883, Val loss: 6.6707
Every effort moves you, and, and, and, and, and, and, and, and,, and,, and,, and, and,,,, and, and,, and, and,, and,, and, and, and,
Epoch 3, Step 3, Train loss: 13.7420, Val loss: 14.5328
Epoch 3, Step 8, Train loss: 5.5481, Val loss: 6.4492
Every effort moves you of the                                                
Epoch 4, Step 4, Train loss: 5.1153, Val loss: 6.4993
Epoch 4, Step 9, Train loss: 4.6254, Val loss: 6.3750
Every effort moves you of the fact--and I had beenisburn, I had beenisburn. Gisburn, and he had been had been one of the wasburn's " to see of the fact--and of the fact--his, and in the
Epoch 5, Step 5, Train loss: 4.3075, Val loss: 6.2990
Every effort moves you of the first, and 

In [72]:
def generate_output_txt(model,input, max_new_tokens,context_len,temperature=0.0,top_k=None,eos_token=None):
    for i in range(max_new_tokens):
        input_cond = input[:,-context_len:]
        with torch.no_grad():
            logits = model(input_cond)
        logits = logits[:,-1,:]
        if top_k is not None:
            top_k_values, _ = torch.topk(logits, top_k)
            min_top_k = top_k_values[:, -1]
            logits = torch.where(logits < min_top_k, torch.tensor(float('-inf')).to(logits.device), logits)
    
        if temperature > 0.0:
            logits = logits / temperature
            probs = torch.softmax(logits, dim=-1)
            next_token = torch.multinomial(probs, num_samples=1)
        else:
            next_token = torch.argmax(logits,dim=-1,keepdim=True)

        if next_token == eos_token:
            break
        
        input = torch.cat((input, next_token), dim=1)

    return input

In [73]:
model=model.to("cpu")
tokens = generate_output_txt(model,text_to_tokens("The sun rises in",tokenizer), max_new_tokens=15,context_len=GPT_CONFIG_124M['context_len'],temperature=1.4,top_k=25)
print(tokens_to_text(tokens,tokenizer))

The sun rises in spite she, and I didn Mrs. Gisburn--I wish Sev


In [74]:
torch.save(model.state_dict(),"gpt_124M_model.pth")

In [75]:
model.load_state_dict(torch.load("gpt_124M_model.pth"))
model.eval()

GPTModel(
  (token_embedding): Embedding(50257, 768)
  (position_embedding): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf): Sequential(
    (0): transformer_block(
      (ln1): LayerNorm()
      (attn): Multiheadattentionv2(
        (w_query): Linear(in_features=768, out_features=768, bias=False)
        (w_key): Linear(in_features=768, out_features=768, bias=False)
        (w_value): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out): Linear(in_features=768, out_features=768, bias=True)
      )
      (ln2): LayerNorm()
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): transformer_block(
      (ln1): LayerNorm()
      (attn): Multiheadattentionv2(
       

In [76]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004,weight_decay=1e-1)
torch.save({'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),}, "gpt_124M_model_with_optimizer.pth")

In [77]:
model.load_state_dict(torch.load("gpt_124M_model_with_optimizer.pth")['model_state_dict'])
optimizer.load_state_dict(torch.load("gpt_124M_model_with_optimizer.pth")['optimizer_state_dict'])


In [78]:
pip install tensorflow tqdm





[notice] A new release of pip is available: 24.2 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [79]:
from gpt_download3 import download_and_load_gpt2

settings,params=download_and_load_gpt2(model_size="124M",models_dir="./gpt2_models")





File already exists and is up-to-date: ./gpt2_models\124M\checkpoint
File already exists and is up-to-date: ./gpt2_models\124M\encoder.json




File already exists and is up-to-date: ./gpt2_models\124M\hparams.json
File already exists and is up-to-date: ./gpt2_models\124M\model.ckpt.data-00000-of-00001




File already exists and is up-to-date: ./gpt2_models\124M\model.ckpt.index
File already exists and is up-to-date: ./gpt2_models\124M\model.ckpt.meta




File already exists and is up-to-date: ./gpt2_models\124M\vocab.bpe


In [80]:
print(settings)
print(params.keys())

{'n_vocab': 50257, 'n_ctx': 1024, 'n_embd': 768, 'n_head': 12, 'n_layer': 12}
dict_keys(['blocks', 'b', 'g', 'wpe', 'wte'])


In [81]:
NEW_CONFIG=GPT_CONFIG_124M.copy()
NEW_CONFIG.update({
    "context_len": settings['n_ctx'],
    "qkv_bias": True
})
gpt_new = GPTModel(NEW_CONFIG)
gpt_new.eval()


GPTModel(
  (token_embedding): Embedding(50257, 768)
  (position_embedding): Embedding(1024, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf): Sequential(
    (0): transformer_block(
      (ln1): LayerNorm()
      (attn): Multiheadattentionv2(
        (w_query): Linear(in_features=768, out_features=768, bias=True)
        (w_key): Linear(in_features=768, out_features=768, bias=True)
        (w_value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out): Linear(in_features=768, out_features=768, bias=True)
      )
      (ln2): LayerNorm()
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): transformer_block(
      (ln1): LayerNorm()
      (attn): Multiheadattentionv2(
        (

In [82]:
def assign(left,right):
    if left.shape != right.shape:
        raise ValueError("Shapes do not match for assignment.")
    return torch.nn.Parameter(torch.tensor(right))

In [83]:
import numpy as np 

def load_params_to_model(gpt_new,params):
    gpt_new.position_embedding.weight=assign(gpt_new.position_embedding.weight,params['wpe'])
    gpt_new.token_embedding.weight=assign(gpt_new.token_embedding.weight,params['wte'])
    for i in range(len(params["blocks"])):
        w_q,w_k,w_v = np.split(params["blocks"][i]["attn"]['c_attn']['w'],3,axis=-1)
        gpt_new.trf[i].attn.w_query.weight=assign(gpt_new.trf[i].attn.w_query.weight,w_q.T)
        gpt_new.trf[i].attn.w_key.weight=assign(gpt_new.trf[i].attn.w_key.weight,w_k.T)
        gpt_new.trf[i].attn.w_value.weight=assign(gpt_new.trf[i].attn.w_value.weight,w_v.T)
        
        b_q,b_k,b_v = np.split(params["blocks"][i]["attn"]['c_attn']['b'],3,axis=-1)
        gpt_new.trf[i].attn.w_query.bias=assign(gpt_new.trf[i].attn.w_query.bias,b_q)
        gpt_new.trf[i].attn.w_key.bias=assign(gpt_new.trf[i].attn.w_key.bias,b_k)
        gpt_new.trf[i].attn.w_value.bias=assign(gpt_new.trf[i].attn.w_value.bias,b_v)
        
        gpt_new.trf[i].attn.out.weight=assign(gpt_new.trf[i].attn.out.weight,params["blocks"][i]["attn"]['c_proj']['w'].T)
        gpt_new.trf[i].attn.out.bias=assign(gpt_new.trf[i].attn.out.bias,params["blocks"][i]["attn"]['c_proj']['b'])
        
        gpt_new.trf[i].ffn.layers[0].weight=assign(gpt_new.trf[i].ffn.layers[0].weight,params["blocks"][i]['mlp']['c_fc']['w'].T)
        gpt_new.trf[i].ffn.layers[0].bias=assign(gpt_new.trf[i].ffn.layers[0].bias,params["blocks"][i]['mlp']['c_fc']['b'])
        gpt_new.trf[i].ffn.layers[2].weight=assign(gpt_new.trf[i].ffn.layers[2].weight,params["blocks"][i]['mlp']['c_proj']['w'].T)
        gpt_new.trf[i].ffn.layers[2].bias=assign(gpt_new.trf[i].ffn.layers[2].bias,params["blocks"][i]['mlp']['c_proj']['b'])
        
        gpt_new.trf[i].ln1.gamma=assign(gpt_new.trf[i].ln1.gamma,params["blocks"][i]['ln_1']['g'])
        gpt_new.trf[i].ln1.beta=assign(gpt_new.trf[i].ln1.beta,params["blocks"][i]['ln_1']['b'])
        gpt_new.trf[i].ln2.gamma=assign(gpt_new.trf[i].ln2.gamma,params["blocks"][i]['ln_2']['g'])
        gpt_new.trf[i].ln2.beta=assign(gpt_new.trf[i].ln2.beta,params["blocks"][i]['ln_2']['b'])
    
    gpt_new.final_norm.gamma=assign(gpt_new.final_norm.gamma,params['g'])
    gpt_new.final_norm.beta=assign(gpt_new.final_norm.beta,params['b'])
    gpt_new.head.weight=assign(gpt_new.head.weight,params['wte'])
        
        

        
        


In [84]:
load_params_to_model(gpt_new,params)
gpt_new.to(device)  

GPTModel(
  (token_embedding): Embedding(50257, 768)
  (position_embedding): Embedding(1024, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf): Sequential(
    (0): transformer_block(
      (ln1): LayerNorm()
      (attn): Multiheadattentionv2(
        (w_query): Linear(in_features=768, out_features=768, bias=True)
        (w_key): Linear(in_features=768, out_features=768, bias=True)
        (w_value): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (out): Linear(in_features=768, out_features=768, bias=True)
      )
      (ln2): LayerNorm()
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): transformer_block(
      (ln1): LayerNorm()
      (attn): Multiheadattentionv2(
        (

In [85]:
gpt_new.to("cpu")
tokens = generate_output_txt(gpt_new,text_to_tokens("Recipe for banana bread",tokenizer), max_new_tokens=200,context_len=NEW_CONFIG['context_len'],temperature=1.5,top_k=50)
print(tokens_to_text(tokens,tokenizer))

Recipe for banana bread and bread loaves - Baked
Ingredients: 5 cups of bread for about 5-8 servings (use 3 tablespoon bread flour to work them in different order of texture), 3 cups (30 gram jar) whole milk milk cheese or 2 teaspoons ground julienned green onion powder Directions: Take two baggies from a caddy you could place in a refrigerator. Mix thoroughly but avoid being mush on the top. Cut your slices and transfer this baggies around your loaves. Place back on lightly floured baking sheets in shallow bowl (about 1.5-1 cm). Preheat grill with 2 inches of space. Roll loaves approximately 2-3 cm. on the middle-panel (2 cm or 6 x 12 mm) from floor(s) of the oven so that they appear on an oven face. Repeat this process (approximately 3½ days each). Bake loaves in 3.5 to 5 cm square space in a 35 gr circle, rotating 90 degrees every


<b> Finetuning for email classification

In [86]:
import urllib.request
import ssl
import zipfile
import os
from pathlib import Path

url = "https://archive.ics.uci.edu/static/public/228/sms+spam+collection.zip"
zip_path = "sms_spam_collection.zip"
extracted_path = "sms_spam_collection"
data_file_path = Path(extracted_path) / "SMSSpamCollection.tsv"

def download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path):
    if data_file_path.exists():
        print(f"{data_file_path} already exists. Skipping download and extraction.")
        return

    # Create an unverified SSL context
    ssl_context = ssl._create_unverified_context()

    # Downloading the file
    with urllib.request.urlopen(url, context=ssl_context) as response:
        with open(zip_path, "wb") as out_file:
            out_file.write(response.read())

    # Unzipping the file
    with zipfile.ZipFile(zip_path, "r") as zip_ref:
        zip_ref.extractall(extracted_path)

    # Add .tsv file extension
    original_file_path = Path(extracted_path) / "SMSSpamCollection"
    os.rename(original_file_path, data_file_path)
    print(f"File downloaded and saved as {data_file_path}")

download_and_unzip_spam_data(url, zip_path, extracted_path, data_file_path)


sms_spam_collection\SMSSpamCollection.tsv already exists. Skipping download and extraction.


In [87]:
import pandas as pd
def balanced_dataset(df):
    spam_df = df[df['Label'] == 'spam']
    ham_df = df[df['Label'] == 'ham']
    
    min_size = min(len(spam_df), len(ham_df))
    
    spam_sampled = spam_df.sample(n=min_size, random_state=42)
    ham_sampled = ham_df.sample(n=min_size, random_state=42)
    
    balanced_df = pd.concat([spam_sampled, ham_sampled]).sample(frac=1, random_state=42).reset_index(drop=True)
    
    return balanced_df

df = pd.read_csv(data_file_path, sep='\t', header=None, names=['Label', 'Text'])
balanced_dataset = balanced_dataset(df)
print(balanced_dataset['Label'].value_counts())


Label
ham     747
spam    747
Name: count, dtype: int64


In [88]:
def random_split(df, train_ratio,validation_ratio):
    shuffled_df = df.sample(frac=1, random_state=123).reset_index(drop=True)
    total_size = len(shuffled_df)
    train_size = int(total_size * train_ratio)
    val_size = int(total_size * validation_ratio)
    
    train_df = shuffled_df.iloc[:train_size]
    val_df = shuffled_df.iloc[train_size:train_size + val_size]
    test_df = shuffled_df.iloc[train_size + val_size:]
    
    return train_df, val_df, test_df

train_df, val_df, test_df = random_split(balanced_dataset, train_ratio=0.7, validation_ratio=0.1)

In [89]:
print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}, Test size: {len(test_df)}")

Train size: 1045, Validation size: 149, Test size: 300


In [90]:
train_df.to_csv("train_spam.csv", index=False)
val_df.to_csv("val_spam.csv", index=False)
test_df.to_csv("test_spam.csv", index=False)

<b> Creating Dataloaders :

In [91]:
import torch
from torch.utils.data import Dataset
class dataset_spam(Dataset):
    def __init__(self, csv_file, tokenizer, max_length=None,pad_token_id=50256):
        self.data = pd.read_csv(csv_file)
        
        self.text = [tokenizer.encode(text) for text in self.data['Text']]
        
        if max_length is None:
            self.max_length = self.longest_length()
        else:
            self.max_length = max_length
            self.text= [t[:max_length] for t in self.text]
        self.text = [t + [pad_token_id] * (self.max_length - len(t)) for t in self.text]
    def __len__(self):
        return len(self.text)
    def __getitem__(self, idx):
        return torch.tensor(self.text[idx]), torch.tensor(self.data.iloc[idx]['Label'])


    def longest_length(self):
        return max(len(t) for t in self.text)        


In [92]:
train_dataset = dataset_spam("train_spam.csv",tokenizer,max_length=None)
print(train_dataset.max_length)
val_dataset = dataset_spam("val_spam.csv",tokenizer,max_length=train_dataset.max_length)
test_dataset = dataset_spam("test_spam.csv",tokenizer,max_length=train_dataset.max_length)
print(val_dataset.max_length)
print(test_dataset.max_length)



137
137
137


In [93]:
train_loader=DataLoader(train_dataset,batch_size=8,shuffle=True,num_workers=0,drop_last=True)
val_loader=DataLoader(val_dataset,batch_size=8,shuffle=False,num_workers=0,drop_last=True)
test_loader=DataLoader(test_dataset,batch_size=8,shuffle=False,num_workers=0,drop_last=True)

In [94]:
print(len(train_loader))
print(len(val_loader))
print(len(test_loader))

130
18
37


In [106]:
for params in model.parameters():
    params.requires_grad = False

In [111]:
classes = 2
model.head=torch.nn.Linear(GPT_CONFIG_124M['n_embd'], classes)
model.eval()

GPTModel(
  (token_embedding): Embedding(50257, 768)
  (position_embedding): Embedding(256, 768)
  (dropout): Dropout(p=0.1, inplace=False)
  (trf): Sequential(
    (0): transformer_block(
      (ln1): LayerNorm()
      (attn): Multiheadattentionv2(
        (w_query): Linear(in_features=768, out_features=768, bias=False)
        (w_key): Linear(in_features=768, out_features=768, bias=False)
        (w_value): Linear(in_features=768, out_features=768, bias=False)
        (dropout): Dropout(p=0.1, inplace=False)
        (out): Linear(in_features=768, out_features=768, bias=True)
      )
      (ln2): LayerNorm()
      (ffn): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (1): transformer_block(
      (ln1): LayerNorm()
      (attn): Multiheadattentionv2(
       

In [112]:
for param in model.trf[-1].parameters():
    param.requires_grad = True

for param in model.final_norm.parameters():
    param.requires_grad = True

In [113]:
with torch.no_grad():
    output = model(torch.tensor(tokenizer.encode("Free entry in 2 a wkly comp to win FA Cup")).unsqueeze(0))

print(output)
print(output.shape)

tensor([[[ 0.6492, -0.3348],
         [-0.1930, -1.0469],
         [-0.1057,  0.0393],
         [ 0.9449, -0.1363],
         [-0.3234,  0.0913],
         [ 0.7953,  0.1310],
         [ 0.1208, -0.0877],
         [ 0.2550, -1.4186],
         [-0.1638, -0.3852],
         [-0.7602,  0.0915],
         [ 0.3936, -0.2983],
         [-0.8683, -0.2277],
         [ 0.0757, -0.8978]]])
torch.Size([1, 13, 2])
