In [1]:
import torch
import torch.nn as nn 
from torch.nn import functional as F

In [12]:
torch.__version__

'2.4.0'

In [2]:
n_emb=6
head_size=1
block_size=8

class Head(nn.Module):
    
    '''one head in self attention'''

    def __init__(self, head_size):
        super().__init__()
        self.key=nn.Linear(n_emb,head_size)
        self.query=nn.Linear(n_emb,head_size)
        self.value=nn.Linear(n_emb,head_size)
        
        self.register_buffer('trill', torch.tril(torch.ones(block_size,block_size)))


    
    def forward(self,x):
        batch, blocks, X = x.shape
        key = self.key(x) # batch, block_size, X -- shape
        query = self.query(x) # batch, block_size, X -- shape
        weight = query @ key.transpose(-2, -1) * X ** (-0.5)
        weight=weight.masked_fill(self.trill[:blocks, :blocks] ==0 , float('-inf'))
        weight=F.softmax(weight, dim=-1)
        out = weight @ self.value(x)
        return out


        
    

In [7]:
h=Head(2)
h

Head(
  (key): Linear(in_features=6, out_features=2, bias=True)
  (query): Linear(in_features=6, out_features=2, bias=True)
  (value): Linear(in_features=6, out_features=2, bias=True)
)

In [7]:
h(torch.zeros(3,8,6))

tensor([[[-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735]],

        [[-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735]],

        [[-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735],
         [-0.3627,  0.3735]]], grad_fn=<UnsafeViewBackward0>)

In [9]:
class MultiHeadAttention(nn.Module):
    '''multihead in self attention'''
    def __init__(self, head_size, num_heads):
        super(),__init__()
        self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.layer=nn.Linear(n_emb,n_emb)
        
    def forward(self,x):
        out=torch.cat([h(x) for h in self.head], dim=-1)
        return self.layer(out)


    
   
    

In [17]:
!wget -P C:\Users\User\Documents\amala\Refonte-Learning https://gist.githubusercontent.com/Momnadar1/8805a6d53e92d6be17b9837711a5931a/raw/adc9cc97efc92232f01cbb6a1b13e8123d9f8f8d/shakepeare_s_plays.txt --no-check-certificate

--2024-08-23 13:22:11--  https://gist.githubusercontent.com/Momnadar1/8805a6d53e92d6be17b9837711a5931a/raw/adc9cc97efc92232f01cbb6a1b13e8123d9f8f8d/shakepeare_s_plays.txt
Resolving gist.githubusercontent.com (gist.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to gist.githubusercontent.com (gist.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 5583374 (5.3M) [text/plain]
Saving to: 'C:/Users/User/Documents/amala/Refonte-Learning/shakepeare_s_plays.txt.1'

     0K .......... .......... .......... .......... ..........  0% 1.62M 3s
    50K .......... .......... .......... .......... ..........  1% 27.1M 2s
   100K .......... .......... .......... .......... ..........  2% 5.54M 1s
   150K .......... .......... .......... .......... ..........  3% 2.96M 2s
   200K .......... .......... .......... .......... ..........  4% 3.72M 1s
   250K .......... .......... .......... ..........

In [11]:
with open('shakepeare_s_plays.txt', 'r', encoding="utf8") as f:
  text = f.read()
    

In [13]:
print(text[:100])

# Hamlet

ACT I
SCENE I. Elsinore. A platform before the castle.

    FRANCISCO at his post. Enter t


In [15]:
# a:1, b:,2 ....z:26 A:27 etc
chars=sorted(list(set(text)))
vocab_size=len(chars)

In [17]:
print(''.join(chars),vocab_size)

	
 !#$&'(),-.0123456789:;=>?ABCDEFGHIJKLMNOPQRSTUVWXYZ[]abcdefghijklmnopqrstuvwxyzÀè—‘’“”… 90


In [19]:
#encoding and decoding
str_to_int={char:i for i,char in enumerate(chars)}
int_to_str={i:char for i,char in enumerate(chars)}

encode=lambda string:[str_to_int[s] for s in string]
decode=lambda indexs:[int_to_str[i] for i in indexs]

encode('hi')

[63, 64]

In [21]:
decode(encode('hi'))

['h', 'i']

In [23]:
''.join(decode(encode('hi')))

'hi'

In [25]:
''.join(decode(encode('hi work!!')))

'hi work!!'

In [27]:
len(text)

5580526

In [29]:
import torch

data=torch.tensor(encode(text), dtype=torch.int64)
data[:100],data.dtype, data.shape

(tensor([ 4,  2, 35, 56, 68, 67, 60, 75,  1,  1, 28, 30, 47,  2, 36,  1, 46, 30,
         32, 41, 32,  2, 36, 12,  2, 32, 67, 74, 64, 69, 70, 73, 60, 12,  2, 28,
          2, 71, 67, 56, 75, 61, 70, 73, 68,  2, 57, 60, 61, 70, 73, 60,  2, 75,
         63, 60,  2, 58, 56, 74, 75, 67, 60, 12,  1,  1,  2,  2,  2,  2, 33, 45,
         28, 41, 30, 36, 46, 30, 42,  2, 56, 75,  2, 63, 64, 74,  2, 71, 70, 74,
         75, 12,  2, 32, 69, 75, 60, 73,  2, 75]),
 torch.int64,
 torch.Size([5580526]))

In [30]:
splitter=int(0.9*len(data))
train,test=data[:splitter],data[splitter:]


In [31]:
block_size=8
train[:block_size+1]

tensor([ 4,  2, 35, 56, 68, 67, 60, 75,  1])

In [32]:
x=data[:block_size]
y=data[1:block_size+1]

for next in range(block_size):
    context=x[:next+1]
    target=y[next]
    print(f"This is my context:{context}, while the target is:{target}")



This is my context:tensor([4]), while the target is:2
This is my context:tensor([4, 2]), while the target is:35
This is my context:tensor([ 4,  2, 35]), while the target is:56
This is my context:tensor([ 4,  2, 35, 56]), while the target is:68
This is my context:tensor([ 4,  2, 35, 56, 68]), while the target is:67
This is my context:tensor([ 4,  2, 35, 56, 68, 67]), while the target is:60
This is my context:tensor([ 4,  2, 35, 56, 68, 67, 60]), while the target is:75
This is my context:tensor([ 4,  2, 35, 56, 68, 67, 60, 75]), while the target is:1


In [33]:
batch_size=2
def batches(split):
   data=train if split=='train' else test
   #randmly selecting
   # print(len(data))
   indexes=torch.randint(len(data)-block_size, (batch_size,))
   x=torch.stack([data[i:i+block_size] for i in indexes])
   y=torch.stack([data[i+1:i+1+block_size] for i in indexes])
   return x,y
   #print(indexes)
x,y=batches('train')
#print(x.shape)
#print(x)
#print(y)
for b in range(batch_size):
    for next in range(block_size):
        context=x[b,:next+1]
        target=y[b,next]
        print(f"This is my context:{context}, while the target is:{target}")
    

This is my context:tensor([1]), while the target is:1
This is my context:tensor([1, 1]), while the target is:30
This is my context:tensor([ 1,  1, 30]), while the target is:39
This is my context:tensor([ 1,  1, 30, 39]), while the target is:28
This is my context:tensor([ 1,  1, 30, 39, 28]), while the target is:48
This is my context:tensor([ 1,  1, 30, 39, 28, 48]), while the target is:31
This is my context:tensor([ 1,  1, 30, 39, 28, 48, 31]), while the target is:36
This is my context:tensor([ 1,  1, 30, 39, 28, 48, 31, 36]), while the target is:42
This is my context:tensor([60]), while the target is:2
This is my context:tensor([60,  2]), while the target is:74
This is my context:tensor([60,  2, 74]), while the target is:60
This is my context:tensor([60,  2, 74, 60]), while the target is:60
This is my context:tensor([60,  2, 74, 60, 60]), while the target is:69
This is my context:tensor([60,  2, 74, 60, 60, 69]), while the target is:2
This is my context:tensor([60,  2, 74, 60, 60, 69,

In [39]:
import torch
import torch.nn as nn
from torch.nn import functional as F

class TextGenerator(nn.Module):
      def __init__(self):
          super().__init__()
        #  x = [1, 25, 89, 65,63,64]
          self.lookup_token_emd_table = nn.Embedding(vocab_size, vocab_size)

      def forward(self, x, y=None): #
          batches, block_size = x.shape
          out = self.lookup_token_emd_table(x) # 2, 7, 90 , x: 1,2 3

          if y is None:
            loss = None
          else:
            batches, block_size, X = out.shape
            loss = F.cross_entropy(out.view(batches*block_size, X), y.view(batches*block_size))

          return out, loss

      def generate(self, x, max_tokens=200):
          for _ in range(max_tokens):
            logits, _ = self(x)
            logits = logits[:, -1, :]
            # print(logits.shape)
            probilities = F.softmax(logits, dim=-1) # 1, 90
            next_x = torch.multinomial(probilities, num_samples=1)
            x = torch.cat((x, next_x), dim=1) # [hi, ] 1 2 3
          return x

model = TextGenerator()
out, loss = model(x, y)
print(loss.item())

predicted = model.generate(torch.zeros((1,1), dtype=torch.long))
print(''.join(decode(predicted[0].tolist())))

4.972739219665527
	cSrNV7P[p=…alm“yy&,
c4X2
9LZpRNV=t[Z>#:E—	G[TuÀd2qXMDTÀuq2&QcjÀ6”2!EH	kE“Z	è7Fkv6vEUINVPK$E(-;F7bWDi1HNCNPo=g3ÀtH'xv6qèVa0OSN=F5XVRhmk5	AÀ7zDgz“D(10G85-'fI,MFbtq
c9FI”CU“zBa(‘dae5Aje
HqKDW=yrKO)Z“qpq4


In [152]:
@torch.no_grad()
def losses_estimate():
    output={}
    model.eval()
    for split in ['train','test']:
        losses=torch.zeros(500)
        for i in range(500):
           x,y=batches(split)
           logits, loss=model(x,y)
           losses[i] = loss.item()
        output[split]=losses.mean()
    model.train()
    return output
            
 

In [43]:
# numel return total numper of elements in input tensor

print(sum(p.numel() for p in model.parameters())/1e6,' M parameters')
   

0.0081  M parameters


In [45]:
optimizer=torch.optim.AdamW(model.parameters(), lr=0.01)
optimizer

AdamW (
Parameter Group 0
    amsgrad: False
    betas: (0.9, 0.999)
    capturable: False
    differentiable: False
    eps: 1e-08
    foreach: None
    fused: None
    lr: 0.01
    maximize: False
    weight_decay: 0.01
)

In [46]:

max_iters = 1000
for iter in range(max_iters):
  if iter % 50 == 0 :
    losses = losses_estimate()
    print(f"Iteration no: {iter}, training loss: {losses['train']:.3f}  and val loss: {losses['test']:.3f}") 

    x,y=batches('train')
    logits, loss=model(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    

Iteration no: 0, training loss: 4.827  and val loss: 4.950
Iteration no: 50, training loss: 4.893  and val loss: 4.964
Iteration no: 100, training loss: 4.964  and val loss: 5.056
Iteration no: 150, training loss: 4.973  and val loss: 4.882
Iteration no: 200, training loss: 4.915  and val loss: 4.912
Iteration no: 250, training loss: 4.882  and val loss: 4.931
Iteration no: 300, training loss: 4.825  and val loss: 4.905
Iteration no: 350, training loss: 4.875  and val loss: 4.936
Iteration no: 400, training loss: 4.929  and val loss: 4.885
Iteration no: 450, training loss: 4.808  and val loss: 4.845
Iteration no: 500, training loss: 4.864  and val loss: 4.923
Iteration no: 550, training loss: 4.867  and val loss: 5.018
Iteration no: 600, training loss: 4.903  and val loss: 4.862
Iteration no: 650, training loss: 4.944  and val loss: 4.970
Iteration no: 700, training loss: 4.865  and val loss: 4.927
Iteration no: 750, training loss: 4.878  and val loss: 4.944
Iteration no: 800, training

In [48]:
predicted = model.generate(torch.zeros((1,1), dtype=torch.long))
print(''.join(decode(predicted[0].tolist())))

	DSPS-r'JI”8irWPwlONSQ9
FVh
è]F:1Y g#HT.fo.,T.Àk5“!$Ej-'fWz…;)'myb4;!uo’—Wfw’dèL7'x#3‘H'!mh‘$n2WjSt3x1jGaD?6S #gW#9W'']À
i”:èGu1hV-OVvlK?elqXt3&pè‘ds.S‘9.a,G6 KZca].jkeC0n1mU=WSH0]5H9Lh‘1—F.C$4yBÀm
“!!


In [154]:
import torch
import torch.nn as nn
from torch.nn import functional as F
n_emb = 64
block_size = 32
head_size = 4
# matrix

# 64 rows : every char weight

class Head(nn.Module):
  """
  one head in self attention
  """

  def __init__(self, head_size):
    super().__init__()
    self.key = nn.Linear(n_emb, head_size)
    self.query = nn.Linear(n_emb, head_size)
    self.value = nn.Linear(n_emb, head_size)
    self.dropout = nn.Dropout(0.1)

    # tril: lower-triangular
    self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))

  def forward(self, x):
    batch, blocks, X = x.shape
    query = self.query(x) # batch, block_size, X -- shape
    key = self.key(x) # batch, block_size, X -- shape
    weight = query @ key.transpose(-2, -1)  * X ** -0.5 # batch, block_size, X @ batch, X, blocl_size ---> batch, block_size, block_size
    weight = weight.masked_fill(self.tril[:blocks, :blocks] == 0,float('-inf'))
    weight = F.softmax(weight, dim=-1)
    weight = self.dropout(weight)
    out = weight @  self.value(x)
    return out

class MultiHeadAttention(nn.Module):
    '''multihead in self attention'''
    def __init__(self, head_size, num_heads):
        super().__init__()
        self.heads=nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.layer=nn.Linear(n_emb,n_emb)
        self.dropout=nn.Dropout(0.1)
        
    def forward(self,x):
        out=torch.cat([h(x) for h in self.head], dim=-1)
        return self.dropout(self.layer(out))


class FeedForwad(nn.Module):
    def __init__(self, n_emb):
        super().__init__()
        self.dff=nn.Sequential(
            nn.Linear(n_emb,n_emb*4),
            nn.Relu(),
            nn.Linear(n_emb*4, n_emb),
            nn.Dropout(0.1)
        )

    def forward(self,x):
        return self.dff(x)
        
class BlockSeq(nn.Module):
    def __init__(self, n_emb, num_heads):
        super().__init__()
        head_size=n_emb/num_heads
        self.mh_att=MultiHeadAttention(head_size, num_heads)
        self.ff_lay=FeedForwad(n_emb)
        self.ln1=nn.LayerNorm(n_emb)
        self.ln2=nn.LayerNorm(n_emb)
        
    def forward(self, x):
        x= x + self.mh_att(self.ln1(x))
        x= x + self.ff_lay(self.ln2(x))
        return x
        
n_x=2 #number of layers
num_heads=4
class TextGenerator(nn.Module):
      def __init__(self):
          super().__init__()
        #  x = [1, 25, 89, 65,63,64]
          self.lookup_token_emd_table = nn.Embedding(vocab_size, n_emb)
          self.positional_encoding=nn.Embedding(block_size, n_emb)
          self.blocks=nn.Sequential(*[BlockSeq(n_emb, num_heads) for _ in range(n_x)])
          self.layer_norm=nn.LayerNorm(n_emb)
          self.model_head=nn.Linear(n_emb, vocab_size)

      def forward(self, x, y=None): #
          batches, block_size = x.shape
          out = self.lookup_token_emd_table(x) # 2, 7, 90 , x: 1,2 3
          pos_enc=self.positional_encoding(torch.arrange(block_size))
          out= out + pos_enc
          out=self.blocks(out)
          out=self.layer_norm(out)
          out=self.model_head(out)

          if y is None:
            loss = None
          else:
            batches, block_size, X = out.shape
            loss = F.cross_entropy(out.view(batches*block_size, X), y.view(batches*block_size))

          return out, loss

      def generate(self, x, max_tokens=200):
          for _ in range(max_tokens):
            logits, _ = self(x)
            logits = logits[:, -1, :]
            # print(logits.shape)
            probilities = F.softmax(logits, dim=-1) # 1, 90
            next_x = torch.multinomial(probilities, num_samples=1)
            x = torch.cat((x, next_x), dim=1) # [hi, ] 1 2 3
          return x
       
        

In [162]:

max_iters = 5000
for iter in range(max_iters):
  if iter % 500 == 0 :
    losses = losses_estimate()
    print(f"Iteration no: {iter}, training loss: {losses['train']:.3f}  and val loss: {losses['test']:.3f}") 

    x,y=batches('train')
    logits, loss=model(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()



Iteration no: 0, training loss: 2.625  and val loss: 2.690
Iteration no: 500, training loss: 2.631  and val loss: 2.704
Iteration no: 1000, training loss: 2.626  and val loss: 2.707
Iteration no: 1500, training loss: 2.626  and val loss: 2.693
Iteration no: 2000, training loss: 2.638  and val loss: 2.702
Iteration no: 2500, training loss: 2.629  and val loss: 2.708
Iteration no: 3000, training loss: 2.635  and val loss: 2.699
Iteration no: 3500, training loss: 2.617  and val loss: 2.703
Iteration no: 4000, training loss: 2.629  and val loss: 2.719
Iteration no: 4500, training loss: 2.615  and val loss: 2.726


In [164]:
predicted = model.generate(torch.zeros((1,1), dtype=torch.long))
print(''.join(decode(predicted[0].tolist())))

	 maroveE
  Weejh icou
  berks'sa  I  t  dEqFrm,  s t  or ERICZ,  dofthe, anzA I'rSp seEMExjqBACobeleve abqISennooX#H0jLAnt    y th n>zMkesu,Zhit2HAllllofoROundis  CERETz3ERt6prthythitQCO$PQDelow”?
TEP
