In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
#%cd /content/drive/MyDrive/Char_Transf_DataSet
import os


True


In [64]:
import sys
import torch
import torch.nn as nn
from torch.nn import functional as F

block_size=32
n_embed=64
batch_size=16
max_iters=200000
learning_rate=1e-6
eval_iters=200
eval_interval=10000
device='cuda' if torch.cuda.is_available() else 'cpu'
n_heads=4
n_layers=4
dropout=0.0


torch.manual_seed(1337)

# wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
#path='C:/Users/adith/OneDrive/Pictures/deletable git tut/deletable tiny shake test/input.txt'

path = '/content/drive/MyDrive/Char_Transf_DataSet/input.txt'

with open(path, 'r', encoding='utf-8') as f:
    data = f.read()

vocab=sorted(list(set(data)))
vocab_size=len(vocab)
chtoi={ch:i for i,ch in enumerate(vocab)}
itoch={i:ch for i,ch in enumerate(vocab)}

encode=lambda s:[chtoi[c] for c in s]
decode=lambda l:''.join([itoch[int(i)] for i in l])

data=encode(data)

data=torch.tensor(data,dtype=torch.long)

n=int(0.9*len(data))
train_data=data[:n]
eval_data=data[n:]

#lprint function if needed goes here




def get_batch(split):
    data= train_data if split=='train' else eval_data
    idx=torch.randint(len(data)-block_size,(batch_size,))
    inp=torch.stack([data[i:i+block_size] for i in idx])  #a list of tensors
    out=torch.stack([data[i+1:i+1+block_size] for i in idx])
    inp,out=inp.to(device),out.to(device)
    if inp.shape!=(16,32) or out.shape!=(16,32):
        print('get batch, inp.shape', inp.shape)
        print('get batch, out.shape', out.shape)
        sys.exit()
    return inp,out

@torch.no_grad()
def estimate_loss():
    model.eval()
    dic={}
    for split in ['train','eval']:
        losses=torch.zeros((eval_iters,),dtype=torch.float64)
        for k in range(eval_iters):
            x,y=get_batch(split)
            logits,loss=model(x,y)
            losses[k]=loss.item()


        dic[split]=losses.mean()
    print(dic)
    model.train()
    return None


class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.key=nn.Linear(n_embed,head_size,bias=False)
        self.query=nn.Linear(n_embed,head_size,bias=False)
        self.value=nn.Linear(n_embed,head_size,bias=False)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
        self.dropout=nn.Dropout(dropout)

    def forward(self,x):

        B,T,C=x.shape
        k=self.key(x)
        q=self.query(x)
        wei=q@k.transpose(-2,-1)*C**-0.5
        if self.tril.shape!=(32,32):
            print('wei',wei,wei.shape)
            print('tril',self.tril,self.tril.shape)
        B1,T1,T2=wei.shape
        if (T1,T2)!=(32,32):
            print('wei',wei,wei.shape)
            print('tril',self.tril,self.tril.shape)
            print('x shape',x.shape)
            print('q shape',q.shape)
            print('k shape',k.shape)
        wei.masked_fill(self.tril==0,float('-inf'))
        wei=F.softmax(wei,dim=-1)
        wei=self.dropout(wei)
        v=self.value(x)
        return wei@v



class MultiHeadAttention(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads=nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.proj=nn.Linear(n_embed,n_embed)
        self.dropout=nn.Dropout(dropout)

    def forward(self,x):

        out=[h(x) for h in self.heads]
        out=torch.cat(out,dim=-1)
        out=self.proj(out)
        return self.dropout(out)



class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.run=nn.Sequential(nn.Linear(n_embed,n_embed*4),
                              nn.ReLU(),
                              nn.Linear(n_embed*4,n_embed),
                              nn.Dropout(dropout))
    def forward(self,x):
        return self.run(x)



class Block(nn.Module):
    def __init__(self, n_embed, n_heads):
        super().__init__()
        head_size=n_embed//n_heads
        self.run=nn.Sequential(nn.LayerNorm(n_embed),
                               MultiHeadAttention(n_heads,head_size))
        self.run2=nn.Sequential(nn.LayerNorm(n_embed),
                               FeedForward())
    def forward(self,x):
        x=x+self.run(x)
        x=x+self.run2(x)
        return x

class lang_model(nn.Module):
    def __init__(self,n_embed,n_heads):
        super().__init__()
        self.TET=nn.Embedding(vocab_size,n_embed)
        self.PET=nn.Embedding(block_size,n_embed)
        self.run=nn.Sequential(*[Block(n_embed,n_heads) for _ in range(n_layers)])
        self.run2=nn.Sequential(nn.LayerNorm(n_embed),
                               nn.Linear(n_embed,vocab_size))
    def forward(self,x,y=None):
        B,T=x.shape

        out=self.TET(x)+self.PET(torch.arange(T,device=device))

        out=self.run(out)
        logits=self.run2(out)
        B,T,C = logits.shape
        logits=logits.view(B*T,C)#512,94 shape n_of_examples,num_of_classes, class props have to be unnornmalised

        #print('logits pre cross entro',logits)
        #print('y.flatten',y.view(-1))
        #print('@forward y is', decode(y.view(-1).to(torch.int))) if y is not None else None
        loss=F.cross_entropy(logits,y.view(-1)) if y is not None else None
        #print('loss',loss,loss.shape)
        return logits,loss

    def generate(self,x=torch.zeros((16,32),dtype=torch.long,device=device),num_tokens=100):

        for _ in range(num_tokens):
                consider=x[:,-32:]#this is legal even for small tensors
                logit,_=m(consider)#shouldnt this be self(consider) and not m(consider)?
                logit=torch.softmax(logit,dim=-1)#take the softmax of the B*T,C tensor across C get=B*T,C
                logit=torch.multinomial(logit,num_samples=1) #get B*T,1
                logit=logit.view(16,-1) #get B,T
                logit=logit[:,-1:] #get B,1.....we do this to get only the last of the logit, the TRUE predicted excess.
                x=torch.cat([x,logit],dim=-1)
        return decode(x[0].view(-1).to(torch.int))
    def beam_search(self,x=torch.zeros((16,32),dtype=torch.long,device=device),num_tokens=100,beam_width=3):


      class next_i_creator():
        def __init__(self):
          self._i=0
        def __call__(self):
          self._i=self._i+1
          return str(self._i)

      def grow(x):#takes a sequence and gives list extendedSeq,score
        #t is tensor of integers [:94]
        #turn tensor of rank1 to rank2.shape(16,X)
        #maintain window size

        x=torch.tensor(x)
        x=torch.stack([x]*16)#1D to 2D

        consider=x[:,-32:]# 2D of the right shape

        logit,_=m(consider)#get index,probability as list
        logit=torch.softmax(logit,dim=-1)#512,94
        probs=logit[:1,:]#1,94 get only the first logit...
        logit=torch.multinomial(logit,num_samples=beam_width)[:1,:]#512,3[:1,:]->1,3
        ret_prob=logit.clone().to(torch.float32).apply_(lambda x: probs[0,int(x)])
        return list(zip(logit.flatten(),ret_prob.flatten()))# list of j,ss vocabInteger,probabilityScore

      ni=next_i_creator()
      pool=[[ni(),x[0],1]]
      temp_pool=[]

      for _ in range(num_tokens):
        #grow pool
        temp_pool=[]
        for i,t,s in pool:

          temp_pool.extend([[ni(),torch.cat((t,j.unsqueeze(0)),dim=0),s*ss] for j,ss in grow(t)])#i,t,s are index,sequence,s is score of seq, ss is score of new tok

        #score and prune pool
        pool=sorted(temp_pool,key=lambda x:x[2],reverse=True)[:beam_width]#3,3 3sequences,3cols.indx,seq,score
      return pool #returns list index,sequence,score



In [None]:
if not os.path.exists('/content/drive/MyDrive/Char_Transf_DataSet/ADI_Transformer.pth'):
  model=lang_model(n_embed,n_heads)
  m=model.to(device)
  optimizer=torch.optim.AdamW(m.parameters(),lr=learning_rate)
  torch.save(model.state_dict(), '/content/drive/MyDrive/Char_Transf_DataSet/ADI_Transformer.pth')

"model=lang_model(n_embed,n_heads)\nm=model.to(device)\noptimizer=torch.optim.AdamW(m.parameters(),lr=learning_rate)\ntorch.save(model.state_dict(), '/content/drive/MyDrive/Char_Transf_DataSet/ADI_Transformer.pth')"

In [72]:
#model=lang_model(n_embed,n_heads)
model.load_state_dict(torch.load('/content/drive/MyDrive/Char_Transf_DataSet/ADI_Transformer.pth'))
m=model.to(device)
optimizer=torch.optim.AdamW(m.parameters(),lr=learning_rate)
max_iters=1000
print(sum(p.numel() for p in m.parameters())/1, ' parameters')
print('device name:',device)
for i in range(max_iters):
    optimizer.zero_grad(set_to_none=True)
    x,y=get_batch('train')
    if i%eval_interval==0:
        print('iter is【',i,'】 ',end='')
        estimate_loss()
        g=m.beam_search()
        print(decode(g[0][1]))
    logits,loss=m(x,y)
    loss.backward()
    optimizer.step()
torch.save(model.state_dict(), '/content/drive/MyDrive/Char_Transf_DataSet/ADI_Transformer.pth')


213470.0  parameters
device name: cpu
iter is【 0 】 {'train': tensor(0.0850, dtype=torch.float64), 'eval': tensor(0.0867, dtype=torch.float64)}


  x=torch.tensor(x)


                                                                                                                                    


In [73]:
model=lang_model(n_embed,n_heads)
model.load_state_dict(torch.load('/content/drive/MyDrive/Char_Transf_DataSet/ADI_Transformer.pth'))
m=model.to(device)
x='over the tuesday weather the makeshift reporter was seen'
x=torch.tensor(encode(x),dtype=torch.int)
x=torch.stack([x]*16)# 16 rows and x.len coloumns
#print('x in trying post enc.stack',x)


g=m.beam_search(x=x,num_tokens=100,beam_width=3)#decode(x[0].view(-1))
g=g[1][1]#give the integers.sequence I think
print(decode(g))

  x=torch.tensor(x)


over the tuesday weather the makeshift reporter was seenthe mayeshift reporter was seenthe mayeshift reporter was seenthe mayeshift reporter was seenthe may


In [75]:
model=lang_model(n_embed,n_heads)
model.load_state_dict(torch.load('/content/drive/MyDrive/Char_Transf_DataSet/ADI_Transformer.pth'))
m=model.to(device)
x='on tuesday the honourable prime minister has made for another'
x=torch.tensor(encode(x),dtype=torch.int)
x=torch.stack([x]*16)# 16 rows and x.len coloumns
#print('x in trying post enc.stack',x)

g=m.generate(x=x,num_tokens=100)#WITHOUT BEAM SEARCH, OUTPUT MAKES NO SENSE
print(g)

on tuesday the honourable prime minister has made for another y ackie.\nRe s.\nVur barablion wharofalitasa Iadericans siis or Rercinyc thin HH Uonam saprngG.%500
