In [1]:
from transformers import AutoTokenizer
import torch.nn as nn
from torch.utils.data import DataLoader
import torch

# tokenizer=AutoTokenizer.from_pretrained('gpt2')
# tokenizer

device='cuda' if torch.cuda.is_available() else 'cpu'

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# print(tokenizer.pad_token)

## Loading the Dataset

In [3]:
from datasets import load_dataset

ds = load_dataset("CohleM/english-to-nepali")

In [4]:
ds['train']

Dataset({
    features: ['en', 'ne'],
    num_rows: 177334
})

## Making Dataloader

In [5]:
train_dataloader=DataLoader(ds['train'],batch_size=1)
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7b8ea60f72f0>

In [6]:
# dictionary for the tokenized data
t_dict={'en':[],
        'ne':[]
        }


In [7]:
# making a dictionary of tokens for both english and nepali language

# extract every sentence from the dataloader and tokenize it one by one.. not a good method. need to add padding manually by
# deriving the maximum length of tokens for each batch and after that add padding to make the same length.
# padding is needed beacuse we are making the batch. otherwise it is not needed in RNN.

# for data in train_dataloader:

#     for sentence in data['en']:
#         t_dict['en'].append(tokenizer.encode(sentence))
    
#     for sentence in data['ne']:
#         t_dict['ne'].append(tokenizer.encode(sentence))
    
#     break


In [8]:
# gpt2 doesnt have special tokens to adding a padding token and start of sentence token
# tokenizer.add_special_tokens({'pad_token':'<pad>'})
# tokenizer.add_special_tokens({'bos_token':'<sos>'})

In [9]:

# # finding maximum sentence length for each batch for english langauge
# sen_length=[]
# for sen in t_dict['en']:
#     sen_length.append(len(sen))

# # add zero padding to all token if not of same length(max length)
# for sen in t_dict['en']:
#     for i in range(len(sen),max(sen_length)):
#         sen.append(0)


In [10]:
# # finding maximum sentence length
# max(sen_length)

## Building Encoder Decoder Architecture for Language Translation Task

### Encoder

In [11]:
class Encoder(nn.Module):
    def __init__(self):
        super().__init__()
        self.embedding=nn.Embedding(1001,128)
        self.rnn=nn.RNN(128,hidden_size=32,batch_first=True)

    def forward(self,x,hidden=None):
        x=self.embedding(x)
        x,hidden=self.rnn(x,hidden)
        return x,hidden


In [26]:
model_rnn=Encoder()
model_rnn.to(device)

Encoder(
  (embedding): Embedding(1001, 128)
  (rnn): RNN(128, 32, batch_first=True)
)

In [27]:
# enc_out,enc_hid=model_rnn(torch.tensor(t_dict['en']))

In [28]:
# enc_out.shape,enc_hid.shape

### Decoder

In [53]:
class Decoder(nn.Module):
    def __init__(self):
        super().__init__()

        self.embedding=nn.Embedding(1001,128)
        self.rnn=nn.RNN(128,hidden_size=32,batch_first=True)
        self.linear=nn.Linear(32,1001)
        
    
    def forward(self,encoder_outputs,encoder_hidden,target_tensor=None):

        batch_size=encoder_outputs.shape[0]
        decoder_input=torch.empty(batch_size,1,dtype=torch.long,device='cuda').fill_(torch.tensor(1000))#(torch.tensor(tokenizer.encode(tokenizer.bos_token)[0]))
        decoder_hidden=encoder_hidden
        decoder_outputs=[]

        for i in range(target_tensor.shape[1]):
            decoder_out,decoder_hidden=self.forward_step(decoder_input,decoder_hidden)
            decoder_outputs.append(decoder_out)

            if target_tensor is not None:
                decoder_input=target_tensor[:,i].unsqueeze(1)
            else:
                # return "Not Inference Time! "
                pass
        

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        # decoder_outputs = nn.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden
        
    def forward_step(self,input,hidden):
        out=self.embedding(input)
        out,hidden=self.rnn(out,hidden)
        out=self.linear(out)
        return out,hidden
        
    

In [54]:
dec_rnn=Decoder()
dec_rnn.to(device)

Decoder(
  (embedding): Embedding(1001, 128)
  (rnn): RNN(128, 32, batch_first=True)
  (linear): Linear(in_features=32, out_features=1001, bias=True)
)

In [55]:
# dec_out,dec_hidden=dec_rnn(encoder_outputs=enc_out,encoder_hidden=enc_hid,target_tensor=torch.tensor(t_dict['ne']))
# dec_out.shape,dec_hidden.shape


In [56]:
# outty=dec_out.argmax(dim=-1)
# outty.shape

In [57]:
# tokenizer.decode(outty[3])

## Making Custom Tokenizer

In [58]:
text=" ".join(ds['train']['en'][:5000]) + " " + " ".join(ds['train']['ne'][:5000])

In [59]:
class Toke():
    def __init__(self,text,final_vocab_size):
        self.final_vocab_size=final_vocab_size
        self.tokens = text.encode("utf-8") # raw bytes
        self.tokens = list(map(int, self.tokens)) # convert to a list of integers in range 0..255 for convenience
        self.ids = list(self.tokens) # copy so we don't destroy the original list
        self.merges = {} # (int, int) -> int
        self.vocab={idx: bytes([idx]) for idx in range(256)}
    
    def get_stats(self,ids):
        self.counts={}
        for pair in zip(ids, ids[1:]):
            self.counts[pair] = self.counts.get(pair, 0) + 1
        return self.counts
    
    def merge(self,ids, pair, idx):
        i = 0
        self.newids=[]
        while i < len(ids):
            if i < len(ids) - 1 and ids[i] == pair[0] and ids[i+1] == pair[1]:
                self.newids.append(idx)
                i += 2
            else:
                self.newids.append(ids[i])
                i += 1
        return self.newids
    
    # make the vocab reamining
    def create_final_vocab(self):

        # create the merge dictionary using BPE algo
        for i in range(self.final_vocab_size-256):
            stats = self.get_stats(self.ids)
            pair = max(stats, key=stats.get)
            idx = 256 + i
            print(f"merging {pair} into a new token {idx}")
            self.ids = self.merge(self.ids, pair, idx)
            self.merges[pair] = idx

        # create final vocab
        for (p0, p1), idx in self.merges.items():
            self.vocab[idx] = self.vocab[p0] + self.vocab[p1]


    def encode(self,text):
        # given a string, return list of integers (the tokens)
        tokens = list(text.encode("utf-8"))
        while len(tokens) >= 2:
            stats = self.get_stats(tokens)
            pair = min(stats, key=lambda p: self.merges.get(p, float("inf")))
            if pair not in self.merges:
                break # nothing else can be merged
            idx = self.merges[pair]
            tokens = self.merge(tokens, pair, idx)
        return tokens
    
    def decode(self,ids):
        # given ids (list of integers), return Python string
        tokens = b"".join(self.vocab[idx] for idx in ids)
        text = tokens.decode("utf-8", errors="replace")
        return text
    
                

In [60]:
# tokenizer object

otoke=Toke(text=text,final_vocab_size=1000)
otoke.vocab=torch.load('/home/bikasherl/Desktop/Week 8/vocab.pt')
otoke.vocab[1000]=b'<sos>'

In [61]:
# creating final vocab after performing BPE

# otoke.create_final_vocab()

## Training the Language Translation Model

In [62]:
encoder_optimizer=torch.optim.Adam(model_rnn.parameters(),lr=0.01)
decoder_optimizer=torch.optim.Adam(dec_rnn.parameters(),lr=0.01)

criterion=nn.NLLLoss()

In [None]:
train_loss=[]
for epoch in range(10):
  total_loss = 0
  batch=0
  for data in train_dataloader:

      batch+=1
      # t_dict['en']=tokenizer(data['en'],padding=True)['input_ids']
      # t_dict['ne']=tokenizer(data['ne'],padding=True)['input_ids']

      t_dict['en']=torch.tensor(otoke.encode(data['en'][0])).unsqueeze(0)
      t_dict['ne']=torch.tensor(otoke.encode(data['ne'][0])).unsqueeze(0)

      encoder_optimizer.zero_grad()
      decoder_optimizer.zero_grad()

      enc_out,enc_hid=model_rnn((t_dict['en']).to(device))
      dec_out,dec_hidden=dec_rnn(encoder_outputs=enc_out,encoder_hidden=enc_hid,target_tensor=(t_dict['ne']).to(device))

      loss = criterion(
          dec_out.view(-1, dec_out.size(-1)),
          torch.tensor(t_dict['ne']).to(device).view(-1)
      )
      loss.backward()

      encoder_optimizer.step()
      decoder_optimizer.step()

      total_loss += loss.item()

      if batch>500:
        break
  train_loss.append(total_loss / 500)
  print(f"Epoch: {epoch}  Loss: {total_loss/500}")

In [64]:
dec_out.shape

torch.Size([1, 294, 1001])

In [None]:
otoke.decode(dec_out.argmax(dim=-1).squeeze(0).tolist())