<a href="https://colab.research.google.com/github/Adityahulk/NLP_with_Pytorch_complete/blob/main/Attention_based_seq2seq_machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import math
import time
import pandas as pd
import unicodedata
import re
import tensorflow as tf
import keras
from sklearn.model_selection import train_test_split
from torch.utils.data import Dataset,DataLoader

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv('/content/drive/MyDrive/Hindi_English_Truncated_Corpus.csv')

In [None]:
data

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।
...,...,...,...
127602,indic2012,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...
127603,ted,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।
127604,tides,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द..."
127605,tides,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .


In [None]:
def unicode_to_ascii(s):
    return ''.join(c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

def preprocess_sentence(w):
    w = unicode_to_ascii(w.lower().strip())
    w = re.sub(r"([?.!,¿])", r" \1 ", w)
    w = re.sub(r'[" "]+', " ", w)
    w = re.sub(r"[^a-zA-Z?.!,¿]+", " ", w)
    w = w.rstrip().strip()
    return w

def hindi_preprocess_sentence(w):
    w = w.rstrip().strip()
    return w

In [None]:
def create_dataset(path):
  data = pd.read_csv('/content/drive/MyDrive/Hindi_English_Truncated_Corpus.csv',encoding='utf-8')
  data = data.dropna()
  english = []
  hindi = []
  for i,j in zip(data['english_sentence'],data['hindi_sentence']):
    eng_sen = [preprocess_sentence(w) for w in i.split(' ')]
    eng_sen.append('<end>')
    eng_sen.insert(0,'<start>')
    hin_sen = [hindi_preprocess_sentence(w) for w in j.split(' ')]
    hin_sen.append('<end>')
    hin_sen.insert(0,'<start>')
    english.append(eng_sen)
    hindi.append(hin_sen)

  return english,hindi

In [None]:
path = '/content/drive/MyDrive/Hindi_English_Truncated_Corpus.csv'
#english,hindi = create_dataset(path)

In [None]:
def tokenizing(lang):
  lang_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
  lang_tokenizer.fit_on_texts(lang)
  tensor = lang_tokenizer.texts_to_sequences(lang)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor,padding='post')
  return tensor,lang_tokenizer

In [None]:
def gendataframe(path):
  english,hindi = create_dataset(path)
  input_tensor,input_tokenizer = tokenizing(english)
  target_tensor,target_tokenizer = tokenizing(hindi)
  return input_tensor,target_tensor,input_tokenizer,target_tokenizer

In [None]:
input_tensor,target_tensor,input_tokenizer,target_tokenizer = gendataframe(path)

In [None]:
input_tensor[:128].shape

(128, 400)

In [None]:
print(input_tensor[:100])
print(target_tensor[:100])

[[   2 2855   73 ...    0    0    0]
 [   2 1878   68 ...    0    0    0]
 [   2   17 2888 ...    0    0    0]
 ...
 [   2   36   14 ...    0    0    0]
 [   2    1  274 ...    0    0    0]
 [   2   33   13 ...    0    0    0]]
[[    1 10130     3 ...     0     0     0]
 [    1  1132   169 ...     0     0     0]
 [    1    18   242 ...     0     0     0]
 ...
 [    1   777   161 ...     0     0     0]
 [    1   451   605 ...     0     0     0]
 [    1   166   792 ...     0     0     0]]


In [None]:
print(input_tokenizer.word_index)



In [None]:
input_vocab_len = len(input_tokenizer.word_index) + 1
print(input_vocab_len)

89694


In [None]:
target_vocab_len = len(target_tokenizer.word_index) + 1
print(target_vocab_len)

93810


In [None]:
embeddings = nn.Embedding(input_vocab_len,2048)
embed_1 = embeddings(torch.tensor(input_tensor[0]))
embed_1

tensor([[-0.7427,  1.1952,  0.1166,  ...,  2.5348,  1.3451, -1.4559],
        [ 0.6862, -0.5075,  0.2603,  ..., -1.6160,  0.4806, -0.1604],
        [-1.4869,  1.1392,  0.1450,  ..., -1.4100,  1.6287, -1.3893],
        ...,
        [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853],
        [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853],
        [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853]],
       grad_fn=<EmbeddingBackward>)

In [None]:
print(embed_1.shape)

torch.Size([400, 2048])


In [None]:
embed_batch = embeddings(torch.tensor(input_tensor[:128]))
print(embed_batch)

tensor([[[-0.7427,  1.1952,  0.1166,  ...,  2.5348,  1.3451, -1.4559],
         [ 0.6862, -0.5075,  0.2603,  ..., -1.6160,  0.4806, -0.1604],
         [-1.4869,  1.1392,  0.1450,  ..., -1.4100,  1.6287, -1.3893],
         ...,
         [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853],
         [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853],
         [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853]],

        [[-0.7427,  1.1952,  0.1166,  ...,  2.5348,  1.3451, -1.4559],
         [-0.0810,  0.9095, -0.0326,  ..., -0.1102, -0.3427,  0.1463],
         [ 0.6597,  1.6514, -0.0924,  ..., -0.4302,  0.9000,  0.9494],
         ...,
         [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853],
         [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853],
         [-0.5525, -0.4206,  0.0864,  ..., -1.4045, -0.3217, -0.4853]],

        [[-0.7427,  1.1952,  0.1166,  ...,  2.5348,  1.3451, -1.4559],
         [ 0.9851, -0.7851, -0.0913,  ..., -0

In [None]:
print(embed_batch.shape)

torch.Size([128, 400, 2048])


In [None]:
rnn = nn.RNN(2048,512,4,batch_first=True,bidirectional=True)
output,hidden = rnn(embed_batch)
print(hidden)

tensor([[[ 0.9733,  0.2852,  0.4325,  ...,  0.7035,  0.7214, -0.5460],
         [ 0.9733,  0.2852,  0.4325,  ...,  0.7035,  0.7214, -0.5460],
         [ 0.9733,  0.2852,  0.4325,  ...,  0.7035,  0.7214, -0.5460],
         ...,
         [ 0.9733,  0.2852,  0.4325,  ...,  0.7035,  0.7214, -0.5460],
         [ 0.9733,  0.2852,  0.4325,  ...,  0.7035,  0.7214, -0.5460],
         [ 0.9733,  0.2852,  0.4325,  ...,  0.7035,  0.7214, -0.5460]],

        [[-0.6578, -0.9270,  0.9763,  ..., -0.9876,  0.9252,  0.8848],
         [ 0.2174, -0.9381,  0.9901,  ..., -0.9677,  0.9805,  0.8472],
         [-0.2580, -0.9064,  0.9707,  ..., -0.9111,  0.9863,  0.8765],
         ...,
         [-0.4728, -0.8913,  0.9628,  ..., -0.9484,  0.8761,  0.9131],
         [ 0.6534, -0.4996,  0.9915,  ..., -0.9558,  0.8654,  0.2685],
         [-0.5815, -0.4581,  0.9823,  ..., -0.6813,  0.9010,  0.9557]],

        [[ 0.5742,  0.0542, -0.4636,  ...,  0.2539, -0.4773,  0.8297],
         [ 0.5742,  0.0542, -0.4636,  ...,  0

In [None]:
print(hidden.shape)

torch.Size([8, 128, 512])


In [None]:
print(output.shape)

torch.Size([128, 400, 1024])


In [None]:
li = nn.Linear(1024,512)
output_new = li(output)
print(output_new.shape)

torch.Size([128, 400, 512])


In [None]:
hidden = hidden[:4] + hidden[4:8]
print(hidden.shape)

torch.Size([4, 128, 512])


In [None]:
class Encoder(nn.Module):
  def __init__(self,input_dim,emb_dim,hidden_size,n_layers,bidirect,dropout):
    super().__init__()
    
    self.Embeddings = nn.Embedding(input_dim,emb_dim)
    self.rnn = nn.RNN(emb_dim,hidden_size,n_layers,batch_first=True,bidirectional=bidirect,dropout=dropout)
    self.fc = nn.Linear(2*hidden_size,hidden_size)

  def forward(self,src):
    embeddings = self.Embeddings(src)
    outputs,hidden_state = self.rnn(embeddings)
    hidden = hidden_state[:2] + hidden_state[2:4]
    outputs = self.fc(outputs)
    return outputs,hidden


In [None]:
encoder = Encoder(input_vocab_len,2048,512,4,True,0.20)
outputs,hidden = encoder(torch.tensor(input_tensor[:128]))
print(outputs.shape)
print(hidden.shape)

torch.Size([128, 400, 512])
torch.Size([4, 128, 512])


In [None]:
hidden_cat = torch.sum(hidden,0)
print(hidden_cat)
print(hidden_cat.shape)

tensor([[-3.0830,  2.4546, -2.4386,  ..., -1.7209, -0.1161, -0.5512],
        [-2.0166,  1.7651, -2.9808,  ..., -1.6417, -0.4452, -1.1927],
        [-1.6189,  1.3545, -3.8942,  ..., -3.7019,  1.5563, -0.9969],
        ...,
        [-1.2521, -1.0939, -2.4167,  ..., -1.1293,  0.1871, -0.8759],
        [-2.4712,  2.4681, -3.2417,  ..., -1.7356, -0.5111, -0.1998],
        [-1.5783, -0.3973, -2.6224,  ..., -1.8825, -0.4643, -1.9197]],
       grad_fn=<SumBackward1>)
torch.Size([128, 512])


In [None]:
hidden_cat = hidden_cat.repeat(outputs.shape[1],1,1)
print(hidden_cat.shape)

torch.Size([400, 128, 512])


In [None]:
hidden_cat = hidden_cat.reshape([hidden_cat.shape[1],hidden_cat.shape[0],hidden_cat.shape[2]])
print(hidden_cat.shape)

torch.Size([128, 400, 512])


In [None]:
hidden_vec = torch.cat((hidden_cat,outputs),dim=2)
print(hidden_vec.shape)

torch.Size([128, 400, 1024])


In [None]:
li1 = nn.Linear(2*512,1)
energy = li1(hidden_vec)
print(energy.shape)

torch.Size([128, 400, 1])


In [None]:
softy = nn.Softmax(dim=0)
attention = softy(energy)
print(attention.shape)

torch.Size([128, 400, 1])


In [None]:
attention = attention.permute(0,2,1)

In [None]:
cntxt_vec = torch.bmm(attention,outputs)
print(cntxt_vec.permute(1,0,2).shape)

torch.Size([1, 128, 512])


In [None]:
class Decoder(nn.Module):
  def __init__(self,input_dim,emb_dim,hidden_size,output_size,n_layers,dropout):
    super().__init__()

    self.Embeddings = nn.Embedding(input_dim,emb_dim)
    self.rnn = nn.RNN(emb_dim+hidden_size,hidden_size,n_layers,dropout=dropout)
    self.fc = nn.Linear(hidden_size,output_size)
    self.energy = nn.Linear(hidden_size*2,1)
    self.softmax = nn.Softmax(dim=0)

  def forward(self,input,outputs,hidden):
    input = torch.tensor(input)
    input = input.unsqueeze(0)
    embeddings = self.Embeddings(input)
    hidden_cat = torch.sum(hidden,0)
    hidden_cat = hidden_cat.repeat(outputs.shape[1],1,1)
    hidden_cat = hidden_cat.permute(1,0,2)
    hidden_cat = torch.cat((hidden_cat,outputs),dim=2)
    energy = self.energy(hidden_cat)
    attention = self.softmax(energy)

    attention = attention.permute(0,2,1)
    #print(attention.shape)
    #print(outputs.shape)
    context_vector = torch.bmm(attention,outputs)
    context_vector = context_vector.permute(1,0,2)
    #print(context_vector.shape)
    #print(embeddings.shape)
    rnn_input = torch.cat((embeddings,context_vector),dim=2)

    decoder_output,hidden = self.rnn(rnn_input,hidden)
    decoder_output = self.fc(decoder_output)
    decoder_output = decoder_output.squeeze(0)
    return decoder_output,hidden

In [None]:
decoder = Decoder(len(target_tokenizer.word_index)+1,2048,512,len(target_tokenizer.word_index)+1,4,0.20)

In [None]:
tar = target_tensor[:128]

In [None]:
tar = tar.permute(1,0)

In [None]:
tar[-1]

tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0], dtype=torch.int32)

In [None]:
words,hidden = decoder(tary[2],outputs,hidden)

  if sys.path[0] == '':


torch.Size([1, 128, 512])
torch.Size([1, 128, 2048])


In [None]:
print(words.shape)
print(words.argmax(1).shape)
print(tary[2].shape)

torch.Size([128, 93810])
torch.Size([128])
torch.Size([128])


In [None]:
print(len(target_tokenizer.word_index)+1)

93810


In [None]:
print(hidden.shape)

torch.Size([4, 128, 512])


In [None]:
class seq_2_seq(nn.Module):
  def __init__(self,encoder,decoder):
    super().__init__()

    self.encoder = encoder
    self.decoder = decoder
  
  def forward(self,src,tar,teacher_force_ratio=0.5):
    
    batch_size = src.shape[0]
    target_len = tar.shape[1]
    target_vocab_len = len(target_tokenizer.word_index) + 1
    outputs = torch.zeros(target_len,batch_size,target_vocab_len)

    encoder_outputs,hidden = self.encoder(src)
    tar = tar.permute(1,0)
    x = tar[0]

    for i in range(1,target_len):
      decoder_output,hidden = decoder(x,encoder_outputs,hidden)
      outputs[i] = decoder_output

      prediction = decoder_output.argmax(1)

      x = prediction if random.random() < teacher_force_ratio else prediction
      
    return outputs


In [None]:
n_epochs = 10
learning_rate = 0.001
batch_size = 8

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
input_vocab_len = len(input_tokenizer.word_index) + 1
output_vocab_len = len(target_tokenizer.word_index) + 1
input_decoder = output_vocab_len
embedding_dim = 256
hidden_size = 512
n_layers = 2
dropout = 0.35
steps_per_epoch = len(input_tensor_train)//batch_size

In [None]:
encoder = Encoder(input_vocab_len,embedding_dim,hidden_size,n_layers,True,dropout)
decoder = Decoder(input_decoder,embedding_dim,hidden_size,output_vocab_len,n_layers,dropout)

In [None]:
model = seq_2_seq(encoder,decoder)

In [None]:
src = torch.tensor(input_tensor[:2])
tar = torch.tensor(target_tensor[:2])
outputs = model(src,tar)

  if sys.path[0] == '':


In [None]:
print(outputs)

tensor([[[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000]],

        [[ 0.2812, -0.6114,  0.3504,  ...,  0.4416,  0.0972, -0.5868],
         [ 0.1191, -0.8078, -0.0755,  ...,  0.2842,  0.1937, -0.2748]],

        [[ 0.1259, -1.0468,  0.3596,  ...,  0.2392,  0.0182, -0.3960],
         [-0.0224, -1.1669,  0.4714,  ...,  0.1018,  0.1743, -0.3491]],

        ...,

        [[-0.2900, -0.9281,  0.4020,  ...,  0.0326,  0.1680, -0.0788],
         [ 0.1804, -1.0019,  0.7604,  ...,  0.0589,  0.6097,  0.0832]],

        [[ 0.4793, -0.9742,  0.5191,  ...,  0.3913,  0.4934, -0.5893],
         [ 0.2661, -0.9038,  0.3454,  ..., -0.0282, -0.1716, -0.0650]],

        [[ 0.1802, -1.1157,  0.8896,  ...,  0.1941,  0.1563,  0.0821],
         [ 0.5315, -0.6418,  0.3557,  ...,  0.2844,  0.3202, -0.4553]]],
       grad_fn=<CopySlices>)


In [None]:
print(outputs.shape)
print(tar.shape)

torch.Size([420, 2, 93810])
torch.Size([2, 420])


In [None]:
tar = tar.permute(1,0)
print(tar.shape)

torch.Size([420, 2])


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
criterion = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(),lr=learning_rate)

In [None]:
input_tensor_train, input_tensor_val, target_tensor_train, target_tensor_val = train_test_split(input_tensor, target_tensor, test_size=0.2)
print(len(input_tensor_train), len(target_tensor_train), len(input_tensor_val), len(target_tensor_val))

102084 102084 25521 25521


In [None]:
BUFFER_SIZE = len(input_tensor_train)
dataset = tf.data.Dataset.from_tensor_slices((input_tensor_train, target_tensor_train)).shuffle(BUFFER_SIZE)
dataset = dataset.batch(batch_size, drop_remainder=True)

In [None]:
#for epochs in range(n_epochs):
#  loss_per_epoch = []
#  for (batch,(src,tar)) in enumerate(dataset.take(steps_per_epoch)):
#    src = torch.tensor(np.array(src))
#    tar = torch.tensor(np.array(tar))
#    #print(tar.shape)
#    output = model(src,tar)
#    optimizer.zero_grad()
#    print(tar.shape)
#    tar = tar.permute(1,0)
#    loss = criterion(output, tar)
#    loss.backward()
#    loss_per_epoch = loss_per_epoch + loss.item()
#
#    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
#    optimizer.step()
#
#  print("Epoch:-",epoch,' Loss:-',loss_per_epoch)
#