

## **Seq2Seq Model with Attention**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import csv
import numpy as np
import sys
import re
import string
import operator
import torch
import torch.nn as nn
import torch.nn.functional as F

In [3]:
# Install Indic NLP Library
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git
!pip install Morfessor
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"

sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

from indicnlp import loader
loader.load()

from indicnlp.tokenize import indic_detokenize, indic_tokenize

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1325, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178[K
Receiving objects: 100% (1325/1325), 9.57 MiB | 11.21 MiB/s, done.
Resolving deltas: 100% (688/688), done.
Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 32.96 MiB/s, done.
Resolving deltas: 100% (51/51), done.
Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [4]:
# Install NLTK
import nltk
nltk.download('punkt')

from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


### **Read the Dataset**

In [5]:
def read_dataset(filepath):

  hindi_list=[]
  english_list=[]

  with open(filepath, 'r') as f:
      reader = csv.DictReader(f)
      for line in reader:
          hindi_list.append(line['hindi'])
          english_list.append(line['english'])

  return hindi_list, english_list



### **Train and Test Split**

In [6]:
def train_test_split(hindi_list, english_list, train_size=0.8):

  indices_list = np.arange(len(hindi_list))
  np.random.shuffle(indices_list)
  
  train_indices_list = indices_list[:int(len(indices_list)*train_size)]
  test_indices_list = indices_list[int(len(indices_list)*train_size):]

  hindi_sentence_list = [hindi_list[i] for i in train_indices_list]
  english_sentence_list = [english_list[i] for i in train_indices_list]

  hindi_test_sentence_list = [hindi_list[i] for i in test_indices_list]
  english_test_sentence_list = [english_list[i] for i in test_indices_list]

  return hindi_sentence_list, english_sentence_list, hindi_test_sentence_list, english_test_sentence_list


In [7]:
# Read from stored files to load the model
def load_train_test():
  with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/hindi_train.txt','r') as f:
    hindi_sentence_list = f.read().splitlines()

  with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/english_train.txt','r') as f:
    english_sentence_list = f.read().splitlines()

  with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/english_test.txt','r') as f:
    english_test_sentence_list = f.read().splitlines()

  with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/hindi_test.txt','r') as f:
    hindi_test_sentence_list = f.read().splitlines()

  return hindi_sentence_list, english_sentence_list, hindi_test_sentence_list, english_test_sentence_list

### **Tokenize Hindi sentences**

In [8]:
# Detokenize
# Ref: https://colab.research.google.com/drive/1p3oGPcNdORw5_MDcufTDYWJhJt3XVPuC?usp=sharing#scrollTo=GU6E07Yw5zvl

def detokenize(sent_list, lang='hi'):  # pass hindi_sentence_list and lang='hi'

  for i in range(len(sent_list)):
    sent_list[i] = indic_detokenize.trivial_detokenize(sent_list[i],lang)

  return sent_list

In [9]:
 string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
#Ref for unicode chart: https://www.ssec.wisc.edu/~tomw/java/unicode.html#x0900 

def hindi_tokenize(hindi_sentence_list): 

  hindi_word_to_count={}
  hindi_word_to_index={'UNK':0, 'PAD':1, 'SOS':2, 'EOS':3}
  hindi_index_to_word={0:'UNK', 1:'PAD', 2:'SOS', 3:'EOS'}
  count=4
  for sent in hindi_sentence_list:
    for t in indic_tokenize.trivial_tokenize(sent): 
      x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
      for elem in x:
        hindi_word_to_count[elem] = hindi_word_to_count.get(elem,0)+1
        if hindi_word_to_index.get(elem) is None and hindi_word_to_count.get(elem,0) >= 2:
          hindi_word_to_index[elem] = count
          hindi_index_to_word[count] = elem
          count+=1
  return hindi_word_to_index, hindi_index_to_word

### **Tokenize English sentences**

In [11]:
def english_tokenize(english_sentence_list):

  english_word_to_count={}
  english_word_to_index={'UNK':0, 'PAD':1, 'SOS':2, 'EOS':3}
  english_index_to_word={0:'UNK', 1:'PAD', 2:'SOS', 3:'EOS'}
  count=4

  for sent in english_sentence_list:
    for token in word_tokenize(sent.lower()):
      temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token)
      for elem in temp:
        english_word_to_count[elem] = english_word_to_count.get(elem,0)+1
        if english_word_to_index.get(elem) is None and english_word_to_count.get(elem,0) >= 2:
          english_word_to_index[elem] = count
          english_index_to_word[count] = elem
          count+=1
  return english_word_to_index, english_index_to_word

### **Store the train and test set**

In [12]:
def store_lists(hindi_sentence_list, english_sentence_list, hindi_test_sentence_list, english_test_sentence_list):

  with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/hindi_train.txt','w') as f:
    for sent in hindi_sentence_list:
      f.write(sent+'\n')

  with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/english_train.txt','w') as f:
    for sent in english_sentence_list:
      f.write(sent+'\n')
  
  with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/hindi_test.txt','w') as f:
    for sent in hindi_test_sentence_list:
      f.write(sent+'\n')

  with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/english_test.txt','w') as f:
    for sent in english_test_sentence_list:
      f.write(sent+'\n')

### **Find Sentences Length and fix the maximum length**


In [13]:
def get_max_len(hindi_sentence_list, english_sentence_list, min_frequency=1000):

  # check for hindi sentences
  sent_len_count={}
  for sent in hindi_sentence_list:
    sent_len=0
    for t in indic_tokenize.trivial_tokenize(sent): 
      x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
      for elem in x:
        sent_len+=1
    sent_len_count[sent_len] = sent_len_count.get(sent_len,0)+1

    # sort the dictionary based on their counts
  sorted_counts = sorted(sent_len_count.items(), key=operator.itemgetter(1), reverse=True)

  index=0
  for pair in sorted_counts:
    if pair[1]>min_frequency:
      index+=1
    else:
      break

  max_hindi_len=0
  for pair in sorted_counts[:index]:
    if pair[0]>max_hindi_len:
      max_hindi_len=pair[0]


  # Now check for english sentences
  english_sent_len_count={}
  for sent in english_sentence_list:
    sent_len=0
    for token in word_tokenize(sent.lower()): 
      temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token)
      for elem in temp:
        sent_len+=1
    english_sent_len_count[sent_len] = english_sent_len_count.get(sent_len,0)+1

  # sort the dictionary based on their counts
  english_sorted_counts = sorted(english_sent_len_count.items(), key=operator.itemgetter(1), reverse=True)

  index=0
  for pair in english_sorted_counts:
    if pair[1]>min_frequency:
      index+=1
    else:
      break

  max_english_len=0
  for pair in english_sorted_counts[:index]:
    if pair[0]>max_english_len:
      max_english_len=pair[0]

  max_len = max(max_hindi_len, max_english_len)
  
  return max_len


### **Filter out sentences with length less than or equal to maximum length**

In [14]:
def filter_sentences(hindi_sentence_list, english_sentence_list, max_len):

  english_filtered_sent_list=[]
  hindi_filtered_sent_list=[]
  filtered_sent_pair_list=[]
  for hin_sent, eng_sent in zip(hindi_sentence_list, english_sentence_list):
    hin_sent_len=0
    for t in indic_tokenize.trivial_tokenize(hin_sent): 
      x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
      for elem in x:
        hin_sent_len+=1

    eng_sent_len=0
    for token in word_tokenize(eng_sent.lower()): 
      temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token)
      for elem in temp:
        eng_sent_len+=1

    if hin_sent_len>=1 and hin_sent_len<=max_len and eng_sent_len>=1 and eng_sent_len<=max_len:
      english_filtered_sent_list.append(eng_sent)
      hindi_filtered_sent_list.append(hin_sent)
      filtered_sent_pair_list.append([hin_sent, eng_sent])

  return english_filtered_sent_list, hindi_filtered_sent_list

### **Form List of list of indexes from the filtered sentences**

In [15]:


def make_tensors( hindi_filtered_sent_list, hindi_test_sentence_list, english_filtered_sent_list,  english_test_sentence_list):
  # Create tensor array for each hindi sentence 
  hindi_list_indices = torch.tensor([[1]*(max_len+2)]*len(hindi_filtered_sent_list), dtype=torch.long)

  i=0
  for sent in hindi_filtered_sent_list:
    hindi_list_indices[i][0]=2  #SOS
    j=1
    for t in indic_tokenize.trivial_tokenize(sent): 
      x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
      for elem in x:
        if hindi_word_to_index.get(elem) is None:
          #continue
          hindi_list_indices[i][j] = 0  # UNK
        else:
          hindi_list_indices[i][j] = hindi_word_to_index.get(elem)
        j+=1
    hindi_list_indices[i][j]=3  #EOS
    j+=1
    while j<=(max_len+1):
      hindi_list_indices[i][j] = 1  #PAD
      j+=1

    i+=1

  # Create tensor array for each hindi test sentence
  hindi_test_list_indices = torch.tensor([[1]*(max_len+2)]*len(hindi_test_sentence_list), dtype=torch.long)

  i=0
  for sent in hindi_test_sentence_list:
    hindi_test_list_indices[i][0]=2  #SOS
    j=1
    for t in indic_tokenize.trivial_tokenize(sent): 
      x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
      for elem in x:
        if j>(max_len):
          break
        if hindi_word_to_index.get(elem) is None:
          #continue
          hindi_test_list_indices[i][j] = 0  # UNK
        else:
          hindi_test_list_indices[i][j] = hindi_word_to_index.get(elem)
        j+=1
    hindi_test_list_indices[i][j]=3  #EOS
    j+=1
    while j<=(max_len+1):
      hindi_test_list_indices[i][j] = 1  #PAD
      j+=1

    i+=1

  # Create tensor array for each english sentence

  english_list_indices = torch.tensor([[1]*(max_len+2)]*len(english_filtered_sent_list), dtype=torch.long)

  i=0
  for sent in english_filtered_sent_list:
    english_list_indices[i][0]=2  #SOS
    j=1
    for token in word_tokenize(sent.lower()):
      temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token)
      for elem in temp:
        if english_word_to_index.get(elem) is None:
          #continue
          english_list_indices[i][j] = 0  # UNK
        else:
          english_list_indices[i][j] = english_word_to_index.get(elem)
        j+=1
    english_list_indices[i][j]=3  #EOS
    j+=1
    while j<=(max_len+1):
      english_list_indices[i][j] = 1  #PAD
      j+=1
    
    i+=1

  # Create tensor array for each english test sentence
  english_test_list_indices = torch.tensor([[1]*(max_len+2)]*len(english_test_sentence_list), dtype=torch.long)

  i=0
  for sent in english_test_sentence_list:
    english_test_list_indices[i][0]=2  #SOS
    j=1
    for token in word_tokenize(sent.lower()):
      temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token)
      for elem in temp:
        if j>(max_len):
          break
        if english_word_to_index.get(elem) is None:
          #continue
          english_test_list_indices[i][j] = 0  # UNK
        else:
          english_test_list_indices[i][j] = english_word_to_index.get(elem)
        j+=1
    english_test_list_indices[i][j]=3  #EOS
    j+=1
    while j<=(max_len+1):
      english_test_list_indices[i][j] = 1  #PAD
      j+=1
    
    i+=1

  return hindi_list_indices, hindi_test_list_indices, english_list_indices, english_test_list_indices

#### **Use GPU**

In [16]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu" 
print(device)

cuda


### **References:** 
### http://ethen8181.github.io/machine-learning/deep_learning/seq2seq/2_torch_seq2seq_attention.html 
### https://arxiv.org/abs/1409.0473

## **Sequence to Sequence Model with Attention**

In [17]:
class Encoder(nn.Module):

  def __init__(self, input_size, embedding_size, hidden_dimension, num_layers, dropout):  
    super().__init__()
    
    self.input_size=input_size
    self.dropout=dropout
    self.hidden_dimension = hidden_dimension
    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)

    self.rnn = nn.GRU(embedding_size, hidden_dimension,num_layers, dropout=dropout, bidirectional=True)

    self.final_layer = nn.Linear(hidden_dimension * 2, hidden_dimension)

  def forward(self, word_inputs):
 
    embedded = self.embedding(word_inputs)

    outputs, hidden = self.rnn(embedded)

    #outputs shape = [sequence_length, batch_size, hidden_dimension*num_of_directions]

    x = torch.cat((hidden[-2, :, :], hidden[-1, :, :]), dim=1)
    hidden = torch.tanh(self.final_layer(x))

    return outputs, hidden

In [18]:
class Attention(nn.Module):

    def __init__(self, hidden_dimension):
        super().__init__()
        self.hidden_dimension = hidden_dimension

        self.final_layer1 = nn.Linear(hidden_dimension * 2 + hidden_dimension, hidden_dimension)
        self.final_layer2 = nn.Linear(hidden_dimension, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        sequence_length = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]
        
        hidden = hidden.unsqueeze(1).repeat(1, sequence_length, 1)

        outputs = encoder_outputs.permute(1, 0, 2)

        x = torch.cat((hidden, outputs), dim=2)

        energy = torch.tanh(self.final_layer1(x))

        attention = self.final_layer2(energy).squeeze(dim=2)        
        attention_weight = torch.softmax(attention, dim=1)

        return attention_weight

In [19]:
class Decoder(nn.Module):
  def __init__(self, output_size, embedding_size, hidden_dimension, num_layers, dropout, attention):
    super().__init__()
    
    self.embedding_size=embedding_size
    self.output_size = output_size
    self.hidden_dimension = hidden_dimension
    self.num_layers = num_layers
    self.dropout = dropout
    self.attention = attention

    self.embedding = nn.Embedding(output_size, embedding_size)
    
    self.rnn = nn.GRU(hidden_dimension * 2 + embedding_size, hidden_dimension, num_layers, dropout = dropout)
    
    self.linear = nn.Linear(hidden_dimension, output_size)
    
  def forward(self, input, encoder_states, hidden):

    attention = self.attention(encoder_states, hidden).unsqueeze(1)

    outputs = encoder_states.permute(1, 0, 2)

    context = torch.bmm(attention, outputs).permute(1, 0, 2)

    embedded = self.embedding(input.unsqueeze(0))
    x = torch.cat((embedded, context), dim=2)

    outputs, hidden = self.rnn(x, hidden.unsqueeze(0))
    prediction = self.linear(outputs.squeeze(0))
    return prediction, hidden.squeeze(0)

In [20]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
      
  def forward(self, source, target, teacher_force = 1):

    batch_size = target.shape[1]
    sequence_length = target.shape[0]

    target_dict_size = self.decoder.output_size
    
    pred_output = torch.zeros(sequence_length, batch_size, target_dict_size).to(self.device)
    
    encoder_states, hidden = self.encoder(source)
    
    input = target[0]
    
    for i in range(1, sequence_length):

      output, hidden = self.decoder(input, encoder_states, hidden)

      pred_output[i] = output
    
      best_pred = output.argmax(1) 

      if random.random() < teacher_force:
        input = target[i]

      else:
        input = best_pred
    
    return pred_output


### **Text Preprocessing**

In [21]:
filepath = '/content/drive/MyDrive/Colab Notebooks/train.csv'

# read dataset
hindi_list, english_list = read_dataset(filepath)

# split into train and test data
hindi_sentence_list, english_sentence_list, hindi_test_sentence_list, english_test_sentence_list = train_test_split(hindi_list, english_list, train_size=1)

# Detokenize hindi sentences
hindi_sentence_list = detokenize(hindi_sentence_list, lang='hi')

# Store the train and test sentences
store_lists(hindi_sentence_list, english_sentence_list, hindi_test_sentence_list, english_test_sentence_list)

# Tokenize hindi sentences
hindi_word_to_index, hindi_index_to_word = hindi_tokenize(hindi_sentence_list)

# Tokenize english sentences
english_word_to_index, english_index_to_word = english_tokenize(english_sentence_list)

# Set maximum length
max_len = get_max_len(hindi_sentence_list, english_sentence_list, min_frequency=300)

# Filter the sentences greater than max length
english_filtered_sent_list, hindi_filtered_sent_list = filter_sentences(hindi_sentence_list, english_sentence_list, max_len)

# Form tensor of the sentences
hindi_list_indices, hindi_test_list_indices, english_list_indices, english_test_list_indices = make_tensors(hindi_filtered_sent_list, hindi_test_sentence_list, english_filtered_sent_list,  english_test_sentence_list)


### **Set Model Parameters and define Model**

In [22]:
input_size = len(hindi_word_to_index)
output_size = len(english_word_to_index)
embedding_size = 128
hidden_dimension = 512
num_layers = 1

dropout = 0.5
batch_size = 32

attention = Attention(hidden_dimension)
enc = Encoder(input_size, embedding_size, hidden_dimension, num_layers, dropout).to(device)
dec = Decoder(output_size, embedding_size, hidden_dimension, num_layers, dropout, attention).to(device)

model = Seq2Seq(enc, dec, device).to(device)

print(enc)
print(dec)
print(model)

  "num_layers={}".format(dropout, num_layers))


Encoder(
  (embedding): Embedding(21672, 128)
  (rnn): GRU(128, 512, dropout=0.5, bidirectional=True)
  (final_layer): Linear(in_features=1024, out_features=512, bias=True)
)
Decoder(
  (attention): Attention(
    (final_layer1): Linear(in_features=1536, out_features=512, bias=True)
    (final_layer2): Linear(in_features=512, out_features=1, bias=False)
  )
  (embedding): Embedding(19009, 128)
  (rnn): GRU(1152, 512, dropout=0.5)
  (linear): Linear(in_features=512, out_features=19009, bias=True)
)
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(21672, 128)
    (rnn): GRU(128, 512, dropout=0.5, bidirectional=True)
    (final_layer): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (final_layer1): Linear(in_features=1536, out_features=512, bias=True)
      (final_layer2): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(19009, 128)
    (rnn): GRU(1152, 512, dropout=0.5)
    (lin

### **Test Encoder**

In [23]:
word_input = torch.zeros((7, 4), dtype=torch.long, device=device)  # here 7 is seq length and 4 is batch size
print(word_input.shape)
out, enc_hid = enc(word_input)  # encode this word_input

print(enc_hid)

print(enc_hid.shape) # [num_layers, seq_length, hidden_units]

torch.Size([7, 4])
tensor([[-0.2043, -0.0622,  0.0095,  ..., -0.2569, -0.1148, -0.3100],
        [-0.2043, -0.0622,  0.0095,  ..., -0.2569, -0.1148, -0.3100],
        [-0.2043, -0.0622,  0.0095,  ..., -0.2569, -0.1148, -0.3100],
        [-0.2043, -0.0622,  0.0095,  ..., -0.2569, -0.1148, -0.3100]],
       device='cuda:0', grad_fn=<TanhBackward>)
torch.Size([4, 512])


### **Test Decoder**

In [24]:
for i in range(7):
    input = word_input[i]
    pred, dec_hid= dec(input, out, enc_hid)
    print(dec_hid.shape, pred)

torch.Size([4, 512]) tensor([[-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978]],
       device='cuda:0', grad_fn=<AddmmBackward>)
torch.Size([4, 512]) tensor([[-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978]],
       device='cuda:0', grad_fn=<AddmmBackward>)
torch.Size([4, 512]) tensor([[-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978],
        [-0.0881, -0.1821, -0.1778,  ..., -0.0157, -0.0460,  0.1978

### **Initialize Weights**

In [25]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(21672, 128)
    (rnn): GRU(128, 512, dropout=0.5, bidirectional=True)
    (final_layer): Linear(in_features=1024, out_features=512, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (final_layer1): Linear(in_features=1536, out_features=512, bias=True)
      (final_layer2): Linear(in_features=512, out_features=1, bias=False)
    )
    (embedding): Embedding(19009, 128)
    (rnn): GRU(1152, 512, dropout=0.5)
    (linear): Linear(in_features=512, out_features=19009, bias=True)
  )
)

### **Optimizer**

In [26]:
from torch import optim
optimizer = optim.Adam(model.parameters(), lr=0.001)

### **Loss**

In [27]:
criterion = nn.CrossEntropyLoss(ignore_index = 1)

### **Create Batches**

In [28]:
from torch.utils import data
def load_array(data_arrays, batch_size, is_train=True):
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [29]:
data_arrays = (hindi_list_indices, english_list_indices)
data_iter = load_array(data_arrays, batch_size)

### **Translate Hindi Sentence to English Sentence**

In [30]:
def translate_sentence(model, sentence, device, max_length=max_len):

    sent_list=[]
    for t in indic_tokenize.trivial_tokenize(sentence): 
      x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
      for elem in x:
        if hindi_word_to_index.get(elem) is None:
          sent_list.append(hindi_word_to_index['UNK'])
          #continue
        else:
           sent_list.append(hindi_word_to_index[elem]) 

    sent_list.insert(0, hindi_word_to_index['SOS'])
    sent_list.append(hindi_word_to_index['EOS'])

    while(len(sent_list)<max_len):
      sent_list.append(hindi_word_to_index['PAD'])


    sent_tensor = torch.tensor(sent_list, dtype=torch.long).unsqueeze(1).to(device)

    with torch.no_grad():
        outputs_encoder, hidden = model.encoder(sent_tensor)

    outputs = [2]  #SOS = 2

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden = model.decoder(previous_word, outputs_encoder, hidden)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        #EOS=3
        if output.argmax(1).item() == 3:
            break

    translated_sentence = [english_index_to_word[idx] for idx in outputs]

    return translated_sentence[1:]

### **Save Model**

In [31]:
def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/content/drive/MyDrive/Colab Notebooks/FinalRound7/checkpoint-NMT')
    torch.save(model.state_dict(),'/content/drive/MyDrive/Colab Notebooks/FinalRound7/checkpoint-NMT-SD')

### **Train Model**

In [None]:
import random
import sys
epoch_loss = 0.0
num_epochs = 15
best_loss = sys.maxsize
best_epoch = -1
step=0
i=0
for epoch in range(num_epochs):

  print("Epoch -",epoch+1)
  model.eval()

  model.train(True)
  for batch_idx, batch in enumerate(data_iter):
    input, target = [x.to(device) for x in batch]

    input = input.permute(1,0)
    target = target.permute(1,0)


    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, target)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    optimizer.step()
    step += 1
      
    epoch_loss += loss.item()

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 

  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(data_iter))

Epoch - 1
Epoch_Loss - 3.2839982509613037

Epoch - 2
Epoch_Loss - 2.755028247833252

Epoch - 3
Epoch_Loss - 1.7104363441467285

Epoch - 4
Epoch_Loss - 1.9817330837249756

Epoch - 5
Epoch_Loss - 1.3006936311721802

Epoch - 6
Epoch_Loss - 1.4060721397399902

Epoch - 7
Epoch_Loss - 0.9448722004890442

Epoch - 8
Epoch_Loss - 1.3367235660552979

Epoch - 9
Epoch_Loss - 0.792805016040802

Epoch - 10
Epoch_Loss - 0.8152564764022827

Epoch - 11
Epoch_Loss - 0.624953031539917

Epoch - 12
Epoch_Loss - 0.5024504661560059

Epoch - 13


### **Load Model**

In [None]:
# # Load the model
# model.load_state_dict(torch.load('/content/drive/MyDrive/Colab Notebooks/FinalRound6/checkpoint-NMT-SD'))
# hindi_sentence_list, english_sentence_list, hindi_test_sentence_list, english_test_sentence_list = load_train_test()

# # Detokenize hindi sentences
# hindi_sentence_list = detokenize(hindi_sentence_list, lang='hi')

# # Store the train and test sentences
# store_lists(hindi_sentence_list, english_sentence_list, hindi_test_sentence_list, english_test_sentence_list)

# # Tokenize hindi sentences
# hindi_word_to_index, hindi_index_to_word = hindi_tokenize(hindi_sentence_list)

# # Tokenize english sentences
# english_word_to_index, english_index_to_word = english_tokenize(english_sentence_list)

# # Set maximum length
# max_len = get_max_len(hindi_sentence_list, english_sentence_list, min_frequency=300)

# # Filter the sentences greater than max length
# english_filtered_sent_list, hindi_filtered_sent_list = filter_sentences(hindi_sentence_list, english_sentence_list, max_len)

# # Form tensor of the sentences
# hindi_list_indices, hindi_test_list_indices, english_list_indices, english_test_list_indices = make_tensors(hindi_filtered_sent_list, hindi_test_sentence_list, english_filtered_sent_list,  english_test_sentence_list)

### **Translate the Hindi Sentences from the Test Set**

In [None]:
outputs = []

for src,trg in zip(hindi_test_sentence_list, english_test_sentence_list):
    prediction = translate_sentence(model, src, device)
    prediction = prediction[:-1]  # remove <eos> token
    x = ' '.join([e for e in prediction])
    outputs.append(x)

### **Compute BLEU and METEOR score on Test Set**

In [None]:
!pip install -U nltk

In [None]:
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

references = english_test_sentence_list

hypotheses = outputs

total_num = len(references)
total_bleu_scores = 0
total_meteor_scores = 0
for i in range(total_num):
  total_bleu_scores+=sentence_bleu([references[i].split(" ")], hypotheses[i].split(" "))
  total_meteor_scores+=single_meteor_score(references[i], hypotheses[i])

bleu_result = total_bleu_scores/total_num
meteor_result = total_meteor_scores/total_num

print("bleu score: ",bleu_result)
print("meteor score: ",meteor_result)

### **Load week3 Hindi dataset**

In [None]:
testpath = '/content/drive/MyDrive/Colab Notebooks/testhindistatements.csv'

In [None]:
import csv

finaldata=[]

with open(testpath, 'r') as f:
    reader = csv.DictReader(f)
    for line in reader:
        finaldata.append(line['hindi'])

### **Store the corresponding Translated English Sentences**

In [None]:
with open('/content/drive/MyDrive/Colab Notebooks/FinalRound7/answer.txt', 'w') as f:
    for sent in finaldata:
        prediction = translate_sentence(model, sent, device)
        prediction = prediction[:-1]  # remove <eos> token
        x = ' '.join([e for e in prediction])
        f.write(x+'\n')