

## **Seq2Seq Model with Attention**

In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session


/kaggle/input/hindi2/hindistatements2.csv
/kaggle/input/hineng/train.csv
/kaggle/input/checkpoint/checkpoint-NMT-SD (1)
/kaggle/input/hinmain/hindistatements.csv
/kaggle/input/hindi3/hindistatements.csv
/kaggle/input/hindi4/hindistatements4.csv


### **Read the Dataset**

In [4]:
filepath = '../input/hineng/train.csv'

In [5]:
import csv

hindi_list=[]
english_list=[]

with open('../input/hineng/train.csv', 'r') as f:
    reader = csv.DictReader(f)
    for line in reader:
        hindi_list.append(line['hindi'])
        english_list.append(line['english'])

In [6]:
hindi_list[:5], english_list[:5]

(['एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध से वापसी ली, उन्होंने वही काम किये जो कैदियों की कश्मकश के निदान हैं।',
  'मैं उनके साथ कोई लेना देना नहीं है.',
  '-हटाओ रिक.',
  'क्योंकि यह एक खुशियों भरी फ़िल्म है.',
  'The thought reaching the eyes...'],
 ["In El Salvador, both sides that withdrew from their civil war took moves that had been proven to mirror a prisoner's dilemma strategy.",
  'I have nothing to do with them.',
  'Fuck them, Rick.',
  "Because it's a happy film.",
  'The thought reaching the eyes...'])

### **Train and Test Split**

In [7]:
import numpy as np

indices_list = np.arange(len(hindi_list))
np.random.shuffle(indices_list)
train_size = 0.8
train_indices_list = indices_list[:int(len(indices_list)*train_size)]
test_indices_list = indices_list[int(len(indices_list)*train_size):]

hindi_sentence_list = [hindi_list[i] for i in train_indices_list]
english_sentence_list = [english_list[i] for i in train_indices_list]

hindi_test_sentence_list = [hindi_list[i] for i in test_indices_list]
english_test_sentence_list = [english_list[i] for i in test_indices_list]

In [8]:
len(hindi_sentence_list), len(english_sentence_list)

(81857, 81857)

In [9]:
len(hindi_test_sentence_list), len(english_test_sentence_list)

(20465, 20465)

### **Install Indic NLP Library**

In [10]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1325, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178[K
Receiving objects: 100% (1325/1325), 9.57 MiB | 9.57 MiB/s, done.
Resolving deltas: 100% (688/688), done.


In [11]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 25.37 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [12]:
!pip install Morfessor

Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [13]:
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"./indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="./indic_nlp_resources"

In [14]:
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

In [15]:
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

In [16]:
from indicnlp import loader
loader.load()

### **Tokenize Hindi sentences**

In [17]:
# Detokenize
# Ref: https://colab.research.google.com/drive/1p3oGPcNdORw5_MDcufTDYWJhJt3XVPuC?usp=sharing#scrollTo=GU6E07Yw5zvl

from indicnlp.tokenize import indic_detokenize  

for i in range(len(hindi_sentence_list)):
  hindi_sentence_list[i] = indic_detokenize.trivial_detokenize(hindi_sentence_list[i],lang='hi')


In [18]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [19]:
#Ref for unicode chart: https://www.ssec.wisc.edu/~tomw/java/unicode.html#x0900 

import re

from indicnlp.tokenize import indic_tokenize 

hindi_word_to_count={}
hindi_word_to_index={'UNK':0, 'PAD':1, 'SOS':2, 'EOS':3}
hindi_index_to_word={0:'UNK', 1:'PAD', 2:'SOS', 3:'EOS'}
count=4
for sent in hindi_sentence_list:
  for t in indic_tokenize.trivial_tokenize(sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      hindi_word_to_count[elem] = hindi_word_to_count.get(elem,0)+1
      if hindi_word_to_index.get(elem) is None and hindi_word_to_count.get(elem,0) >= 2:
        hindi_word_to_index[elem] = count
        hindi_index_to_word[count] = elem
        count+=1
print(count)

19062


In [20]:
with open('./HindiWordToCount.txt','w') as f:
  for k,v in hindi_word_to_count.items():
    f.write(str(k)+","+str(v)+"\n")

In [21]:
print(len(hindi_word_to_count))

40102


In [22]:
print(len(hindi_word_to_index))

19062


### **Tokenize English sentences**

In [23]:
!python3 -m spacy download en

Collecting en_core_web_sm==2.3.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 13.1 MB/s eta 0:00:01
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/opt/conda/lib/python3.7/site-packages/en_core_web_sm -->
/opt/conda/lib/python3.7/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [24]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [25]:
english_word_to_count={}
english_word_to_index={'UNK':0, 'PAD':1, 'SOS':2, 'EOS':3}
english_index_to_word={0:'UNK', 1:'PAD', 2:'SOS', 3:'EOS'}
count=4

for sent in english_sentence_list:
  for token in nlp.tokenizer(sent.lower()):
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      english_word_to_count[elem] = english_word_to_count.get(elem,0)+1
      if english_word_to_index.get(elem) is None and english_word_to_count.get(elem,0) >= 2:
        english_word_to_index[elem] = count
        english_index_to_word[count] = elem
        count+=1
print(count)

16692


In [26]:
with open('./EnglishWordToCount.txt','w') as f:
  for k,v in english_word_to_count.items():
    f.write(str(k)+","+str(v)+"\n")

### **Find Sentences Length and fix the maximum length and filter the sentences**


#### **First Let's Check for Hindi**

In [27]:
sent_len_count={}
for sent in hindi_sentence_list:
  sent_len=0
  for t in indic_tokenize.trivial_tokenize(sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      sent_len+=1
  sent_len_count[sent_len] = sent_len_count.get(sent_len,0)+1

print(sent_len_count)


{7: 6183, 10: 4169, 6: 6544, 29: 553, 14: 2383, 8: 5679, 26: 725, 11: 3635, 9: 5005, 5: 6357, 35: 349, 20: 1290, 13: 2784, 23: 954, 4: 5611, 18: 1517, 19: 1339, 16: 2021, 1: 319, 3: 3866, 2: 2730, 22: 1031, 12: 3220, 15: 2233, 27: 703, 34: 361, 24: 861, 28: 642, 17: 1571, 30: 525, 32: 403, 46: 119, 41: 174, 21: 1113, 31: 448, 47: 104, 42: 196, 38: 256, 39: 211, 33: 403, 40: 219, 25: 753, 65: 19, 60: 43, 37: 259, 36: 278, 44: 125, 43: 185, 77: 9, 52: 70, 63: 28, 64: 36, 53: 75, 58: 46, 50: 93, 56: 47, 62: 28, 61: 49, 49: 81, 88: 4, 45: 148, 54: 60, 48: 111, 83: 6, 72: 16, 69: 23, 96: 5, 57: 34, 51: 77, 92: 2, 68: 30, 59: 29, 89: 3, 66: 28, 105: 3, 75: 9, 70: 9, 55: 48, 71: 14, 104: 3, 73: 9, 84: 6, 101: 3, 80: 7, 86: 8, 67: 19, 82: 5, 93: 1, 79: 11, 78: 7, 74: 17, 109: 4, 81: 4, 90: 3, 99: 4, 118: 3, 87: 4, 95: 1, 127: 1, 122: 2, 107: 1, 179: 1, 141: 1, 76: 7, 85: 8, 98: 2, 103: 2, 180: 1, 97: 2, 223: 1, 111: 2, 144: 1, 116: 1, 100: 2, 94: 2, 131: 1, 106: 1, 0: 1, 91: 2, 113: 2, 172: 1,

In [28]:
# sort the dictionary based on their counts
import operator

sorted_counts = sorted(sent_len_count.items(), key=operator.itemgetter(1), reverse=True)

print(sorted_counts)
index=0
for pair in sorted_counts:
  if pair[1]>300:
    index+=1
  else:
    break

max_hindi_len=0
for pair in sorted_counts[:index]:
  if pair[0]>max_hindi_len:
    max_hindi_len=pair[0]

print(max_hindi_len)

[(6, 6544), (5, 6357), (7, 6183), (8, 5679), (4, 5611), (9, 5005), (10, 4169), (3, 3866), (11, 3635), (12, 3220), (13, 2784), (2, 2730), (14, 2383), (15, 2233), (16, 2021), (17, 1571), (18, 1517), (19, 1339), (20, 1290), (21, 1113), (22, 1031), (23, 954), (24, 861), (25, 753), (26, 725), (27, 703), (28, 642), (29, 553), (30, 525), (31, 448), (32, 403), (33, 403), (34, 361), (35, 349), (1, 319), (36, 278), (37, 259), (38, 256), (40, 219), (39, 211), (42, 196), (43, 185), (41, 174), (45, 148), (44, 125), (46, 119), (48, 111), (47, 104), (50, 93), (49, 81), (51, 77), (53, 75), (52, 70), (54, 60), (61, 49), (55, 48), (56, 47), (58, 46), (60, 43), (64, 36), (57, 34), (68, 30), (59, 29), (63, 28), (62, 28), (66, 28), (69, 23), (65, 19), (67, 19), (74, 17), (72, 16), (71, 14), (79, 11), (77, 9), (75, 9), (70, 9), (73, 9), (86, 8), (85, 8), (80, 7), (78, 7), (76, 7), (83, 6), (84, 6), (96, 5), (82, 5), (88, 4), (109, 4), (81, 4), (99, 4), (87, 4), (89, 3), (105, 3), (104, 3), (101, 3), (90, 3)

#### **Now Let's check for English**

In [29]:
english_sent_len_count={}
for sent in english_sentence_list:
  sent_len=0
  for token in nlp.tokenizer(sent.lower()): 
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      sent_len+=1
  english_sent_len_count[sent_len] = english_sent_len_count.get(sent_len,0)+1

print(english_sent_len_count)

{10: 4263, 4: 6245, 8: 5737, 6: 7312, 25: 689, 5: 7107, 13: 2722, 9: 4820, 21: 1033, 11: 3484, 12: 3181, 7: 6622, 3: 3935, 26: 681, 30: 464, 20: 1168, 23: 860, 18: 1397, 16: 1797, 17: 1560, 15: 2004, 19: 1318, 35: 261, 2: 2925, 22: 944, 31: 440, 42: 149, 14: 2345, 38: 226, 34: 314, 48: 100, 27: 599, 39: 189, 41: 169, 32: 401, 36: 256, 24: 766, 33: 332, 29: 493, 28: 554, 52: 59, 56: 45, 46: 100, 70: 11, 44: 144, 1: 94, 50: 74, 58: 31, 55: 45, 47: 99, 37: 215, 43: 141, 53: 43, 49: 75, 54: 47, 40: 189, 61: 23, 67: 14, 69: 13, 78: 5, 45: 109, 63: 13, 60: 29, 86: 4, 91: 2, 57: 43, 51: 66, 68: 12, 59: 34, 83: 4, 62: 24, 104: 1, 71: 16, 94: 2, 64: 23, 72: 8, 81: 3, 65: 13, 66: 13, 114: 1, 76: 6, 77: 7, 82: 3, 80: 8, 85: 4, 74: 6, 89: 5, 100: 1, 73: 9, 93: 3, 99: 2, 108: 1, 112: 1, 97: 1, 79: 3, 92: 3, 171: 1, 143: 1, 106: 3, 84: 5, 282: 1, 164: 1, 88: 5, 95: 2, 261: 1, 87: 2, 116: 1, 75: 7, 110: 1, 107: 1, 90: 5, 121: 1, 169: 1, 123: 1, 446: 1, 103: 2, 96: 1, 157: 1}


In [30]:
# sort the dictionary based on their counts
import operator

english_sorted_counts = sorted(english_sent_len_count.items(), key=operator.itemgetter(1), reverse=True)

index=0
for pair in english_sorted_counts:
  if pair[1]>300:
    index+=1
  else:
    break

print(index)
max_english_len=0
for pair in english_sorted_counts[:index]:
  if pair[0]>max_english_len:
    max_english_len=pair[0]

print(max_english_len)

33
34


#### **Get the maximum Length**

In [31]:
max_len = max(max_hindi_len, max_english_len)
max_len

35

### **Filter out sentences with length less than or equal to maximum length**

In [32]:
english_filtered_sent_list=[]
hindi_filtered_sent_list=[]
filtered_sent_pair_list=[]
for hin_sent, eng_sent in zip(hindi_sentence_list, english_sentence_list):
  hin_sent_len=0
  for t in indic_tokenize.trivial_tokenize(hin_sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      hin_sent_len+=1

  eng_sent_len=0
  for token in nlp.tokenizer(eng_sent.lower()): 
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      eng_sent_len+=1

  if hin_sent_len>=1 and hin_sent_len<=max_len and eng_sent_len>=1 and eng_sent_len<=max_len:
    english_filtered_sent_list.append(eng_sent)
    hindi_filtered_sent_list.append(hin_sent)
    filtered_sent_pair_list.append([hin_sent, eng_sent])

print(len(english_filtered_sent_list))
print(len(hindi_filtered_sent_list))
print(len(filtered_sent_pair_list))
print(filtered_sent_pair_list[:5])

77854
77854
77854
[['बोलो, Rom, थूको मत.', "Say it, don't spray it, Rom."], ['मैं भी उस समय टूट गया।', 'This was mine.'], ['मुझे आपकी ज़रूरत है टोटेम नष्ट करने के लिए।', 'I need you to destroy the totem.'], ['मैं उससे मिलना चाहती हूँ.', 'I want to meet her.'], ['मुझे लगता है की मैं अपने ग्राहकों की जरूरतों को पूरा कर सकती हूँ भविष्य की पीढ़ियों की एक हरी कल में रहने की क्षमता से समझौता किए बगैर', 'I feel that I can meet the needs of my customers without compromising the ability of future generations to live in a greener tomorrow.']]


In [33]:
# Store the filtered hindi sentences in a file

with open('./FilteredSentencesPair.txt','w') as f:
  for line in filtered_sent_pair_list:
    f.write(line[0]+','+line[1]+'\n')

### **Form List of list of indexes from the filtered sentences**

In [34]:
# Create tensor array for each hindi sentence 
import torch
hindi_list_indices = torch.tensor([[1]*(max_len+2)]*len(filtered_sent_pair_list), dtype=torch.long)

i=0
for sent in hindi_filtered_sent_list:
  hindi_list_indices[i][0]=2  #SOS
  j=1
  for t in indic_tokenize.trivial_tokenize(sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      if hindi_word_to_index.get(elem) is None:
        continue
      else:
        hindi_list_indices[i][j] = hindi_word_to_index.get(elem)
      j+=1
  hindi_list_indices[i][j]=3  #EOS
  j+=1
  while j<=(max_len+1):
    hindi_list_indices[i][j] = 1  #PAD
    j+=1

  i+=1

# Create tensor array for each hindi test sentence
hindi_test_list_indices = torch.tensor([[1]*(max_len+2)]*len(hindi_test_sentence_list), dtype=torch.long)

i=0
for sent in hindi_test_sentence_list:
  hindi_test_list_indices[i][0]=2  #SOS
  j=1
  for t in indic_tokenize.trivial_tokenize(sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      if j>(max_len):
        break
      if hindi_word_to_index.get(elem) is None:
        continue
        #hindi_test_list_indices[i][j] = 0  # UNK
      else:
        hindi_test_list_indices[i][j] = hindi_word_to_index.get(elem)
      j+=1
  hindi_test_list_indices[i][j]=3  #EOS
  j+=1
  while j<=(max_len+1):
    hindi_test_list_indices[i][j] = 1  #PAD
    j+=1

  i+=1


# Create tensor array for each english sentence

english_list_indices = torch.tensor([[1]*(max_len+2)]*len(filtered_sent_pair_list), dtype=torch.long)

i=0
for sent in english_filtered_sent_list:
  english_list_indices[i][0]=2  #SOS
  j=1
  for token in nlp.tokenizer(sent.lower()):
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      if english_word_to_index.get(elem) is None:
        continue
        #english_list_indices[i][j] = 0  # UNK
      else:
        english_list_indices[i][j] = english_word_to_index.get(elem)
      j+=1
  english_list_indices[i][j]=3  #EOS
  j+=1
  while j<=(max_len+1):
    english_list_indices[i][j] = 1  #PAD
    j+=1
  
  i+=1


# Create tensor array for each english test sentence
english_test_list_indices = torch.tensor([[1]*(max_len+2)]*len(english_test_sentence_list), dtype=torch.long)

i=0
for sent in english_test_sentence_list:
  english_test_list_indices[i][0]=2  #SOS
  j=1
  for token in nlp.tokenizer(sent.lower()):
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      if j>(max_len):
        break
      if english_word_to_index.get(elem) is None:
        continue
        #english_test_list_indices[i][j] = 0  # UNK
      else:
        english_test_list_indices[i][j] = english_word_to_index.get(elem)
      j+=1
  english_test_list_indices[i][j]=3  #EOS
  j+=1
  while j<=(max_len+1):
    english_test_list_indices[i][j] = 1  #PAD
    j+=1
  
  i+=1

print(hindi_list_indices[:5])
print(english_list_indices[:5])

tensor([[   2, 1378,    4, 3621,    4,   14,    7,    3,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1],
        [   2,    6,   48,   32,   77, 1104,   12,    5,    3,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1],
        [   2,    8,   44,  313,    9, 5735,  202,   37,   19,   38,    5,    3,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1],
        [   2,    6,   27,  231,  466,   11,    7,    3,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,    1,
            1,    1,    1,    1,    1,    1,    1,    1,    1

In [35]:
print(hindi_list_indices.shape)
print(english_list_indices.shape)

torch.Size([77854, 37])
torch.Size([77854, 37])


#### **Use GPU**

In [36]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu" 
print(device)

cuda


### **References:** 
### http://ethen8181.github.io/machine-learning/deep_learning/seq2seq/2_torch_seq2seq_attention.html 
### https://arxiv.org/abs/1409.0473

## **Sequence to Sequence Model with Attention**

In [37]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [38]:
# Bidiretional GRU Encoder
class Encoder(nn.Module):

  def __init__(self, input_size, embedding_size, hidden_dimension, num_layers, dropout):  
    super().__init__()
    
    self.input_size=input_size
    self.dropout=dropout
    self.hidden_dimension = hidden_dimension
    self.num_layers = num_layers

    self.embedding = nn.Embedding(input_size, embedding_size)

    self.rnn = nn.GRU(embedding_size, hidden_dimension,num_layers, dropout=dropout, bidirectional=True)

    self.final_layer = nn.Linear(hidden_dimension * 2, hidden_dimension)

  def forward(self, word_inputs):
 
    # First embed the input words and then pass it to the encoder
    embedded = self.embedding(word_inputs)

    outputs, hidden = self.rnn(embedded)
    
    # Since the deoder is unidirectional, concatenate the hidden states
    x = torch.cat((hidden[::2], hidden[1::2]), dim=2)

    # The concatenated hidden vectors are then passed on to the layer with hidden dimension as the output dimension and tanh function
    # as the activation function
    hidden = torch.tanh(self.final_layer(x))

    return outputs, hidden

In [39]:
# Attention uses the previous state of the decoder and final layer hidden states
class Attention(nn.Module):

    def __init__(self, hidden_dimension):
        super().__init__()
        self.hidden_dimension = hidden_dimension

        self.final_layer1 = nn.Linear(hidden_dimension * 2 + hidden_dimension, hidden_dimension)
        self.final_layer2 = nn.Linear(hidden_dimension, 1, bias=False)

    def forward(self, encoder_outputs, hidden):
        # encoder_ouptuts shape: [sequence_length, batch_size, hidden_dimension*2]
        sequence_length = encoder_outputs.shape[0]
        batch_size = encoder_outputs.shape[1]
        
        # hidden shape: [batch_size, hidden_dimension]
        hidden = hidden.unsqueeze(1).repeat(1, sequence_length, 1)
        #hidden shape: [batch_size, sequence_length, hidden_dimension]

        outputs = encoder_outputs.permute(1, 0, 2)
        # ouputs shape: [batch_size, sequence_length, hidden_dimension*2]

        x = torch.cat((hidden, outputs), dim=2)

        energy = torch.tanh(self.final_layer1(x))

        attention = self.final_layer2(energy).squeeze(dim=2)        
        attention_weight = torch.softmax(attention, dim=1)

        return attention_weight

In [40]:
class Decoder(nn.Module):
  def __init__(self, output_size, embedding_size, hidden_dimension, num_layers, dropout, attention):
    super().__init__()
    
    self.embedding_size=embedding_size
    self.output_size = output_size
    self.hidden_dimension = hidden_dimension
    self.num_layers = num_layers
    self.dropout = dropout
    self.attention = attention

    self.embedding = nn.Embedding(output_size, embedding_size)
    
    self.rnn = nn.GRU(hidden_dimension * 2 + embedding_size, hidden_dimension, num_layers, dropout = dropout)
    
    self.linear = nn.Linear(hidden_dimension, output_size)
    
  def forward(self, input, encoder_states, hidden):

    attention = self.attention(encoder_states, hidden[-1]).unsqueeze(1)

    outputs = encoder_states.permute(1, 0, 2)

    context = torch.bmm(attention, outputs).permute(1, 0, 2)

    embedded = self.embedding(input.unsqueeze(0))
    x = torch.cat((embedded, context), dim=2)

    outputs, hidden = self.rnn(x, hidden)
    prediction = self.linear(outputs.squeeze(0))
    return prediction, hidden.squeeze(0)

In [41]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
      
  def forward(self, source, target, teacher_force =0.75):

    batch_size = target.shape[1]
    sequence_length = target.shape[0]

    target_dict_size = self.decoder.output_size
    
    pred_output = torch.zeros(sequence_length, batch_size, target_dict_size).to(self.device)
    
    encoder_states, hidden = self.encoder(source)
    
    input = target[0]
    
    for i in range(1, sequence_length):

      output, hidden = self.decoder(input, encoder_states, hidden)

      pred_output[i] = output
    
      best_pred = output.argmax(1) 

      if random.random() < teacher_force:
        input = target[i]

      else:
        input = best_pred
    
    return pred_output

### **Set Model Parameters and define Model**

In [42]:
input_size = len(hindi_word_to_index)
output_size = len(english_word_to_index)
embedding_size = 128
hidden_dimension = 256
num_layers = 2
dropout = 0.5
batch_size=32

attention = Attention(hidden_dimension)
enc = Encoder(input_size, embedding_size, hidden_dimension, num_layers, dropout).to(device)
dec = Decoder(output_size, embedding_size, hidden_dimension, num_layers, dropout, attention).to(device)

model = Seq2Seq(enc, dec, device).to(device)

print(enc)
print(dec)
print(model)

Encoder(
  (embedding): Embedding(19062, 128)
  (rnn): GRU(128, 256, num_layers=2, dropout=0.5, bidirectional=True)
  (final_layer): Linear(in_features=512, out_features=256, bias=True)
)
Decoder(
  (attention): Attention(
    (final_layer1): Linear(in_features=768, out_features=256, bias=True)
    (final_layer2): Linear(in_features=256, out_features=1, bias=False)
  )
  (embedding): Embedding(16692, 128)
  (rnn): GRU(640, 256, num_layers=2, dropout=0.5)
  (linear): Linear(in_features=256, out_features=16692, bias=True)
)
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(19062, 128)
    (rnn): GRU(128, 256, num_layers=2, dropout=0.5, bidirectional=True)
    (final_layer): Linear(in_features=512, out_features=256, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (final_layer1): Linear(in_features=768, out_features=256, bias=True)
      (final_layer2): Linear(in_features=256, out_features=1, bias=False)
    )
    (embedding): Embedding(16692, 128)
    (rnn):

### **Test Encoder**

In [43]:
word_input = torch.zeros((7, 4), dtype=torch.long, device=device)  # here 7 is seq length and 4 is batch size
print(word_input.shape)
out, enc_hid = enc(word_input)  # encode this word_input

print(enc_hid)

print(enc_hid.shape) # [num_layers, seq_length, hidden_units]

torch.Size([7, 4])
tensor([[[-0.0185,  0.0578,  0.3347,  ...,  0.1850, -0.1256, -0.4478],
         [-0.0185,  0.0578,  0.3347,  ...,  0.1850, -0.1256, -0.4478],
         [-0.0185,  0.0578,  0.3347,  ...,  0.1850, -0.1256, -0.4478],
         [-0.0185,  0.0578,  0.3347,  ...,  0.1850, -0.1256, -0.4478]],

        [[ 0.2067, -0.2617, -0.1299,  ..., -0.1121, -0.1026,  0.1087],
         [ 0.1723, -0.1295,  0.0345,  ..., -0.0705, -0.0513,  0.2374],
         [ 0.1768, -0.2075,  0.0048,  ..., -0.1104, -0.1819,  0.0840],
         [ 0.2841, -0.1245,  0.0143,  ..., -0.1391, -0.1000,  0.0962]]],
       device='cuda:0', grad_fn=<TanhBackward>)
torch.Size([2, 4, 256])


### **Test Decoder**

In [44]:
for i in range(7):
    input = word_input[i]
    pred, dec_hid= dec(input, out, enc_hid)
    print(dec_hid.shape, pred)

torch.Size([2, 4, 256]) tensor([[-0.1202,  0.0229, -0.0214,  ..., -0.0571, -0.0622,  0.0572],
        [-0.0417, -0.0058,  0.0288,  ...,  0.0336, -0.0163,  0.0434],
        [-0.1417,  0.0561, -0.0353,  ...,  0.0146, -0.1423,  0.0089],
        [-0.1656,  0.0200,  0.0359,  ...,  0.0011, -0.0821,  0.0564]],
       device='cuda:0', grad_fn=<AddmmBackward>)
torch.Size([2, 4, 256]) tensor([[-0.0804,  0.0034,  0.0134,  ...,  0.0020, -0.0403,  0.0719],
        [-0.1363,  0.0450,  0.0258,  ...,  0.0703, -0.0414,  0.0829],
        [-0.1405, -0.0046,  0.0147,  ..., -0.0541, -0.0128,  0.0623],
        [-0.1217, -0.0485, -0.0050,  ...,  0.0094, -0.0972,  0.0960]],
       device='cuda:0', grad_fn=<AddmmBackward>)
torch.Size([2, 4, 256]) tensor([[-0.1563, -0.0049, -0.0652,  ..., -0.0679, -0.0857,  0.0564],
        [-0.1242,  0.0310,  0.0437,  ...,  0.0419, -0.0176, -0.0050],
        [-0.1432,  0.0190, -0.0138,  ...,  0.0309, -0.0728,  0.0726],
        [-0.1093, -0.0363, -0.0146,  ..., -0.0076, -0.0647

### **Initialize Weights**

In [45]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(19062, 128)
    (rnn): GRU(128, 256, num_layers=2, dropout=0.5, bidirectional=True)
    (final_layer): Linear(in_features=512, out_features=256, bias=True)
  )
  (decoder): Decoder(
    (attention): Attention(
      (final_layer1): Linear(in_features=768, out_features=256, bias=True)
      (final_layer2): Linear(in_features=256, out_features=1, bias=False)
    )
    (embedding): Embedding(16692, 128)
    (rnn): GRU(640, 256, num_layers=2, dropout=0.5)
    (linear): Linear(in_features=256, out_features=16692, bias=True)
  )
)

### **Optimizer**

In [46]:
from torch import optim
optimizer = optim.Adam(model.parameters(), lr=0.001)

### **Loss**

In [47]:
criterion = nn.CrossEntropyLoss(ignore_index = 1)

### **Create Batches**

In [48]:
from torch.utils import data
def load_array(data_arrays, batch_size, is_train=True):
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [49]:
data_arrays = (hindi_list_indices, english_list_indices)
data_iter = load_array(data_arrays, batch_size)

### **Translate Hindi Sentence to English Sentence**

In [50]:
def translate_sentence(model, sentence, device, max_length=max_len):

    sent_list=[]
    for t in indic_tokenize.trivial_tokenize(sentence): 
      x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
      for elem in x:
        if hindi_word_to_index.get(elem) is None:
          continue
        else:
           sent_list.append(hindi_word_to_index[elem]) 

    sent_list.insert(0, hindi_word_to_index['SOS'])
    sent_list.append(hindi_word_to_index['EOS'])

    while(len(sent_list)<max_len):
      sent_list.append(hindi_word_to_index['PAD'])


    sent_tensor = torch.tensor(sent_list, dtype=torch.long).unsqueeze(1).to(device)

    with torch.no_grad():
        outputs_encoder, hidden = model.encoder(sent_tensor)

    outputs = [2]  #SOS = 2

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden = model.decoder(previous_word, outputs_encoder, hidden)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        #EOS=3
        if output.argmax(1).item() == 3:
            break

    translated_sentence = [english_index_to_word[idx] for idx in outputs]

    return translated_sentence[1:]

### **Save Model**

In [52]:
def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, './checkpoint-NMT')
    torch.save(model.state_dict(),'./checkpoint-NMT-SD')

### **Train Model**

In [53]:
import random
import sys
epoch_loss = 0.0
num_epochs = 10
best_loss = sys.maxsize
best_epoch = -1
sentence1 = "मैं कहाँ रहते हैं आप जानते हो?"
step=0
i=0
for epoch in range(num_epochs):

  print("Epoch -",epoch+1)
  model.eval()
  print(max_len)
  translated_sentence1 = translate_sentence(model, sentence1, device, max_length=max_len)

  model.train(True)
  for batch_idx, batch in enumerate(data_iter):
    input, target = [x.to(device) for x in batch]

    input = input.permute(1,0)
    target = target.permute(1,0)


    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, target)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    optimizer.step()
    step += 1
      
    epoch_loss += loss.item()

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 

  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(data_iter))

Epoch - 1
35
Epoch_Loss - 5.104295253753662

Epoch - 2
35
Epoch_Loss - 4.499795913696289

Epoch - 3
35
Epoch_Loss - 4.5629730224609375

Epoch - 4
35
Epoch_Loss - 3.321319818496704

Epoch - 5
35
Epoch_Loss - 4.023654460906982

Epoch - 6
35
Epoch_Loss - 3.695626974105835

Epoch - 7
35
Epoch_Loss - 3.640519380569458

Epoch - 8
35
Epoch_Loss - 3.675459384918213

Epoch - 9
35
Epoch_Loss - 3.4773542881011963

Epoch - 10
35
Epoch_Loss - 3.014796257019043

39.57087118097455


### **Translate the Hindi Sentences from the Test Set**

In [54]:
outputs = []

for src,trg in zip(hindi_test_sentence_list, english_test_sentence_list):
    prediction = translate_sentence(model, src, device)
    prediction = prediction[:-1]  # remove <eos> token
    x = ' '.join([e for e in prediction])
    outputs.append(x)

### **Compute BLEU and METEOR score on Test Set**

In [55]:
!pip install -U nltk

Collecting nltk
  Downloading nltk-3.6.2-py3-none-any.whl (1.5 MB)
[K     |████████████████████████████████| 1.5 MB 606 kB/s eta 0:00:01
Installing collected packages: nltk
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
preprocessing 0.1.13 requires nltk==3.2.4, but you have nltk 3.6.2 which is incompatible.[0m
Successfully installed nltk-3.6.2


In [56]:
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

references = english_test_sentence_list

hypotheses = outputs

total_num = len(references)
total_bleu_scores = 0
total_meteor_scores = 0
for i in range(total_num):
  total_bleu_scores+=sentence_bleu([references[i].split(" ")], hypotheses[i].split(" "))
  total_meteor_scores+=single_meteor_score(references[i], hypotheses[i])

bleu_result = total_bleu_scores/total_num
meteor_result = total_meteor_scores/total_num

print("bleu score: ",bleu_result)
print("meteor score: ",meteor_result)

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


bleu score:  0.0011523031828677911
meteor score:  0.16078945059348346


### **Load Hindi dataset**

In [57]:
testpath = '../input/hindi4/hindistatements4.csv'

In [58]:
import csv

finaldata=[]

with open(testpath, 'r') as f:
    reader = csv.DictReader(f)
    for line in reader:
        finaldata.append(line['hindi'])

### **Store the corresponding Translated English Sentences**

In [59]:
with open('./answer.txt', 'w') as f:
    for sent in finaldata:
        prediction = translate_sentence(model, sent, device)
        prediction = prediction[:-1]  # remove <eos> token
        x = ' '.join([e for e in prediction])
        f.write(x+'\n')