## **Seq2Seq Model**

In [3]:
filepath = '/content/drive/MyDrive/Colab Notebooks/train.csv'

In [4]:
import pandas as pd

In [5]:
data = pd.read_csv(filepath)

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,hindi,english
0,0,"एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध...","In El Salvador, both sides that withdrew from ..."
1,1,मैं उनके साथ कोई लेना देना नहीं है.,I have nothing to do with them.
2,2,-हटाओ रिक.,"Fuck them, Rick."
3,3,क्योंकि यह एक खुशियों भरी फ़िल्म है.,Because it's a happy film.
4,4,The thought reaching the eyes...,The thought reaching the eyes...


In [7]:
data.shape

(102322, 3)

In [8]:
data.columns

Index(['Unnamed: 0', 'hindi', 'english'], dtype='object')

In [9]:
data.drop(columns='Unnamed: 0', inplace=True)

In [10]:
data.head()

Unnamed: 0,hindi,english
0,"एल सालवाडोर मे, जिन दोनो पक्षों ने सिविल-युद्ध...","In El Salvador, both sides that withdrew from ..."
1,मैं उनके साथ कोई लेना देना नहीं है.,I have nothing to do with them.
2,-हटाओ रिक.,"Fuck them, Rick."
3,क्योंकि यह एक खुशियों भरी फ़िल्म है.,Because it's a happy film.
4,The thought reaching the eyes...,The thought reaching the eyes...


In [11]:
#Ref: https://towardsdatascience.com/how-to-split-a-dataframe-into-train-and-test-set-with-python-eaa1630ca7b3
        
from sklearn.model_selection import train_test_split
train, test = train_test_split(data, test_size=0.2, random_state=42, shuffle=True)

In [12]:
train = train.reset_index(drop=True)
test = test.reset_index(drop=True)

In [13]:
train.head()

Unnamed: 0,hindi,english
0,"मैं पिता की तरह तुम राजा हूं, कब?","When I'm king like you, Father?"
1,"उत्तरी डकोटा, Standing Rock Nation इस कछुओं के...","North Dakota. Standing Rock Nation, in this Tu..."
2,यह आवाज के साथ इश्क कर रही है.,It's flirting with sound.
3,"सुधार, वे मेरे लिए जवाब देना होगा .","Correction, they will have to answer to me."
4,"काफ़ी लोगों नें कहा, ""ऐसा है, शहरी लोग कौन होते...","You know, a lot of people said, ""Well, you kno..."


In [14]:
test.head()

Unnamed: 0,hindi,english
0,"चिन अप, बर्ट.","Chin up, Burt."
1,कैदियों की तरह देखते थे। उनको बेचने से राजा अप...,"By selling them, kings enriched their own real..."
2,केवट ऑलसेन!,It's boatsman Olsen!
3,"निस्संदेह रूढ़िवादी हमेशा गलत होते हैं, ¶ लेकि...","Stereotypes are always a mistake, of course, b..."
4,"अगर वो कर सकते हैं, तो सरकारें और गैर-लाभ संस्...","If they can do that, why can't governments and..."


In [15]:
train.shape[0]

81857

In [16]:
train.loc[0,'hindi']

'मैं पिता की तरह तुम राजा हूं, कब?'

In [17]:
#Create train hindi file and train english file

with open('/content/drive/MyDrive/Colab Notebooks/train_hindi.txt', 'w') as f:
  for i in range(train.shape[0]):
    f.write(train.iloc[i]['hindi']+'\n')

In [18]:
with open('/content/drive/MyDrive/Colab Notebooks/train_english.txt', 'w') as f:
  for i in range(train.shape[0]):
    f.write(train.iloc[i]['english']+'\n')

In [19]:
# make sentence list for training data

hindi_sentence_list = train['hindi'].to_list()
english_sentence_list = train['english'].to_list()

In [20]:
# make test sentence list


hindi_test_sentence_list = test['hindi'].to_list()
english_test_sentence_list = test['english'].to_list()

In [21]:
len(hindi_sentence_list), len(english_sentence_list)

(81857, 81857)

In [22]:
len(hindi_test_sentence_list), len(english_test_sentence_list)

(20465, 20465)

### **Install Indic NLP Library**

In [23]:
!git clone "https://github.com/anoopkunchukuttan/indic_nlp_library"

Cloning into 'indic_nlp_library'...
remote: Enumerating objects: 1325, done.[K
remote: Counting objects: 100% (147/147), done.[K
remote: Compressing objects: 100% (103/103), done.[K
remote: Total 1325 (delta 84), reused 89 (delta 41), pack-reused 1178[K
Receiving objects: 100% (1325/1325), 9.57 MiB | 10.89 MiB/s, done.
Resolving deltas: 100% (688/688), done.


In [24]:
!git clone https://github.com/anoopkunchukuttan/indic_nlp_resources.git

Cloning into 'indic_nlp_resources'...
remote: Enumerating objects: 133, done.[K
remote: Counting objects: 100% (7/7), done.[K
remote: Compressing objects: 100% (7/7), done.[K
remote: Total 133 (delta 0), reused 2 (delta 0), pack-reused 126[K
Receiving objects: 100% (133/133), 149.77 MiB | 33.16 MiB/s, done.
Resolving deltas: 100% (51/51), done.


In [25]:
!pip install Morfessor

Collecting Morfessor
  Downloading Morfessor-2.0.6-py3-none-any.whl (35 kB)
Installing collected packages: Morfessor
Successfully installed Morfessor-2.0.6


In [26]:
# The path to the local git repo for Indic NLP library
INDIC_NLP_LIB_HOME=r"/content/indic_nlp_library"

# The path to the local git repo for Indic NLP Resources
INDIC_NLP_RESOURCES="/content/indic_nlp_resources"

In [27]:
import sys
sys.path.append(r'{}'.format(INDIC_NLP_LIB_HOME))

In [28]:
from indicnlp import common
common.set_resources_path(INDIC_NLP_RESOURCES)

In [29]:
from indicnlp import loader
loader.load()

### **Tokenize Hindi sentences**

In [30]:
# Detokenize
# Ref: https://colab.research.google.com/drive/1p3oGPcNdORw5_MDcufTDYWJhJt3XVPuC?usp=sharing#scrollTo=GU6E07Yw5zvl

from indicnlp.tokenize import indic_detokenize  

for i in range(len(hindi_sentence_list)):
  hindi_sentence_list[i] = indic_detokenize.trivial_detokenize(hindi_sentence_list[i],lang='hi')


In [31]:
import string 
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [32]:
#Ref for unicode chart: https://www.ssec.wisc.edu/~tomw/java/unicode.html#x0900 

import re

from indicnlp.tokenize import indic_tokenize 

hindi_word_to_count={}
hindi_word_to_index={'UNK':0, 'PAD':1, 'SOS':2, 'EOS':3}
hindi_index_to_word={0:'UNK', 1:'PAD', 2:'SOS', 3:'EOS'}
count=4
for sent in hindi_sentence_list:
  for t in indic_tokenize.trivial_tokenize(sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      hindi_word_to_count[elem] = hindi_word_to_count.get(elem,0)+1
      if hindi_word_to_index.get(elem) is None and hindi_word_to_count.get(elem,0) >= 2:
        hindi_word_to_index[elem] = count
        hindi_index_to_word[count] = elem
        count+=1
print(count)

19168


In [33]:
with open('/content/drive/MyDrive/Colab Notebooks/HindiWordToCount.txt','w') as f:
  for k,v in hindi_word_to_count.items():
    f.write(str(k)+","+str(v)+"\n")

In [34]:
print(len(hindi_word_to_count))

39920


In [35]:
print(len(hindi_word_to_index))

19168


### **Tokenize English sentences**

In [36]:
!python3 -m spacy download en

Collecting en_core_web_sm==2.2.5
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.2.5/en_core_web_sm-2.2.5.tar.gz (12.0 MB)
[K     |████████████████████████████████| 12.0 MB 8.0 MB/s 
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/usr/local/lib/python3.7/dist-packages/en_core_web_sm -->
/usr/local/lib/python3.7/dist-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [37]:
import spacy
nlp = spacy.load("en_core_web_sm")

In [38]:
# english_word_to_count={}
# english_word_to_index={'UNK':0, 'PAD':1, 'SOS':2, 'EOS':3}
# english_index_to_word={0:'UNK', 1:'PAD', 2:'SOS', 3:'EOS'}
# count=4

# for sent in english_sentence_list:
#   for token in nlp.tokenizer(sent.lower()):
#     temp = re.findall('[A-Za-z\']+', token.text)
#     for elem in temp:
#       english_word_to_count[elem] = english_word_to_count.get(elem,0)+1
#       if english_word_to_index.get(elem) is None and english_word_to_count.get(elem,0) >= 2:
#         english_word_to_index[elem] = count
#         english_index_to_word[count] = elem
#         count+=1
# print(count)

In [39]:
english_word_to_count={}
english_word_to_index={'UNK':0, 'PAD':1, 'SOS':2, 'EOS':3}
english_index_to_word={0:'UNK', 1:'PAD', 2:'SOS', 3:'EOS'}
count=4

for sent in english_sentence_list:
  for token in nlp.tokenizer(sent.lower()):
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      english_word_to_count[elem] = english_word_to_count.get(elem,0)+1
      if english_word_to_index.get(elem) is None and english_word_to_count.get(elem,0) >= 2:
        english_word_to_index[elem] = count
        english_index_to_word[count] = elem
        count+=1
print(count)

16715


In [40]:
with open('/content/drive/MyDrive/Colab Notebooks/EnglishWordToCount.txt','w') as f:
  for k,v in english_word_to_count.items():
    f.write(str(k)+","+str(v)+"\n")

### **Find Sentences Length and fix the maximum length and filter the sentences**


First Let's Check for Hindi

In [41]:
sent_len_count={}
for sent in hindi_sentence_list:
  sent_len=0
  for t in indic_tokenize.trivial_tokenize(sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      sent_len+=1
  sent_len_count[sent_len] = sent_len_count.get(sent_len,0)+1

print(sent_len_count)


{10: 4142, 36: 266, 9: 5068, 28: 645, 3: 3866, 8: 5652, 7: 6170, 26: 731, 2: 2751, 13: 2763, 24: 847, 31: 453, 37: 267, 5: 6383, 6: 6487, 22: 1051, 25: 774, 14: 2383, 29: 558, 11: 3643, 4: 5620, 12: 3206, 17: 1628, 21: 1123, 30: 527, 19: 1342, 16: 1959, 23: 932, 20: 1282, 18: 1527, 32: 424, 15: 2163, 46: 113, 34: 367, 27: 685, 44: 141, 87: 3, 42: 187, 33: 415, 85: 8, 41: 177, 35: 354, 49: 90, 53: 67, 57: 29, 51: 89, 80: 8, 52: 73, 60: 38, 55: 55, 62: 26, 58: 44, 596: 1, 59: 32, 38: 257, 47: 111, 50: 95, 1: 324, 66: 26, 43: 194, 74: 22, 39: 216, 40: 222, 45: 146, 67: 21, 54: 52, 56: 49, 76: 6, 68: 27, 64: 32, 72: 19, 70: 10, 79: 8, 48: 117, 61: 47, 69: 19, 78: 5, 0: 3, 139: 1, 63: 22, 94: 2, 65: 21, 82: 4, 116: 1, 109: 4, 92: 2, 84: 8, 86: 9, 83: 10, 112: 1, 90: 3, 95: 3, 77: 10, 73: 10, 99: 4, 81: 5, 144: 1, 71: 15, 75: 11, 107: 2, 125: 1, 88: 3, 118: 3, 93: 1, 100: 1, 104: 4, 91: 2, 97: 2, 106: 1, 127: 1, 111: 1, 102: 1, 96: 4, 133: 1, 131: 1, 165: 1, 105: 2, 223: 1, 141: 1, 101: 3, 1

In [42]:
# sort the dictionary based on their counts
import operator

sorted_counts = sorted(sent_len_count.items(), key=operator.itemgetter(1), reverse=True)

print(sorted_counts)
index=0
for pair in sorted_counts:
  if pair[1]>300:
    index+=1
  else:
    break

max_hindi_len=0
for pair in sorted_counts[:index]:
  if pair[0]>max_hindi_len:
    max_hindi_len=pair[0]

print(max_hindi_len)

[(6, 6487), (5, 6383), (7, 6170), (8, 5652), (4, 5620), (9, 5068), (10, 4142), (3, 3866), (11, 3643), (12, 3206), (13, 2763), (2, 2751), (14, 2383), (15, 2163), (16, 1959), (17, 1628), (18, 1527), (19, 1342), (20, 1282), (21, 1123), (22, 1051), (23, 932), (24, 847), (25, 774), (26, 731), (27, 685), (28, 645), (29, 558), (30, 527), (31, 453), (32, 424), (33, 415), (34, 367), (35, 354), (1, 324), (37, 267), (36, 266), (38, 257), (40, 222), (39, 216), (43, 194), (42, 187), (41, 177), (45, 146), (44, 141), (48, 117), (46, 113), (47, 111), (50, 95), (49, 90), (51, 89), (52, 73), (53, 67), (55, 55), (54, 52), (56, 49), (61, 47), (58, 44), (60, 38), (59, 32), (64, 32), (57, 29), (68, 27), (62, 26), (66, 26), (74, 22), (63, 22), (67, 21), (65, 21), (72, 19), (69, 19), (71, 15), (75, 11), (70, 10), (83, 10), (77, 10), (73, 10), (86, 9), (85, 8), (80, 8), (79, 8), (84, 8), (76, 6), (78, 5), (81, 5), (82, 4), (109, 4), (99, 4), (104, 4), (96, 4), (87, 3), (0, 3), (90, 3), (95, 3), (88, 3), (118, 

Now Let's check for English

In [43]:
english_sent_len_count={}
for sent in english_sentence_list:
  sent_len=0
  for token in nlp.tokenizer(sent.lower()): 
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      sent_len+=1
  english_sent_len_count[sent_len] = english_sent_len_count.get(sent_len,0)+1

print(english_sent_len_count)

{9: 4922, 32: 418, 6: 7294, 10: 4244, 31: 445, 15: 1973, 3: 3970, 7: 6587, 28: 559, 24: 767, 2: 2972, 11: 3428, 19: 1263, 16: 1838, 35: 272, 4: 6229, 8: 5701, 20: 1119, 5: 7095, 25: 714, 13: 2679, 12: 3145, 14: 2318, 17: 1590, 22: 968, 30: 465, 27: 602, 33: 341, 43: 143, 29: 529, 18: 1424, 44: 135, 92: 3, 37: 210, 55: 40, 21: 1026, 23: 895, 26: 659, 42: 157, 49: 78, 60: 27, 38: 238, 80: 8, 47: 90, 50: 74, 64: 20, 51: 65, 446: 1, 34: 318, 39: 185, 1: 100, 36: 267, 79: 7, 46: 99, 48: 90, 40: 192, 45: 111, 88: 6, 52: 60, 90: 4, 41: 164, 65: 20, 58: 29, 56: 40, 54: 51, 53: 42, 74: 9, 69: 15, 63: 14, 128: 1, 82: 3, 61: 20, 59: 32, 57: 39, 68: 10, 76: 8, 66: 17, 73: 7, 77: 6, 70: 13, 94: 2, 67: 15, 71: 15, 78: 6, 62: 23, 72: 11, 89: 6, 86: 6, 83: 4, 99: 3, 84: 6, 100: 1, 87: 4, 75: 7, 81: 2, 85: 2, 0: 1, 107: 1, 95: 2, 108: 1, 106: 3, 93: 1, 116: 1, 110: 1, 157: 1, 104: 1, 261: 1, 143: 1, 114: 1, 123: 1, 121: 1, 103: 1, 96: 1, 91: 2, 112: 1, 164: 1, 282: 1}


In [44]:
# sort the dictionary based on their counts
import operator

english_sorted_counts = sorted(english_sent_len_count.items(), key=operator.itemgetter(1), reverse=True)

index=0
for pair in english_sorted_counts:
  if pair[1]>300:
    index+=1
  else:
    break

print(index)
max_english_len=0
for pair in english_sorted_counts[:index]:
  if pair[0]>max_english_len:
    max_english_len=pair[0]

print(max_english_len)

33
34


In [45]:
max_len = max(max_hindi_len, max_english_len)
max_len

35

### **Filter out sentences with length less than or equal to maximum length**

In [46]:
english_filtered_sent_list=[]
hindi_filtered_sent_list=[]
filtered_sent_pair_list=[]
for hin_sent, eng_sent in zip(hindi_sentence_list, english_sentence_list):
  hin_sent_len=0
  for t in indic_tokenize.trivial_tokenize(hin_sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      hin_sent_len+=1

  eng_sent_len=0
  for token in nlp.tokenizer(eng_sent.lower()): 
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      eng_sent_len+=1

  if hin_sent_len>=1 and hin_sent_len<=max_len and eng_sent_len>=1 and eng_sent_len<=max_len:
    english_filtered_sent_list.append(eng_sent)
    hindi_filtered_sent_list.append(hin_sent)
    filtered_sent_pair_list.append([hin_sent, eng_sent])

print(len(english_filtered_sent_list))
print(len(hindi_filtered_sent_list))
print(len(filtered_sent_pair_list))
print(filtered_sent_pair_list[:5])

77825
77825
77825
[['मैं पिता की तरह तुम राजा हूं, कब?', "When I'm king like you, Father?"], ['यह आवाज के साथ इश्क कर रही है.', "It's flirting with sound."], ['सुधार, वे मेरे लिए जवाब देना होगा.', 'Correction, they will have to answer to me.'], ['काफ़ी लोगों नें कहा, "ऐसा है, शहरी लोग कौन होते हैं हम ग्रामीण लोगों को बताने बाले कि अपने समय के साथ क्या करें.', 'You know, a lot of people said, "Well, you know, city boys have no business telling us rural types what to do with our time.'], ['और तुम चूसोगे जो मैं तुम्हें चूसने दूंगा.', ".? and you're gonna swallow what I give you to swallow."]]


In [47]:
# Store the filtered hindi sentences in a file

with open('/content/drive/MyDrive/Colab Notebooks/FilteredSentencesPair.txt','w') as f:
  for line in filtered_sent_pair_list:
    f.write(line[0]+','+line[1]+'\n')

### Form List of list of indexes from the filtered sentences

In [48]:
valid_eng_len=[]
valid_hin_len=[]

In [49]:
# Create tensor array for each hindi sentence 
import torch
hindi_list_indices = torch.tensor([[1]*(max_len+2)]*len(filtered_sent_pair_list), dtype=torch.long)

i=0
for sent in hindi_filtered_sent_list:
  hindi_list_indices[i][0]=2  #SOS
  j=1
  for t in indic_tokenize.trivial_tokenize(sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      if hindi_word_to_index.get(elem) is None:
        continue
        #hindi_list_indices[i][j] = 0  # UNK
      else:
        hindi_list_indices[i][j] = hindi_word_to_index.get(elem)
      j+=1
  hindi_list_indices[i][j]=3  #EOS
  valid_hin_len.append(j)
  j+=1
  while j<=(max_len+1):
    hindi_list_indices[i][j] = 1  #PAD
    j+=1

  i+=1




# Create tensor array for each hindi test sentence
hindi_test_list_indices = torch.tensor([[1]*(max_len+2)]*len(hindi_test_sentence_list), dtype=torch.long)

i=0
for sent in hindi_test_sentence_list:
  hindi_test_list_indices[i][0]=2  #SOS
  j=1
  for t in indic_tokenize.trivial_tokenize(sent): 
    x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
    for elem in x:
      if j>(max_len):
        break
      if hindi_word_to_index.get(elem) is None:
        continue
        #hindi_test_list_indices[i][j] = 0  # UNK
      else:
        hindi_test_list_indices[i][j] = hindi_word_to_index.get(elem)
      j+=1
  hindi_test_list_indices[i][j]=3  #EOS
  j+=1
  while j<=(max_len+1):
    hindi_test_list_indices[i][j] = 1  #PAD
    j+=1

  i+=1


# Create tensor array for each english sentence

english_list_indices = torch.tensor([[1]*(max_len+2)]*len(filtered_sent_pair_list), dtype=torch.long)

i=0
for sent in english_filtered_sent_list:
  english_list_indices[i][0]=2  #SOS
  j=1
  for token in nlp.tokenizer(sent.lower()):
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      if english_word_to_index.get(elem) is None:
        continue
        #english_list_indices[i][j] = 0  # UNK
      else:
        english_list_indices[i][j] = english_word_to_index.get(elem)
      j+=1
  english_list_indices[i][j]=3  #EOS
  valid_eng_len.append(j)
  j+=1
  while j<=(max_len+1):
    english_list_indices[i][j] = 1  #PAD
    j+=1
  
  i+=1


# Create tensor array for each english test sentence
english_test_list_indices = torch.tensor([[1]*(max_len+2)]*len(english_test_sentence_list), dtype=torch.long)

i=0
for sent in english_test_sentence_list:
  english_test_list_indices[i][0]=2  #SOS
  j=1
  for token in nlp.tokenizer(sent.lower()):
    temp = re.findall('[A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+', token.text)
    for elem in temp:
      if j>(max_len):
        break
      if english_word_to_index.get(elem) is None:
        continue
        #english_test_list_indices[i][j] = 0  # UNK
      else:
        english_test_list_indices[i][j] = english_word_to_index.get(elem)
      j+=1
  english_test_list_indices[i][j]=3  #EOS
  j+=1
  while j<=(max_len+1):
    english_test_list_indices[i][j] = 1  #PAD
    j+=1
  
  i+=1

print(hindi_list_indices[:5])
print(english_list_indices[:5])

tensor([[    2,    17,    44,    22,    48,    16,   785,    41,     4,  1009,
            21,     3,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1],
        [    2,    29,   227,     8,    15,   402,    40,    67,    10,     9,
             3,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1],
        [    2,  1156,     4,    49,    58,    20,   707,   217,   306,     9,
             3,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1],
        [    2,   584,    13,  2716,   202,     4,    31,   148,    10,     4,
           426,    11,   144,    

In [50]:
print(hindi_list_indices.shape)
print(english_list_indices.shape)

torch.Size([77825, 37])
torch.Size([77825, 37])


### Zip The English and hindi text and process it simultaneously

In [51]:
import torch

device = "cuda" if torch.cuda.is_available() else "cpu" 
print(device)

cuda


In [52]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [53]:
class Encoder(nn.Module):

  def __init__(self, input_size, embedding_size, hidden_dimension, num_layers, dropout):  
    super().__init__()
    
    self.hidden_dimension = hidden_dimension
    self.num_layers = num_layers
    self.dropout = dropout
    self.embedding = nn.Embedding(input_size, embedding_size)

    self.rnn = nn.LSTM(embedding_size, hidden_dimension, num_layers, dropout = dropout)

    self.dropout = nn.Dropout(dropout)
      
  def forward(self, word_inputs):
  
    embedded = self.dropout(self.embedding(word_inputs))
    
    #embedded = [batch_size, sequence_length, embedding_size]
    
    outputs, (hidden, cell) = self.rnn(embedded)
    
    return hidden, cell

In [54]:

class Decoder(nn.Module):
  def __init__(self, output_size, embedding_size, hidden_dimension, num_layers, dropout):
    super().__init__()
    
    self.output_size = output_size
    self.hidden_dimension = hidden_dimension
    self.num_layers = num_layers
    
    self.embedding = nn.Embedding(output_size, embedding_size)
    
    self.rnn = nn.LSTM(embedding_size, hidden_dimension, num_layers, dropout = dropout)
    
    self.last_layer = nn.Linear(hidden_dimension, output_size)
    
    self.dropout = nn.Dropout(dropout)
        
  def forward(self, input, hidden, cell):
    
    input = input.unsqueeze(0)

    embedded = self.dropout(self.embedding(input))  
    
    output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
    
    prediction = self.last_layer(output.squeeze(0))

    return prediction, hidden, cell

In [55]:
class Seq2Seq(nn.Module):
  def __init__(self, encoder, decoder, device):
    super().__init__()
    
    self.encoder = encoder
    self.decoder = decoder
    self.device = device
      
  def forward(self, source, target, teacher_force = 1):

    batch_size = target.shape[1]
    sequence_length = target.shape[0]

    target_dict_size = self.decoder.output_size
    
    pred_output = torch.zeros(sequence_length, batch_size, target_dict_size).to(self.device)
    
    hidden, cell = self.encoder(source)
    
    input = target[0]
    
    for i in range(1, sequence_length):

      output, hidden, cell = self.decoder(input, hidden, cell)
      
      pred_output[i] = output
    
      best_pred = output.argmax(1) 

      if random.random() < teacher_force:
        input = target[i]

      else:
        input = best_pred
    
    return pred_output


### **Training**

In [56]:
input_size = len(hindi_word_to_index)
output_size = len(english_word_to_index)
embedding_size = 256
hidden_dimension = 512
num_layers = 2
dropout = 0.5

enc = Encoder(input_size, embedding_size, hidden_dimension, num_layers, dropout).to(device)
dec = Decoder(output_size, embedding_size, hidden_dimension, num_layers, dropout).to(device)

model = Seq2Seq(enc, dec, device).to(device)

print(enc)
print(dec)
print(model)

Encoder(
  (embedding): Embedding(19168, 256)
  (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  (dropout): Dropout(p=0.5, inplace=False)
)
Decoder(
  (embedding): Embedding(16715, 256)
  (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
  (last_layer): Linear(in_features=512, out_features=16715, bias=True)
  (dropout): Dropout(p=0.5, inplace=False)
)
Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(19168, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(16715, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (last_layer): Linear(in_features=512, out_features=16715, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)


### **Test Encoder**

In [57]:
word_input = torch.zeros((7, 4), dtype=torch.long, device=device)  # here 7 is seq length and 4 is batch size
print(word_input.shape)
enc_hid, enc_cell = enc.forward(word_input)  # encode this word_input

print(enc_hid)
print(enc_cell)

print(enc_hid.shape) # [num_layers, seq_length, hidden_units]
print(enc_cell.shape) # [num_layers, seq_length, hidden_units]  Here hidden_units mean hidden dimension

torch.Size([7, 4])
tensor([[[ 0.1904,  0.1793, -0.2109,  ...,  0.0713, -0.0887, -0.0097],
         [ 0.1411,  0.1646, -0.0862,  ...,  0.1745, -0.0210, -0.0557],
         [ 0.2216,  0.0308,  0.0233,  ...,  0.1827, -0.0080, -0.1731],
         [ 0.3838, -0.0009,  0.1093,  ...,  0.2471,  0.0230, -0.1818]],

        [[ 0.1378, -0.0015, -0.0041,  ...,  0.0698, -0.0034, -0.0540],
         [ 0.1190, -0.0130,  0.0690,  ...,  0.0238,  0.1155, -0.0479],
         [ 0.1149, -0.0104, -0.0211,  ...,  0.0702,  0.0239, -0.0659],
         [ 0.0835, -0.0350,  0.0783,  ...,  0.0827,  0.0294, -0.1209]]],
       device='cuda:0', grad_fn=<CudnnRnnBackward>)
tensor([[[ 0.4157,  0.4014, -0.3680,  ...,  0.1384, -0.2407, -0.0169],
         [ 0.2526,  0.2766, -0.1569,  ...,  0.4593, -0.0976, -0.0791],
         [ 0.4453,  0.1030,  0.0349,  ...,  0.2475, -0.0348, -0.2767],
         [ 0.6123, -0.0021,  0.2198,  ...,  0.5197,  0.0948, -0.3113]],

        [[ 0.2732, -0.0030, -0.0079,  ...,  0.1375, -0.0067, -0.1185],


### **Test Decoder**

In [58]:
for i in range(7):
    input = word_input[i]
    pred, dec_hid, dec_cell = dec(input, enc_hid, enc_cell)
    print(dec_hid.shape, pred, dec_cell.shape)

torch.Size([2, 4, 512]) tensor([[ 0.0549,  0.0527,  0.0217,  ...,  0.0085, -0.0515, -0.0003],
        [ 0.0386,  0.0704, -0.0025,  ..., -0.0382,  0.0241, -0.0326],
        [ 0.0208,  0.0344,  0.0136,  ..., -0.0285,  0.0048, -0.0025],
        [ 0.0381,  0.0222,  0.0002,  ..., -0.0158,  0.0242,  0.0256]],
       device='cuda:0', grad_fn=<AddmmBackward>) torch.Size([2, 4, 512])
torch.Size([2, 4, 512]) tensor([[ 0.0267,  0.0124, -0.0089,  ..., -0.0288, -0.0105, -0.0036],
        [ 0.0088,  0.0568,  0.0277,  ..., -0.0582,  0.0103, -0.0254],
        [ 0.0339,  0.0221,  0.0033,  ..., -0.0252,  0.0120,  0.0027],
        [ 0.0176,  0.0020,  0.0139,  ..., -0.0425,  0.0098,  0.0191]],
       device='cuda:0', grad_fn=<AddmmBackward>) torch.Size([2, 4, 512])
torch.Size([2, 4, 512]) tensor([[ 0.0322, -0.0046, -0.0122,  ...,  0.0031, -0.0055, -0.0296],
        [-0.0076,  0.0417,  0.0306,  ..., -0.0296, -0.0069, -0.0509],
        [ 0.0298,  0.0544,  0.0223,  ..., -0.0057, -0.0158,  0.0041],
        [ 

### **Initialize Weights**

In [59]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

Seq2Seq(
  (encoder): Encoder(
    (embedding): Embedding(19168, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Decoder(
    (embedding): Embedding(16715, 256)
    (rnn): LSTM(256, 512, num_layers=2, dropout=0.5)
    (last_layer): Linear(in_features=512, out_features=16715, bias=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
)

### **Optimizer**

In [60]:
from torch import optim
optimizer = optim.Adam(model.parameters(), lr=0.001)

### **Loss**

In [61]:
criterion = nn.CrossEntropyLoss(ignore_index = 1)

In [62]:
from torch.utils import data
def load_array(data_arrays, batch_size, is_train=True):
    dataset = data.TensorDataset(*data_arrays)
    return data.DataLoader(dataset, batch_size, shuffle=is_train)

In [63]:
len(hindi_list_indices), len(english_list_indices), type(english_list_indices)

(77825, 77825, torch.Tensor)

In [64]:
print(hindi_list_indices.shape)

torch.Size([77825, 37])


In [65]:
batch_size=32
data_arrays = (hindi_list_indices, english_list_indices)
data_iter = load_array(data_arrays, batch_size)

In [66]:
data_arrays2 = (hindi_test_list_indices, english_test_list_indices)
valid_iter = load_array(data_arrays2, batch_size)

In [67]:
def translate_sentence(model, sentence, device, max_length=max_len):

    sent_list=[]
    for t in indic_tokenize.trivial_tokenize(sentence): 
      x = re.findall("[\u0901-\u0964A-Za-z.?!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]+", t)
      for elem in x:
        if hindi_word_to_index.get(elem) is None:
          #sent_list.append(hindi_word_to_index['UNK'])
          continue
        else:
           sent_list.append(hindi_word_to_index[elem]) 

    sent_list.insert(0, hindi_word_to_index['SOS'])
    sent_list.append(hindi_word_to_index['EOS'])

    sent_tensor = torch.tensor(sent_list, dtype=torch.long).unsqueeze(1).to(device)

    with torch.no_grad():
        hidden, cell = model.encoder(sent_tensor)

    outputs = [2]  #SOS = 2

    for _ in range(max_length):
        previous_word = torch.LongTensor([outputs[-1]]).to(device)

        with torch.no_grad():
            output, hidden, cell = model.decoder(previous_word, hidden, cell)
            best_guess = output.argmax(1).item()

        outputs.append(best_guess)

        #EOS=3
        if output.argmax(1).item() == 3:
            break

    translated_sentence = [english_index_to_word[idx] for idx in outputs]

    return translated_sentence[1:]

def checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss):
    state = {'model': model,'best_loss': best_loss,'epoch': epoch,'rng_state': torch.get_rng_state(), 'optimizer': optimizer.state_dict(),}
    torch.save(state, '/content/drive/MyDrive/Colab Notebooks/checkpoint-NMT')
    torch.save(model.state_dict(),'/content/drive/MyDrive/Colab Notebooks/checkpoint-NMT-SD')

In [68]:
import random
import sys
epoch_loss = 0.0
num_epochs = 30
best_loss = sys.maxsize
best_epoch = -1
sentence1 = "मैं कहाँ रहते हैं आप जानते हो?"
step=0

for epoch in range(num_epochs):
  print("Epoch -",epoch+1)
  model.eval()
  translated_sentence1 = translate_sentence(model, sentence1, device, max_length=max_len)

  model.train(True)
  for batch_idx, batch in enumerate(data_iter):
    input, target = [x.to(device) for x in batch]

    input = input.permute(1,0)
    target = target.permute(1,0)

    output = model(input, target)
    output = output[1:].reshape(-1, output.shape[2])
    target = target[1:].reshape(-1)

    optimizer.zero_grad()

    loss = criterion(output, target)

    loss.backward()

    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)

    optimizer.step()
    step += 1
      
    epoch_loss += loss.item()

  if epoch_loss < best_loss:
    best_loss = epoch_loss
    best_epoch = epoch
    checkpoint_and_save(model, best_loss, epoch, optimizer, epoch_loss) 

  print("Epoch_Loss - {}".format(loss.item()))
  print()
  
print(epoch_loss / len(data_iter))

Epoch - 1
Epoch_Loss - 5.103395938873291

Epoch - 2
Epoch_Loss - 4.9170355796813965

Epoch - 3
Epoch_Loss - 3.745488405227661

Epoch - 4
Epoch_Loss - 3.059257745742798

Epoch - 5
Epoch_Loss - 2.108837604522705

Epoch - 6
Epoch_Loss - 2.0863287448883057

Epoch - 7
Epoch_Loss - 1.623865008354187

Epoch - 8
Epoch_Loss - 3.311858892440796

Epoch - 9
Epoch_Loss - 1.4698431491851807

Epoch - 10
Epoch_Loss - 3.5562806129455566

Epoch - 11
Epoch_Loss - 1.1076253652572632

Epoch - 12
Epoch_Loss - 2.8540573120117188

Epoch - 13
Epoch_Loss - 0.5403867959976196

Epoch - 14
Epoch_Loss - 2.381434440612793

Epoch - 15
Epoch_Loss - 0.34060031175613403

Epoch - 16
Epoch_Loss - 1.7280659675598145

Epoch - 17
Epoch_Loss - 2.3184120655059814

Epoch - 18
Epoch_Loss - 2.0976369380950928

Epoch - 19
Epoch_Loss - 0.5977654457092285

Epoch - 20
Epoch_Loss - 1.1289169788360596

Epoch - 21
Epoch_Loss - 0.3504440188407898

Epoch - 22
Epoch_Loss - 1.139922857284546

Epoch - 23
Epoch_Loss - 1.3642802238464355

Epoc

In [69]:
outputs = []

for src,trg in zip(hindi_test_sentence_list, english_test_sentence_list):
    prediction = translate_sentence(model, src, device)
    prediction = prediction[:-1]  # remove <eos> token
    x = ' '.join([e for e in prediction])
    outputs.append(x)

In [70]:
!pip install -U nltk



In [71]:
import nltk
import sys
nltk.download('wordnet')
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.meteor_score import single_meteor_score

references = english_test_sentence_list

hypotheses = outputs

total_num = len(references)
total_bleu_scores = 0
total_meteor_scores = 0
for i in range(total_num):
  total_bleu_scores+=sentence_bleu([references[i].split(" ")], hypotheses[i].split(" "))
  total_meteor_scores+=single_meteor_score(references[i], hypotheses[i])

bleu_result = total_bleu_scores/total_num
meteor_result = total_meteor_scores/total_num

print("bleu score: ",bleu_result)
print("meteor score: ",meteor_result)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


bleu score:  0.00047116058731513916
meteor score:  0.10489189948948359


In [72]:
print(epoch_loss / len(data_iter))

66.97723525815267


In [73]:
testpath = '/content/drive/MyDrive/Colab Notebooks/hindistatements.csv'

In [74]:
finaldata = pd.read_csv(testpath,index_col=None)

In [75]:
finaldata = finaldata['hindi'].to_list()

In [76]:
with open('/content/drive/MyDrive/Colab Notebooks/answer.txt', 'w') as f:
    for sent in finaldata:
        prediction = translate_sentence(model, sent, device)
        prediction = prediction[:-1]  # remove <eos> token
        x = ' '.join([e for e in prediction])
        f.write(x+'\n')