In [118]:
from __future__ import unicode_literals, print_function, division
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import unicodedata

import numpy as np
import pandas as pd

import os
import re
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
SOS_token = 0
EOS_token = 0
MAX_LENGTH = 20

class Lang:
    def __init__(self):
        self.word2index = {}
        self.word2count = {}
        self.index2word = {0: "SOS", 1: "EOS"}
        self.n_words = 2

    def addSentence(self, sentence):
        for word in sentence.split():
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1


def normalize_sentence(df, lang):
    sentence = df[lang].str.lower()
    
    if lang == 'eng':
        sentence = sentence.str.replace(r'[^A-Za-z\s]+', '', regex=True)
        sentence = sentence.str.normalize('NFD')
        sentence = sentence.apply(lambda x: x.encode('ascii', errors='ignore').decode('utf-8'))
    elif lang == 'hin':
        # Keep Devanagari script and basic punctuation
        sentence = sentence.str.replace(r'[^\w\s\u0900-\u097F]', '', regex=True)
    
    return sentence

In [None]:
def read_sentence(df ,lang1, lang2):
  sentence1 = normalize_sentence(df, lang1)
  sentence2 = normalize_sentence(df, lang2)
  return sentence1 , sentence2

def read_file(loc,lang1,lang2):
  df = pd.read_csv(loc, delimiter='\t', header=None, names=[lang1, lang2, 'meta'])
  return df

def process_data(lang1,lang2):
  df = read_file('text/%s-%s.txt'%(lang1, lang2), lang1,lang2)
  print(f"Read {len(df)} sentence pairs")
  sentence1, sentence2 = read_sentence(df,lang1,lang2)
  source = Lang()
  target = Lang()
  pairs = []
  for i in range(len(df)):
    if len(sentence1[i].split()) < MAX_LENGTH and len(sentence2[i].split()) < MAX_LENGTH:
      full = [sentence1[i],sentence2[i]]
      source.addSentence(sentence1[i])
      target.addSentence(sentence2[i])
      pairs.append(full)
      
  return source, target, pairs


In [121]:
def indexesFromSentence(lang,sentence):
  return [lang.word2index[word] for word in sentence.split()]

def tensorFromSentence(lang,sentence):
  indexes = indexesFromSentence(lang,sentence)
  indexes.append(EOS_token)
  return torch.tensor(indexes,dtype = torch.long,device = device).view(-1,1)

def tensorsFromPair(input_lang,output_lang,pair):
  input_tensor = tensorFromSentence(input_lang,pair[0])
  target_tensor = tensorFromSentence(output_lang, pair[1])
  return (input_tensor,target_tensor)

In [124]:
def evaluate(model, input_lang, output_lang, sentences, max_length=MAX_LENGTH):
   with torch.no_grad():
       input_tensor = tensorFromSentence(input_lang, sentences[0])
       output_tensor = tensorFromSentence(output_lang, sentences[1])
  
       decoded_words = []
  
       output = model(input_tensor, output_tensor)
       # print(output_tensor)
  
       for ot in range(output.size(0)):
           topv, topi = output[ot].topk(1)
           # print(topi)

           if topi[0].item() == EOS_token:
               decoded_words.append('<EOS>')
               break
           else:
               decoded_words.append(output_lang.index2word[topi[0].item()])
   return decoded_words

def evaluateRandomly(model, source, target, pairs, n=10):
   for i in range(n):
       pair = random.choice(pairs)
       print('source {}'.format(pair[0]))
       print('target {}'.format(pair[1]))
       output_words = evaluate(model, source, target, pair)
       output_sentence = ' '.join(output_words)
       print('predicted {}'.format(output_sentence))
       

In [None]:
lang1 = 'eng'
lang2 = 'hin'
source, target, pairs = process_data(lang1, lang2)

randomize = random.choice(pairs)
print('random sentence {}'.format(randomize))

#print number of words
input_size = source.n_words
output_size = target.n_words
print('Input : {} Output : {}'.format(input_size, output_size))

embed_size = 256
hidden_size = 512
num_layers = 1
num_iteration = 100000

#create encoder-decoder model
encoder = Encoder(input_size, hidden_size, embed_size, num_layers)
decoder = Decoder(output_size, hidden_size, embed_size, num_layers)

model = Seq2Seq(encoder, decoder, device).to(device)

#print model 
print(encoder)
print(decoder)

model = trainModel(model, source, target, pairs, num_iteration)
evaluateRandomly(model, source, target, pairs)

Read 3061 sentence pairs
random sentence ['she gave me a large room while i stayed at her house', 'जब मैं उसके घर पर रही उसने मुझे रहने के लिए एक बड़ा सा कमरा दिया।']
Input : 2432 Output : 3129
Encoder(
  (embedding): Embedding(2432, 256)
  (gru): GRU(256, 512)
)
Decoder(
  (embedding): Embedding(3129, 256)
  (gru): GRU(256, 512)
  (out): Linear(in_features=512, out_features=3129, bias=True)
  (softmax): LogSoftmax(dim=1)
)
5000 4.3709
10000 4.2771
15000 4.2573
20000 4.2763
