
**Import of Libraries** 


In [1]:
import numpy as np
import re
import random
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import string
import random

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
torch.device("cuda" if torch.cuda.is_available() else "cpu")

device(type='cuda')

**For reading train.csv from google drive**

In [3]:
!pip install -U -q PyDrive
  
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
  
# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

**Link to my drive**

In [None]:
link_train = 'https://drive.google.com/file/d/1djSVsZbRv7QhnXGHTTh-ljj8WArssyR0/view?usp=sharing'

In [None]:
import pandas as pd

id = link_train.split("/")[-2]
  
downloaded = drive.CreateFile({'id':id}) 
downloaded.GetContentFile('train.csv')  
  
df_train = pd.read_csv('train.csv')

**PREPROCESSING**

**Removal of some noisy sentences**(Removing sentences in which some english words are mixed with hindi)

In [None]:
for inx , r in df_train.iterrows():
  txt = r['hindi'].split(" ")
  for i in txt:
    x = re.search("^[A-Za-z]", i)
    if x:
      df_train.drop(index=inx, axis=0, inplace=True)
      break

In [None]:
len(df_train)

97239

**Code for checking number of sentences of different lenght**

In [None]:
leng = {}
for inx , r in df_train.iterrows():
  txt = r['hindi'].split(" ")
  l = len(txt)
  if l not in leng.keys():
    leng[l] = 1
  else:
      leng[l] = leng[l] + 1

**Number of Sentences whose length smaller than 10**

In [None]:
count  = 0
for i in range(1,10):
  count  = count + leng[i]
print(count)  

57405


**Drop rows whose length is greater than 9**


In [None]:
for inx , r in df_train.iterrows():
  txt = r['hindi'].split(" ")
  l = len(txt)
  if l > 9:
    df_train.drop(index=inx, axis=0, inplace=True)

In [None]:
len(df_train)

57405

**word-index dictionary and total words in dictionary for Hindi and English Language**

In [None]:
english_index = {}
english_count = {}
english_word = {0: "start", 1: "end"}
english_total_words = 2

hindi_index = {}
hindi_count = {}
hindi_word = {0: "start", 1: "end"}
hindi_total_words = 2

In [None]:
pairs = []
for index, row in df_train.iterrows():
    if len(row['english'].split(" "))< 10 and len(row['hindi'].split(" "))< 10:
      a = row['hindi']

      b = row['english']
      b = b.lower().strip()
      b = re.sub(r"([.!?])", r" \1", b)
      b = re.sub(r"[.!?]+", r" ", b)
      pairs.append([a,b])

In [None]:
for pair in pairs:
  hi = pair[0]
  for sp in hi.split(' '):
    if sp not in hindi_index:
      hindi_index[sp] = hindi_total_words
      hindi_count[sp] = 1
      hindi_word[hindi_total_words] = sp
      hindi_total_words = hindi_total_words + 1
    else:
      hindi_count[sp] += 1  

  en = pair[1]
  for sp in en.split(' '):
    if sp not in english_index:
      english_index[sp] = english_total_words
      english_count[sp] = 1
      english_word[english_total_words] = sp
      english_total_words = english_total_words + 1
    else:
      english_count[sp] += 1  

**Dictionary size for Hindi and English**

In [None]:
print(len(hindi_index))
print(len(english_index))

29621
19350


**Dictionary size After removal of those words which present only one time**

In [None]:
for i in hindi_count.keys():
  if hindi_count[i] == 1:
    del hindi_index[i]

for i in english_count.keys():
  if english_count[i] == 1:
    del english_index[i]

In [None]:
print(len(hindi_index))
print(len(english_index))

11416
9086


**Encoder** (With GRU)

In [None]:
class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size)

    def forward(self, input, hidden):
        embedded = self.embedding(input).view(1, 1, -1)
        output = embedded
        output, hidden = self.gru(output, hidden)
        return output, hidden

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

**Decoder**

In [None]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size, dp=0.1, l=11):
        super(AttnDecoderRNN, self).__init__()
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.dp = dp
        self.l = l

        self.embedding = nn.Embedding(self.output_size, self.hidden_size)
        self.attn = nn.Linear(self.hidden_size * 2, self.l)
        self.attn_combine = nn.Linear(self.hidden_size * 2, self.hidden_size)
        self.dropout = nn.Dropout(self.dp)
        self.gru = nn.GRU(self.hidden_size, self.hidden_size)
        self.out = nn.Linear(self.hidden_size, self.output_size)

    def forward(self, input, hidden, encoder_outputs):
        embedded = self.embedding(input).view(1, 1, -1)
        embedded = self.dropout(embedded)

        attn_weights = F.softmax(
            self.attn(torch.cat((embedded[0], hidden[0]), 1)), dim=1)
        attn_applied = torch.bmm(attn_weights.unsqueeze(0),
                                 encoder_outputs.unsqueeze(0))

        output = torch.cat((embedded[0], attn_applied[0]), 1)
        output = self.attn_combine(output).unsqueeze(0)

        output = F.relu(output)
        output, hidden = self.gru(output, hidden)

        output = F.log_softmax(self.out(output[0]), dim=1)
        return output, hidden, attn_weights

    def initHidden(self):
        return torch.zeros(1, 1, self.hidden_size, device=device)

**Teacher Forcing Ratio**

In [None]:
teacher_forcing_ratio = 0.65

def training_phase(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=11):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)
    #print(input_length , end = " ")
    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(input_tensor[ei], encoder_hidden)
        #print(ei)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[0]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]

    else:
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == 1:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

**This is the training phase where we get input tensors and output tensors using word-index dictionary**

**Number of epochs are also decalred here**

In [None]:
def Iteration(encoder, decoder, every, learning_rate=0.001):
    print_loss_total = 0 

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = []



    #Code for getting input tensors and output tensors/
    for i in range(10): #epoch
        for j in range(0,len(pairs)-1):
          a = pairs[j] 
          p1 = a[0]
          p2 = a[1]

          tem = []
          for word in p1.split(' '):
            if word in hindi_index.keys():
                tem.append(hindi_index[word])
            else:
                tem.append(1)    #for unknown
          tem.append(1)        #for last word
          t = torch.tensor(tem, dtype=torch.long, device=device).view(-1, 1)

          tem1 = []
          for word in p2.split(' '):
            if word in english_index.keys():
                tem1.append(english_index[word])
            else:
                tem1.append(1)    #for unknown
          tem1.append(1)        #for last word
          t1 = torch.tensor(tem1, dtype=torch.long, device=device).view(-1, 1)       

          training_pairs.append((t , t1))


    criterion = nn.NLLLoss()

    for iter in range(1, 10*(len(pairs)-1)): #total iter epochs*no_of_pairs
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = training_phase(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss

        if iter % every == 0:
            print_loss_avg = print_loss_total / every
            print_loss_total = 0
            print('(%d %d%%) %.4f' % (iter, (iter / 10*(len(pairs)-1) )* 100, print_loss_avg)) #total iter here also

In [None]:
hidden_size = 1024
encode = EncoderRNN(hindi_total_words, hidden_size).to(device)
decode = AttnDecoderRNN(hidden_size, english_total_words).to(device)

**Loading save states**

**Loading encode.pt and decode.pt**

In [None]:
file_en = 'https://drive.google.com/file/d/1IvtzSR-hhFGIHx1nmuScWyI0ntx26YOy/view?usp=sharing'

In [None]:
id_encode = file_en.split("/")[-2]
  
downloaded_en = drive.CreateFile({'id':id_encode}) 
downloaded_en.GetContentFile('encode.pt')  

In [None]:
encode.load_state_dict(torch.load('encode.pt'))

<All keys matched successfully>

In [None]:
file_de = 'https://drive.google.com/file/d/19-xe2nN1E0VP9Xj5W54OcSsoKA7OxgYp/view?usp=sharing'

In [None]:
id_decode = file_de.split("/")[-2]
  
downloaded_de = drive.CreateFile({'id':id_decode}) 
downloaded_de.GetContentFile('decode.pt')  

In [None]:
decode.load_state_dict(torch.load('decode.pt'))

<All keys matched successfully>

**Skip this step here because already loaded encode.pt and decode.pt**

In [None]:
#Iteration(encode, decode, every=200)

**Evaluation step**

In [None]:
def evaluation_of_sent(encoder, decoder, sentence, max_length=11):
    with torch.no_grad():

        tem = []
        for word in sentence.split(' '):
          if word in hindi_index.keys():
            tem.append(hindi_index[word])
          else:
            tem.append(1)    #for unknown
        tem.append(1)        #for last word
        input_tensor  = torch.tensor(tem, dtype=torch.long, device=device).view(-1, 1)



        input_length = input_tensor.size()[0]
        encoder_hidden = encoder.initHidden()

        encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

        for ei in range(input_length):
            encoder_output, encoder_hidden = encoder(input_tensor[ei],
                                                     encoder_hidden)
            encoder_outputs[ei] += encoder_output[0, 0]

        decoder_input = torch.tensor([[0]], device=device) 

        decoder_hidden = encoder_hidden

        decoded_words = []
        decoder_attentions = torch.zeros(max_length, max_length)

        for di in range(max_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            decoder_attentions[di] = decoder_attention.data
            topv, topi = decoder_output.data.topk(1)
            if topi.item() == 1:
                decoded_words.append('')
                break
            else:
                decoded_words.append(english_word[topi.item()])

            decoder_input = topi.squeeze().detach()

        return decoded_words, decoder_attentions[:di + 1]

**Evaluation step for single sentence**

In [None]:
hindi = 'उन्होंने वो'
output_words, _ = evaluation_of_sent(encode, decode , hindi)
ans = ' '.join(output_words)
print(ans)

they she   


In [None]:
link_hindi = 'https://drive.google.com/file/d/1Lclpwk1xJydzwN0F168BNdWlAztwdQk_/view?usp=sharing'

In [None]:
id_hindi = link_hindi.split("/")[-2]
  
downloaded1 = drive.CreateFile({'id':id_hindi}) 
downloaded1.GetContentFile('hindistatements.csv')  

In [None]:
df_hi = pd.read_csv('hindistatements.csv')

In [None]:
df_hi.head(1)

Unnamed: 0.1,Unnamed: 0,id,hindi
0,0,0,"(तालियां) अब, इसने मेरे मन को उड़ा दिया।"


In [None]:
len(df_hi)

24102

**Saving test set conversions**

In [None]:
list_ans = []
list_id = []
o=0
for index, row in df_hi.iterrows():
    a = row['id']
    b = row['hindi']
    if len(b.split(' ')) <10:
        r, _ = evaluation_of_sent(encode, decode , b)
        ans = ' '.join(r)
        list_ans.append(ans)
        list_id.append(a)
        print(o)
        o = o + 1
    else :
        se = b.split(' ')[0:10]
        sent = ' '.join(se)
        r, _ = evaluation_of_sent(encode, decode , sent)
        ans = ' '.join(r)
        list_ans.append(ans)
        list_id.append(a)
        print(o)
        o = o + 1       

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


**Output File**

In [None]:
with open('/content/drive/My Drive/NLP/answer.txt', 'w') as f: 
    c = 0
    for i in list_ans:
        f.write(i+'\n')
        c = c + 1