In [1]:
import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch.nn.utils.rnn import pad_sequence
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import random

import spacy
from transformers import AutoTokenizer
import string
import re

import itertools

from tqdm import tqdm

In [2]:
df = pd.read_csv('/kaggle/input/english-to-hindi/hin.txt',sep = '\t',names = ['english_sentence','hindi_sentence'])
df.head()

Unnamed: 0,english_sentence,hindi_sentence
0,Wow!,वाह!
1,Help!,बचाओ!
2,Jump.,उछलो.
3,Jump.,कूदो.
4,Jump.,छलांग.


In [3]:
#torch.backends.cudnn.benchmark = True
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
eng_tokenizer = spacy.load("en_core_web_sm")
hin_tokenizer = AutoTokenizer.from_pretrained('raunaqjabbal/hindi-tokenizer')

tokenizer_config.json:   0%|          | 0.00/11.0M [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/607k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/12.8M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/1.97M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

In [5]:
class Vocabulary:
    
    def __init__(self,freq_threshold):
        self.freq_threshold = freq_threshold
        
        self.en_stoi = {'<pad>':0,'<sos>':1,'<eos>':2,'<unk>':3}
        self.en_itos = {0:'<pad>',1:'<sos>',2:'<eos>',3:'<unk>'}
        
        self.hi_stoi = {'<pad>':0,'<sos>':1,'<eos>':2,'<unk>':3}
        self.hi_itos = {0:'<pad>',1:'<sos>',2:'<eos>',3:'<unk>'}
        
    def __len__(self):
        return len(self.en_stoi)
    
    def tokenize_eng(self, en_text):
        return [token.text.lower() for token in eng_tokenizer.tokenizer(en_text)]
    
    def tokenize_hin(self,hi_text):
        return hin_tokenizer.tokenize(hi_text)
    
    def build_vocabulary(self, sentences, lang):
        frequencies = {}
        idx = 4
        
        if lang == 'english':
            for sentence in sentences:
                for word in self.tokenize_eng(sentence):
                    if word not in frequencies:
                        frequencies[word] = 1
                    else:
                        frequencies[word] += 1

                    if frequencies[word] == self.freq_threshold and len(word)>1 and word.isalpha():
                        self.en_stoi[word] = idx
                        self.en_itos[idx] = word
                        idx += 1
        else:
            for sentence in sentences:
                for word in self.tokenize_hin(sentence):
                    if word not in frequencies:
                        frequencies[word] = 1
                    else:
                        frequencies[word] += 1

                    if frequencies[word] == self.freq_threshold and len(word)>1:
                        self.hi_stoi[word] = idx
                        self.hi_itos[idx] = word
                        idx += 1
    
    def english_vector(self, text):
        tokenized_text = self.tokenize_eng(text)
        
        numerical_sentence = [self.en_stoi['<sos>']]
        numerical_sentence += [self.en_stoi[word] if word in self.en_stoi else self.en_stoi['<unk>'] for word in tokenized_text]
        numerical_sentence.append(self.en_stoi['<eos>'])
        
        return numerical_sentence
    
    def hindi_vector(self, text):
        tokenized_text = self.tokenize_hin(text)
        
        numerical_sentence = [self.hi_stoi['<sos>']]
        numerical_sentence += [self.hi_stoi[word] if word in self.hi_stoi else self.hi_stoi['<unk>'] for word in tokenized_text]
        numerical_sentence.append(self.hi_stoi['<eos>'])
        
        return numerical_sentence
            

In [6]:
class TextData(Dataset):
    
    def __init__(self, freq_threshold, max_length):
                
        self.df = pd.read_csv('/kaggle/input/english-to-hindi/hin.txt',sep = '\t',names = ['english_sentence','hindi_sentence'])
        self.df.dropna(inplace = True)
        
        self.vocab = Vocabulary(freq_threshold)
        self.vocab.build_vocabulary(self.df['english_sentence'],'english')
        self.vocab.build_vocabulary(self.df['hindi_sentence'],'hindi')
        
        print('English Vocabulary Size - ',len(self.vocab.en_stoi))
        print('Hindi Vocabulary Size - ',len(self.vocab.hi_stoi))
        
        self.vocab.en_stoi = dict(itertools.islice(self.vocab.en_stoi.items(), max_length))
        self.vocab.en_itos = dict(itertools.islice(self.vocab.en_itos.items(), max_length))
        
        self.vocab.hi_stoi = dict(itertools.islice(self.vocab.hi_stoi.items(), max_length))
        self.vocab.hi_itos = dict(itertools.islice(self.vocab.hi_itos.items(), max_length))
        
        self.pad_idx = self.vocab.en_stoi['<pad>']
    
        self.english_sentences = self.df['english_sentence'].to_list()
        self.hindi_sentences = self.df['hindi_sentence'].to_list()
    
    def __len__(self):
        return self.df.shape[0]

    def __getitem__(self, index):
        eng_tokenized = torch.tensor(self.vocab.english_vector(self.english_sentences[index]))
        hin_tokenized = torch.tensor(self.vocab.hindi_vector(self.hindi_sentences[index]))
        return eng_tokenized, hin_tokenized

class MyCollate:
    
    def __init__(self,pad_idx):
        self.pad_idx = pad_idx
    
    def __call__(self, batch):
        eng_sentences = [item[0] for item in batch]
        hin_sentences = [item[1] for item in batch]
        
        eng_sentences = pad_sequence(eng_sentences, batch_first = False, padding_value = self.pad_idx)
        hin_sentences = pad_sequence(hin_sentences, batch_first = False, padding_value = self.pad_idx)
        
        return eng_sentences, hin_sentences

In [7]:
class Encoder(nn.Module):
    
    def __init__(self, input_size, embed_size, hidden_size, num_layers):
        super(Encoder, self).__init__()
        
        self.embed = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x):
        # shape of x: (seq_len, N) 
        # where N is batch size
        # seq_len is the number of words being sent
        embeddings = self.embed(x) 
        
        # embeddings shape: (seq_len, N, embed_size)
        outputs, (hidden,cell) = self.lstm(embeddings)
        
        # outputs shape: (seq_len ,N, hidden_size)
        return hidden, cell

class Decoder(nn.Module):
    
    def __init__(self, input_size, embed_size, hidden_size, output_size,num_layers):
        super(Decoder, self).__init__()
        
        self.embed = nn.Embedding(input_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers)
        self.fc = nn.Linear(hidden_size, output_size)
        self.dropout = nn.Dropout(0.5)
    
    def forward(self, x, hidden, cell):
        # Here x is of the shape (N), and we want to convert it to (1,N)
        # 1 here is seq_len, since we are passing only one word
        x = x.unsqueeze(0)
        
        # x shape: (1,N)
        embeddings = self.embed(x)
        
        # embeddings shape: (1,N,embed_size)
        outputs, (hidden, cell) = self.lstm(embeddings, (hidden, cell))
        
        # outputs shape: (1,N, hidden_size)
        predictions = self.fc(outputs)
        
        # predictions,shape: (1,N,output_size)
        # loss function needs shape (N, output_size) so we're
        # just gonna remove the first dim
        predictions = predictions.squeeze(0)
        
        return predictions, hidden, cell

class Translator(nn.Module):
    
    def __init__(self, encoder, decoder):
        super(Translator, self).__init__()
        
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, source, target, teacher_force_ratio = 0.5):
        batch_size = source.shape[1] # shape: (seq_len, N)
        target_len = target.shape[0]
        target_vocab_size = len(dataset.vocab.en_stoi)
        
        outputs = torch.zeros(target_len, batch_size, target_vocab_size).to(device)
        
        hidden,cell = self.encoder(source)
        
        x = target[0] # Get <sos>
        
        for t in range(1,target_len):
            output, hidden, cell = self.decoder(x, hidden, cell)
            
            outputs[t] = output
            
            best_guess = torch.argmax(output)
            #print(best_guess)
            
            x = target[t]
        
        return outputs

In [33]:
dataset = TextData(10,260)

loader = DataLoader(
    dataset = dataset,
    batch_size = 32,
    shuffle = True,
    collate_fn = MyCollate(pad_idx = 0),
)

English Vocabulary Size -  265
Hindi Vocabulary Size -  368


In [37]:
num_epochs = 100
learning_rate = 0.001
batch_size = 32

input_size_encoder = len(dataset.vocab.en_stoi)
input_size_decoder = len(dataset.vocab.hi_stoi)

embed_size = 256

output_size = len(dataset.vocab.en_stoi)

hidden_size = 256
num_layers = 2

In [38]:
encoder = Encoder(input_size_encoder, embed_size, hidden_size, num_layers).to(device)
decoder = Decoder(input_size_decoder, embed_size, hidden_size, output_size, num_layers).to(device)
translator = Translator(encoder,decoder).to(device)

optimizer = optim.Adam(translator.parameters(), lr = learning_rate)
criterion = nn.CrossEntropyLoss(ignore_index = 0)

In [39]:
for epoch in range(num_epochs):
    
    #translator.train()
    
    for batch_idx, (eng_sentences, hin_sentences) in tqdm(enumerate(loader),total = len(loader), leave = False):
        eng_sent_list = eng_sentences.to(device)
        hin_sent_list = hin_sentences.to(device)
        
        output = translator(eng_sent_list, hin_sent_list)
        
        output = output[1:].reshape(-1, output.shape[2])
        hin_sent_list = hin_sent_list[1:].reshape(-1)
        
        loss = criterion(output,hin_sent_list)
        
        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(translator.parameters(), max_norm=1)
        optimizer.step()
    
    print(f'Epoch - {epoch+1}/{num_epochs}, Loss - {loss}')

                                               

Epoch - 1/100, Loss - 3.4572300910949707


                                               

Epoch - 2/100, Loss - 2.9711129665374756


                                               

Epoch - 3/100, Loss - 2.851594924926758


                                               

Epoch - 4/100, Loss - 2.620299816131592


                                               

Epoch - 5/100, Loss - 2.4703550338745117


                                               

Epoch - 6/100, Loss - 2.3842270374298096


                                               

Epoch - 7/100, Loss - 1.9746005535125732


                                               

Epoch - 8/100, Loss - 2.0658698081970215


                                               

Epoch - 9/100, Loss - 2.2504942417144775


                                               

Epoch - 10/100, Loss - 2.1644887924194336


                                               

Epoch - 11/100, Loss - 1.7776215076446533


                                               

Epoch - 12/100, Loss - 1.7767635583877563


                                               

Epoch - 13/100, Loss - 1.6396008729934692


                                               

Epoch - 14/100, Loss - 1.4502800703048706


                                               

Epoch - 15/100, Loss - 1.4784351587295532


                                               

Epoch - 16/100, Loss - 1.298967957496643


                                               

Epoch - 17/100, Loss - 1.088638424873352


                                               

Epoch - 18/100, Loss - 0.8755085468292236


                                               

Epoch - 19/100, Loss - 0.8410502672195435


                                               

Epoch - 20/100, Loss - 0.7431150078773499


                                               

Epoch - 21/100, Loss - 0.8212277293205261


                                               

Epoch - 22/100, Loss - 0.5538183450698853


                                               

Epoch - 23/100, Loss - 0.6262295246124268


                                               

Epoch - 24/100, Loss - 0.5811991095542908


                                               

Epoch - 25/100, Loss - 0.476529061794281


                                               

Epoch - 26/100, Loss - 0.38100725412368774


                                               

Epoch - 27/100, Loss - 0.3897639214992523


                                               

Epoch - 28/100, Loss - 0.22639711201190948


                                               

Epoch - 29/100, Loss - 0.3453122079372406


                                               

Epoch - 30/100, Loss - 0.1916462928056717


                                               

Epoch - 31/100, Loss - 0.27130287885665894


                                               

Epoch - 32/100, Loss - 0.16365934908390045


                                               

Epoch - 33/100, Loss - 0.16445380449295044


                                               

Epoch - 34/100, Loss - 0.09851576387882233


                                               

Epoch - 35/100, Loss - 0.22310347855091095


                                               

Epoch - 36/100, Loss - 0.07519522309303284


                                               

Epoch - 37/100, Loss - 0.2056819051504135


                                               

Epoch - 38/100, Loss - 0.07744888216257095


                                               

Epoch - 39/100, Loss - 0.085706427693367


                                               

Epoch - 40/100, Loss - 0.1260933130979538


                                               

Epoch - 41/100, Loss - 0.06894758343696594


                                               

Epoch - 42/100, Loss - 0.08086400479078293


                                               

Epoch - 43/100, Loss - 0.11735855787992477


                                               

Epoch - 44/100, Loss - 0.09868873655796051


                                               

Epoch - 45/100, Loss - 0.097244493663311


                                               

Epoch - 46/100, Loss - 0.061692483723163605


                                               

Epoch - 47/100, Loss - 0.10545367002487183


                                               

Epoch - 48/100, Loss - 0.039516422897577286


                                               

Epoch - 49/100, Loss - 0.043350785970687866


                                               

Epoch - 50/100, Loss - 0.0967550277709961


                                               

Epoch - 51/100, Loss - 0.10996715724468231


                                               

Epoch - 52/100, Loss - 0.10208574682474136


                                               

Epoch - 53/100, Loss - 0.36227813363075256


                                               

Epoch - 54/100, Loss - 0.117117740213871


                                               

Epoch - 55/100, Loss - 0.03944908455014229


                                               

Epoch - 56/100, Loss - 0.06149817258119583


                                               

Epoch - 57/100, Loss - 0.06731389462947845


                                               

Epoch - 58/100, Loss - 0.030277714133262634


                                               

Epoch - 59/100, Loss - 0.045321930199861526


                                               

Epoch - 60/100, Loss - 0.07336197793483734


                                               

Epoch - 61/100, Loss - 0.01572844572365284


                                               

Epoch - 62/100, Loss - 0.02019030787050724


                                               

Epoch - 63/100, Loss - 0.07800281047821045


                                               

Epoch - 64/100, Loss - 0.05966208875179291


                                               

Epoch - 65/100, Loss - 0.04261879622936249


                                               

Epoch - 66/100, Loss - 0.019346412271261215


                                               

Epoch - 67/100, Loss - 0.08556432276964188


                                               

Epoch - 68/100, Loss - 0.027949891984462738


                                               

Epoch - 69/100, Loss - 0.051201727241277695


                                               

Epoch - 70/100, Loss - 0.03815704956650734


                                               

Epoch - 71/100, Loss - 0.12321819365024567


                                               

Epoch - 72/100, Loss - 0.05466196686029434


                                               

Epoch - 73/100, Loss - 0.12369703501462936


                                               

Epoch - 74/100, Loss - 0.09495975077152252


                                               

Epoch - 75/100, Loss - 0.04688311740756035


                                               

Epoch - 76/100, Loss - 0.21363894641399384


                                               

Epoch - 77/100, Loss - 0.20163948833942413


                                               

Epoch - 78/100, Loss - 0.14350302517414093


                                               

Epoch - 79/100, Loss - 0.10396073013544083


                                               

Epoch - 80/100, Loss - 0.09787888824939728


                                               

Epoch - 81/100, Loss - 0.03448847681283951


                                               

Epoch - 82/100, Loss - 0.0573328360915184


                                               

Epoch - 83/100, Loss - 0.04282107949256897


                                               

Epoch - 84/100, Loss - 0.04037195444107056


                                               

Epoch - 85/100, Loss - 0.005147258285433054


                                               

Epoch - 86/100, Loss - 0.03913351893424988


                                               

Epoch - 87/100, Loss - 0.04192572087049484


                                               

Epoch - 88/100, Loss - 0.0031819725409150124


                                               

Epoch - 89/100, Loss - 0.004113370552659035


                                               

Epoch - 90/100, Loss - 0.08737750351428986


                                               

Epoch - 91/100, Loss - 0.05949858948588371


                                               

Epoch - 92/100, Loss - 0.06084074079990387


                                               

Epoch - 93/100, Loss - 0.039124105125665665


                                               

Epoch - 94/100, Loss - 0.039958395063877106


                                               

Epoch - 95/100, Loss - 0.050274789333343506


                                               

Epoch - 96/100, Loss - 0.031248953193426132


                                               

Epoch - 97/100, Loss - 0.04022153466939926


                                               

Epoch - 98/100, Loss - 0.04340507835149765


                                               

Epoch - 99/100, Loss - 0.02403227798640728


                                               

Epoch - 100/100, Loss - 0.05798852816224098




In [40]:
eng_sent = "What are you doing?"

sentence_tensor = torch.tensor(dataset.vocab.english_vector(eng_sent)).unsqueeze(1).to(device)

with torch.no_grad():
    hidden, cell = translator.encoder(sentence_tensor)


outputs = [dataset.vocab.hi_stoi['<sos>']]
for _ in range(100):
    previous_word = torch.tensor([outputs[-1]]).to(device)
    
    with torch.no_grad():
        output, hidden, cell = translator.decoder(previous_word, hidden, cell)
        best_guess = torch.argmax(output).item()
    
    outputs.append(best_guess)
    
    if output.argmax(1).item() == dataset.vocab.hi_stoi["<eos>"]:
            break

translated_sentence = [dataset.vocab.hi_itos[idx] for idx in outputs]

print(translated_sentence)

['<sos>', 'तुम', 'क्या', 'कर', 'रहे', '<unk>', '<eos>']
