# Loading Data and Libraries


In [16]:
from google.colab import drive
drive._mount('/content/drive')

Mounted at /content/drive


In [None]:
! pip install transformers



In [18]:
!unzip -q /content/drive/MyDrive/parallel.zip

In [2]:
# Importing libraries
import os
import pandas as pd
import numpy as np
import string
import re
import torch
import torch.nn as nn



import warnings
warnings.filterwarnings("ignore")

# Preprocessing Text

In [76]:
# Read data 
df_english = pd.read_csv("/content/parallel/English.txt", sep = 'you_delimiter')
df_english.columns = ["english_sent"]
df_english["english_sent"] = df_english["english_sent"].astype(str)

df_hindi = pd.read_csv("/content/parallel/Hindi.txt", sep = 'you_delimiter')
df_hindi.columns = ["hindi_sent"]
df_hindi["hindi_sent"] = df_hindi["hindi_sent"].astype(str)

In [77]:
# Create main DataFrame
df = pd.concat([df_english, df_hindi], axis=1)
df.dropna(axis = 0,inplace = True)

In [78]:
def preprocess_text(df):
    # Lowercase the characters
    df["english_sent"] = df["english_sent"].apply(lambda x : x.lower())
    df["hindi_sent"] = df["hindi_sent"].apply(lambda x : x.lower())

    # Remove extra spaces
    df['english_sent'] = df['english_sent'].apply(lambda x : x.strip())
    df['hindi_sent'] = df['hindi_sent'].apply(lambda x : x.strip())

    # Rmoving URLs
    df["english_sent"] = df["english_sent"].apply(lambda x : re.sub(r'http\S+', '', x))
    df["hindi_sent"] = df["hindi_sent"].apply(lambda x : re.sub(r'http\S+', '', x))

    # Removing digits
    remove_digits = str.maketrans("", "",string.digits)
    df["english_sent"] = df["english_sent"].apply(lambda x : x.translate(remove_digits))
    df["hindi_sent"] = df["hindi_sent"].apply(lambda x : x.translate(remove_digits))
    df["hindi_sent"] = df["hindi_sent"].apply(lambda x : re.sub("[a-zA-z२३०८१५७९४६]", "", x))

    # Remove quotes
    df['english_sent'] = df['english_sent'].apply(lambda x: re.sub("'", '', x))
    df['hindi_sent'] = df['hindi_sent'].apply(lambda x: re.sub("'", '', x))

    # Remove special characters
    special = set(string.punctuation)
    df['english_sent'] = df['english_sent'].apply(lambda x : ''.join(ch for ch in x if ch not in special))
    df['hindi_sent'] = df['hindi_sent'].apply(lambda x : ''.join(ch for ch in x if ch not in special))

    # Add <start> and <end> tags
    df["hindi_sent"] = df["hindi_sent"].apply(lambda x : '<start>' + x + '<end>')

In [79]:
# Preprocess text
preprocess_text(df)

# Drop rows with Null values
df.drop(df[df["english_sent"] == " "].index, inplace = True)
df.drop(df[df["hindi_sent"] == "<start><end>"].index, inplace = True)

In [80]:
# Vocabulary for Hindi and English words
hindi_words = set()
for sent in df["hindi_sent"]:
    for word in sent.split():
        if word not in hindi_words:
            hindi_words.add(word)

english_words = set()
for sent in df["english_sent"]:
    for word in sent.split():
        if word not in english_words:
            english_words.add(word)

In [81]:
# Find Sentence Length
df["eng_sent_length"] = df["english_sent"].apply(lambda x : len(x.split(' ')))
df["hindi_sent_length"] = df_hindi["hindi_sent"].apply(lambda x : len(x.split(' ')))

In [82]:
# Get sentences with specific length
# 30 for English & 32 for Hindi
df = df[df["eng_sent_length"] <= 30]
df = df[df["hindi_sent_length"] <= 32]

# Take 1 Million records for training
df = df.sample(n = 1000000, random_state = 1)
df = df.reset_index(drop = True)

In [87]:
df.sample(5)

Unnamed: 0,english_sent,hindi_sent,eng_sent_length,hindi_sent_length
710316,the chinese ambassador and its indian counterp...,<start>उसने गाड़ी अधिकतम प्राप्य कीमत पर बेची।...,9,7
609385,occupation money,<start>प्रिंट सर्वर<end>,2,2
202646,search of post office through pin code,<start>सड़क निर्माण में जहाँ कहीं भी आवश्यक हो...,7,14
249146,a person in any institution entrusted with the...,<start>मज़े के लिए किसी जानवर को सताना।<end>,15,7
726979,speaking on the occasion the president appreci...,<start>इसकी कार्यप्रणाली के निष्पादन हेतु स्था...,23,7


In [107]:
# Create padding matrix for english sentences
eng_pad_idx = np.zeros(shape=(1000000, 30))
for (i, sent) in enumerate(df["english_sent"]):
    x = df["eng_sent_length"].iloc[[i]]
    if(int(x) != 30):
        eng_pad_idx[i][int(x):] = 1

# Defining Model

In [95]:
class Transformer(nn.Module):
    def __init__(self, 
                 embedding_size, 
                 src_vocab_size, 
                 trg_vocab_size,
                 src_pad_idx,
                 num_heads,
                 num_encoder_layers,
                 num_decoder_layers,
                 forward_expansion,
                 dropout,
                 max_len,
                 device 
    ):
        super(Transformer, self).__init__()
        self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
        self.src_position_embedding = nn.Embedding(max_len, embedding_size)
        self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
        self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
        self.device = device
        self.transformer = nn.Transformer(
            embedding_size,
            num_heads,
            num_encoder_layers,
            num_decoder_layers,
            forward_expansion,
            dropout
        )
        self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.src_pad_idx = self.src_pad_idx
    
    def make_src_mask(self, src_pad_idx):
        src_pad_idx = np.where(src_pad_idx == 0, False, True)
        return src_pad_idx

    def forward(self, src, trg):
        src_seq_length, N = src.shape
        trg_seq_length, N = trg.shape

        src_positions = (
            torch.arange(0, src_seq_length).unsequeeze(1).expand(src_seq_length, N).to(self.device)
        )
        trg_positions =(
            torch.arange(0, trg_seq_length).unsequeeze(1).expand(trg_seq_length, N).to(self.device)
        )

        embed_src = self.dropout(
            (self.src_word_embedding(src) + self.src_position_embedding(src_postitions))
        )
        embed_trg = self.dropout(
            (self.trg_word_embdedding(trg) + self.trg_postition_embedding(trg_positions))
        )
         

        src_padding_mask = self.make_src_mask(src_pad_idx)
        trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(self.device)

        out = self.transformer(
            embed_src,
            embed_trg,
            src_key_padding_mask = src_padding_mask,
            trg_mask = trg_mask 
        )    
        return out 

# Training Model

In [90]:
# Setup training phase
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
load_model = False
save_model = True

# Tranining Hyperparameters
num_epochs = 5
learning_rate = 3e-4
batch_size = 32

# Model Hyperparameters
src_vocab_size = len(english_words)
trg_vocab_size = len(hindi_words)
embedding_size = 512
num_heads = 8
num_encoder_layers = 6
num_decoder_layers = 6
dropout = 0.10
max_length = 30
forward_expansion = 4
src_pad_idx = eng_pad_idx

In [96]:
model = Transformer(
    embedding_size,
    src_vocab_size,
    trg_vocab_size,
    src_pad_idx,
    num_heads,
    num_encoder_layers,
    num_decoder_layers,
    forward_expansion,
    dropout,
    max_length,
    device
).to(device)

In [None]:
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

loss_func = nn.CrossEntropyLoss(ignore_index = src_pad_idx)