In [1]:
import pandas as pd
from transformers import BertTokenizer, BertModel
import torch
import re
import string
from string import digits

In [2]:
df=pd.read_csv("../data/Hindi_English_Truncated_Corpus.csv",encoding='utf-8')

In [3]:
df = df.sample(n=25000, random_state=42)
df = df.reset_index(drop=True)

In [4]:
df.drop_duplicates(inplace=True)
df=df[~pd.isnull(df['english_sentence'])]


In [5]:
# Lowercase all characters
df['english_sentence']=df['english_sentence'].apply(lambda x: x.lower())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.lower())

# Remove quotes
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub("'", '', x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub("'", '', x))

exclude = set(string.punctuation) # Set of all special characters
# Remove all the special characters
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in exclude))


# Remove all numbers from text
remove_digits = str.maketrans('', '', digits)
df['english_sentence']=df['english_sentence'].apply(lambda x: x.translate(remove_digits))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.translate(remove_digits))

df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: re.sub("[२३०८१५७९४६]", "", x))

# Remove extra spaces
df['english_sentence']=df['english_sentence'].apply(lambda x: x.strip())
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: x.strip())
df['english_sentence']=df['english_sentence'].apply(lambda x: re.sub(" +", " ", x))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: re.sub(" +", " ", x))



In [6]:
def clean_hindi_sentence(sentence):
    # Regular expression for Hindi numbers
    hindi_numbers_pattern = '[०१२३४५६७८९]'
    
    # Regular expression for English words
    english_words_pattern = r'\b[a-zA-Z]+\b'
    
    # Remove Hindi numbers
    sentence = re.sub(hindi_numbers_pattern, '', sentence)
    
    # Remove English words
    sentence = re.sub(english_words_pattern, '', sentence)
    
    return sentence

# Apply the function to the 'hindi_sentence' column
df['hindi_sentence'] = df['hindi_sentence'].apply(clean_hindi_sentence)

In [7]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,tides,he declares the result and reports it to the e...,वही परिणाम की घोषणा करता है और निर्वाचन आयोग क...
1,ted,was a little uncomfortable for them,थोडा कठिन था।
2,indic2012,but mulla assamudin was proved to be not eligible,मगर मुल्ला असमुद्दीन अक्षम सिद्ध हुए।
3,ted,i would never have to make a book and then pre...,मुझे कभी भी किताब बना कर किसी प्रदर्शनस्थल को ...
4,indic2012,headind kaun banega crorepati,शीर्षक कौन बनेगा करोड़पति


In [8]:
df.shape

(24698, 3)

# Keeping the data with only sentences less than equal to  max length

In [9]:
df['length_eng_sentence']=df['english_sentence'].apply(lambda x:len(x.split(" ")))
df['length_hin_sentence']=df['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [10]:
df=df[df['length_eng_sentence']<=20]
df=df[df['length_hin_sentence']<=20]

In [11]:
print("maximum length of Hindi Sentence ",max(df['length_hin_sentence']))
print("maximum length of English Sentence ",max(df['length_eng_sentence']))

maximum length of Hindi Sentence  20
maximum length of English Sentence  20


In [12]:
df.shape

(17234, 5)

In [13]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
0,tides,he declares the result and reports it to the e...,वही परिणाम की घोषणा करता है और निर्वाचन आयोग क...,19,20
1,ted,was a little uncomfortable for them,थोडा कठिन था।,6,3
2,indic2012,but mulla assamudin was proved to be not eligible,मगर मुल्ला असमुद्दीन अक्षम सिद्ध हुए।,9,6
3,ted,i would never have to make a book and then pre...,मुझे कभी भी किताब बना कर किसी प्रदर्शनस्थल को ...,15,14
4,indic2012,headind kaun banega crorepati,शीर्षक कौन बनेगा करोड़पति,4,7


In [14]:
# Drop multiple columns
df = df.drop(['length_hin_sentence', 'length_eng_sentence'], axis=1)




In [15]:
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,tides,he declares the result and reports it to the e...,वही परिणाम की घोषणा करता है और निर्वाचन आयोग क...
1,ted,was a little uncomfortable for them,थोडा कठिन था।
2,indic2012,but mulla assamudin was proved to be not eligible,मगर मुल्ला असमुद्दीन अक्षम सिद्ध हुए।
3,ted,i would never have to make a book and then pre...,मुझे कभी भी किताब बना कर किसी प्रदर्शनस्थल को ...
4,indic2012,headind kaun banega crorepati,शीर्षक कौन बनेगा करोड़पति


# Using BERT

In [16]:
# Initialize tokenizers for English and Hindi
english_tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
hindi_tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

# Tokenize the sentences
df['english_tokens'] = df['english_sentence'].apply(lambda x: english_tokenizer.encode(x, add_special_tokens=True))
df['hindi_tokens'] = df['hindi_sentence'].apply(lambda x: hindi_tokenizer.encode(x, add_special_tokens=True))

# Tokenize the sentences
def tokenize_and_pad(sentences, tokenizer, max_len=50):
    tokens = [tokenizer.encode(sentence, add_special_tokens=True) for sentence in sentences]
    padded_tokens = pad_sequences(tokens, maxlen=max_len, padding='post')
    attention_masks = [[int(token_id > 0) for token_id in token_seq] for token_seq in padded_tokens]
    return torch.tensor(padded_tokens), torch.tensor(attention_masks)

MAX_LEN = 50

english_tokens, english_attention_masks = tokenize_and_pad(df['english_sentence'].tolist(), english_tokenizer, MAX_LEN)
hindi_tokens, hindi_attention_masks = tokenize_and_pad(df['hindi_sentence'].tolist(), hindi_tokenizer, MAX_LEN)

NameError: name 'pad_sequences' is not defined

In [None]:
df.head()

In [None]:
def get_embeddings(tokens, attention_masks, model):
    with torch.no_grad():
        outputs = model(tokens, attention_mask=attention_masks)
    return outputs.last_hidden_state

# Load pre-trained BERT models
english_model = BertModel.from_pretrained('bert-base-uncased')
hindi_model = BertModel.from_pretrained('bert-base-multilingual-cased')

# Generate embeddings for each sentence
english_embeddings = get_embeddings(english_tokens, english_attention_masks, english_model)
hindi_embeddings = get_embeddings(hindi_tokens, hindi_attention_masks, hindi_model)


In [None]:
import torch.nn as nn

class Seq2SeqModel(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim):
        super(Seq2SeqModel, self).__init__()
        self.encoder = nn.TransformerEncoder(
            nn.TransformerEncoderLayer(d_model=input_dim, nhead=8), num_layers=3)
        self.decoder = nn.TransformerDecoder(
            nn.TransformerDecoderLayer(d_model=output_dim, nhead=8), num_layers=3)
        self.fc = nn.Linear(output_dim, hidden_dim)
        self.output_fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, src, tgt):
        memory = self.encoder(src)
        output = self.decoder(tgt, memory)
        output = self.fc(output)
        output = self.output_fc(output)
        return output

# Initialize model
input_dim = 768  # BERT embeddings dimension
output_dim = 768
hidden_dim = 512
model = Seq2SeqModel(input_dim, output_dim, hidden_dim)
