# "🌍✨ Let's break language barriers with AI-powered translation! 🔥🗣️"

In [1]:
import numpy as np
import pandas as pd
import os
import string
from string import digits
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import re

In [70]:
path_data = '../Data/Hindi_English_Corpus.csv'

In [98]:
data = pd.read_csv(path_data)
data.head(5)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [99]:
ted_source = data.source == 'ted'
data = data[ted_source]

In [100]:
data.loc[93409]

source                                                            ted
english_sentence    But it turns out that tryptophan also happens ...
hindi_sentence      लेकिन यह पता चला है कि tryptophan भी पाया जाता है
Name: 93409, dtype: object

In [101]:
print(f'Shape of data before deleting null values: {data.shape}')
data.isnull().sum()

Shape of data before deleting null values: (39881, 3)


source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [102]:
data.drop_duplicates(inplace = True)
print(f'Shape of data after deleting duplicate values: {data.shape}')

Shape of data after deleting duplicate values: (38803, 3)


In [103]:
data_lines=data.sample(n=28000,random_state=42)
data_lines.shape

(28000, 3)

##### Convert text to lowercase, remove single quotes, and remove all special characters.

In [104]:
def preprocess_text(text):
    """
    Convert text to lowercase, remove single quotes, and remove all special characters.
    
    Args:
    text (str or any): The input value.
    
    Returns:
    str: The processed string, or an empty string if the input is not valid.
    """
    if isinstance(text, str):  
        text = text.lower()                # Convert to lowercase
        text = re.sub("'", '', text)       # Remove single quotes
        text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove special characters
        text = re.sub(r'\d+', '', text)  # Remove standard digits
        text = re.sub(r'[०-९]', '', text)  # Remove Hindi digits (Unicode range)
        text = re.sub(' +'," ",text).strip()
    return text if isinstance(text, str) else ""

In [105]:
data_lines['english_sentence'] = data_lines['english_sentence'].astype(str).apply(preprocess_text)
data_lines['hindi_sentence'] = data_lines['hindi_sentence'].astype(str).apply(preprocess_text)
data_lines.sample(5)

Unnamed: 0,source,english_sentence,hindi_sentence
120813,ted,theres so much we can absorb,यहां हमारे सीखने के लिये कितना कुछ है
36318,ted,aww thank you,ओह धन्यवाद
19798,ted,and in the information era we all live in today,और इस सूचना के युग में जिसमे हम रहते है
35140,ted,and even fantasies,और कल्पनाओं में भी बदलते हैं
105803,ted,that there was something happening here,यहाँ कुछ अलग हो रहा है


In [106]:
data_lines['hindi_sentence'] = data_lines['hindi_sentence'].apply(lambda x : "START_" + x + "_END")
data_lines.sample(5)

Unnamed: 0,source,english_sentence,hindi_sentence
38001,ted,but ill stay until you show up ponytail or not,START_और मई उसी कोने में रुकुंगा जब तक तुम आ न...
60296,ted,my dreams dont look exactly like i thought the...,START_मेरे सपने वैसे नहीं हैं जैसे तब थे जब मै...
62270,ted,so on august th,START_अतः अगस्त को_END
21905,ted,for the first time,START_पहली बार_END
66167,ted,now a lot of people have given up on government,START_अब बहुत लोग ऐसे है जिन्होनें सरकार को त्...


In [107]:
def get_all_words(language):
    all_words = set()
    for sentence in data_lines[language]:
        for word in sentence.split():
            all_words.add(word)
    return all_words

In [108]:
all_eng_words   = get_all_words('english_sentence')
all_hindi_words = get_all_words('hindi_sentence')

print(all_eng_words)



In [109]:
data_lines['length_eng_sentence']=data_lines['english_sentence'].apply(lambda x:len(x.split(" ")))
data_lines['length_hin_sentence']=data_lines['hindi_sentence'].apply(lambda x:len(x.split(" ")))

In [110]:
data_lines.sample(5)

Unnamed: 0,source,english_sentence,hindi_sentence,length_eng_sentence,length_hin_sentence
115563,ted,thats it,START_बस।_END,2,1
82647,ted,a design issue on a voting form,START_एक डिजा़इन तथ्य जिस पर चुनाव हुआ था_END,7,8
42122,ted,think about all the services we have online,START_सभी ऑनलाइन सेवाओंके बारे में सोचो_END,8,6
31727,ted,and that was from one human to another,START_और वो थी एक इन्सान से दूसरे की_END,8,8
72022,ted,i think we all always have the choice in the end,START_मुझे लगता है कि हमारे पास आखिर तक ये चुन...,11,12


#### Input And target values

In [112]:
data_lines=data_lines[data_lines['length_eng_sentence']<=20]
data_lines=data_lines[data_lines['length_hin_sentence']<=20]

In [113]:
max_length_src=max(data_lines['length_hin_sentence'])
max_length_tar=max(data_lines['length_eng_sentence'])

In [114]:
input_words = sorted(list(all_eng_words))
target_words = sorted(list(all_hindi_words))
num_encoder_tokens = len(all_eng_words)
num_decoder_tokens = len(all_hindi_words)
num_encoder_tokens, num_decoder_tokens

(14800, 23587)

In [None]:
num_decoder_tokens += 1 #for zero padding
input_token_index = dict([(word, i+1) for i, word in enumerate(input_words)])
target_token_index = dict([(word, i+1) for i, word in enumerate(target_words)])
reverse_input_char_index = dict((i, word) for word, i in input_token_index.items())
reverse_target_char_index = dict((i, word) for word, i in target_token_index.items())
data_lines = shuffle(data_lines)