# "🌍✨ Let's break language barriers with AI-powered translation! 🔥🗣️"

In [1]:
import numpy as np
import pandas as pd
import os
import string
from string import digits
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from keras.layers import Input, LSTM, Embedding, Dense
from keras.models import Model
import re

In [70]:
path_data = '../Data/Hindi_English_Corpus.csv'

In [71]:
data_lines = pd.read_csv(path_data)
data_lines.head(5)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [72]:
ted_source = data_lines.source == 'ted'
data_lines = data_lines[ted_source]

In [73]:
data_lines.loc[93409]

source                                                            ted
english_sentence    But it turns out that tryptophan also happens ...
hindi_sentence      लेकिन यह पता चला है कि tryptophan भी पाया जाता है
Name: 93409, dtype: object

In [74]:
print(f'Shape of data before deleting null values: {data_lines.shape}')
data_lines.isnull().sum()

Shape of data before deleting null values: (39881, 3)


source              0
english_sentence    0
hindi_sentence      0
dtype: int64

In [75]:
data_lines.drop_duplicates(inplace = True)
print(f'Shape of data after deleting duplicate values: {data_lines.shape}')

Shape of data after deleting duplicate values: (38803, 3)


##### Convert text to lowercase, remove single quotes, and remove all special characters.

In [76]:
def preprocess_text(text):
    """
    Convert text to lowercase, remove single quotes, and remove all special characters.
    
    Args:
    text (str or any): The input value.
    
    Returns:
    str: The processed string, or an empty string if the input is not valid.
    """
    if isinstance(text, str):  
        text = text.lower()                # Convert to lowercase
        text = re.sub("'", '', text)       # Remove single quotes
        text = re.sub(f"[{re.escape(string.punctuation)}]", "", text)  # Remove special characters
        text = re.sub(r'\d+', '', text)  # Remove standard digits
        text = re.sub(r'[०-९]', '', text)  # Remove Hindi digits (Unicode range)
        text = re.sub(' +'," ",text).strip()
    return text if isinstance(text, str) else ""

In [77]:
data_lines['english_sentence'] = data_lines['english_sentence'].astype(str).apply(preprocess_text)
data_lines['hindi_sentence'] = data_lines['hindi_sentence'].astype(str).apply(preprocess_text)
data_lines.sample(5)

Unnamed: 0,source,english_sentence,hindi_sentence
84728,ted,the first time in any sort of public setting,किसी भी सार्वजनिक मंच पर पहली बार
86103,ted,of not using sanitary pad,समाज में सैनेटरी पैड ना इस्तेमाल कर पाने की सम...
126623,ted,now lets turn to two countries,अब फ़िर से एक नज़र दो देशों पर
100326,ted,what this dark energy that the universe is mad...,श्याम पदार्थ क्या है ब्रह्माण्ड जिससे बना उस स...
93616,ted,youre an percent bicyclist”,आप सायकिल चलाना सीख गये हैं।”


In [79]:
data_lines['hindi_sentence'] = data_lines['hindi_sentence'].apply(lambda x : "START_" + x + "_END")
data_lines.sample(5)

Unnamed: 0,source,english_sentence,hindi_sentence
8874,ted,youve heard that saying,START_आपने कई बार ये सुना होगा_END
35758,ted,that is actually the device,START_यह वास्तव में मोबाईल हैं_END
110127,ted,this is the wrong way of looking at it,START_यह इसे देखने का गलत तरीका है_END
113189,ted,the fact is no government in the world,START_तथ्य यह है कि दुनिया की कोई भी सरकार_END
103909,ted,up the stairs down the hall past the real fire...,START_ऊपर की तरफ बरामदे में असली अग्निशमन कर्म...


In [78]:
data_lines.dtypes

source              object
english_sentence    object
hindi_sentence      object
dtype: object