In [53]:
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense, LSTM, Embedding, Input
from keras.models import load_model, save_model, Model
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.corpus import stopwords
import string
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from wordcloud import STOPWORDS, WordCloud
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to C:\Users\aashutosh
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to C:\Users\aashutosh
[nltk_data]     kumar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [32]:
dataset = pd.read_csv(r"C:\Users\aashutosh kumar\Music\Hindi_English_Truncated_Corpus.csv")
dataset.head(5)

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [33]:
dataset.drop(columns = ["source"], axis = 1, inplace = True)

In [34]:
print(dataset.isnull().sum())
dataset.dropna(inplace = True)

english_sentence    2
hindi_sentence      0
dtype: int64


In [35]:
print(dataset.duplicated().sum())
dataset.drop_duplicates(inplace = True)
dataset.shape

2780


(124825, 2)

In [36]:
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    stopwords_set = set(stopwords.words("english"))
    lematizer = WordNetLemmatizer()

    text = " ".join(lematizer.lemmatize(word) for word in text.split() if word not in stopwords_set)
    return text

In [37]:
dataset["english_sentence"] = dataset["english_sentence"].apply(preprocess)
dataset.head(4)

Unnamed: 0,english_sentence,hindi_sentence
0,politician permission need done,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,id like tell one child,मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,percentage even greater percentage india,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,really mean theyre bad paying attention,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते


In [38]:
def add_tokens(text):
    START_TOKEN = "<start>"
    END_TOKEN = "<end>"

    return START_TOKEN + text + END_TOKEN


In [39]:
dataset["hindi_sentence"] = dataset["hindi_sentence"].apply(add_tokens)
dataset.head(4)

Unnamed: 0,english_sentence,hindi_sentence
0,politician permission need done,<start>राजनीतिज्ञों के पास जो कार्य करना चाहिए...
1,id like tell one child,<start>मई आपको ऐसे ही एक बच्चे के बारे में बता...
2,percentage even greater percentage india,<start>यह प्रतिशत भारत में हिन्दुओं प्रतिशत से...
3,really mean theyre bad paying attention,<start>हम ये नहीं कहना चाहते कि वो ध्यान नहीं ...


In [40]:
english_tokenizer = Tokenizer()
english_tokenizer.fit_on_texts(dataset["english_sentence"])
english_sequences = english_tokenizer.texts_to_sequences(dataset["english_sentence"])

In [42]:
hindi_tokenizer = Tokenizer()
hindi_tokenizer.fit_on_texts(dataset["hindi_sentence"])
hindi_sequences = hindi_tokenizer.texts_to_sequences(dataset["hindi_sentence"])

In [47]:
max_len_eng = max(len(seq) for seq in english_sequences)
max_len_hindi = max(len(seq) for seq in hindi_sequences)
max_len_eng, max_len_hindi

(219, 419)

In [49]:
english_padded = pad_sequences(english_sequences, maxlen= max_len_eng, padding = "post")
hindi_padded = pad_sequences(hindi_sequences, maxlen= max_len_hindi, padding = "post")