In [None]:
import pandas as pd

# Read the contents of the English text file
with open('/kaggle/input/parallel-corpus-for-english-urdu-language/Dataset/english-corpus.txt', 'r',encoding="utf8") as file:
    english_lines = file.readlines()

# Read the contents of the Urdu text file
with open('/kaggle/input/parallel-corpus-for-english-urdu-language/Dataset/urdu-corpus.txt', 'r',encoding="utf8") as file:
    urdu_lines = file.readlines()

# Create a DataFrame
df = pd.DataFrame({'English': english_lines, 'Urdu': urdu_lines})

# Optional: Remove newline characters from the strings
df['English'] = df['English'].str.strip()
df['Urdu'] = df['Urdu'].str.strip()

# Print the DataFrame
print(df)

In [None]:
import numpy as np
import pandas as pd
# import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from keras.preprocessing.text import Tokenizer
import tensorflow as tf
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Embedding ,GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout
from collections import Counter
from wordcloud import WordCloud
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from keras.losses import sparse_categorical_crossentropy

In [None]:
# Separating the English and Urdu datasets
eng = df['English']
urdu = df['Urdu']

eng_word_counter = Counter([word for sentence in eng for word in sentence.split()])
print("Total count of English words:",len([word for sentence in eng for word in sentence.split()]))
print("Count of distinct English words:",len(eng_word_counter))
print("10 most common English words:",list(zip(*eng_word_counter.most_common(10)))[0])

urdu_word_counter = Counter([word for sentence in urdu for word in sentence.split()])
print("Total count of Urdu words:",len([word for sentence in urdu for word in sentence.split()]))
print("Count of distinct Urdu words:",len(urdu_word_counter))
print("10 most common Urdu words:",list(zip(*urdu_word_counter.most_common(10)))[0])


In [None]:
plt.figure(figsize=(12,8))
wc = WordCloud(width=600,height=300).generate(' '.join(eng))
plt.imshow(wc)
plt.show();

In [None]:
def word_count(line):
    return len(line.split())

In [None]:
df['English_word_count'] = df['English'].apply(lambda x: word_count(x))
df['Urdu_word_count'] = df['Urdu'].apply(lambda x: word_count(x))

df['English_word_count']

In [None]:
df['Urdu_word_count']

In [None]:
import seaborn as sns
fig, axes = plt.subplots(nrows=1,ncols=2)
sns.distplot(df['English_word_count'],ax=axes[0])
sns.distplot(df['Urdu_word_count'],ax=axes[1])
sns.despine()
plt.show();

In [None]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

tokenized_urdu_sentences,urdu_tokenizer = tokenize(df['Urdu'])

tokenized_urdu_sentences

In [None]:
tokenized_english_sentences, english_tokenizer = tokenize( df['English'])

tokenized_english_sentences

import pickle
# Save the tokenizer to a file
with open('english_tokenizer.pkl', 'wb') as file:
    pickle.dump(english_tokenizer, file)

In [None]:
def pad(x):
    return pad_sequences(x, maxlen = 10, padding = 'post')

In [None]:
preproc_urdu_sentences = pad(tokenized_urdu_sentences)

preproc_urdu_sentences[0]

In [None]:
preproc_english_sentences = pad(tokenized_english_sentences)

preproc_english_sentences[0]

In [None]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_urdu_sequence_length = preproc_urdu_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
urdu_vocab_size = len(urdu_tokenizer.word_index)

print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_urdu_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("urdu vocabulary size:", urdu_vocab_size)

In [None]:
def model(input_shape, output_sequence_length, urdu_vocab_size, english_vocab_size):

    learning_rate = 0.001

    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='leaky_relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(urdu_vocab_size, activation='softmax')))
    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [None]:
# Train
model = model(
    preproc_english_sentences.shape,
    preproc_urdu_sentences.shape[1],
    len(urdu_tokenizer.word_index)+1,
    len(english_tokenizer.word_index)+1)

model.summary()

history=model.fit(preproc_english_sentences, preproc_urdu_sentences, batch_size=64, epochs=10, validation_split=0.3)

In [None]:
plt.figure(figsize=(12,8))
plt.plot(history.history['loss'],'r',label='train loss')
plt.plot(history.history['val_loss'],'b',label='test loss')
plt.xlabel('No. of Epochs')
plt.ylabel('Loss')
plt.title('Loss Graph')
plt.legend();


In [None]:
plt.figure(figsize=(12,8))
plt.plot(history.history['accuracy'],'r',label='train accuracy')
plt.plot(history.history['val_accuracy'],'b',label='test accuracy')
plt.xlabel('No. of Epochs')
plt.ylabel('Accuracy')
plt.title('Accuracy Graph')
plt.legend();

In [None]:
model.save('english_to_urdu_translator_final.h5')

In [None]:
def process_text(logits, tokenizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [None]:
from tensorflow.keras.models import load_model
translator_model = load_model('english_to_urdu_translator_final.h5')
translator_model

i=1

print(preproc_english_sentences[i])
print("Prediction:")
prediction = translator_model.predict(preproc_english_sentences[i].reshape(1, -1))
predicted_text = process_text(prediction[0], urdu_tokenizer)
print(predicted_text)

print("\nCorrect Translation:")
print(urdu[i])

print("\nOriginal text:")
print(eng[i])