In [None]:
Create an RNN based Python machine translation system.

In [None]:
import tensorflow as tf
import numpy as np

# Step 1: Load and preprocess data
# Parsing the file to extract sentence pairs
source_texts = []
target_texts = []
file_path = r"C:\Users\kulla\Downloads\DatasetNLP\hin.txt"

with open(file_path, 'r', encoding='utf-8') as file:
    for line in file:
        parts = line.strip().split('\t')
        if len(parts) >= 2:
            source_texts.append(parts[0].lower())  # English text
            target_texts.append(parts[1].lower())  # Hindi text

# Step 2: Tokenization using Keras Tokenizer
source_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=False)  # Word-level tokenization
target_tokenizer = tf.keras.preprocessing.text.Tokenizer(char_level=False)

source_tokenizer.fit_on_texts(source_texts)
target_tokenizer.fit_on_texts(target_texts)

# Convert texts to sequences of integers
source_sequences = source_tokenizer.texts_to_sequences(source_texts)
target_sequences = target_tokenizer.texts_to_sequences(target_texts)

# Vocabulary sizes
source_vocab_size = len(source_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Step 3: Pad sequences to the same length
max_sequence_length = max(max(len(seq) for seq in source_sequences), max(len(seq) for seq in target_sequences))
source_sequences = tf.keras.preprocessing.sequence.pad_sequences(source_sequences, maxlen=max_sequence_length, padding='post')
target_sequences = tf.keras.preprocessing.sequence.pad_sequences(target_sequences, maxlen=max_sequence_length, padding='post')

# Step 4: Build the model with updated input/output dimensions
model = tf.keras.models.Sequential([
    tf.keras.layers.Embedding(source_vocab_size, 64, input_length=max_sequence_length),
    tf.keras.layers.SimpleRNN(128, return_sequences=True),
    tf.keras.layers.Dense(target_vocab_size, activation='softmax')
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Step 5: One-hot encode the target sequences for training
target_sequences_one_hot = np.array([tf.keras.utils.to_categorical(seq, num_classes=target_vocab_size) for seq in target_sequences])

# Step 6: Train the model
model.fit(source_sequences, target_sequences_one_hot, epochs=100)