In [1]:
'''Simple Neural Machine Translation (NMT)
• Build a basic sequence-to-sequence model using Python libraries like TensorFlow or
Keras.
• Use small parallel datasets (e.g., English-French sentence pairs) for training.'''

'Simple Neural Machine Translation (NMT)\n• Build a basic sequence-to-sequence model using Python libraries like TensorFlow or\nKeras.\n• Use small parallel datasets (e.g., English-French sentence pairs) for training.'

In [2]:
import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split

In [3]:
en_data = pd.read_csv("cleaned_small_vocab_en.csv", header=None, on_bad_lines='skip', quoting=3)[0]
fr_data = pd.read_csv("cleaned_small_vocab_fr.csv", header=None, on_bad_lines='skip', quoting=3)[0]

In [4]:
en_data

0             "new jersey is sometimes quiet during autumn 
1         "the united states is usually chilly during july 
2                "california is usually quiet during march 
3         "the united states is sometimes mild during june 
4                     "your least liked fruit is the grape 
                                ...                        
118055                  "france is never busy during march 
118056         "india is sometimes beautiful during spring 
118057                   "india is never wet during summer 
118058              "france is never chilly during january 
118059                   "the orange is her favorite fruit 
Name: 0, Length: 118060, dtype: object

In [5]:
fr_data

0         "new jersey est parfois calme pendant l' automne 
1         "les états-unis est généralement froid en juil...
2               "california est généralement calme en mars 
3               "les états-unis est parfois légère en juin 
4                    "votre moins aimé fruit est le raisin 
                                ...                        
135821               "la france est jamais occupée en mars 
135822             "l' inde est parfois belle au printemps 
135823          "l' inde est jamais mouillé pendant l' été 
135824              "la france est jamais froid en janvier 
135825                     "l'orange est son fruit préféré 
Name: 0, Length: 135826, dtype: object

In [6]:
en_data.isnull().sum()

0

In [7]:
fr_data.isnull().sum()

0

In [8]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [9]:
en_tokenizer = Tokenizer()
en_tokenizer.fit_on_texts(en_data)
en_sequences = en_tokenizer.texts_to_sequences(en_data)

In [10]:
fr_tokenizer = Tokenizer()
fr_tokenizer.fit_on_texts(fr_data)
fr_sequences = fr_tokenizer.texts_to_sequences(fr_data)

In [11]:
en_max_len = max(len(seq) for seq in en_sequences)
fr_max_len = max(len(seq) for seq in fr_sequences)

In [12]:
en_sequences = pad_sequences(en_sequences, maxlen=en_max_len, padding='post')
fr_sequences = pad_sequences(fr_sequences, maxlen=fr_max_len, padding='post')

In [13]:
en_vocab_size = len(en_tokenizer.word_index) + 1
fr_vocab_size = len(fr_tokenizer.word_index) + 1

In [14]:
print("English Vocabulary Size:", en_vocab_size)
print("French Vocabulary Size:", fr_vocab_size)
print("Max Length (English):", en_max_len)
print("Max Length (French):", fr_max_len)

English Vocabulary Size: 199
French Vocabulary Size: 333
Max Length (English): 11
Max Length (French): 15


In [15]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

In [16]:
# Step 1: Define the Encoder
embedding_dim = 256
units = 512

In [17]:
encoder_inputs = Input(shape=(en_max_len,))
encoder_embedding = Embedding(en_vocab_size, embedding_dim)(encoder_inputs)
encoder_lstm, state_h, state_c = LSTM(units, return_state=True)(encoder_embedding)

In [18]:
encoder_states = [state_h, state_c]

In [19]:
decoder_inputs = Input(shape=(fr_max_len - 1,))  # Adjust the input shape here
decoder_embedding = Embedding(fr_vocab_size, embedding_dim)(decoder_inputs)
decoder_lstm, _, _ = LSTM(units, return_sequences=True, return_state=True)(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(fr_vocab_size, activation='softmax')
decoder_outputs = decoder_dense(decoder_lstm)

In [20]:
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [21]:
model.summary()

In [22]:
from sklearn.model_selection import train_test_split

# Prepare decoder input data (French sentences shifted by one timestep)
decoder_input_data = fr_sequences[:, :-1]  # Input: all but the last token
decoder_output_data = fr_sequences[:, 1:]  # Output: all but the first token

# Ensure that en_sequences and decoder_input_data have the same number of samples
min_samples = min(len(en_sequences), len(decoder_input_data))
en_sequences = en_sequences[:min_samples]
decoder_input_data = decoder_input_data[:min_samples]
decoder_output_data = decoder_output_data[:min_samples]

# Split into train, validation, and test sets
X_train_val, X_test, decoder_input_train_val, decoder_input_test = train_test_split(
    en_sequences, decoder_input_data, test_size=0.2, random_state=42
)
decoder_output_train_val, decoder_output_test = train_test_split(
    decoder_output_data, test_size=0.2, random_state=42
)

# Further split train_val into train and validation sets
X_train, X_val, decoder_input_train, decoder_input_val = train_test_split(
    X_train_val, decoder_input_train_val, test_size=0.2, random_state=42
)
decoder_output_train, decoder_output_val = train_test_split(
    decoder_output_train_val, test_size=0.2, random_state=42
)

# Reshape the target data
decoder_output_train = np.expand_dims(decoder_output_train, -1)
decoder_output_val = np.expand_dims(decoder_output_val, -1)
decoder_output_test = np.expand_dims(decoder_output_test, -1)  # Reshape test data

In [23]:
batch_size = 64 

epochs = 50

history = model.fit(
    [X_train, decoder_input_train], decoder_output_train,
    validation_data=([X_val, decoder_input_val], decoder_output_val),
    batch_size=batch_size,
    epochs=epochs
)

Epoch 1/50
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1523s[0m 1s/step - accuracy: 0.7565 - loss: 1.0415 - val_accuracy: 0.8237 - val_loss: 0.4868
Epoch 2/50
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1247s[0m 1s/step - accuracy: 0.8256 - loss: 0.4803 - val_accuracy: 0.8266 - val_loss: 0.4738
Epoch 3/50
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1087s[0m 899ms/step - accuracy: 0.8274 - loss: 0.4689 - val_accuracy: 0.8278 - val_loss: 0.4675
Epoch 4/50
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m532s[0m 451ms/step - accuracy: 0.8281 - loss: 0.4633 - val_accuracy: 0.8264 - val_loss: 0.4629
Epoch 5/50
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1598s[0m 1s/step - accuracy: 0.8290 - loss: 0.4596 - val_accuracy: 0.8276 - val_loss: 0.4631
Epoch 6/50
[1m1181/1181[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m356s[0m 301ms/step - accuracy: 0.8292 - loss: 0.4570 - val_accuracy: 0.8287 - val_loss: 0.46

In [29]:
# Save the trained model to a file
model.save('seq2seq_translation_model.h5')
print("Model saved successfully!")



Model saved successfully!


In [30]:
# Load the saved model
from tensorflow.keras.models import load_model

model = load_model('seq2seq_translation_model.h5')
print("Model loaded successfully!")



Model loaded successfully!


In [31]:
# Evaluate the model on validation data
val_loss, val_accuracy = model.evaluate([X_val, decoder_input_val], decoder_output_val, verbose=1)
print(f"Validation Loss: {val_loss}")
print(f"Validation Accuracy: {val_accuracy}")

[1m591/591[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 119ms/step - accuracy: 0.8129 - loss: 0.7395
Validation Loss: 0.7371774315834045
Validation Accuracy: 0.8125613331794739
