In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/frenchtransdata/fra.txt


In [2]:
import re
import string

def clean_text(text):
      text = text.replace('\xa0', ' ')
      text = re.sub(r'[^\x00-\x7F]+', '', text)
      text = text.lower()
      text = text.translate(str.maketrans('', '', string.punctuation))
      text = re.sub(r'\s+', ' ', text).strip()
      return text

In [3]:
text = open('/kaggle/input/frenchtransdata/fra.txt','r').read()
lines = text.split('\n')
lines[0]

'Go.\tVa !\tCC-BY 2.0 (France) Attribution: tatoeba.org #2877272 (CM) & #1158250 (Wittydev)'

In [4]:
import random 
x = []
y = []
for line in lines:
  sub_lines = line.split('\t')
  if len(sub_lines) == 3:
     eng_part = sub_lines[0].lower()
     fra_part = sub_lines[1].lower().replace('.','')
     x.append(clean_text(eng_part))
     y.append(f"<start> {fra_part} <end>")

In [5]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

2025-06-16 07:53:55.705981: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750060435.890382      18 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750060435.943774      18 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [6]:
english_token = Tokenizer(oov_token='UNK', num_words = 5000)
english_token.fit_on_texts(x)
english_vocab_size = len(english_token.word_index)

french_token = Tokenizer(oov_token='UNK', num_words = 5000)
french_token.fit_on_texts(y)
french_vocab_size = len(french_token.word_index)

In [7]:
english_sequences = english_token.texts_to_sequences(x)
french_sequences = french_token.texts_to_sequences(y)

max_eng_seq = 25
max_fra_seq = 25

padded_eng_seq = pad_sequences(english_sequences, max_eng_seq)
padded_fra_seq = pad_sequences(french_sequences, max_fra_seq)

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    padded_eng_seq, padded_fra_seq, test_size=0.2, random_state=42
)

In [9]:
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, Concatenate
from tensorflow.keras.layers import Attention
from tensorflow.keras.models import Model

In [10]:
embedding_dim = 64
latent_dim = 128
num_encoder_tokens = len(english_token.word_index) + 1
num_decoder_tokens = len(french_token.word_index) + 1

encoder_inputs = Input(shape=(None,))
encoder_embedding = Embedding(num_encoder_tokens, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

decoder_inputs = Input(shape=(None,))
decoder_embedding = Embedding(num_decoder_tokens, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)

attention = Attention()
attention_output = attention([decoder_outputs, encoder_outputs])

decoder_concat_input = Concatenate(axis=-1)([decoder_outputs, attention_output])

decoder_dense = Dense(num_decoder_tokens, activation='softmax')
decoder_outputs = decoder_dense(decoder_concat_input)

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

I0000 00:00:1750060459.673073      18 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13942 MB memory:  -> device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5
I0000 00:00:1750060459.673734      18 gpu_device.cc:2022] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 13942 MB memory:  -> device: 1, name: Tesla T4, pci bus id: 0000:00:05.0, compute capability: 7.5


In [11]:
model.summary()

In [12]:
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

callbacks = [
    EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2)
]

In [13]:
model.fit(
    [x_train, y_train[:, :-1]], 
    y_train[:, 1:],                     
    batch_size=64,
    epochs=15,
    validation_split=0.2,
    callbacks=callbacks
)

Epoch 1/15


I0000 00:00:1750060465.715086      60 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 158ms/step - accuracy: 0.7038 - loss: 2.1211 - val_accuracy: 0.7635 - val_loss: 1.2599 - learning_rate: 0.0010
Epoch 2/15
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 159ms/step - accuracy: 0.7770 - loss: 1.1586 - val_accuracy: 0.8086 - val_loss: 0.9414 - learning_rate: 0.0010
Epoch 3/15
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 159ms/step - accuracy: 0.8197 - loss: 0.8640 - val_accuracy: 0.8384 - val_loss: 0.7419 - learning_rate: 0.0010
Epoch 4/15
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 159ms/step - accuracy: 0.8484 - loss: 0.6774 - val_accuracy: 0.8590 - val_loss: 0.6182 - learning_rate: 0.0010
Epoch 5/15
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 159ms/step - accuracy: 0.8670 - loss: 0.5597 - val_accuracy: 0.8693 - val_loss: 0.5527 - learning_rate: 0.0010
Epoch 6/15
[1m2379/2379[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7ae5874804d0>

In [14]:
def predict_in_batches(model, x_test, y_test, batch_size=2):
    y_pred_ids = []
    for i in range(0, len(x_test), 1000):
        x_batch = x_test[i:i+1000]
        y_batch = y_test[i:i+1000, :-1]
        probs = model.predict([x_batch, y_batch], batch_size=batch_size)
        ids = probs.argmax(axis=-1)
        y_pred_ids.extend(ids)
        print(len(y_pred_ids))
    return np.array(y_pred_ids)

In [15]:
import numpy as np
y_pred_ids = predict_in_batches(model, x_test, y_test[:,:-1])  

rev_target_index = {v: k for k, v in french_token.word_index.items()}

def ids_to_text(seq):
    return [rev_target_index.get(idx, '') for idx in seq if idx > 0]

y_pred_texts = [" ".join(ids_to_text(seq)) for seq in y_pred_ids]
y_true_texts = [" ".join(ids_to_text(seq)) for seq in y_test[:,1:]]

[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 10ms/step
1000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
2000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
3000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
4000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
5000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
6000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
7000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
8000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
9000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step
10000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
11000
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
12000
[1m500/500[0m [32m━━━━━━━━━━━━━━━

In [16]:
from nltk.translate.bleu_score import corpus_bleu

bleu_score = corpus_bleu([[ref.split()] for ref in y_true_texts], [pred.split() for pred in y_pred_texts])
print("BLEU Score:", bleu_score)

BLEU Score: 0.3792811465925518
