In [1]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
devicharith_language_translation_englishfrench_path = kagglehub.dataset_download('devicharith/language-translation-englishfrench')

print('Data source import complete.')


Data source import complete.


In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/language-translation-englishfrench/eng_-french.csv


# Data preprocesing

In [3]:
import tensorflow as tf
from tensorflow.keras.layers import LSTM,Input,Embedding,TimeDistributed,Dense,AdditiveAttention,Concatenate

In [4]:
data = pd.read_csv("/kaggle/input/language-translation-englishfrench/eng_-french.csv")

In [5]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 175621 entries, 0 to 175620
Data columns (total 2 columns):
 #   Column                   Non-Null Count   Dtype 
---  ------                   --------------   ----- 
 0   English words/sentences  175621 non-null  object
 1   French words/sentences   175621 non-null  object
dtypes: object(2)
memory usage: 2.7+ MB


In [6]:
data.head()

Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [7]:
data.isna().sum()

Unnamed: 0,0
English words/sentences,0
French words/sentences,0


In [8]:
data = data.rename(columns={"English words/sentences":"english","French words/sentences":"french"})
data = data[:100000]
data.head()

Unnamed: 0,english,french
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !


In [9]:
def data_preprocessing(data_point):
    eng_text = tf.strings.strip(data_point["english"])
    french_text = tf.strings.strip(data_point["french"])
    sos = tf.constant("<SOS>")
    eos = tf.constant("<EOS>")
    french_text = tf.strings.join([sos,french_text,eos],separator=" ")
    return {"english":eng_text,"french":french_text}

In [10]:
tf_dataset = tf.data.Dataset.from_tensor_slices(dict(data))
tf_dataset = tf_dataset.map(data_preprocessing,num_parallel_calls=tf.data.AUTOTUNE)
len(tf_dataset)

100000

In [11]:
test_data = tf_dataset.take(5)
for data_point in test_data:
    print({k:v.numpy() for k,v in data_point.items()})

{'english': b'Hi.', 'french': b'<SOS> Salut! <EOS>'}
{'english': b'Run!', 'french': b'<SOS> Cours\xe2\x80\xaf! <EOS>'}
{'english': b'Run!', 'french': b'<SOS> Courez\xe2\x80\xaf! <EOS>'}
{'english': b'Who?', 'french': b'<SOS> Qui ? <EOS>'}
{'english': b'Wow!', 'french': b'<SOS> \xc3\x87a alors\xe2\x80\xaf! <EOS>'}


In [12]:
raw_data = [
    {'english': b'Hi.', 'french': b'<SOS> Salut! <EOS>'},
    {'english': b'Run!', 'french': b'<SOS> Cours\xe2\x80\xaf! <EOS>'},
    {'english': b'Run!', 'french': b'<SOS> Courez\xe2\x80\xaf! <EOS>'},
    {'english': b'Who?', 'french': b'<SOS> Qui ? <EOS>'},
    {'english': b'Wow!', 'french': b'<SOS> \xc3\x87a alors\xe2\x80\xaf! <EOS>'}
]

# Create a tf.data.Dataset from your data
# We'll extract just the French sentences for this demo
french_sentences_bytes = [item['french'] for item in raw_data]
dataset = tf.data.Dataset.from_tensor_slices(french_sentences_bytes)

In [13]:
from tensorflow.keras.layers import TextVectorization

vectorizer = TextVectorization(standardize='lower_and_strip_punctuation')
vectorizer.adapt(dataset)
vectorizer.get_vocabulary()

['',
 '[UNK]',
 np.str_('sos'),
 np.str_('eos'),
 np.str_('Ça'),
 np.str_('salut'),
 np.str_('qui'),
 np.str_('cours\u202f'),
 np.str_('courez\u202f'),
 np.str_('alors\u202f')]

In [14]:
english_vectorizer = tf.keras.layers.TextVectorization()
french_vectorizer = tf.keras.layers.TextVectorization()

for batch in tf_dataset.batch(len(tf_dataset)):
    english_vectorizer.adapt(batch["english"])
    french_vectorizer.adapt(batch["french"])

english_vocab,french_vocab = len(english_vectorizer.get_vocabulary()),len(french_vectorizer.get_vocabulary())
english_vocab,french_vocab

(8930, 21327)

In [15]:
def vectorization_func(x):
    english_vec = english_vectorizer(x["english"])
    french_vec = french_vectorizer(x["french"])
    french_input = french_vec[:-1]
    french_target = tf.reshape(french_vec[1:],(-1,1))
    return ({"english":english_vec,"french_input":
            french_input},french_target)

In [32]:
tf_dataset_vectorized = tf_dataset.map(vectorization_func, num_parallel_calls=tf.data.AUTOTUNE)
tf_dataset_vectorized = tf_dataset_vectorized.shuffle(1000000)
dataset_size = len(tf_dataset_vectorized)
train_size = int(dataset_size*0.8)
train = tf_dataset_vectorized.take(train_size)
test = tf_dataset_vectorized.skip(train_size)
print(f"Train size {train_size} actual length {len(train)}")
test_size = dataset_size-train_size
val_size = int((dataset_size-train_size)*0.5)
test_size = (dataset_size-train_size)-val_size
val = test.skip(test_size)
test = test.take(test_size)
print(f"Test size {test_size} actual length {len(test)}")
print(f"val size {val_size} actual length {len(val)}")

Train size 80000 actual length 80000
Test size 10000 actual length 10000
val size 10000 actual length 10000


In [33]:
padded_shapes = (
    {
        "english":[None],
        "french_input":[None]
    },
    [None,1]
)

In [34]:
train = train.shuffle(10000).padded_batch(batch_size=32, padded_shapes=padded_shapes).prefetch(tf.data.AUTOTUNE)
val = val.padded_batch(batch_size=32, padded_shapes=padded_shapes).prefetch(tf.data.AUTOTUNE)
test = test.padded_batch(batch_size=1, padded_shapes=padded_shapes).prefetch(tf.data.AUTOTUNE)

# building custom attention layers

In [19]:
from tensorflow.keras.layers import Layer

class BahdanauAttention(Layer):
    def __init__(self,units):
        super().__init__()
        self.w1 = Dense(units)
        self.w2 = Dense(units)
        self.v = Dense(1)
    def call(self,query,values):
        # query shape (batch,dec_seq,units)
        # values shape (batch,enc_seq,units)
        # after expanding shape (batch,dec_seq,1,units) , (batch,1,enc_seq,units)
        query = tf.expand_dims(query,axis=2)
        values = tf.expand_dims(values,axis=1)

        # result shape (batch,dec_seq,enc_seq,units)
        result = tf.math.tanh(self.w1(query)+self.w2(values))

        #score in shape (batch,dec_seq,enc_seq,1)
        score = self.v(result)

        # applying softmax in enc_seq
        attention_weights = tf.nn.softmax(score,axis=2)
        # applying attention weights of shape (batch,dec_seq,enc_seq,1) in values of shape (batch,1,enc_seq,units)
        # gives (batch,dec_seq,enc_seq,units)
        context_vector = attention_weights*values

        # gives shape (batch,dec_seq,units)
        context_vector = tf.reduce_sum(context_vector,axis=2)

        return context_vector,attention_weights




In [20]:
class LuongAttention(Layer):
    def __init__(self):
        super().__init__()

    def call(self,query,values):
        # query shape (batch,dec_seq,units)
        # values shape (batch,enc_seq,units)
        # for dot similarity of 2 matrix we do matrix multiplication of transpose
        # values_transposed (batch,units,enc_seq)
        values_transposed = tf.transpose(values,perm=[0,2,1])

        # score shape (batch,dec_seq,enc_seq)
        score = tf.matmul(query,values_transposed)

        attention_weights = tf.nn.softmax(score,axis=-1)

        # shape (batch,dec_seq,units)
        contex_vector = tf.matmul(attention_weights,values)

        return contex_vector, attention_weights

# building Encoder - Decoder

## Encoder

In [21]:
latent_dim = 256


In [22]:
encoder_input = Input((None,))
encoder_embedding = Embedding(english_vocab,latent_dim,trainable=True)(encoder_input)
encoder_lstm1 = LSTM(latent_dim,return_state=True,return_sequences=True)
encoder_lstm2 = LSTM(latent_dim,return_state=True,return_sequences=True)
encoder_output1,_ ,_ = encoder_lstm1(encoder_embedding)
encoder_output, encoder_state_h, encoder_state_c = encoder_lstm2(encoder_output1)

## decoder

In [23]:
decoder_input = Input((None,))
decoder_embedding_layer = Embedding(french_vocab,latent_dim,trainable=True)
decoder_embedding = decoder_embedding_layer(decoder_input)
decoder_lstm = LSTM(latent_dim,return_sequences=True,return_state=True)
decoder_lstm_output, _, _ = decoder_lstm(decoder_embedding,initial_state=[encoder_state_h, encoder_state_c])
# decoder_attention = AdditiveAttention()([decoder_lstm_output,encoder_output])
# decoder_attention,_ = BahdanauAttention(latent_dim)(decoder_lstm_output,encoder_output)
decoder_attention,_ = LuongAttention()(decoder_lstm_output,encoder_output)
decoder_concat_input = Concatenate(axis=-1)([decoder_lstm_output,decoder_attention])
decoder_dropout_input = tf.keras.layers.Dropout(0.5)(decoder_concat_input)
decoder_dense = TimeDistributed(Dense(french_vocab,"softmax"))
decoder_dense_output = decoder_dense(decoder_dropout_input)

# encoder-decoder model

In [24]:
model = tf.keras.Model(inputs={"english":encoder_input,"french_input":decoder_input},outputs=[decoder_dense_output])
model.summary()



In [25]:
model.compile(optimizer="adam",loss="sparse_categorical_crossentropy",metrics=["accuracy"])
model.fit(train,epochs=10,validation_data=val,verbose=1)

Epoch 1/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 47ms/step - accuracy: 0.5276 - loss: 3.4567 - val_accuracy: 0.6353 - val_loss: 2.2209
Epoch 2/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 45ms/step - accuracy: 0.6457 - loss: 2.1830 - val_accuracy: 0.7034 - val_loss: 1.6534
Epoch 3/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 45ms/step - accuracy: 0.6951 - loss: 1.7362 - val_accuracy: 0.7547 - val_loss: 1.2959
Epoch 4/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m133s[0m 48ms/step - accuracy: 0.7401 - loss: 1.4074 - val_accuracy: 0.7976 - val_loss: 1.0295
Epoch 5/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 48ms/step - accuracy: 0.7703 - loss: 1.1900 - val_accuracy: 0.8196 - val_loss: 0.8750
Epoch 6/10
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 47ms/step - accuracy: 0.7937 - loss: 1.0251 - val_accuracy: 0.8393 - val_loss: 0.749

<keras.src.callbacks.history.History at 0x7f0998e7bfd0>

# Building seperate enoder and decoder model for inference

In [26]:
encoder = tf.keras.Model(encoder_input,[encoder_output, encoder_state_h, encoder_state_c])
encoder.summary()

In [27]:
decoder_state_h_input = Input((latent_dim,))
decoder_state_c_input = Input((latent_dim,))
encoder_output_inference = Input((None,latent_dim))
decoder_inference_embedding = decoder_embedding_layer(decoder_input)
decoder_lstm_inference_output, decoder_inference_state_h, decoder_inference_state_c = decoder_lstm(decoder_inference_embedding,initial_state=[decoder_state_h_input, decoder_state_c_input])
# decoder_attention_inference = AdditiveAttention()([decoder_lstm_inference_output,encoder_output_inference])
# decoder_attention_inference,_ = BahdanauAttention(latent_dim)(decoder_lstm_inference_output,encoder_output_inference)
decoder_attention_inference,_ = LuongAttention()(decoder_lstm_inference_output,encoder_output_inference)
decoder_concat_inference = Concatenate(axis=-1)([decoder_lstm_inference_output,decoder_attention_inference])
decoder_dense_inference_output = decoder_dense(decoder_concat_inference)

In [28]:
decoder = tf.keras.Model([decoder_input,decoder_state_h_input, decoder_state_c_input,encoder_output_inference],[decoder_dense_inference_output,decoder_inference_state_h, decoder_inference_state_c])
decoder.summary()

# Inference

In [31]:
english_token_to_text = english_vectorizer.get_vocabulary()
french_token_to_text = french_vectorizer.get_vocabulary()


In [29]:
def decoder_inference(input_seq):

    encoder_output,state_value_h,state_value_c = encoder.predict(input_seq,verbose=0)

    target_seq = np.zeros((1,1))
    target_seq[0,0] = french_vectorizer("<SOS>").numpy()

    stop_condition = True

    decoded_sentence = ""

    while(stop_condition):

        output_token,state_value_h,state_value_c = decoder.predict([target_seq,state_value_h,state_value_c,encoder_output],verbose=0)
        output_word = french_token_to_text[output_token[0,0,:].argmax()]
        decoded_sentence = (decoded_sentence + " " + output_word) if (not output_word == "eos") else decoded_sentence
        target_seq[0,0] =  output_token[0,0,:].argmax()

        if output_word == "eos" or len(decoded_sentence.split(" ")) > 20:
            stop_condition = False

    return decoded_sentence

In [67]:
def translate(input_seq):
  batch_size = input_seq.shape[0]
  encoder_output, state_value_h, state_value_c = encoder.predict(input_seq,verbose=0)
  target_seq = np.ones((batch_size,1))*french_vectorizer("<SOS>").numpy()
  active = np.ones((batch_size),dtype=bool)
  decoded_sentences = [[] for _ in range(batch_size)]
  for _ in range(30):
    decoder_preds, state_value_h, state_value_c = decoder.predict([target_seq,state_value_h,state_value_c,encoder_output],verbose=0)
    target_seq = decoder_preds.argmax(axis=-1)

    for i in range(batch_size):
      output_token = french_token_to_text[target_seq[i,0]]
      if (output_token != "eos") and (active[i]==True):
        decoded_sentences[i].append(output_token)

      if (output_token=="eos") and (active[i]==True):
        active[i] = False

  return [" ".join(text) for text in decoded_sentences]

In [37]:
for features,target in test.take(20):
    english_words = [english_token_to_text[token] for token in features["english"][0].numpy()]
    english_text = " ".join(english_words)
    french_words = [french_token_to_text[token] for token in features["french_input"][0,1:].numpy()]
    french_text = " ".join(french_words)
    decoded_sentence = decoder_inference(features["english"])
    print(f"Input: {english_text}")
    print(f"Decoded: {decoded_sentence}")
    print(f"Expected: {french_text}")
    print("-" * 20)

Input: i didnt ask you to come here
Decoded:  je ne vous ai pas demandé de venir ici
Expected: je ne vous ai pas demandé de venir ici
--------------------
Input: did you make it by yourself
Decoded:  lavezvous fait vousmême 
Expected: lastu fait toimême
--------------------
Input: there was something else
Decoded:  il y avait quelque chose dautre
Expected: il y avait quelque chose dautre
--------------------
Input: youre very rude
Decoded:  vous êtes très grossière
Expected: vous êtes très grossiers
--------------------
Input: she was wearing long boots
Decoded:  elle portait des longs longs
Expected: elle portait des cuissardes
--------------------
Input: youre disgusting
Decoded:  vous êtes complètement file
Expected: tu es dégoûtant
--------------------
Input: he consented on the spot
Decoded:  il a eu du mal à la tête
Expected: il y a immédiatement consenti
--------------------
Input: are you watching carefully
Decoded:  avezvous des problèmes
Expected: regardestu attentivement
---

In [73]:
!pip install -q evaluate
import evaluate
metric = evaluate.load("bleu")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
[?25h

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules: 0.00B [00:00, ?B/s]

In [74]:
test = test.unbatch().padded_batch(64,padded_shapes=padded_shapes)
for features, target in test:
  actual_translation = []
  for label in features["french_input"]:
    french_words = [french_token_to_text[token] for token in label[1:].numpy()]
    french_text  = " ".join(french_words)
    actual_translation.append(french_text)
  predicted_translation=translate(features["english"])
  metric.add_batch(predictions=predicted_translation,references=actual_translation)

In [75]:
metric.compute()

{'bleu': 0.47242170931348254,
 'precisions': [0.6829060506399335,
  0.5245095616939269,
  0.4133560410048145,
  0.3364191639625883],
 'brevity_penalty': 1.0,
 'length_ratio': 1.018832885892493,
 'translation_length': 50474,
 'reference_length': 49541}

In [83]:
from pathlib import Path
MODEL_DIR = Path("models")
MODEL_DIR.mkdir(parents=True,exist_ok=True)

encoder_path = MODEL_DIR / "encoder.keras"
decoder_path = MODEL_DIR / "decoder.keras"

encoder.save(encoder_path)
decoder.save(decoder_path)