In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import plotly.express as px
import seaborn as sns
from tqdm.auto import tqdm 
import torch
import torch.nn as nn
import torch.optim as optim

import re
from nltk.corpus import stopwords 
from collections import Counter 
from string import punctuation 

from sklearn.model_selection import train_test_split 

import tensorflow as tf 
from tensorflow import keras 
from tensorflow.keras.preprocessing.text import Tokenizer 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model 
from tensorflow.keras.layers import Conv1D , Dense , Embedding , Dropout , LayerNormalization , MultiHeadAttention
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from tensorflow.keras.optimizers import Adam   
import torch
from torchmetrics.text import BLEUScore
bleu1 = BLEUScore()
bleu2 = BLEUScore()
from torchmetrics.text import WordErrorRate
WER1 = WordErrorRate()
WER2 = WordErrorRate()

2024-05-10 06:40:06.913164: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-10 06:40:06.913307: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-10 06:40:07.036572: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
gpus = tf.config.experimental.list_physical_devices('GPU')
for gpu in gpus:
    print("Name:", gpu.name, "  Type:", gpu.device_type)

Name: /physical_device:GPU:0   Type: GPU


In [3]:
df = pd.read_csv('/kaggle/input/en-fr-translation-dataset/en-fr.csv' , nrows=500000)


In [4]:
def english_preprocessing(data , col) : 
    data[col] = data[col].astype(str) 
    data[col] = data[col].apply(lambda x: x.lower())
    data[col] = data[col].apply(lambda x: re.sub("[^A-Za-z\s]","",x)) 
    data[col] = data[col].apply(lambda x: x.replace("\s+"," "))
    data[col] = data[col].apply(lambda x: " ".join([word for word in x.split()]))
    return data 

def french_preprocessing(data , col) : 
    data[col] = data[col].astype(str) 
    data[col] = data[col].apply(lambda x : x.lower()) 
    data[col] = data[col].apply(lambda x: re.sub(r'\d','',x))
    data[col] = data[col].apply(lambda x: re.sub(r'\s+',' ',x))
    data[col] = data[col].apply(lambda x: re.sub(r"[-()\"#/@;:<>{}`+=~|.!?,।]", "", x))
    data[col] = data[col].apply(lambda x: x.strip()) 
    data[col] = "<sos> " + data[col] + " <eos>" 
    return data

In [5]:
df = french_preprocessing(df , 'fr')
df = english_preprocessing(df , 'en')

In [6]:
df["en_len"] = [len(text.split()) for text in df.en]
df['fr_len'] = [len(text.split()) for text in df.fr]

In [7]:
df = df[~(df['en_len'] < 5) & ~(df['en_len'] > 20)]
df = df[~(df['fr_len'] < 5) & ~(df['fr_len'] > 20)]

In [8]:
df.head()

Unnamed: 0,en,fr,en_len,fr_len
7,the sky of the first inhabitants a contemporar...,<sos> le ciel des premiers habitants la vision...,15,15
12,astronomers introduction introduction video wh...,<sos> astronomes introduction vidéo d'introduc...,7,9
14,the name is derived from the greek root astron...,<sos> son nom vient du grec astron qui veut di...,17,18
18,it prompts us to ask the deepest existential q...,<sos> l'astronomie évoque donc aussi les grand...,9,12
22,the lure of these universal enigmas was the sp...,<sos> l'attrait exercé par ces énigmes univers...,19,18


In [9]:
MINLEN_en = np.min(df['en_len'])
MAXLEN_en = np.max(df['en_len'])
MINLEN_fr = np.min(df['en_len'])
MAXLEN_fr= np.max(df['fr_len'])

In [10]:
MINLEN_en , MAXLEN_en,MINLEN_fr , MAXLEN_fr

(5, 20, 5, 20)

In [11]:
def Vectorization(col , MAXLEN = 20) : 
    sents = df[col].tolist() 
    
    # Build vocabulary 
    corpus = [word for text in df[col] for word in text.split()] 
    vocab_size = len(Counter(corpus)) 
    
    tokenizer = Tokenizer(num_words=vocab_size , oov_token = "<OOV>" , 
                          filters='!#$%&()*+,-/:;<=>@«»""[\\]^_`{|}~\t\n'
                         )
    tokenizer.fit_on_texts(sents) 
    
    tokenizer.word_index['<pad>'] = 0 
    tokenizer.index_word[0] = '<pad>' 
    
    vocab_to_idx = tokenizer.word_index 
    idx_to_vocab = tokenizer.index_word 
    
    # Text Vectorization 
    seqs = tokenizer.texts_to_sequences(sents) 
    
    pad_seqs = pad_sequences(seqs , maxlen = MAXLEN , padding='post')
    
    return vocab_to_idx , idx_to_vocab , pad_seqs , tokenizer

In [12]:
en_vocab , en_inv_vocab , en_seqs , en_tokenizer = Vectorization('en')
fr_vocab , fr_inv_vocab , fr_seqs , fr_tokenizer = Vectorization('fr')

In [13]:
x_train , x_val , y_train , y_val = train_test_split(en_seqs , fr_seqs , train_size = 0.80, random_state = 42)

In [14]:
x_train.shape , x_val.shape

((135318, 20), (33830, 20))

In [15]:
BATCH_SIZE = 32
BUFFER_SIZE = 1000

In [16]:
train_set = tf.data.Dataset.from_tensor_slices((x_train , y_train))
train_set = train_set.shuffle(BUFFER_SIZE).batch(BATCH_SIZE , drop_remainder = True)

val_set = tf.data.Dataset.from_tensor_slices((x_val , y_val))
val_set = val_set.batch(BATCH_SIZE , drop_remainder = True)

In [17]:
print(f"the size of the training set {len(train_set)} batches of {BATCH_SIZE}")


the size of the training set 4228 batches of 32


In [18]:
print(f"the size of the validation set {len(val_set)} batches of {BATCH_SIZE}")


the size of the validation set 1057 batches of 32


In [19]:
# # define parameters
# EMBEDDING_DIM = 256
# SRC_VOCAB_SIZE = len(en_vocab) + 1 # 55126
# TRG_VOCAB_SIZE = len(fr_vocab) + 1 # 73164
# HIDDEN_DIM = 512
# MAXLEN = 20
# EPOCHS = 50
# LR = 0.001

In [20]:
with tf.device("/CPU:0"):
    src_sample , trg_sample = next(iter(val_set))

In [21]:
src_sample.shape , trg_sample.shape

(TensorShape([32, 20]), TensorShape([32, 20]))

In [22]:
class EncoderBlock(tf.keras.layers.Layer) : 
    def __init__(self , embedding_dim , num_heads , fc_dim , dropout_rate = 0.1) : 
        super(EncoderBlock , self).__init__() 
        
        self.MHA = MultiHeadAttention(num_heads=num_heads , key_dim=embedding_dim) 
        
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6) 
        
        self.dropout1 = Dropout(dropout_rate) 
        self.dropout2 = Dropout(dropout_rate) 
        
        self.fc = tf.keras.Sequential([
            Dense(fc_dim , activation = 'relu') , 
            Dense(embedding_dim)
        ]) 
        
    def call(self , x) : 
        
        attn_out = self.MHA(x , x) 
        attn_out = self.dropout1(attn_out) 
        out1 = self.norm1(x + attn_out) 
        
        fc_out = self.dropout2(self.fc(out1)) 
        
        enc_out = self.norm2(out1 + fc_out) 
        
        return enc_out

In [23]:
class Encoder(tf.keras.layers.Layer) : 
    def __init__(
        self , 
        src_vocab_size , 
        max_length ,
        num_layers , 
        embedding_dim , 
        num_heads , 
        fc_dim ,  
        dropout_rate = 0.1
    ) : 
        super(Encoder , self).__init__() 
        
        self.num_layers = num_layers 
        
        self.embedding = Embedding(src_vocab_size , embedding_dim) 
        self.pos_encoding = Embedding(max_length , embedding_dim) 

        self.enc_layers = [EncoderBlock(embedding_dim , num_heads , fc_dim , dropout_rate)
                          for _ in range(num_layers)] 
        
        self.dropout = Dropout(dropout_rate)
        
    def call(self , x ) : 
        batch_size = tf.shape(x)[0] 
        seqlen = tf.shape(x)[1]
        
        positions = tf.range(start=0, limit=seqlen, delta=1) 
        positions = tf.expand_dims(positions , axis = 0) 
        positions = tf.tile(positions , [batch_size , 1])
        
        x = self.dropout((self.embedding(x) + self.pos_encoding(positions)))  
        
        for i in range(self.num_layers) : 
            x = self.enc_layers[i](x)
        
        return x # (batch_size , seqlen , embedding_dim)
        
    

In [24]:
class DecoderBlock(tf.keras.layers.Layer) : 
    def __init__(self , embedding_dim , num_heads , fc_dim , dropout_rate = 0.1) : 
        super(DecoderBlock , self).__init__() 
        
        self.MHA1 = MultiHeadAttention(num_heads=num_heads , key_dim=embedding_dim)
        self.MHA2 = MultiHeadAttention(num_heads=num_heads , key_dim=embedding_dim)
        
        self.norm1 = LayerNormalization(epsilon=1e-6)
        self.norm2 = LayerNormalization(epsilon=1e-6) 
        self.norm3 = LayerNormalization(epsilon=1e-6)
        
        self.dropout1 = Dropout(dropout_rate) 
        self.dropout2 = Dropout(dropout_rate) 
        self.dropout3 = Dropout(dropout_rate)
        
        self.fc = tf.keras.Sequential([
            Dense(fc_dim , activation = 'relu') , 
            Dense(embedding_dim)
        ])
        
    def look_ahead_mask(self , trg) : 
        batch_size = tf.shape(trg)[0] 
        seqlen = tf.shape(trg)[1] 
        
        i = tf.range(seqlen)[:, None]
        j = tf.range(seqlen)
        m = i >= j - seqlen + seqlen
        mask = tf.cast(m, tf.bool)
        mask = tf.reshape(mask, [1, seqlen, seqlen])
        mult = tf.concat(
            [batch_size[..., tf.newaxis], tf.constant([1, 1], dtype=tf.int32)], 0
        )
        
        return tf.tile(mask, mult)
        
    def call(self , x , enc_output) : 
        mask = self.look_ahead_mask(x) 
        
        attn1 = self.MHA1(x , x , attention_mask = mask) 
        attn1 = self.dropout1(attn1) 
        out1 = self.norm1(attn1 + x) 
        
        attn2 = self.MHA2(out1 , enc_output) 
        attn2 = self.dropout2(attn2) 
        out2 = self.norm2(attn2 + out1) 
        
        fc_out = self.dropout3(self.fc(out2)) 
        
        dec_out = self.norm3(fc_out + out2) 
        
        return dec_out

In [25]:
class Decoder(tf.keras.layers.Layer) : 
    def __init__(
        self , 
        num_layers ,
        embedding_dim , 
        num_heads , 
        fc_dim , 
        trg_vocab_size , 
        max_length , 
        dropout_rate = 0.1
    ) : 
        super(Decoder , self).__init__() 
        
        self.num_layers = num_layers 
        
        self.embedding = Embedding(trg_vocab_size , embedding_dim) 
        self.pos_encoding = Embedding(max_length , embedding_dim) 
        
        self.dec_layers = [DecoderBlock(embedding_dim , num_heads , fc_dim , dropout_rate)
                          for _ in range(num_layers)] 
        
        self.dropout = Dropout(dropout_rate)
        
    def call(self , trg , enc_output) : 
        batch_size = tf.shape(trg)[0] 
        seqlen = tf.shape(trg)[1]
        
        positions = tf.range(start=0, limit=seqlen, delta=1) 
        positions = tf.expand_dims(positions , axis = 0) 
        positions = tf.tile(positions , [batch_size , 1])
        
        x = self.dropout((self.embedding(trg) + self.pos_encoding(positions)))  
        
        for i in range(self.num_layers) : 
            x = self.dec_layers[i](x , enc_output)
        
        return x # (batch_size , seqlen , embedding_dim)

In [26]:
class Transformer(Model) : 
    def __init__(
        self , 
        enc_num_layers ,
        dec_num_layers,
        embedding_dim , 
        num_heads , 
        fc_dim , 
        src_vocab_size , 
        src_max_length ,
        trg_vocab_size , 
        trg_max_length ,
        dropout_rate = 0.1 
    ) : 
        super(Transformer , self).__init__() 
        
        self.encoder = Encoder(
            src_vocab_size , 
            src_max_length ,
            enc_num_layers , 
            embedding_dim , 
            num_heads , 
            fc_dim ,  
            dropout_rate 
        )
        
        self.decoder = Decoder(
            dec_num_layers , 
            embedding_dim , 
            num_heads , 
            fc_dim , 
            trg_vocab_size , 
            trg_max_length , 
            dropout_rate
        ) 
        
        self.fc_out = Dense(trg_vocab_size) 
        
    def call(self , src , trg) : 
        
        enc_output = self.encoder(src) 
        
        dec_output = self.decoder(trg , enc_output) 
        
        out = self.fc_out(dec_output) 
        
        return out

In [27]:
# set hyperparameters
EPOCHS = 500
EMBEDDING_DIM = 512 
FC_DIM = 4
enc_num_layers =3
dec_num_layers = 3
NUM_HEADS = 8 

SRC_VOCAB_SIZE = len(en_tokenizer.word_index)
SRC_MAXLEN = MAXLEN_en

TRG_VOCAB_SIZE = len(fr_tokenizer.word_index)
TRG_MAXLEN = MAXLEN_fr
LR = 0.0001 
DROPOUT_RATE = 0.1

model = Transformer(
    enc_num_layers ,
    dec_num_layers,
    EMBEDDING_DIM , 
    NUM_HEADS , 
    FC_DIM , 
    SRC_VOCAB_SIZE ,  
    SRC_MAXLEN ,
    TRG_VOCAB_SIZE ,  
    TRG_MAXLEN , 
    DROPOUT_RATE)

In [28]:
temp_trg_out = model(src_sample , trg_sample)
model.summary()

In [29]:
loss_object = SparseCategoricalCrossentropy(from_logits=True) 
optimizer = Adam(LR)

def criterion(real , pred) : 
    mask = tf.math.logical_not(tf.math.equal(real , 0)) 
    
    loss = loss_object(real , pred) 
    
    mask = tf.cast(mask , dtype = loss.dtype)
    
    loss *= mask 
    
    return tf.reduce_sum(loss) / tf.reduce_sum(mask)

In [30]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
val_loss = tf.keras.metrics.Mean(name='val_loss')
# test_loss = tf.keras.metrics.Mean(name='test_loss')

In [39]:
def evalute(sample) :  
    sample = tf.expand_dims(sample,0)
    decoder_input = fr_tokenizer.texts_to_sequences(['sos'])
    decoder_input = tf.convert_to_tensor(np.array(decoder_input) , dtype = tf.int64)
    
    for i in range(MAXLEN_fr) : # maxlen = 20        
        preds = model(
            sample , 
            decoder_input 
        )
        
        preds = preds[: , -1: , :] # (batch_size, 1, vocab_size) 
        predicted_id = tf.cast(tf.argmax(preds, axis=-1), tf.int64) 
        
        if predicted_id == fr_tokenizer.word_index['eos'] : 
            return tf.squeeze(decoder_input , axis = 0)
        
        decoder_input = tf.concat([decoder_input, predicted_id], axis=1)
        
    return tf.squeeze(decoder_input, axis=0)

In [32]:
def prediction():
    preds = []
    targets = []
    for sample,target in zip(src_sample,trg_sample) :
        result = evalute(sample)
        pred_sent = ' '.join([fr_tokenizer.index_word[idx] for idx in result.numpy() if idx != 0 and idx != 2 and idx !=3])
        trg_sent = ' '.join([fr_tokenizer.index_word[idx] for idx in target.numpy() if idx != 0 and idx != 2 and idx !=3])
        preds.append(pred_sent)
        targets.append([trg_sent])
        print(f"Actual correction    : {trg_sent}")
        print(f"Predicted correction : {pred_sent}\n")

In [33]:
@tf.function 
def train_step(src , trg) : 
    decoder_input = trg[: , :-1] 
    trg_reals = trg[: , 1:] 
    
    with tf.GradientTape() as tape : 
        preds = model(src , decoder_input)
        
        loss = criterion(trg_reals , preds) 
        
    gradients = tape.gradient(loss , model.trainable_variables) 
    optimizer.apply_gradients(zip(gradients , model.trainable_variables)) 
    
    train_loss(loss)
    
@tf.function 
def val_step(src , trg) : 
    decoder_input = trg[: , :-1] 
    trg_reals = trg[: , 1:] 
    
    preds = model(src , decoder_input) 
    
    loss = criterion(trg_reals , preds) 
    
    val_loss(loss)

In [34]:
train_step(src_sample,trg_sample)

In [35]:
best_val_loss = float('inf')
best_epoch = 0
early_stop = 0

In [36]:
with tf.device('/GPU:0') :
    train_losses = [] 
    val_losses = []
    for epoch in tqdm(range(EPOCHS)) :   
        train_loss.reset_state()  
        val_loss.reset_state()    
        
        for src , trg in tqdm(train_set) : 
            train_step(src , trg)
         
        for src , trg in tqdm(val_set) : 
            val_step(src , trg) 
             
        train_losses.append(train_loss.result())
        val_losses.append(val_loss.result()) 
        
        # if (epoch + 1) % 10 == 0 : 
        print(f"\n[Epoch :  {epoch+1}/{EPOCHS}] [Training Loss : {train_losses[-1]:0.5f}] [Validation Loss : {val_losses[-1]:0.5f}] \n")

        # if (epoch+1) % 10 == 0 :
        #     prediction()
        if val_losses[-1] < best_val_loss :
            best_val_loss = val_losses[-1]
            best_epoch = epoch
            early_stop = 0
            print(f'Best Epoch {best_epoch} , Best Val Loss {best_val_loss}')
            
        else :
            early_stop += 1
        if early_stop == 5 :
            break
        if val_losses[-1] <= 0.01:
            break

  0%|          | 0/500 [00:00<?, ?it/s]

  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  1/500] [Training Loss : 3.90198] [Validation Loss : 3.05442] 

Best Epoch 0 , Best Val Loss 3.054415225982666


  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  2/500] [Training Loss : 2.53748] [Validation Loss : 2.50634] 

Best Epoch 1 , Best Val Loss 2.506340742111206


  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  3/500] [Training Loss : 1.88593] [Validation Loss : 2.32191] 

Best Epoch 2 , Best Val Loss 2.3219079971313477


  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  4/500] [Training Loss : 1.42041] [Validation Loss : 2.30905] 

Best Epoch 3 , Best Val Loss 2.309049129486084


  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  5/500] [Training Loss : 1.04465] [Validation Loss : 2.38043] 



  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  6/500] [Training Loss : 0.74636] [Validation Loss : 2.52411] 



  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  7/500] [Training Loss : 0.52561] [Validation Loss : 2.67895] 



  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  8/500] [Training Loss : 0.37514] [Validation Loss : 2.85663] 



  0%|          | 0/4228 [00:00<?, ?it/s]

  0%|          | 0/1057 [00:00<?, ?it/s]


[Epoch :  9/500] [Training Loss : 0.28057] [Validation Loss : 2.97448] 



In [40]:
preds = []
targets = []
for sample,target in zip(src_sample,trg_sample) :
    result = evalute(sample)
    pred_sent = ' '.join([fr_tokenizer.index_word[idx] for idx in result.numpy() if idx != 0 and idx != 2 and idx !=3])
    trg_sent = ' '.join([fr_tokenizer.index_word[idx] for idx in target.numpy() if idx != 0 and idx != 2 and idx !=3])
    preds.append(pred_sent)
    targets.append([trg_sent])
    print(f"Actual correction    : {trg_sent}")
    print(f"Predicted correction : {pred_sent}\n")



Actual correction    : cependant l'exception est assujettie à certaines conditions
Predicted correction : cependant la exception faite à certaines conditions du sujet est indiquée à certaines conditions

Actual correction    : silva paula moreno aquaculture management in chile
Predicted correction : mesures sanitaires sur la gestion de l'aquaculture et de l’aquaculture du chili

Actual correction    : les fleurs présentaient la plus forte teneur en parthenolide et les tiges présentaient la plus faible teneur
Predicted correction : les fleurs affichaient les niveaux les plus élevés tout particulièrement les parties concernées

Actual correction    : série d'orientation annonces dans le processus de nomination commission de la fonction publique du canada
Predicted correction : série d'orientation les annonces publicitaires en anglais seulement commission des services

Actual correction    : en février la chambre des communes a voté contre la prorogation de l’application de ces mesures
Pr