In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow.keras as ks
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

In [2]:
data=pd.read_csv("data-english/data-english.csv")
print(data.shape)
data.head(10)

(175621, 2)


Unnamed: 0,English words/sentences,French words/sentences
0,Hi.,Salut!
1,Run!,Cours !
2,Run!,Courez !
3,Who?,Qui ?
4,Wow!,Ça alors !
5,Fire!,Au feu !
6,Help!,À l'aide !
7,Jump.,Saute.
8,Stop!,Ça suffit !
9,Stop!,Stop !


In [3]:
eng_data=data["English words/sentences"]
fren_data=data["French words/sentences"]

print(eng_data.shape)
print(fren_data.shape)

(175621,)
(175621,)


In [4]:
#Fonction pour la Tokenisation du texte
def tokenData(x):
    inst_token= Tokenizer()
    inst_token.fit_on_texts(x)
    return inst_token.texts_to_sequences(x), inst_token

#Fonction pour la pad des phrases plus petite
def padSentences(x):
    return pad_sequences(x, maxlen=55, padding='post')

In [5]:
textEn, textEnTok= tokenData(eng_data)
textFren, textFrenTok= tokenData(fren_data)

In [6]:
print(max([len(sentence) for sentence in textEn]))
print(max([len(sentence) for sentence in textFren]))

44
55


In [7]:
#test des donnees tokeniser
print(len(textEnTok.word_index))
print(len(textFrenTok.word_index))

14531
30660


In [8]:
dataTrain, dataTest = train_test_split(data, test_size=.2)
print(dataTrain.shape)
print(dataTest.shape)

(140496, 2)
(35125, 2)


In [9]:
#Pour les donnees d'entrainement FR en En
dataXTrain, dataXTrainTok = tokenData(dataTrain["English words/sentences"])
size_labelXTrain = len(dataXTrainTok.word_index)
dataYTrain, dataYTrainTok = tokenData(dataTrain["French words/sentences"])
size_labelYTrain = len(dataYTrainTok.word_index)
dataXTrain = padSentences(dataXTrain)
dataYTrain = padSentences(dataYTrain)
print("Train: ")
print(size_labelXTrain)
print(size_labelYTrain, "\n")

# Pour les donnees de test
dataXTest, dataXTestTok = tokenData(dataTest["English words/sentences"])
size_labelXTest = len(dataXTestTok.word_index)
dataXTest = padSentences(dataXTest)
dataYTest, dataYTestTok = tokenData(dataTest["French words/sentences"])
size_labelYTest = len(dataYTestTok.word_index)
dataYTest = padSentences(dataYTest)
print("Test: ")
print(size_labelXTest)
print(size_labelYTest)

Train: 
13528
28013 

Test: 
8167
14962


In [10]:
EnTrain= max([len(sentence) for sentence in dataYTrain])
FrTrain= max([len(sentence) for sentence in dataYTest])

In [11]:
# Construction du modele LSTM
from tensorflow.keras.layers import Embedding, GRU, TimeDistributed, Dense, RepeatVector, Bidirectional, Dropout 
from tensorflow.keras.optimizers import Adam
import tensorflow as tf
from tensorflow.keras.losses import sparse_categorical_crossentropy

def model_define(voca_trainEn, voca_trainFr, sen_trainEn):
    model = tf.keras.models.Sequential()
    model.add(Embedding(voca_trainEn, 256, input_length=sen_trainEn))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation="relu")))
    model.add(Dropout(.5))
    model.add(TimeDistributed(Dense(voca_trainFr, activation="softmax")))
    
    model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(0.003),
                 metrics=["accuracy"])
    return model

In [12]:
model = model_define(size_labelXTrain, size_labelYTrain, EnTrain)
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 55, 256)           3463168   
                                                                 
 bidirectional (Bidirectiona  (None, 55, 512)          789504    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 55, 1024)         525312    
 ibuted)                                                         
                                                                 
 dropout (Dropout)           (None, 55, 1024)          0         
                                                                 
 time_distributed_1 (TimeDis  (None, 55, 28013)        28713325  
 tributed)                                                       
                                                        

In [13]:
model_fit=model.fit(dataXTrain, dataYTrain, batch_size=64, epochs=5,
         validation_split=.2)

Epoch 1/5
   9/1757 [..............................] - ETA: 1:34:41 - loss: 4.8197 - accuracy: 0.7805

KeyboardInterrupt: 

In [None]:
loss= model_fit.history["loss"]
loss_val= model_fit.history["val_loss"]

plt.plot(loss, label="Loss")
plt.plot(loss_val, label="Loss_val")
plt.title("Loss rate")
plt.legend()