In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model, Sequential
from tensorflow.keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import sparse_categorical_crossentropy
from tensorflow.keras.callbacks import ModelCheckpoint
import nltk



In [2]:
##Loading and processing data
eng_fr = pd.read_csv("/kaggle/input/wec-translationen-fr/nlp_intel_train.csv")
eng_fr_test = pd.read_csv("/kaggle/input/wec-translationen-fr/nlp_intel_test.csv")

In [3]:
eng_fr['en'] = eng_fr['en'].str.lower()

In [4]:
eng_fr['fr'] = eng_fr['fr'].str.lower()

In [5]:
eng_fr_test['en'] = eng_fr_test['en'].str.lower()

In [6]:
eng_fr_test['fr'] = eng_fr_test['fr'].str.lower()


In [7]:
eng_fr = eng_fr.dropna(axis=0, how="any", subset=None, inplace=False)
eng_fr_test = eng_fr_test.dropna(axis=0, how="any", subset=None, inplace=False)

In [8]:
eng_fr

Unnamed: 0.1,Unnamed: 0,en,fr
0,1000,"in 1981, he founded the astronomy club of rimo...","en 1981, il fonde le club d'astronomie de rimo..."
1,1001,the club was very active and they twice organi...,le club est très actif et organise à deux occa...
2,1002,"in 1983, lemay initiated the first joint meeti...","en 1983, il est l'instigateur à québec du cong..."
3,1003,"the conference took place in quebec city, and ...",le congrès est un franc succès et regroupe pas...
4,1004,"from 1990 to 1992, he was the national preside...","de 1990 à 1992, il est président national de l..."
...,...,...,...
17995,18995,imports of shrimp and prawn recorded also a sh...,"en 2001, une forte baisse des importations jap..."
17996,18996,the volume of import decreased by 16.3% from 9...,"en effet, entre 2000 et 2001, le volume des im..."
17997,18997,the market for northern shrimp (pandalus borea...,"de plus, le marché mondial des crevettes nordi..."
17998,18998,imports of molluscs (almost 100% of this being...,"entre 2000 et 2001, les importations de mollus..."


In [9]:
##Tokenizer and padding

def tokenize(data):
  t = Tokenizer()
  t.fit_on_texts(data)
  return t
def training_sequences(tokenizer, m_length, data):
    seq = tokenizer.texts_to_sequences(data)
    seq = pad_sequences(seq, maxlen = m_length, padding='post',truncating='post')
    return seq


In [10]:
#Preprocessing by tokenization and padding
#return processed data and tokenizer
def preprocess(x, y):

    x_tk = tokenize(x)
    y_tk = tokenize(y)

    preprocess_x = training_sequences(x_tk,None,x)
    preprocess_y = training_sequences(y_tk,None,y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [11]:
preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(eng_fr["en"].tolist(), eng_fr["fr"].tolist())

In [12]:
preproc_english_sentences[0,:]

array([    4,  3430,   241,  5895,     1,   904,  2716,     2, 11594,
           4,  1306,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [13]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Max English sentence length: 407
Max French sentence length: 453
English vocabulary size: 21789
French vocabulary size: 27712


In [14]:
#Final output funtion
def logits_to_text(logits, tokenizer):

    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = ' '

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [15]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):

    learning_rate = 0.001

    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(GRU(256,return_sequences=True))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    # Compile model
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    return model

In [16]:
preproc_french_sentences.shape[1]

453

In [17]:
tmp_x = pad_sequences(preproc_english_sentences, maxlen=preproc_french_sentences.shape[1], padding='post',truncating='post')
tmp_x[0,:]

array([    4,  3430,   241,  5895,     1,   904,  2716,     2, 11594,
           4,  1306,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,

In [18]:
tmp_x = tmp_x.reshape((-1, preproc_french_sentences.shape[-2]))

# Train
model = bd_model(
    tmp_x.shape,
    preproc_english_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(french_tokenizer.word_index)+1)

model.summary()

history = model.fit(tmp_x, preproc_french_sentences, batch_size=32, epochs=60, validation_split=0.2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 453, 256)          5578240   
                                                                 
 gru (GRU)                   (None, 453, 256)          394752    
                                                                 
 time_distributed (TimeDistr  (None, 453, 1024)        263168    
 ibuted)                                                         
                                                                 
 dropout (Dropout)           (None, 453, 1024)         0         
                                                                 
 time_distributed_1 (TimeDis  (None, 453, 27713)       28405825  
 tributed)                                                       
                                                                 
Total params: 34,641,985
Trainable params: 34,641,985
No

In [19]:
model.save('translation_model.h5')

In [20]:
import pickle

with open('eng_tokenizer.pickle', 'wb') as handle:
    pickle.dump(english_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
with open('fr_tokenizer.pickle', 'wb') as handle:
    pickle.dump(french_tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
    

In [21]:
i= 1

prediction= [logits_to_text(model.predict(tmp_x[[i]])[0], french_tokenizer)]
correct_translation = eng_fr["fr"].tolist()[i]
joined_text = ''.join(correct_translation)
correct_list = joined_text.split()

print("Prediction:")
print(prediction[0])
print("\nCorrect Translation:")
print(correct_translation)
print("\nOriginal text:")
print(eng_fr["en"].tolist()[i])
print("\n\n")


BLEUscore = nltk.translate.bleu_score.sentence_bleu([correct_list], prediction[0].split())
print("BLEU SCORE:\n")
print(BLEUscore)

Prediction:
les club est très actif et organise à deux occasions de 1990 et le le congrès annuel de la fédération de astronomes amateurs québec québec                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  