In [15]:
import collections

import helper
import numpy as np
import pandas as pd
import os
import csv
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional, SimpleRNN, LSTM
from keras.layers.embeddings import Embedding
from keras.losses import sparse_categorical_crossentropy

from tensorflow.keras.optimizers import Adam

In [16]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [17]:
from tensorflow.python.client import device_lib
#print(device_lib.list_local_devices())
device_lib.list_local_devices()

[name: "/device:CPU:0"
 device_type: "CPU"
 memory_limit: 268435456
 locality {
 }
 incarnation: 10373894186266952967
 xla_global_id: -1, name: "/device:GPU:0"
 device_type: "GPU"
 memory_limit: 14465892352
 locality {
   bus_id: 1
   links {
   }
 }
 incarnation: 16651275156050919117
 physical_device_desc: "device: 0, name: Tesla T4, pci bus id: 0000:00:04.0, compute capability: 7.5"
 xla_global_id: 416903419]

#### Data source 
https://www.kaggle.com/dhruvildave/en-fr-translation-dataset <br>
The data was separated into two.<br>
Only a portion of the data was used.

In [18]:
english_path = '/content/drive/MyDrive/data/training/small_vocab_en'
french_path = '/content/drive/MyDrive/data/training/small_vocab_fr'

In [19]:
input_file = os.path.join(english_path)
with open(input_file, "r") as f:
    data = f.read()
english_sentences = data.split('\n')

input_file = os.path.join(french_path)
with open(input_file, "r") as f:
    data = f.read()
french_sentences = data.split('\n')

In [20]:
 for i in range(5):
    print(f'English: -> {english_sentences[i]}')
    print(f'French : -> {french_sentences[i]}' )
    print('-----------------------------------')

English: -> new jersey is sometimes quiet during autumn , and it is snowy in april .
French : -> new jersey est parfois calme pendant l' automne , et il est neigeux en avril .
-----------------------------------
English: -> the united states is usually chilly during july , and it is usually freezing in november .
French : -> les états-unis est généralement froid en juillet , et il gèle habituellement en novembre .
-----------------------------------
English: -> california is usually quiet during march , and it is usually hot in june .
French : -> california est généralement calme en mars , et il est généralement chaud en juin .
-----------------------------------
English: -> the united states is sometimes mild during june , and it is cold in september .
French : -> les états-unis est parfois légère en juin , et il fait froid en septembre .
-----------------------------------
English: -> your least liked fruit is the grape , but my least liked is the apple .
French : -> votre moins aimé

In [21]:
print(f'Number of English words {len([i for sentence in english_sentences for i in sentence.split()])}')
print(f'Number of French words {len([i for sentence in french_sentences for i in sentence.split()])}')

Number of English words 1823250
Number of French words 1961295


In [22]:
english_words_counter = collections.Counter([i for sentence in english_sentences for i in sentence.split()])
french_words_counter = collections.Counter([i for sentence in french_sentences for i in sentence.split()])
print(f'Number of UNIQUE English words {english_words_counter}')
print(f'Number of UNIQUE English words {french_words_counter}')

Number of UNIQUE English words Counter({'is': 205858, ',': 140897, '.': 129039, 'in': 75525, 'it': 75137, 'during': 74933, 'the': 67628, 'but': 63987, 'and': 59850, 'sometimes': 37746, 'usually': 37507, 'never': 37500, 'least': 27564, 'favorite': 27371, 'fruit': 27105, 'most': 14934, 'loved': 13666, 'liked': 13546, 'new': 12197, 'paris': 11334, 'india': 11277, 'united': 11270, 'states': 11270, 'california': 11250, 'jersey': 11225, 'france': 11170, 'china': 10953, 'he': 10786, 'she': 10786, 'grapefruit': 10118, 'your': 9734, 'my': 9700, 'his': 9700, 'her': 9700, 'fall': 9134, 'june': 9133, 'spring': 9102, 'january': 9090, 'winter': 9038, 'march': 9023, 'autumn': 9004, 'may': 8995, 'nice': 8984, 'september': 8958, 'july': 8956, 'april': 8954, 'november': 8951, 'summer': 8948, 'december': 8945, 'february': 8942, 'our': 8932, 'their': 8932, 'freezing': 8928, 'pleasant': 8916, 'beautiful': 8915, 'october': 8910, 'snowy': 8898, 'warm': 8890, 'cold': 8878, 'wonderful': 8808, 'dry': 8794, 'bus

In [23]:
print(f'15 Most common words in English:{list(zip(*english_words_counter.most_common(15)))[0]}')
print(f'15 Most common words in French:{list(zip(*french_words_counter.most_common(15)))[0]}')

15 Most common words in English:('is', ',', '.', 'in', 'it', 'during', 'the', 'but', 'and', 'sometimes', 'usually', 'never', 'least', 'favorite', 'fruit')
15 Most common words in French:('est', '.', ',', 'en', 'il', 'les', 'mais', 'et', 'la', 'parfois', 'jamais', 'le', "l'", 'généralement', 'moins')


In [24]:
del english_words_counter
del french_words_counter

In [25]:
def tokenize(text):
    x_tk = Tokenizer()
    x_tk.fit_on_texts(text)
    return x_tk.texts_to_sequences(text), x_tk

text_sentences = [
    'There was a monkey on the tree',
    'The banana was on the ground.',
    'sky is blue, SKY.']

text_tokenized, text_tokenizer = tokenize(text_sentences)
print(f'Tonkenizer ids  :{text_tokenized}\n(token_id, word):{text_tokenizer.word_index}')

Tonkenizer ids  :[[5, 2, 6, 7, 3, 1, 8], [1, 9, 2, 3, 1, 10], [4, 11, 12, 4]]
(token_id, word):{'the': 1, 'was': 2, 'on': 3, 'sky': 4, 'there': 5, 'a': 6, 'monkey': 7, 'tree': 8, 'banana': 9, 'ground': 10, 'is': 11, 'blue': 12}


In [26]:
def padding(texts, max_length = None):
    if max_length is None:
        max_length = max([len(i) for i in texts])
    return pad_sequences(texts,  padding="post", maxlen=max_length)

padding_test = padding(text_tokenized)
print('Map:',text_tokenizer.word_index)
for i, (token, pad) in enumerate(zip(text_tokenized, padding_test)):
    print('-------')
    print(f'Text:   {text_sentences[i]} - LENGTH:{len(text_sentences[i].split())}')
    print(f'Input:  {np.array(token)} - LENGTH:{len(np.array(token))}')
    print(f'Output: {pad} - LENGTH:{len(pad)}')
    print(f'padding length: {len(pad)-len(np.array(token))}')

del text_sentences
del padding_test
del text_tokenized
del text_tokenizer

Map: {'the': 1, 'was': 2, 'on': 3, 'sky': 4, 'there': 5, 'a': 6, 'monkey': 7, 'tree': 8, 'banana': 9, 'ground': 10, 'is': 11, 'blue': 12}
-------
Text:   There was a monkey on the tree - LENGTH:7
Input:  [5 2 6 7 3 1 8] - LENGTH:7
Output: [5 2 6 7 3 1 8] - LENGTH:7
padding length: 0
-------
Text:   The banana was on the ground. - LENGTH:6
Input:  [ 1  9  2  3  1 10] - LENGTH:6
Output: [ 1  9  2  3  1 10  0] - LENGTH:7
padding length: 1
-------
Text:   sky is blue, SKY. - LENGTH:4
Input:  [ 4 11 12  4] - LENGTH:4
Output: [ 4 11 12  4  0  0  0] - LENGTH:7
padding length: 3


In [27]:
def preprocess(Feature, Label):
    preprocess_Feature, Feature_tk = tokenize(Feature)
    preprocess_Label, Label_tk = tokenize(Label)

    preprocess_Feature = padding(preprocess_Feature)
    preprocess_Label = padding(preprocess_Label)

    preprocess_Label = preprocess_Label.reshape(*preprocess_Label.shape, 1)

    return preprocess_Feature, preprocess_Label, Feature_tk, Label_tk

preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer = preprocess(english_sentences, french_sentences)

In [28]:
def ids_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [29]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_french_sequence_length = preproc_french_sentences.shape[1]

english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

In [30]:
print('Preprocessed Data summary')
print(f"Max English sentence:{max_english_sequence_length}")
print(f"Max French sentence:{max_french_sequence_length}")
print(f"English vocab size:{english_vocab_size}")
print(f"French vocab size:{french_vocab_size}")


Preprocessed Data summary
Max English sentence:15
Max French sentence:21
English vocab size:199
French vocab size:344


In [33]:
def MODEL(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    
    learning_rate = 1e-3
    
    model = Sequential()
    model.add(Embedding(input_dim=english_vocab_size + 1, output_dim=64, input_length=output_sequence_length, input_shape=input_shape[1:]))
    model.add(Bidirectional(LSTM(256, return_sequences=True)))
    model.add(Dense(french_vocab_size, activation='softmax'))

    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(learning_rate),
                  metrics=['accuracy'])
    
    return model

In [36]:
ENGLISH = padding(preproc_english_sentences, max_french_sequence_length)
trained_model_ = MODEL( ENGLISH.shape, max_french_sequence_length, english_vocab_size, french_vocab_size)
trained_model_.fit(ENGLISH, preproc_french_sentences, batch_size=32, epochs=10, validation_split=0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7f22f2fc7a50>

In [37]:
index = 54321
print("ORIGINAL English: --> ", english_sentences[:index][-1])
print('--- --- ---')
print("MODEL to French:  --> ", ids_to_text(trained_model_.predict(ENGLISH[index-1:index])[0], french_tokenizer).replace('<PAD>',''))
print("GOOGLE to French: -->  la france est pluvieuse en automne , et il n'y a jamais de monde en décembre")
print('--- --- ---')
print('MODEL FRENCH to English using google  --> France is rainy in the fall and it is never busy in December')
print('GOOGLE FRENCH to English using google --> France is rainy in autumn, and there are never many people in December')

ORIGINAL English: -->  france is rainy during fall , and it is never busy in december .
--- --- ---
MODEL to French:  -->  france est pluvieux pendant l' automne et il est jamais occupé en décembre        
GOOGLE to French: -->  la france est pluvieuse en automne , et il n'y a jamais de monde en décembre
--- --- ---
MODEL FRENCH to English using google  --> France is rainy in the fall and it is never busy in December
GOOGLE FRENCH to English using google --> France is rainy in autumn, and there are never many people in December
