In [11]:
import pandas as pd
import numpy as np
import collections
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.models import Model, Sequential
from keras.layers import GRU, Input, Dense, TimeDistributed, Activation, RepeatVector, Bidirectional,LSTM, Dropout
from keras.layers import Embedding
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy
from keras.callbacks import ModelCheckpoint
from IPython.display import Markdown, display


In [12]:
total_sentences = 10000
dataset = pd.read_csv("/content/Eng_-portuguese.txt", sep='\t', nrows = total_sentences, names=["EN","PT", "Attribution"], header=None)
test_proportion = 0.1
train_test_threshold = int( (1-test_proportion) * total_sentences)

In [13]:
test_proportion = 0.1
train_test_threshold = int( (1-test_proportion) * total_sentences)

In [14]:
dataset.drop(labels = "Attribution", axis = 1,inplace = True)

In [30]:
dataset = dataset.sample(frac=1, random_state=1)
dataset.iloc[1:100]

Unnamed: 0,EN,PT
9880,Tom will speak.,Tom falará.
4104,I'm so sorry.,Eu sinto muito.
1257,Come along.,Venha conosco.
5109,Am I mistaken?,Estou errado?
4531,They'll call.,Eles chamarão.
...,...,...
1538,I'm 30 now.,Eu tenho 30 anos agora.
9244,Take your meds.,Tome os seus remédios.
2368,I guess not.,Eu acho que não.
361,Can I go?,Posso ir?


In [31]:
dataset.head()

Unnamed: 0,EN,PT
6749,This is basic.,Isso é básico.
9880,Tom will speak.,Tom falará.
4104,I'm so sorry.,Eu sinto muito.
1257,Come along.,Venha conosco.
5109,Am I mistaken?,Estou errado?


In [33]:
dataset.sort_index()

Unnamed: 0,EN,PT
0,Go.,Vai.
1,Go.,Vá.
2,Hi.,Oi.
3,Run!,Corre!
4,Run!,Corra!
...,...,...
9995,We didn't wait.,Não esperamos.
9996,We drank a lot.,Nós bebemos muito.
9997,We drank a lot.,A gente bebeu muito.
9998,We fell asleep.,Nós adormecemos.


In [34]:
english = dataset['EN']  #Separating the English and portuguese data
portuguese = dataset['PT']

In [35]:
english_words_counter = collections.Counter([word for sentence in english for word in sentence.split()])
portuguese_words_counter = collections.Counter([word for sentence in portuguese for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('20 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(20)))[0]) + '"')
print()
print('{} Portuguese words.'.format(len([word for sentence in portuguese for word in sentence.split()])))
print('{} unique French words.'.format(len(portuguese_words_counter)))
print('20 Most common words in the Portuguese dataset:')
print('"' + '" "'.join(list(zip(*portuguese_words_counter.most_common(20)))[0]) + '"')
#stats about the datasets

27930 English words.
3066 unique English words.
20 Most common words in the English dataset:
"I" "Tom" "I'm" "a" "is" "It's" "We" "was" "you" "it." "Tom." "me." "it" "They" "He" "Is" "It" "not" "you." "like"

29214 Portuguese words.
5443 unique French words.
20 Most common words in the Portuguese dataset:
"Eu" "Tom" "é" "está" "de" "o" "Não" "um" "Você" "não" "Estou" "Nós" "O" "É" "Tom." "Ele" "sou" "a" "Isso" "me"


In [110]:
def tokenize(x):  #breaking the raw text into small chunks
    
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer #These tokens help in understanding the context or developing the model for the NLP Natural Lenguage Processing

In [111]:
def pad(x, length=None): #We also have to pad each text as all the neural networks needs to have the inputs that should be in similar shape and size.
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen = 55, padding = 'post')
    

In [112]:
def preprocess(x, y): #The preprocess function will perform both of these tokenizing and padding tasks by incorporating the 2 previously defined functions.
   
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    # Keras's sparse_categorical_crossentropy function requires the labels to be in 3 dimensions
    preprocess_y = preprocess_y.reshape(*preprocess_y.shape, 1)

    return preprocess_x, preprocess_y, x_tk, y_tk

In [113]:
preproc_english_sentences, preproc_portuguese_sentences, english_tokenizer, portuguese_tokenizer = preprocess(english, portuguese)

In [114]:
max_english_sequence_length = preproc_english_sentences.shape[1]
max_portuguese_sequence_length = preproc_portuguese_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
portuguese_vocab_size = len(portuguese_tokenizer.word_index)

print("Max English sentence length:", max_english_sequence_length)
print("Max Portuguese sentence length:", max_portuguese_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("Portuguese vocabulary size:", portuguese_vocab_size)

Max English sentence length: 55
Max Portuguese sentence length: 55
English vocabulary size: 2114
Portuguese vocabulary size: 3571


In [115]:
def logits_to_text(logits, tokenizer): #We also need a function to convert the final prediction by our model into text form.
    
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ' '.join([index_to_words[prediction] for prediction in np.argmax(logits, 1)])

In [116]:
def bd_model(input_shape, output_sequence_length, english_vocab_size, portuguese_vocab_size):
    
    learning_rate = 0.003
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(portuguese_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(english_vocab_size, activation='softmax'))) 

    # Compile model
    model.compile(loss=sparse_categorical_crossentropy, optimizer=Adam(learning_rate), metrics=['accuracy'])
    return model

In [117]:
tmp_x.shape

(10000, 55)

In [118]:
preproc_english_sentences.shape

(10000, 55)

In [119]:
tmp_x = pad(preproc_portuguese_sentences, preproc_portuguese_sentences.shape[1])
tmp_x = tmp_x.reshape((-1, preproc_portuguese_sentences.shape[-2]))

# Train 
model = bd_model(
    tmp_x.shape,
    preproc_english_sentences.shape[1],
    len(english_tokenizer.word_index)+1,
    len(portuguese_tokenizer.word_index)+1)

model.summary()

model.fit(tmp_x, preproc_english_sentences, batch_size=64, epochs=2, validation_split=0.2)

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_3 (Embedding)     (None, 55, 256)           914432    
                                                                 
 bidirectional_3 (Bidirectio  (None, 55, 512)          789504    
 nal)                                                            
                                                                 
 time_distributed_6 (TimeDis  (None, 55, 1024)         525312    
 tributed)                                                       
                                                                 
 dropout_3 (Dropout)         (None, 55, 1024)          0         
                                                                 
 time_distributed_7 (TimeDis  (None, 55, 2115)         2167875   
 tributed)                                                       
                                                      

<keras.callbacks.History at 0x7f6bbf98a610>

In [120]:
  i=1004


  print("Prediction:")
  print(logits_to_text(model.predict(tmp_x[[i]])[0], english_tokenizer))

  print("\nCorrect Translation:")
  print(english[i])

  print("\nOriginal text:")
  print(portuguese[i])


Prediction:
<PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD> <PAD>

Correct Translation:
Nice shot!

Original text:
Belo tiro!
