In [0]:
import os

import numpy as np
#np.random.seed(int(np.pi*10**5)) #

from keras.models import Sequential, Model
from keras.layers import Dense, GRU, LSTM
from keras.layers import Input, TimeDistributed, Embedding, RepeatVector, Lambda, Bidirectional
from keras.layers import Flatten, Reshape, Permute, Activation
from keras.layers import Dot, Concatenate, Multiply
from keras.layers import merge
from keras.callbacks import EarlyStopping
from keras import backend as K

from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer

#nltk.download()

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

import sys  

reload(sys)  
sys.setdefaultencoding('latin-1')

In [0]:
# Install the PyDrive wrapper & import libraries.
# This only needs to be done once per notebook.
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from google.colab import files
from oauth2client.client import GoogleCredentials


# Authenticate and create the PyDrive client.
# This only needs to be done once per notebook.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download a file based on its file ID.
#
# Search query reference:
# https://developers.google.com/drive/v2/web/search-parameters
listed = drive.ListFile({'q': "title contains '.txt' and 'root' in parents"}).GetList()
for file in listed:
  #py_file = list(file['title'])
  print('title {}, id {}'.format(file['title'], file['id']))


  

In [0]:
src_word_chunker = lambda sent: list(sent)
trg_word_chunker = lambda sent: list(sent)

source, target = [], []
src_vocab, trg_vocab = set({}), set({})
src_max_len = 1
trg_max_len = 1

file_id = '18BPogdqyco7sV8cdMtqg227ywKdYS7gG'
downloaded = drive.CreateFile({'id': file_id})
downloaded.GetContentFile('fra_orig.txt') 


with open('fra_orig.txt', 'r') as sythtree:
    for line in sythtree:
        src_sent_raw, trg_sent_raw = line.split('\t')
        
        trg_sent_raw = trg_sent_raw.replace('\n','')
        trg_sent_raw = 'BOS ' + str(trg_sent_raw) + ' EOS'
        
        src_words = word_tokenize(src_sent_raw)
        trg_words = word_tokenize(trg_sent_raw)
        
        # update vocabulary with new words
        src_vocab.update(src_words)
        trg_vocab.update(trg_words)
        
        #src_sent = src_word_chunker(src_sent_raw)
        #trg_sent = trg_word_chunker(trg_sent_raw)

        # add to the list (chunked)
        source.append(src_sent_raw)
        target.append(trg_sent_raw)
        
        src_max_len   = src_max_len if src_max_len > len(src_words) else len(src_words)
        trg_max_len   = trg_max_len if trg_max_len > len(trg_words) else len(trg_words)

print('source: ', source[0:5])
print('target:: ', target[0:5])

# the first words is the padding sign:
src_vocab = list(src_vocab)
trg_vocab = ['BOS'] + list(trg_vocab) + ['EOS']

src_vocab_size = len(src_vocab)
trg_vocab_size = len(trg_vocab)

##print(src_vocab_size)
#print(trg_vocab_size)


## convert words to word_id 
#source_sents = [ 
#    [src_vocab.index(w) for w in sent]
#    for sent in source
#]
#
#target_sents = [ 
#    [trg_vocab.index(w) for w in sent]
#    for sent in target
#]

tok_src = Tokenizer(num_words=5000)
tok_trg = Tokenizer(num_words=5000)

tok_src.fit_on_texts(source)
source_sents = tok_src.texts_to_sequences(source)

tok_trg.fit_on_texts(target)
target_sents = tok_trg.texts_to_sequences(target)


print("source sents:: ", source_sents[0:7])
print("target sents:: ", target_sents[0:7])

src_max_len = tok_src.num_words
trg_max_len = tok_trg.num_words

print("source max size:", src_max_len)
print("target max size:", trg_max_len)


In [0]:

# fixed length of source and target inputs after padding
T_x = src_max_len
T_y = trg_max_len + 1

# padded sentences
# https://keras.io/preprocessing/sequence/
source_pp = sequence.pad_sequences(source_sents, maxlen=T_x)
target_pp = sequence.pad_sequences(sequence.pad_sequences(target_sents, maxlen=T_y-1), maxlen=T_y)
trg_end_padding = np.pad(target_pp[:,1:], [(0,0), (0,1)], 'constant', constant_values=0)

print("source pp:", source_pp)
print("target pp:", target_pp)

print("source vocab size:", src_vocab_size)
print("target vocab size:", trg_vocab_size)

print('source shape:', source_pp.shape)
print('target shape:', target_pp.shape)

print(trg_end_padding)

def one_hot_initializer(shape, dtype=None):
    """Keras friendly initialization for one-hot encodedings as embedings"""
    output = K.eye(shape[0], dtype=dtype)
    output = K.concatenate([K.zeros_like(output[:1,:]), output[1:,:]], 0)
    output = K.concatenate([K.zeros_like(output[:,:1]), output[:,1:]], 1)
    return output

def sequential_layer_composition(input_tensor, layers):
    """This function takes a list of layers similar to Sequential but it is designed to work for Model in Keras"""
    if len(layers) > 1:
        return sequential_layer_composition(layers[0](input_tensor), layers[1:])
    else:
        return layers[0](input_tensor)
    


In [0]:

encoder_unit_size = 32
encoder_embedding_size = 8
#encoder_embedding_size = src_vocab_size

print('T_x::', T_x)
print('T_y::', T_y)

# the length of source and target inputs can be different but I chose the same max_len=10
src_input = Input(shape=(T_x,))
trg_input = Input(shape=(T_y,)) 

source_embeddings = Embedding(
    src_vocab_size,
    encoder_embedding_size,
    input_shape=(T_x,),
    #embeddings_initializer=one_hot_initializer,
    #trainable=False,
)

print(source_embeddings)

encoder_model = Sequential([
    source_embeddings, 
    Bidirectional(LSTM(int(encoder_unit_size/2), return_sequences=True)),
    Bidirectional(LSTM(int(encoder_unit_size/2), return_sequences=True)),
])

encoder_output = encoder_model(src_input)

print(encoder_model.layers[0].input)
print(encoder_model.layers[0].output)
print(encoder_model.layers[1].output)



T_x:: 5000
T_y:: 5001
<keras.layers.embeddings.Embedding object at 0x0000026707723B38>
Tensor("embedding_27_input:0", shape=(?, 5000), dtype=float32)
Tensor("embedding_27/Gather:0", shape=(?, 5000, 8), dtype=float32)
Tensor("bidirectional_29/concat:0", shape=(?, ?, 32), dtype=float32)


In [0]:
decoder_unit_size = 32
decoder_embedding_size = 8
#decoder_embedding_size = trg_vocab_size

# Understanding the following code require both understanding the desgin, some math behind it, and Keras tricks.
m = T_x # source lenth
n = T_y # target lenth
d1 = encoder_unit_size
d2 = decoder_unit_size

print('n, d:: ', n,d2)
# concatenate all H = (h_i) to all S = (s_j) ===> H_S = [s_(j-1) h_i] (S lag one phase behind)
S          = Input((n, d2,))
S_shift    = Lambda(lambda x: K.concatenate([K.zeros_like(K.expand_dims(x[:,0], 1)), x[:,:-1]], 1))(S)
S_flat     = Flatten()(S_shift)
S_flat_rep = RepeatVector(m)(S_flat)
S_rep_n    = Reshape((m, n, d2))(S_flat_rep)
# (m, n, d2,)

print('m, d1:: ', m,d1)

H            = Input((m, d1,))
H_flat       = Flatten()(H)
H_flat_rep   = RepeatVector(n)(H_flat)
H_flat_rep_  = Reshape((n, m, d1))(H_flat_rep)
H_rep_m      = Permute((2,1,3))(H_flat_rep_) 
# (m, n, d1,)

# concatenate everything with everything:
S_H_     = Concatenate(-1)([S_rep_n, H_rep_m]) 
# (m, n, d1+d2)
S_H_flat = Flatten()(S_H_)
S_H      = Reshape((m*n, (d1+d2)))(S_H_flat) 
# (m*n, (d1+d2),)

# make the e_ji
E_T_1 = TimeDistributed(Dense(d1+d2, activation='tanh'))(S_H)
E_T_  = TimeDistributed(Dense(1, activation='linear'))(E_T_1)
E_T   = Reshape((m, n))(E_T_) 
E     = Permute((2,1))(E_T) # E = {E_j} = {{ e_{ji} }} 
# (n, m,)

# the alignemtns
alpha = TimeDistributed(Activation('softmax'))(E) # alpha_j = softmax(E_j}
# (n, m,)

C     = Dot((2,1))([alpha, H])
# (n, d1,)

attention_model = Model([S, H], C)


target_embeddings = Embedding(
    trg_vocab_size,
    decoder_embedding_size,
    input_shape=(T_y,),
    #embeddings_initializer=one_hot_initializer,
    #trainable=False,
)
decoder_rnn = LSTM(decoder_unit_size, return_sequences=True, input_shape=(T_y, decoder_embedding_size+encoder_unit_size))

decoder_model = Model(
    [src_input, trg_input],
    sequential_layer_composition(trg_input, [
        target_embeddings, 
        decoder_rnn, 
        lambda S: Concatenate(2)([S, attention_model([S, encoder_output])]), 
        TimeDistributed(Dense(trg_vocab_size, activation='softmax')),
    ])
)


alignments_model = Model(
    [src_input, trg_input],
    sequential_layer_composition(trg_input, [
        target_embeddings,
        decoder_rnn, 
        lambda x: Model([S, H], alpha)([x, encoder_output]),
    ])
)


print(decoder_model.summary())
print(alignments_model.summary())


# input of the encoder-decoder model is a list of two inputs: source, target
encoder_decoder = Model([src_input, trg_input], decoder_model([src_input, trg_input]))

encoder_decoder.compile('adam', 'categorical_crossentropy')

print(encoder_decoder.summary())



n, d::  5001 32
m, d1::  5000 32
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_54 (InputLayer)           (None, 5001)         0                                            
__________________________________________________________________________________________________
embedding_28 (Embedding)        (None, 5001, 8)      238784      input_54[0][0]                   
__________________________________________________________________________________________________
input_53 (InputLayer)           (None, 5000)         0                                            
__________________________________________________________________________________________________
lstm_39 (LSTM)                  (None, 5001, 32)     5248        embedding_28[0][0]               
____________________________________________________________________________

In [0]:
print(trg_vocab_size)
print(trg_end_padding)
print(trg_end_padding.shape)

y=np.eye(trg_max_len)[trg_end_padding]
print(y)


29848
[[   0    0    0 ...    1    2    0]
 [   0    0    0 ...  119    2    0]
 [   0    0    0 ...  591    2    0]
 ...
 [   0    0    0 ...  289    2    0]
 [   0    0    0 ... 2223    2    0]
 [   0    0    0 ... 1026    2    0]]
(145438, 5001)


MemoryError: 

In [0]:
encoder_decoder.fit(
    x          = [source_pp, target_pp],
    y          = np.eye(trg_max_len)[trg_end_padding],
    batch_size = 32,
    epochs     = 3,
    validation_split=0.2,
    callbacks=[EarlyStopping(patience=10)],
)

MemoryError: 