In [39]:
import keras
from keras.preprocessing.text import Tokenizer
from keras import Input, Model
from keras.layers import Dense, LSTM, Embedding
from keras.preprocessing.sequence import pad_sequences

from AttentionLayer import Attention

import pandas as pd
import json

In [None]:
with open('Prime_Pantry_5.json') as file:
    reviews = json.load(file)

In [None]:
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Prime_Pantry_5.json.gz')   

In [26]:
df = df[['reviewText', 'overall']].dropna(inplace=False)

In [27]:
X = df['reviewText']
y = df['overall']

In [30]:
t = Tokenizer()
t.fit_on_texts(X)
text_matrix = t.texts_to_sequences(X)

In [53]:
sent_length = max([len(sent) for sent in text_matrix])
vocab_length = len(set([word for sent in text_matrix for word in sent]))

In [40]:
text_pad = pad_sequences(text_matrix, maxlen=sent_length, padding='post')

In [55]:
inputs=Input((sent_length,))
x=Embedding(input_dim=vocab_length*2,output_dim=32,input_length=sent_length)(inputs)
att_in=LSTM(16,return_sequences=True,dropout=0.3,recurrent_dropout=0.2)(x)
att_out=Attention()(att_in)
outputs=Dense(1,activation='softmax',trainable=True)(att_out)
model=Model(inputs,outputs)
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_9 (InputLayer)         (None, 2221)              0         
_________________________________________________________________
embedding_8 (Embedding)      (None, 2221, 32)          1958272   
_________________________________________________________________
lstm_6 (LSTM)                (None, 2221, 16)          3136      
_________________________________________________________________
attention_6 (Attention)      (None, 16)                2237      
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 17        
Total params: 1,963,662
Trainable params: 1,963,662
Non-trainable params: 0
_________________________________________________________________


In [56]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
model.fit(text_pad, y, batch_size=100, epochs=10,verbose=1, shuffle=True, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 110088 samples, validate on 27523 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10

KeyboardInterrupt: 

In [None]:
text_matrix

In [None]:
vocab_length

In [None]:
# MODEL FUNCTION

def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Implement
    model = Sequential()
    
    model.add(
              Embedding(input_dim=english_vocab_size*2,
              output_dim=256,
              input_length=input_shape[1]))
    
    model.add(Bidirectional(LSTM(256)))
    # Add repeatvector to fix problem with vectors shape
    model.add(RepeatVector(output_sequence_length))
    model.add(Dropout(0.5))

    model.add(Bidirectional(LSTM(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(0.002),
                  metrics=['accuracy'])
    return model

    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(0.002),
                  metrics=['accuracy'])
    
    return model


def final_predictions(x, y, x_tk, y_tk):
    """
    Gets predictions using the final model
    x: Preprocessed English data
    y: Preprocessed French data
    x_tk: English tokenizer
    y_tk: French tokenizer
    """
    model = model_final(x.shape,y.shape[1], len(x_tk.word_index)+1, len(y_tk.word_index)+1)
    
    model.fit(x, y, batch_size=1024, epochs=20, validation_split=0.2)
    
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'

    sentence = 'he saw an old yellow truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Il a vu un vieux camion jaune')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))

In [5]:
# Load input data
english_sentences = X
# Load output data
french_sentences = y

In [6]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
#french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

In [23]:
preproc_english_sentences, english_tokenizer = preprocess(english_sentences)
    
input_length = preproc_english_sentences.shape[1]
output_length = 1

TypeError: pad() missing 1 required positional argument: 'length'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

x, y, x_tk, y_tk = preprocess(X_train, y_train)

In [None]:
english_vocab_size = len(english_tokenizer.word_index)

In [None]:
model = model_final(x.shape, y.shape[1], len(x_tk.word_index)+1, len(y_tk.word_index)+1)
    
model.fit(x, y, batch_size=1024, epochs=20)

In [None]:
final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)

In [13]:
preprocess(english_sentences)

TypeError: pad() missing 1 required positional argument: 'length'

In [24]:

preprocess_x, x_tk = data_process.tokenize(X)
preprocess_x = data_process.pad(preprocess_x, 32)


137611

In [54]:
vocab_length

30598