In [47]:
import keras
from keras.preprocessing.text import Tokenizer
from keras import Input, Model
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from keras.layers import Dense, LSTM, Embedding, Bidirectional, RepeatVector
from keras.preprocessing.sequence import pad_sequences

from AttentionLayer import Attention

from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score


import pandas as pd
import json

In [3]:
with open('Prime_Pantry_5.json') as file:
    reviews = json.load(file)

JSONDecodeError: Extra data: line 2 column 1 (char 859)

In [34]:
import gzip

def parse(path):
    g = gzip.open(path, 'rb')
    for l in g:
        yield json.loads(l)

def getDF(path):
    i = 0
    df = {}
    for d in parse(path):
        df[i] = d
        i += 1
    return pd.DataFrame.from_dict(df, orient='index')

df = getDF('Prime_Pantry_5.json.gz')   

In [35]:
df = df[['reviewText', 'overall']].dropna(inplace=False)

In [36]:
X = df['reviewText']
y = df['overall']

In [37]:
t = Tokenizer()
t.fit_on_texts(X)
text_matrix = t.texts_to_sequences(X)

In [38]:
sent_length = max([len(sent) for sent in text_matrix])
vocab_length = len(set([word for sent in text_matrix for word in sent]))

In [39]:
# prepare input
text_pad = pad_sequences(text_matrix, maxlen=sent_length, padding='post')

In [41]:
# prepare output for multinomial classification
# encode class values as integers
encoder = LabelEncoder()
encoder.fit(y)
encoded_y = encoder.transform(y)
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_y)

In [44]:
inputs=Input((sent_length,))
x=Embedding(input_dim=vocab_length*2,output_dim=32,input_length=sent_length)(inputs)
# Bidirectional layer to take previous and following states into consideration
bidir = Bidirectional(LSTM(32))(x)
# Add repeatvector to fix bidirectional shape, with output length as parameter
rvec = RepeatVector(32)(bidir)
# Add attention layer
att_in=LSTM(16,return_sequences=True,dropout=0.3,recurrent_dropout=0.2)(rvec)
att_out=Attention()(att_in)
outputs = Dense(5, activation='softmax')(att_out)
model=Model(inputs,outputs)
model.summary()

Model: "model_6"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 2221)              0         
_________________________________________________________________
embedding_10 (Embedding)     (None, 2221, 32)          1958272   
_________________________________________________________________
bidirectional_5 (Bidirection (None, 64)                16640     
_________________________________________________________________
repeat_vector_3 (RepeatVecto (None, 32, 64)            0         
_________________________________________________________________
lstm_9 (LSTM)                (None, 32, 16)            5184      
_________________________________________________________________
attention_4 (Attention)      (None, 16)                48        
_________________________________________________________________
dense_6 (Dense)              (None, 5)                 85  

In [45]:
model.compile(loss='categorical_crossentropy', optimizer=keras.optimizers.Adam(0.005), metrics=['acc'])
#model.fit(text_pad, y, batch_size=254, epochs=10,verbose=1, shuffle=True, validation_split=0.2)

In [46]:
#estimator = KerasClassifier(model, epochs=200, batch_size=5, verbose=0)
#kfold = KFold(n_splits=10, shuffle=True)
#results = cross_val_score(estimator, text_pad, dummy_y, cv=kfold)
#print("Model: %.2f%% (%.2f%%)" % (results.mean()*100, results.std()*100))

ValueError: Unknown layer: Attention

In [43]:
dummy_y

array([[0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       ...,
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1.]], dtype=float32)

In [None]:
# MODEL FUNCTION

def model_final(input_shape, output_sequence_length, english_vocab_size, french_vocab_size):
    """
    Build and train a model that incorporates embedding, encoder-decoder, and bidirectional RNN on x and y
    :param input_shape: Tuple of input shape
    :param output_sequence_length: Length of output sequence
    :param english_vocab_size: Number of unique English words in the dataset
    :param french_vocab_size: Number of unique French words in the dataset
    :return: Keras model built, but not trained
    """
    # TODO: Implement
    model = Sequential()
    
    model.add(
              Embedding(input_dim=english_vocab_size*2,
              output_dim=256,
              input_length=input_shape[1]))
    
    model.add(Bidirectional(LSTM(256)))
    # Add repeatvector to fix problem with vectors shape
    model.add(RepeatVector(output_sequence_length))
    model.add(Dropout(0.5))

    model.add(Bidirectional(LSTM(256,return_sequences=True)))
    model.add(TimeDistributed(Dense(512, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(french_vocab_size, activation='softmax')))
    
    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(0.002),
                  metrics=['accuracy'])
    return model

    model.compile(loss=sparse_categorical_crossentropy,
                  optimizer=Adam(0.002),
                  metrics=['accuracy'])
    
    return model


def final_predictions(x, y, x_tk, y_tk):
    """
    Gets predictions using the final model
    x: Preprocessed English data
    y: Preprocessed French data
    x_tk: English tokenizer
    y_tk: French tokenizer
    """
    model = model_final(x.shape,y.shape[1], len(x_tk.word_index)+1, len(y_tk.word_index)+1)
    
    model.fit(x, y, batch_size=1024, epochs=20, validation_split=0.2)
    
    y_id_to_word = {value: key for key, value in y_tk.word_index.items()}
    y_id_to_word[0] = '<PAD>'

    sentence = 'he saw an old yellow truck'
    sentence = [x_tk.word_index[word] for word in sentence.split()]
    sentence = pad_sequences([sentence], maxlen=x.shape[-1], padding='post')
    sentences = np.array([sentence[0], x[0]])
    predictions = model.predict(sentences, len(sentences))

    print('Sample 1:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[0]]))
    print('Il a vu un vieux camion jaune')
    print('Sample 2:')
    print(' '.join([y_id_to_word[np.argmax(x)] for x in predictions[1]]))
    print(' '.join([y_id_to_word[np.max(x)] for x in y[0]]))

In [5]:
# Load input data
english_sentences = X
# Load output data
french_sentences = y

In [6]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
#french_words_counter = collections.Counter([word for sentence in french_sentences for word in sentence.split()])

In [23]:
preproc_english_sentences, english_tokenizer = preprocess(english_sentences)
    
input_length = preproc_english_sentences.shape[1]
output_length = 1

TypeError: pad() missing 1 required positional argument: 'length'

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)

x, y, x_tk, y_tk = preprocess(X_train, y_train)

In [None]:
english_vocab_size = len(english_tokenizer.word_index)

In [None]:
model = model_final(x.shape, y.shape[1], len(x_tk.word_index)+1, len(y_tk.word_index)+1)
    
model.fit(x, y, batch_size=1024, epochs=20)

In [None]:
final_predictions(preproc_english_sentences, preproc_french_sentences, english_tokenizer, french_tokenizer)

In [13]:
preprocess(english_sentences)

TypeError: pad() missing 1 required positional argument: 'length'

In [24]:

preprocess_x, x_tk = data_process.tokenize(X)
preprocess_x = data_process.pad(preprocess_x, 32)


137611

In [54]:
vocab_length

30598

In [1]:
print(x[1])

NameError: name 'x' is not defined