In [1]:
# !pip uninstall keras -y
# !pip install keras==2.1.2
# !pip install pydot

In [2]:
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MinMaxScaler
from tensorflow.python.keras.wrappers.scikit_learn import KerasRegressor


import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings

from keras.layers import Input, Embedding, Flatten, Dot, Dense, LSTM
from keras.models import Model

from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences
from keras import Sequential
import keras

warnings.filterwarnings('ignore')
%matplotlib inline

Using TensorFlow backend.


In [3]:

def read_data(f_name):
    
    df = pd.read_csv(f_name, sep='\t', nrows = 10000)
        
    df['docs_to_embed'] = df['product_title'] + ', ' + df['review_headline'] + ', ' + df['review_body']
    
    df.drop(['product_title', 'review_headline', 'review_body'], axis=1, inplace=True)
    
    # remove nans
    df.dropna(inplace=True)
    
#     df['customer_id'] = str(df['customer_id'])
 
    return df

df = read_data('micro_data.tsv')


## Build and train a model

### Features to embed:
1. Product title
2. Review headline
3. Review body
4. Customer ID
5. Product ID 

In [4]:
def get_label(df):
    labels = [1 if int(x) >= 4 else 0 for x in df['star_rating'] ]
    return labels

labels = get_label(df)


def get_encoded_ids(df, id_name):

    assert id_name in ['product_id', 'docs_to_embed']
    
    vocab_size = get_vocab_size(df, col_name = id_name)

    docs = df[id_name].values.tolist()
        
    encoded_ids = [one_hot(d, vocab_size) for d in docs]

    return np.array(encoded_ids)

def get_vocab_size(df, col_name):
    vocab_size = len(set((' ').join(df[col_name]).split()))
    return vocab_size

def get_max_length(df, col_name):
    max_length = 0
    for idx, row in df.iterrows():
        doc = row[col_name]
        l = len(doc.split())
        if l > max_length:
            max_length = l
            
    return max_length

def get_padded_documents(df):
    encoded_docs = get_encoded_ids(df, 'docs_to_embed')
    max_length = get_max_length(df, 'docs_to_embed')
    
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    return padded_docs

### Add additional inputs
On top of looking at the textual data, we have a few additional columns we need to consider. Each of these we'll consider a running variable
1. Product category
2. Votes

In the tiny sample dataset, all the records are on the same date and are in the same product category, so there's no reason to include either date or product category in the first model. We'll specify all of these as  inputs, then pass them in as a list to our model

### Train the model!

In [5]:
def get_model_input_specs(df):
    vocab_size = get_vocab_size(df, 'docs_to_embed')
    max_length = get_max_length(df, col_name = 'docs_to_embed')

    n_users = len(set(df['customer_id'].values.tolist()))
    n_products = len(set(df['product_id'].values.tolist()))

    return vocab_size, max_length, n_users, n_products

def get_scaled(df, col_name):
    x = [int(x) for x in df[col_name]]
    
    x = np.reshape(x, (-1, 1))

    scaler_x = MinMaxScaler()

    scaler_x.fit(x)
    
    xscale = scaler_x.transform(x)

    return xscale

def get_model_input_data(df):
    
    padded_docs = get_padded_documents(df)
    
    encoded_product_ids = get_encoded_ids(df, 'product_id')
    
    # failed here 
#     encoded_customer_idx = get_encoded_ids(df, 'customer_id')

    votes = get_scaled(df, 'total_votes')
    
#     return [encoded_customer_idx, encoded_product_ids, padded_docs, votes]

    return [padded_docs, votes, encoded_product_ids, df['customer_id']]

In [6]:
def get_model(df):

    vocab_size, max_length, n_users, n_products = get_model_input_specs(df)

    ##########
    # INPUTS #
    ##########

    doc_input = Input(shape=[max_length,], dtype='int32', name="Document-Input")
    doc_embedding = Embedding(vocab_size, output_dim = 512, name="Document-Embedding", input_length=max_length)(doc_input)
    lstm_out = LSTM(32)(doc_embedding)
    # auxiliary output 
    aux_out = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

    votes_input = Input(shape=[1,], name="Votes-Input") 

    product_input = Input(shape=[1, ], name="Product-Input")
    product_embedding = Embedding(n_products+1, 24, name="Product-Embedding")(product_input)
    # is it better to include this layer, or not? 
    product_vec = Flatten(name="Flatten-Products")(product_embedding)

    user_input = Input(shape=[1, ], name="User-Input")
#     user_embedding = Embedding(n_users+1, 24, name="User-Embedding")(user_input)
#     user_vec = Flatten(name="Flatten-Users")(user_embedding)

    ##########
    # CONCAT #
    ########## 
    concat = keras.layers.concatenate([lstm_out, votes_input, product_vec, user_input], 
                                      name = 'main_concat')

    x1 = Dense(64, activation='relu', name='1st_post_dense')(concat)

    x2 = keras.layers.Dropout(.2, name='Dropout')(x1)

    x3 = Dense(32, activation='relu', name='3st_post_dense')(x2)

    ###############
    # PREDICTIONS #
    ###############

    predictions = keras.layers.Dense(1, activation='sigmoid')(x3)

    #########
    # MODEL #
    #########

    # your model is a list of embedded inputs, then the dot product, then the scaled running variables 
#     model = Model(inputs = [doc_input, votes_input, product_input, user_input], output = predictions)



    model = Model(inputs = [doc_input, votes_input, product_input, user_input], output = predictions)


    model.compile('adam', 'binary_crossentropy')
    
    return model


In [7]:
model = get_model(df)


W0918 19:51:40.300965 139845503002432 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:517: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0918 19:51:40.318198 139845503002432 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:4138: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0918 19:51:40.327623 139845503002432 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:74: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0918 19:51:40.651489 139845503002432 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:13

In [9]:
data_input = get_model_input_data(df)

In [11]:
history = model.fit(data_input, labels, epochs=1, verbose=1, validation_split=0.2)   
model.save('/home/ec2-user/SageMaker/model-versions/full-model.h5')

Train on 8000 samples, validate on 2000 samples
Epoch 1/1


### Get a visual representation of the model!

In [17]:
from keras.utils import plot_model
plot_model(model, to_file='model.png')

### Get a visual representation of the model!

In [16]:
model_json = model.to_json()
with open("full-model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("full-model.h5")
print("Saved model to disk")

Saved model to disk
