# Deep Learning Recommender Model
In this notebook we'll:
- Read in the data from disk
- Define the preprocessing functions
- Define the model
- Apply the features to the model, ie fit

In [1]:
!pip uninstall keras -y
!pip install keras==2.1.2

Uninstalling Keras-2.2.4:
  Successfully uninstalled Keras-2.2.4
[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Collecting keras==2.1.2
[?25l  Downloading https://files.pythonhosted.org/packages/68/89/58ee5f56a9c26957d97217db41780ebedca3154392cb903c3f8a08a52208/Keras-2.1.2-py2.py3-none-any.whl (304kB)
[K    100% |████████████████████████████████| 307kB 31.4MB/s ta 0:00:01
Installing collected packages: keras
Successfully installed keras-2.1.2
[33mYou are using pip version 10.0.1, however version 19.2.3 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [2]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings

from sklearn.preprocessing import MinMaxScaler

import keras
import sagemaker
from keras.models import load_model

from keras.layers import Input, Embedding, Flatten, Dot, Dense, LSTM, Activation
from keras.models import Model
from keras.preprocessing.text import one_hot
from keras.preprocessing.sequence import pad_sequences

from tensorflow.python.saved_model import tag_constants

warnings.filterwarnings('ignore')
%matplotlib inline

sess = sagemaker.Session()
role = sagemaker.get_execution_role()

Using TensorFlow backend.


In [3]:
os.system('''gunzip micro_data.tsv.gz''')

0

In [4]:
def read_data(f_name):
    
    df = pd.read_csv(f_name, sep='\t', nrows = 2000)
        
    df['docs_to_embed'] = df['product_title'] + ', ' + df['review_headline'] + ', ' + df['review_body']
    
    df.drop(['product_title', 'review_headline', 'review_body'], axis=1, inplace=True)
    
    # remove nans
    df.dropna(inplace=True)
    
#     df['customer_id'] = str(df['customer_id'])
 
    return df

df = read_data('micro_data.tsv')

## Build and train a model

### Features to embed:
1. Product title
2. Review headline
3. Review body
4. Customer ID
5. Product ID 

In [5]:
def get_label(df):
    labels = [1 if int(x) >= 4 else 0 for x in df['star_rating'] ]
    return labels

labels = get_label(df)

def get_encoded_ids(df, id_name):

    assert id_name in ['product_id', 'docs_to_embed']
    
    vocab_size = get_vocab_size(df, col_name = id_name)
    
    print (id_name, vocab_size)

    docs = df[id_name].values.tolist()
        
    encoded_ids = [one_hot(d, vocab_size) for d in docs]

    return np.array(encoded_ids)

def get_vocab_size(df, col_name):
    vocab_size = len(set((' ').join(df[col_name]).split()))
    return vocab_size

def get_max_length(df, col_name):
    max_length = 0
    for idx, row in df.iterrows():
        doc = row[col_name]
        l = len(doc.split())
        if l > max_length:
            max_length = l
            
    return max_length

def get_padded_documents(df):
    encoded_docs = get_encoded_ids(df, 'docs_to_embed')
    max_length = get_max_length(df, 'docs_to_embed')
    
    padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
    return padded_docs

### Add additional inputs
On top of looking at the textual data, we have a few additional columns we need to consider. Each of these we'll consider a running variable
1. Product category
2. Votes

In the tiny sample dataset, all the records are on the same date and are in the same product category, so there's no reason to include either date or product category in the first model. We'll specify all of these as  inputs, then pass them in as a list to our model

### Train the model!

In [6]:
def get_model_input_specs(df):
    vocab_size = get_vocab_size(df, 'docs_to_embed')
    max_length = get_max_length(df, col_name = 'docs_to_embed')

    n_users = len(set(df['customer_id'].values.tolist()))
    n_products = len(set(df['product_id'].values.tolist()))

    return vocab_size, max_length, n_users, n_products

def get_scaled(df, col_name):
    x = [int(x) for x in df[col_name]]
    
    x = np.reshape(x, (-1, 1))

    scaler_x = MinMaxScaler()

    scaler_x.fit(x)
    
    xscale = scaler_x.transform(x)

    return xscale

def get_model_input_data(df):
    
    padded_docs = get_padded_documents(df)
    
    encoded_product_ids = get_encoded_ids(df, 'product_id')

    votes = get_scaled(df, 'total_votes')
    
    return [padded_docs, votes, encoded_product_ids, df['customer_id']]

In [7]:
def get_model(df):

    vocab_size, max_length, n_users, n_products = get_model_input_specs(df)

    ##########
    # INPUTS #
    ##########

    doc_input = Input(shape=[max_length,], dtype='int32', name="Document-Input")
    doc_embedding = Embedding(vocab_size, output_dim = 512, name="Document-Embedding", input_length=max_length)(doc_input)
    lstm_out = LSTM(32)(doc_embedding)
    # auxiliary output 
    aux_out = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

    votes_input = Input(shape=[1,], name="Votes-Input") 

    product_input = Input(shape=[1, ], name="Product-Input")
    product_embedding = Embedding(n_products+1, 24, name="Product-Embedding")(product_input)
    
    product_vec = Flatten(name="Flatten-Products")(product_embedding)

    user_input = Input(shape=[1, ], name="User-Input")

    ##########
    # CONCAT #
    ########## 
    concat = keras.layers.concatenate([lstm_out, votes_input, product_vec, user_input], 
                                      name = 'main_concat')

    x1 = Dense(64, activation='relu', name='1st_post_dense')(concat)

    x2 = keras.layers.Dropout(.2, name='Dropout')(x1)

    x3 = Dense(32, activation='relu', name='3st_post_dense')(x2)

    ###############
    # PREDICTIONS #
    ###############

    predictions = keras.layers.Dense(1, activation='sigmoid')(x3)

    #########
    # MODEL #
    #########

    model = Model(inputs = [doc_input, votes_input, product_input, user_input], output = predictions)

    model.compile('adam', 'binary_crossentropy')
    
    return model


Once you defined the model, run this next cell for a few minutes. The data is too large to train in a reasonable time period for a workshop, so you're actually going to load a pre-trained model. But run this step so you can see that it works.

In [8]:
model = get_model(df)

data_input = get_model_input_data(df)

history = model.fit(data_input, labels, epochs=1, verbose=1, validation_split=0.2)  

version = 8

model.save('full-model-v{}.h5'.format(version))

# write to json 
json_string = model.to_json()
with open("full-model-v{}.json".format(version), "w") as json_file:
    json_file.write(json_string)

# save weights
model.save_weights('full-model_weights-v{}.h5'.format(version))

W0924 15:07:39.863368 139928220383040 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:497: The name tf.placeholder is deprecated. Please use tf.compat.v1.placeholder instead.

W0924 15:07:39.880140 139928220383040 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:3636: The name tf.random_uniform is deprecated. Please use tf.random.uniform instead.

W0924 15:07:39.889464 139928220383040 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:64: The name tf.get_default_graph is deprecated. Please use tf.compat.v1.get_default_graph instead.

W0924 15:07:40.756482 139928220383040 deprecation.py:506] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:1247: call

docs_to_embed 12629
product_id 1358


W0924 15:07:42.037797 139928220383040 deprecation_wrapper.py:119] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:958: The name tf.assign_add is deprecated. Please use tf.compat.v1.assign_add instead.

W0924 15:07:42.047095 139928220383040 deprecation.py:506] From /home/ec2-user/anaconda3/envs/tensorflow_p36/lib/python3.6/site-packages/keras/backend/tensorflow_backend.py:680: calling Constant.__init__ (from tensorflow.python.ops.init_ops) with dtype is deprecated and will be removed in a future version.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor


Train on 1600 samples, validate on 400 samples
Epoch 1/1
