# LSTM With doc2Vec

Transform each basket into a vector, then train a LSTM on the previous baskets to predict the next basket

In [1]:

import os
import collections

import random
import sqlite3
import numpy as np

# Load Data

In [2]:
def getOrders(lowerlimit, upperlimit):
    
    conn = sqlite3.connect("instacart.db")
    cur = conn.cursor()
    
    # Get final order
    cur.execute("SELECT B.user_id as user_id, A.order_id as order_id, "
                " B.order_number as order_number, A.product_id as product_id "
                "FROM products_train A INNER JOIN orders B "
                " ON A.order_id = B.order_id "
                "WHERE A.order_id % 100 >= " + str(lowerlimit) + \
                " AND A.order_id % 100 <= " + str(upperlimit) + ";")
    train_order = np.array(cur.fetchall())
    
    # Get all prior orders
    cur.execute( \
        "SELECT D.user_id as user_id, "
        "  D.order_id as order_id, "
        "  D.order_number as order_number, "
        "  C.product_id as product_id "
        "FROM products_prior C INNER JOIN ( "
        "  SELECT DISTINCT A.user_id as user_id,"
        "    A.order_id as order_id, A.order_number as order_number "
        "  FROM orders A INNER JOIN ( "
        "    SELECT DISTINCT user_id FROM orders "
        "    WHERE eval_set = 'train' "
        "      AND order_id % 100 >= " + str(lowerlimit) + 
        "      AND order_id % 100 <= " + str(upperlimit) +
        "    ) B ON A.user_id = B.user_id WHERE A.eval_set = 'prior' "
        ") D ON C.order_id = D.order_id;")
    prior_orders = np.array(cur.fetchall())
    
    conn.close()
    
    return train_order, prior_orders

In [3]:
y, x = getOrders(0, 10)

print(x.shape)
print(y.shape)

(2293292, 4)
(151348, 4)


# Convert purchases to OHE

In [None]:

order_dict = dict()

for i in range(x.shape[0]):
    try:
        order_dict[x[i][1]][x[i][2]-1] = 1
    except:
        order_dict[x[i][1]] = np.zeros((49688))
        order_dict[x[i][1]][x[i][2]-1] = 1      

# Train LSTM Model with Basket Vectors

In [6]:
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# create and fit the LSTM network
LSTM_model = Sequential()
LSTM_model.add(LSTM(100, stateful=True, batch_input_shape=(500, 5, 49688))) # input_shape=(batchsize, timesteps, data_dim)
LSTM_model.add(Dense(49688, activation='sigmoid'))
LSTM_model.compile(loss='binary_crossentropy', optimizer='adam')

## Create training data in batches of 500 orders

In [7]:
for n in range(1):
    print("Starting batch " + str(n) + " of ten.")
    print("--------------------------------")
    # Delete x_train / y_train from last iteration, may fail if this is first time through
    try:
        del y_train
    except:
        pass

    try:
        del x_train
    except:
        pass

    curPosition = 0
    
    # i iterates over 500 orders 
    for i in range(500):
        
        # Print status
        if i % 100 == 0:
            print ("Working on order " + str(i) + " of this batch...")
        
        # New order id coming in:
        current_order_id = y[curPosition][1]
        current_user_id = y[curPosition][0]
        current_user_final_order_number = y[curPosition][2]
        
        # Initialize One Hot Encoding, one zero for each product
        current_user_y = np.zeros((49688))
        
        # This loop increments curPosition at the end   
        while current_order_id == y[curPosition][1]:
            
            # make the position in current_order_y = 1 for each product_id
            # Zero index is product_id == 1, so subtract 1 from the product id
            current_user_y[y[curPosition][3] - 1] = 1
        
            curPosition += 1
        

        # current_order_y is done, let's make the current_order_x: 
        # For reference: x and y have format: [user_id, order_id, order_number, product_id]

        
        # grab this user's prior orders
        # Only grab orders where the order number is in the last five orders
        prior_orders_products = x[np.where((x[:,0] == current_user_id) & (x[:,2] >= current_user_final_order_number - 5))]

        # initialize current_user_x
        current_user_x = np.zeros((5, 49688))

        # for each product in prior_orders_products, add to current_user_x
        for p in prior_orders_products:
            
            # current_user_x[a][b], where... 
            # a = timestep = The timestep is order_number - (current_user_final_order_number + 5)
            # b = product_id in OHE, minus 1 because of zero indexing
            current_user_x[p[2] - (current_user_final_order_number - 5)][p[3] - 1] = 1
    
        # Assert - current_user_x is done, add to x_train and y_train

        # Expand full order to batch dimension
        cur_user_x_train = np.expand_dims(current_user_x, axis=0)

        # Create x_train or add to it 
        try: 
            x_train = np.append(x_train, cur_user_x_train, axis=0)
        except:
            x_train = cur_user_x_train
            
        # do the same for y: 
        # Expand full order to batch dimension
        cur_user_y_train = np.expand_dims(current_user_y, axis=0)

        # Create x_train or add to it 
        try: 
            y_train = np.append(y_train, cur_user_y_train, axis=0)
        except:
            y_train = cur_user_y_train

    # Done creating batch, time to train
    #LSTM_model.fit(x_train, y_train, epochs=15, batch_size=x_train.shape[0], verbose=2)
    

Starting batch 0 of ten.
--------------------------------
Working on order 0 of this batch...
Working on order 100 of this batch...
Working on order 200 of this batch...
Working on order 300 of this batch...
Working on order 400 of this batch...


In [9]:
LSTM_model.fit(x_train, y_train, epochs=15, batch_size=x_train.shape[0], verbose=2)


kwargs passed to function are ignored with Tensorflow backend


Epoch 1/15
7s - loss: 0.6932
Epoch 2/15
5s - loss: 0.6927
Epoch 3/15
5s - loss: 0.6921
Epoch 4/15
5s - loss: 0.6916
Epoch 5/15
5s - loss: 0.6909
Epoch 6/15
5s - loss: 0.6899
Epoch 7/15
5s - loss: 0.6886
Epoch 8/15
5s - loss: 0.6867
Epoch 9/15
5s - loss: 0.6838
Epoch 10/15
5s - loss: 0.6795
Epoch 11/15
5s - loss: 0.6733
Epoch 12/15
5s - loss: 0.6641
Epoch 13/15
5s - loss: 0.6508
Epoch 14/15
5s - loss: 0.6317
Epoch 15/15
5s - loss: 0.6050


<keras.callbacks.History at 0x7eff97624080>

# Score the validation orders

## Grab validation orders:

In [100]:
# Get validation orders:
y_validation, x_validation = getOrders(70, 75)

## Change LSTM Model to accept one order at a time

In [124]:
# re-define model
LSTM_model_predict = Sequential()
LSTM_model_predict.add(LSTM(300, stateful=True, batch_input_shape=(1, 5, 100))) # input_shape=(timesteps, data_dim)
LSTM_model_predict.add(Dense(100))
LSTM_model_predict.add(Dense(100))

# copy weights
old_weights = LSTM_model.get_weights()
LSTM_model_predict.set_weights(old_weights)

# compile model
LSTM_model_predict.compile(loss='mean_squared_error', optimizer='adam')


## Make Function to predict for a given order

In [136]:
# Create function to predict for an offer:
# # x_valid is a numpy array of [user_id, order_id, product_id, order_number]
# # ...but only for a single user / final validation order
# user_id = customer we are predicting a final order for
# final_order_number = order_number of the order we are predicting
def makePrediction(x_valid, final_order_number):
    

    # go grab the products for each prior order
    # dropping zero index, since we want to subtract at least one from final order num
    for j in range(1, 6):

        # Reset order string, which will error if this is the first loop 
        try:
            del cur_order_string
        except: 
            pass        

        # Make string of products for current order
        if final_order_number - j > 0:
            for product in [prod for prod in x_valid if prod[2] == final_order_number - j]:
                try:
                    cur_order_string += " "
                    cur_order_string += str(product[3])
                except:
                    cur_order_string = str(product[3])
        else:
            cur_order_string = ""

        # Turn string into vector
        cur_order_x_temp = doc2vec_model.infer_vector(cur_order_string.split())

        # Expand vectors to include empty timestep dimension
        cur_order_x_temp = np.expand_dims(cur_order_x_temp, axis=0)

        # cur_order_x_temp is the vector of this time step's basket. 
        # add it to the tensor for cur_order_x_train
        # Notice, cur_order_x_temp is added on left, we are going backwards in timesteps
        try:
            cur_order_x_train = np.append(cur_order_x_temp, cur_order_x_train, axis=0)
        except:
            cur_order_x_train = cur_order_x_temp
            
    # Expand to 3 dimensions
    cur_order_x_train = np.expand_dims(cur_order_x_train, axis=0)
    
    # Make Prediction
    prediction = LSTM_model_predict.predict(cur_order_x_train)
    
    # Find closest basket 
    #closest_basket = doc2vec_model.docvecs.most_similar([prediction], topn=1)
    
    return prediction

In [133]:
print(y_validation.shape)
print(x_validation.shape)

(81609, 4)
(1224500, 4)


In [137]:
i = 0
cur_user = y_validation[i][0]
cur_user_order_num = y_validation[i][3]
cur_user_prior_orders = x_validation[np.where(x_validation[:,0] == cur_user)]
cur_user_input = makePrediction(cur_user_prior_orders, cur_user_order_num)


In [147]:
#print(cur_user_input.flatten())
doc2vec_model.docvecs.most_similar([cur_user_input.flatten()], topn=1)

[(1720460, 0.7618535757064819)]

In [148]:
print(order_dict[1720460])

38557 20995 13176 47766 37646 46969 21137 21174 260 24184 16759 46049 1468 40706 22035


In [157]:
print(y_validation[:30])

[[182389    170  18394      7]
 [182389    170  37766      7]
 [182389    170  13176      7]
 [182389    170   6236      7]
 [182389    170   5077      7]
 [182389    170   8153      7]
 [182389    170  43772      7]
 [182389    170  25591      7]
 [182389    170  34582      7]
 [182389    170  49593      7]
 [182389    170  15093      7]
 [182389    170  43841      7]
 [182389    170  21137      7]
 [182389    170  40354      7]
 [182389    170  17794      7]
 [182389    170  11182      7]
 [182389    170  39190      7]
 [ 77529    473  20082      7]
 [ 77529    473  24852      7]
 [ 77529    473  47144      7]
 [ 77529    473  36441      7]
 [ 77529    473  12206      7]
 [ 77529    473   4034      7]
 [ 77529    473  30573      7]
 [ 77529    473  42404      7]
 [ 27650    774  47482     25]
 [ 27650    774  43335     25]
 [ 27650    774  16108     25]
 [ 10125   1275   6046      5]
 [ 10125   1275  48679      5]]
