# LSTM With doc2Vec

Transform each basket into a vector, then train a LSTM on the previous baskets to predict the next basket

In [2]:
# Install gensim
! pip install -U gensim

Collecting gensim
  Downloading gensim-2.3.0.tar.gz (17.2MB)
[K    100% |################################| 17.2MB 86kB/s eta 0:00:011  29% |#########                       | 5.0MB 5.9MB/s eta 0:00:03    30% |#########                       | 5.3MB 5.4MB/s eta 0:00:03    38% |############                    | 6.7MB 4.6MB/s eta 0:00:03    56% |##################              | 9.7MB 5.3MB/s eta 0:00:02    58% |##################              | 10.0MB 4.7MB/s eta 0:00:02    64% |####################            | 11.0MB 5.2MB/s eta 0:00:02    94% |##############################  | 16.2MB 4.9MB/s eta 0:00:01
[?25hCollecting numpy>=1.11.3 (from gensim)
  Downloading numpy-1.13.1-cp36-cp36m-manylinux1_x86_64.whl (17.0MB)
[K    100% |################################| 17.0MB 88kB/s eta 0:00:01   12% |###                             | 2.1MB 4.8MB/s eta 0:00:04    37% |############                    | 6.4MB 4.4MB/s eta 0:00:03    58% |##################              | 10.0MB 5.1MB/s eta 0:00:

In [3]:
import gensim
import os
import collections
import smart_open
import random
import sqlite3
import numpy as np

Using TensorFlow backend.


# Load Data

In [7]:
def getOrders(lowerlimit, upperlimit):
    
    conn = sqlite3.connect("instacart.db")
    cur = conn.cursor()
    
    # Get final order
    cur.execute("SELECT B.user_id as user_id, A.order_id as order_id, "
                " A.product_id as product_id, B.order_number "
                "FROM products_train A INNER JOIN orders B "
                " ON A.order_id = B.order_id "
                "WHERE A.order_id % 100 >= " + str(lowerlimit) + \
                " AND A.order_id % 100 <= " + str(upperlimit) + ";")
    train_order = np.array(cur.fetchall())
    
    # Get all prior orders
    cur.execute( \
        "SELECT D.user_id as user_id, "
        "  D.order_id as order_id, "
        "  D.order_number as order_number, "
        "  C.product_id as product_id "
        "FROM products_prior C INNER JOIN ( "
        "  SELECT DISTINCT A.user_id as user_id,"
        "    A.order_id as order_id, A.order_number as order_number "
        "  FROM orders A INNER JOIN ( "
        "    SELECT DISTINCT user_id FROM orders "
        "    WHERE eval_set = 'train' "
        "      AND order_id % 100 >= " + str(lowerlimit) + 
        "      AND order_id % 100 <= " + str(upperlimit) +
        "    ) B ON A.user_id = B.user_id WHERE A.eval_set = 'prior' "
        ") D ON C.order_id = D.order_id;")
    prior_orders = np.array(cur.fetchall())
    
    conn.close()
    
    return train_order, prior_orders

In [53]:
y, x = getOrders(0, 69)

print(x.shape)
print(y.shape)

(14458765, 4)
(968968, 4)


# Convert purchases to documents

In [11]:
from gensim.models import doc2vec
from collections import namedtuple

# Transform data (you can add more data preprocessing steps) 

order_docs = []
analyzed_order = namedtuple('AnalyzedDocument', 'words tags')

# both x and y are of the format: [user_id, order_id, product_id, order_number]
# So for each distinct order_id, make a space delimited list of words

order_dict = dict()

for i in range(x.shape[0]):
    try:
        order_dict[x[i][1]] += " "
        order_dict[x[i][1]] += str(x[i][2])
    except:
        order_dict[x[i][1]] = str(x[i][2])
        
for order in order_dict:
    words = order_dict[order].split()
    tags = [order]
    order_docs.append(analyzed_order(words, tags))

# Get the vectors

#model.docvecs[0]
#model.docvecs[1]

In [30]:
# Train model (set min_count = 1, if you want the model to work with the provided example data set)
doc2vec_model = doc2vec.Doc2Vec(order_docs, size = 100, window = 300, min_count = 2, workers = 4)

# Test Embedding

model is an embedding of all baskets. Let's test how it works

In [139]:
# Grab order number 1, look at products
order_dict[1]

'49302 11109 10246 49683 43633 13176 47209 22035'

In [144]:
# Have the model predict the vector for order_id = 1
order1_vector = doc2vec_model.infer_vector(order_dict[1].split())
print(order1_vector)

[ 0.06293377 -0.02151527 -0.00064242 -0.00924427 -0.02898256 -0.01243152
  0.02328276 -0.00991685 -0.03488562  0.00587283 -0.06857688 -0.04181214
 -0.01552221  0.04018221 -0.000702    0.02179286 -0.01139843 -0.02207963
 -0.0051099   0.05599847 -0.02450138  0.02682662  0.00958188  0.00421782
 -0.02438697  0.00845574  0.00358347  0.01389023  0.04696041 -0.02785262
 -0.00032243  0.00817654  0.01316404  0.03014762  0.00947701  0.00023515
  0.05599121  0.00496916 -0.04433753 -0.03634056  0.01820436  0.01616696
 -0.01494499 -0.03214792 -0.03879462  0.00453568  0.00326993 -0.0028774
  0.02175377  0.04297222  0.00377262  0.04014904 -0.00166964 -0.02475531
  0.02314722 -0.06379671 -0.04274622 -0.02753814  0.03868269  0.00157445
 -0.01569884 -0.05873756  0.02092193 -0.02124652  0.00322498 -0.01677983
  0.03195389 -0.0124905  -0.01520837  0.02523862  0.0098991  -0.00233243
  0.0370995   0.0049514  -0.03105518 -0.02719858  0.02964512 -0.02753293
  0.02012725 -0.004572    0.00586819 -0.02517196  0.

In [143]:
# Find the five most similar items to order_id = 1
print(doc2vec_model.docvecs.most_similar([order1_vector], topn=1))

[(364052, 0.8343102931976318)]


In [24]:
# see what products are in 3122701
print(order_dict[3122701])

44632 21709 49191 7389 16440 13176


In [23]:
# What produts are in order_id = 1186825
print(order_dict[1186825])

46692 47766 13176 21616 33279


# Train LSTM Model with Basket Vectors

In [98]:
import math
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

# create and fit the LSTM network
LSTM_model = Sequential()
LSTM_model.add(LSTM(300, stateful=True, batch_input_shape=(500, 5, 100))) # input_shape=(timesteps, data_dim)
LSTM_model.add(Dense(100))
LSTM_model.add(Dense(100))
LSTM_model.compile(loss='mean_squared_error', optimizer='adam')

## Create training data in batches of 64 orders

In [99]:
for n in range(10):
    print("Starting batch " + str(n) + " of ten.")
    print("--------------------------------")
    # Delete x_train / y_train from last iteration, may fail if this is first time through
    try:
        del y_train
    except:
        pass

    try:
        del x_train
    except:
        pass

    # i iterates over 500 orders 
    for i in range(500):

        if i % 100 == 0:
            print ("Working on order " + str(i) + " of this batch...")
        # x and y have format: [user_id, order_id, product_id, order_number]

        # for each order in y, make the y_train array using order_dict
        y_temp = doc2vec_model.infer_vector(order_dict[y[(500*n) + i][1]].split())

        y_temp2 = np.expand_dims(y_temp, axis=0)
        try:
            y_train = np.append(y_train, y_temp2, axis=0)
        except NameError:
            y_train = y_temp2


        final_order_num = y[(500*n) + i][3]
        final_order_user_id = y[(500*n) + i][0]

        # grab this user's prior orders
        prior_orders = x[np.where(x[:,0] == final_order_user_id)]

        cur_order_string = ''

        try:
            del cur_order_x_train
        except:
            pass

        # go grab the products for each prior order
        # dropping zero index, since we want to subtract at least one from final order num
        for j in range(1, 6):

            # Reset 
            del cur_order_string

            # Make string of products for current order
            if final_order_num - j > 0:
                for product in [prod for prod in prior_orders if prod[2] == final_order_num - j]:
                    try:
                        cur_order_string += " "
                        cur_order_string += str(product[3])
                    except:
                        cur_order_string = str(product[3])
            else:
                cur_order_string = ""

            # Turn string into vector
            cur_order_x_temp = doc2vec_model.infer_vector(cur_order_string.split())

            # Expand vectors to include empty timestep dimension
            cur_order_x_temp = np.expand_dims(cur_order_x_temp, axis=0)

            # cur_order_x_temp is the vector of this time step's basket. 
            # add it to the tensor for cur_order_x_train
            # Notice, cur_order_x_temp is added on left, we are going backwards in timesteps
            try:
                cur_order_x_train = np.append(cur_order_x_temp, cur_order_x_train, axis=0)
            except:
                cur_order_x_train = cur_order_x_temp

        # ASSERT: cur_order_x_train has five timesteps now times 100 vectors, shape = (5, 100)

        # Expand full order to batch dimension
        cur_order_x_train = np.expand_dims(cur_order_x_train, axis=0)

        # Create x_train or add to it 
        try: 
            x_train = np.append(x_train, cur_order_x_train, axis=0)
        except:
            x_train = cur_order_x_train

    # Done creating batch, time to train
    LSTM_model.fit(x_train, y_train, epochs=15, batch_size=x_train.shape[0], verbose=2)
    

Starting batch 0 of ten.
--------------------------------
Working on order 0 of this batch...
Working on order 100 of this batch...
Working on order 200 of this batch...
Working on order 300 of this batch...
Working on order 400 of this batch...


kwargs passed to function are ignored with Tensorflow backend


Epoch 1/15
0s - loss: 0.0017
Epoch 2/15
0s - loss: 0.0015
Epoch 3/15
0s - loss: 0.0014
Epoch 4/15
0s - loss: 0.0013
Epoch 5/15
0s - loss: 0.0012
Epoch 6/15
0s - loss: 0.0012
Epoch 7/15
0s - loss: 0.0011
Epoch 8/15
0s - loss: 0.0011
Epoch 9/15
0s - loss: 0.0011
Epoch 10/15
0s - loss: 0.0010
Epoch 11/15
0s - loss: 0.0010
Epoch 12/15
0s - loss: 9.9590e-04
Epoch 13/15
0s - loss: 9.7828e-04
Epoch 14/15
0s - loss: 9.5921e-04
Epoch 15/15
0s - loss: 9.5183e-04
Starting batch 1 of ten.
--------------------------------
Working on order 0 of this batch...
Working on order 100 of this batch...
Working on order 200 of this batch...
Working on order 300 of this batch...
Working on order 400 of this batch...
Epoch 1/15
0s - loss: 0.0016
Epoch 2/15
0s - loss: 0.0015
Epoch 3/15
0s - loss: 0.0014
Epoch 4/15
0s - loss: 0.0014
Epoch 5/15
0s - loss: 0.0013
Epoch 6/15
0s - loss: 0.0013
Epoch 7/15
0s - loss: 0.0013
Epoch 8/15
0s - loss: 0.0012
Epoch 9/15
0s - loss: 0.0012
Epoch 10/15
0s - loss: 0.0012
Epoch 

# Score the validation orders

## Grab validation orders:

In [100]:
# Get validation orders:
y_validation, x_validation = getOrders(70, 75)

## Change LSTM Model to accept one order at a time

In [124]:
# re-define model
LSTM_model_predict = Sequential()
LSTM_model_predict.add(LSTM(300, stateful=True, batch_input_shape=(1, 5, 100))) # input_shape=(timesteps, data_dim)
LSTM_model_predict.add(Dense(100))
LSTM_model_predict.add(Dense(100))

# copy weights
old_weights = LSTM_model.get_weights()
LSTM_model_predict.set_weights(old_weights)

# compile model
LSTM_model_predict.compile(loss='mean_squared_error', optimizer='adam')


## Make Function to predict for a given order

In [136]:
# Create function to predict for an offer:
# # x_valid is a numpy array of [user_id, order_id, product_id, order_number]
# # ...but only for a single user / final validation order
# user_id = customer we are predicting a final order for
# final_order_number = order_number of the order we are predicting
def makePrediction(x_valid, final_order_number):
    

    # go grab the products for each prior order
    # dropping zero index, since we want to subtract at least one from final order num
    for j in range(1, 6):

        # Reset order string, which will error if this is the first loop 
        try:
            del cur_order_string
        except: 
            pass        

        # Make string of products for current order
        if final_order_number - j > 0:
            for product in [prod for prod in x_valid if prod[2] == final_order_number - j]:
                try:
                    cur_order_string += " "
                    cur_order_string += str(product[3])
                except:
                    cur_order_string = str(product[3])
        else:
            cur_order_string = ""

        # Turn string into vector
        cur_order_x_temp = doc2vec_model.infer_vector(cur_order_string.split())

        # Expand vectors to include empty timestep dimension
        cur_order_x_temp = np.expand_dims(cur_order_x_temp, axis=0)

        # cur_order_x_temp is the vector of this time step's basket. 
        # add it to the tensor for cur_order_x_train
        # Notice, cur_order_x_temp is added on left, we are going backwards in timesteps
        try:
            cur_order_x_train = np.append(cur_order_x_temp, cur_order_x_train, axis=0)
        except:
            cur_order_x_train = cur_order_x_temp
            
    # Expand to 3 dimensions
    cur_order_x_train = np.expand_dims(cur_order_x_train, axis=0)
    
    # Make Prediction
    prediction = LSTM_model_predict.predict(cur_order_x_train)
    
    # Find closest basket 
    #closest_basket = doc2vec_model.docvecs.most_similar([prediction], topn=1)
    
    return prediction

In [133]:
print(y_validation.shape)
print(x_validation.shape)

(81609, 4)
(1224500, 4)


In [137]:
i = 0
cur_user = y_validation[i][0]
cur_user_order_num = y_validation[i][3]
cur_user_prior_orders = x_validation[np.where(x_validation[:,0] == cur_user)]
cur_user_input = makePrediction(cur_user_prior_orders, cur_user_order_num)


In [147]:
#print(cur_user_input.flatten())
doc2vec_model.docvecs.most_similar([cur_user_input.flatten()], topn=1)

[(1720460, 0.7618535757064819)]

In [148]:
print(order_dict[1720460])

38557 20995 13176 47766 37646 46969 21137 21174 260 24184 16759 46049 1468 40706 22035


In [157]:
print(y_validation[:30])

[[182389    170  18394      7]
 [182389    170  37766      7]
 [182389    170  13176      7]
 [182389    170   6236      7]
 [182389    170   5077      7]
 [182389    170   8153      7]
 [182389    170  43772      7]
 [182389    170  25591      7]
 [182389    170  34582      7]
 [182389    170  49593      7]
 [182389    170  15093      7]
 [182389    170  43841      7]
 [182389    170  21137      7]
 [182389    170  40354      7]
 [182389    170  17794      7]
 [182389    170  11182      7]
 [182389    170  39190      7]
 [ 77529    473  20082      7]
 [ 77529    473  24852      7]
 [ 77529    473  47144      7]
 [ 77529    473  36441      7]
 [ 77529    473  12206      7]
 [ 77529    473   4034      7]
 [ 77529    473  30573      7]
 [ 77529    473  42404      7]
 [ 27650    774  47482     25]
 [ 27650    774  43335     25]
 [ 27650    774  16108     25]
 [ 10125   1275   6046      5]
 [ 10125   1275  48679      5]]
