In [1]:
import os

import numpy as np
import pandas as pd
import seaborn.apionly as sns
import matplotlib.pyplot as plt
sns.set()
color = sns.color_palette()

In [2]:
#setup para el notebook

%matplotlib inline
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:,.2f}'.format
plt.rcParams['figure.figsize'] = (16, 12)

In [3]:
for data_dir in ["C:\\Users\Rafael\\Documents\\data\\instacart\\raw",
                 "C:/Users/rcrescenzi/Documents/Personal/data/instacart/raw"]:
    if os.path.exists(data_dir):
        print(data_dir)
        break

target_dir = os.path.abspath(os.path.join(data_dir, "../mock_df"))

if not os.path.exists(target_dir):
    os.makedirs(target_dir)

orders = pd.read_csv(data_dir + "/orders.csv", index_col="order_id",
                     dtype={
                        'order_id': np.uint32,
                        'user_id': np.uint32,
                        'eval_set': 'category',
                        'order_number': np.uint8,
                        'order_dow': np.uint8,
                        'order_hour_of_day': np.uint8,
                        'days_since_prior_order': np.float32})

valid_users = orders[orders.eval_set == "train"].user_id.sample(frac=0.1)
orders = orders[orders.user_id.isin(valid_users)]

def make_df(path):
    res = []
    for i, cand in enumerate(pd.read_csv(path, chunksize=100000)):
        temp =  cand.join(orders[["user_id", "order_number"]], on="order_id")
        temp = temp[temp.user_id.isin(valid_users)]
        res.append(temp)
        if (i % 100) == 0:
            print("haciendo", i+1)
            print(temp)
    return pd.concat(res)
data = make_df(data_dir + "/order_products__train.csv")
data = pd.concat([data, make_df(data_dir + "/order_products__prior.csv")])

pids = int(data.product_id.max() + 1)
max_order = int(data.order_number.max())
max_order_size = int(data.groupby("order_id").count().max().product_id)

C:\Users\Rafael\Documents\data\instacart\raw
haciendo 1
       order_id  product_id  add_to_cart_order  reordered    user_id  \
0             1       49302                  1          1 112,108.00   
1             1       11109                  2          1 112,108.00   
2             1       10246                  3          0 112,108.00   
3             1       49683                  4          0 112,108.00   
4             1       43633                  5          1 112,108.00   
...         ...         ...                ...        ...        ...   
99953    246768        5456                  7          0 141,967.00   
99954    246768       31215                  8          1 141,967.00   
99955    246768       34993                  9          0 141,967.00   
99956    246768       44142                 10          0 141,967.00   
99957    246768       42504                 11          0 141,967.00   

       order_number  
0              4.00  
1              4.00  
2            

In [4]:
EMBEDDING_SIZE = 10
BATCH_SIZE = 1

In [5]:
def pad(a):
    return np.pad(a, (max_order_size - a.shape[0], 0), "constant", constant_values=(0, 0))

data_orders = data.groupby(["user_id", "order_number"]).product_id.apply(lambda x: pad(x.values))
data_labels = data[data.reordered == 1].groupby(["user_id", "order_number"]).product_id.apply(lambda x: x.values)
data = pd.concat([data_orders.rename("compras"), data_labels.rename("recompras")], axis=1)
del data_orders, data_labels

data = data.sort_index()
data["recompras"] = data.recompras.shift(-1)

In [65]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,compras,recompras
user_id,order_number,Unnamed: 2_level_1,Unnamed: 3_level_1
7.00,1.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[29871, 22963, 6361, 31683]"
7.00,2.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[45066, 37999, 45628, 37602, 29871, 22963, 190..."
7.00,3.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[37999, 14332, 37602, 6361, 31683, 17638, 4085..."
7.00,4.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[47272, 21137, 37999, 37602, 19019, 6361, 1763..."
7.00,5.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[21137, 42803]"
7.00,6.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[8518]
7.00,7.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10895, 39275, 37602, 31683, 17638, 13198, 439..."
7.00,8.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[29993, 39121, 45628, 39275, 4920, 37602, 3168..."
7.00,9.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[10895, 45628, 4920, 17638, 40852, 43967, 30391]"
7.00,10.00,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[21137, 45628, 4920, 37602, 31683, 17638, 4085..."


In [66]:
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.layers.wrappers import TimeDistributed

model = Sequential()
model.add(TimeDistributed(Embedding(input_dim=pids, output_dim=EMBEDDING_SIZE, mask_zero=True), input_shape=(max_order, max_order_size)))
model.add(TimeDistributed(LSTM(15)))
model.add(LSTM(8))
model.add(Dense(5, activation="relu"))
model.add(Dense(pids, activation="sigmoid"))
model.compile(optimizer='Adam', loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_5 (TimeDist (None, 100, 102, 10)      496890    
_________________________________________________________________
time_distributed_6 (TimeDist (None, 100, 15)           1560      
_________________________________________________________________
lstm_6 (LSTM)                (None, 8)                 768       
_________________________________________________________________
dense_5 (Dense)              (None, 5)                 45        
_________________________________________________________________
dense_6 (Dense)              (None, 49689)             298134    
Total params: 797,397.0
Trainable params: 797,397
Non-trainable params: 0.0
_________________________________________________________________


In [7]:
def onehot(vec, pids=pids):
    if vec is np.nan:
        return np.zeros((pids,))
    else:
        return to_categorical(vec, pids).sum(axis=0)

from keras.utils.np_utils import to_categorical

def train_gen(data, last=False):
    users = data.index.levels[0].values
    zeros = np.zeros((max_order_size,))
    while True:
        for user in np.random.choice(users, len(users), replace=False):
            temp = data.xs(user)
            if not last:
                length = np.random.choice(temp.index[:-2], 1)[0]
                temp = temp.loc[:length]
            else:
                temp = temp.iloc[:-1]
            compras = temp.compras.values
            pad = int(max_order - temp.shape[0])
            X = []
            for i in range(max_order):
                if i < pad:
                    X.append(zeros)
                else:
                    X.append(compras[pad - i])
            yield np.asarray([np.vstack(X)]), np.asarray([onehot(temp.recompras.iloc[-1])])

In [67]:
tgen = train_gen(data)
test_gen = train_gen(data, last=True)


In [68]:
steps = data.index.levels[0].shape[0]
steps = steps / 100
model.fit_generator(tgen, steps_per_epoch=steps, epochs=10,
                    validation_data=test_gen, validation_steps=steps,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e6c4e746a0>

In [69]:
t = next(test_gen)
tr = t[1]
tr = pd.Series(tr[0])
tp = model.predict(t[0])
tp = pd.Series(tp[0])
res = pd.concat([tr.rename("real"), tp.rename("preds")], axis=1)
res

Unnamed: 0,real,preds
0,0.00,0.00
1,0.00,0.00
2,0.00,0.00
3,0.00,0.00
4,0.00,0.00
5,0.00,0.00
6,0.00,0.00
7,0.00,0.00
8,0.00,0.00
9,0.00,0.00


In [70]:
res[res.real==1]

Unnamed: 0,real,preds
16521,1.0,0.0
22935,1.0,0.03
27241,1.0,0.0
48679,1.0,0.01


In [21]:
res[res.preds > 0.01]

Unnamed: 0,real,preds
5077,0.0,0.01
5876,0.0,0.02
8277,0.0,0.01
8518,0.0,0.01
9076,0.0,0.02
13176,1.0,0.07
15290,0.0,0.01
16797,0.0,0.03
19660,0.0,0.01
21137,0.0,0.05


In [22]:
pd.options.display.max_rows = 50
res[res.preds > 0.01].sort_values("preds", ascending=False)

Unnamed: 0,real,preds
24852,0.0,0.08
13176,1.0,0.07
47766,0.0,0.05
21137,0.0,0.05
47209,0.0,0.04
27845,0.0,0.04
47626,0.0,0.03
16797,0.0,0.03
21903,0.0,0.03
45007,0.0,0.03


In [41]:
model = Sequential()
model.add(TimeDistributed(Embedding(input_dim=pids, output_dim=EMBEDDING_SIZE, mask_zero=True), input_shape=(max_order, max_order_size)))
model.add(TimeDistributed(LSTM(10)))
model.add(LSTM(10))
model.add(Dense(5, activation="relu"))
model.add(Dense(pids, activation="softmax"))
model.compile(optimizer='Adam', loss='binary_crossentropy',
              metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
time_distributed_3 (TimeDist (None, 100, 102, 10)      496890    
_________________________________________________________________
time_distributed_4 (TimeDist (None, 100, 10)           840       
_________________________________________________________________
lstm_4 (LSTM)                (None, 10)                840       
_________________________________________________________________
dense_3 (Dense)              (None, 5)                 55        
_________________________________________________________________
dense_4 (Dense)              (None, 49689)             298134    
Total params: 796,759.0
Trainable params: 796,759
Non-trainable params: 0.0
_________________________________________________________________


In [42]:
model.fit_generator(tgen, steps_per_epoch=steps, epochs=10,
                    validation_data=test_gen, validation_steps=steps,
                    verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x1e6ba662470>

In [78]:
t = next(test_gen)
tr = t[1]
tr = pd.Series(tr[0])
tp = model.predict(t[0])
tp = pd.Series(tp[0])
res = pd.concat([tr.rename("real"), tp.rename("preds")], axis=1)
print(res[res.real == 1])
print(res[res.preds > 0.01].sort_values("preds", ascending=False))

       real  preds
13176  1.00   0.06
13535  1.00   0.00
35561  1.00   0.00
       real  preds
24852  0.00   0.10
13176  1.00   0.06
21137  0.00   0.04
47209  0.00   0.04
21903  0.00   0.03
22935  0.00   0.03
27845  0.00   0.02
27966  0.00   0.02
24964  0.00   0.02
47626  0.00   0.02
16797  0.00   0.02
28204  0.00   0.02
47766  0.00   0.02
4605   0.00   0.02
26209  0.00   0.02
49683  0.00   0.02
45007  0.00   0.01
19057  0.00   0.01
44632  0.00   0.01
45066  0.00   0.01
28985  0.00   0.01
37646  0.00   0.01
46979  0.00   0.01
5876   0.00   0.01
39877  0.00   0.01
30391  0.00   0.01
39275  0.00   0.01
40706  0.00   0.01
39928  0.00   0.01
