In [1]:
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.sparse
pd.set_option("display.max_columns",101)
RANDOM_STATE = 42
DATA_PATH = "../data/instacart/"

In [2]:
print('loading prior')
priors = pd.read_csv(DATA_PATH + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})


loading prior


In [3]:
print('loading orders')
orders = pd.read_csv(DATA_PATH + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

loading orders


In [4]:
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

In [5]:
print('loading train')
train = pd.read_csv(DATA_PATH + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

loading train


In [6]:
print ('split orders: train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

split orders: train, test


In [23]:
users = pd.read_csv(DATA_PATH + "users_match.csv", dtype={
    'total_items': np.int32,
    'total_distinct_items': np.int32,
    'average_days_between_orders': np.float32,
    'nb_orders': np.int32,
    'average_basket': np.float32
})
products = pd.read_csv(DATA_PATH + "products_match.csv", dtype={
    'product_id': np.int32,
    'aisle_id': np.int32,
    'department_id': np.int32,
    'orders': np.int32,
    'reorders': np.int32,
    'reorder_rate': np.float64,
})


ValueError: cannot safely convert passed user dtype of int32 for float64 dtyped data in column 4

In [8]:
userXproduct = pd.read_csv(DATA_PATH + "user_product_match.csv", index_col=0, dtype={
    'np_orders': np.int32,
    'last_order_id': np.int32,
    'sum_pos_in_cart': np.int32,
})

In [9]:
def xx(a):
    tmp = a.strip()[6:-2].split(",")
    result = [int(x) for x in tmp if x]
    return result
users['all_products'] = users['all_products'].apply(xx)

In [10]:
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        if user_id not in users.all_products:
            continue
            
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    df.head()
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders)
    df['product_reorders'] = df.product_id.map(products.reordered)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x))
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [19]:
df_train, labels = features(train_orders, labels_given=True)

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'

build candidate list
('order row', 10000)
('order row', 20000)
('order row', 30000)


KeyboardInterrupt: 

In [16]:
df_train.head(n=120)

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,dow,order_hour_of_day,days_since_prior_order,days_since_ratio,aisle_id,department_id,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last
0,1187899,5066,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,29.0,13.0,3949.0,1306.0,0.330717,,,,,,
1,1187899,2573,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,40.0,8.0,478.0,297.0,0.621339,,,,,,
2,1187899,18961,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,63.0,9.0,7.0,1.0,0.142857,,,,,,
3,1187899,23,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,47.0,11.0,13.0,6.0,0.461538,,,,,,
4,1187899,32792,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,69.0,15.0,95.0,30.0,0.315789,,,,,,
5,1187899,22559,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,90.0,7.0,50.0,16.0,0.320000,,,,,,
6,1187899,13351,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,66.0,6.0,27.0,5.0,0.185185,,,,,,
7,1187899,47144,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,123.0,4.0,3094.0,1446.0,0.467356,,,,,,
8,1187899,45613,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,98.0,7.0,12.0,1.0,0.083333,,,,,,
9,1187899,10305,23.0,-5.0,80.0,8.681818,10.913043,0,21,4.0,0.460733,29.0,13.0,402.0,173.0,0.430348,,,,,,


In [17]:
users.total_items.head()

0    59
1   -61
2    88
3    18
4    37
Name: total_items, dtype: int8

In [147]:
m = int(len(labels) * 0.8)
df_train_data = df_train[:m]
label_train = labels[:m]

In [149]:
df_test_dat = df_train[m:]
label_test = labels[m:]

In [150]:
import lightgbm as lgb
d_train = lgb.Dataset(df_train_data[f_to_use],
                      label=label_train,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'

In [151]:
d_test = lgb.Dataset(df_test_dat[f_to_use],
                      label=label_test,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'

In [201]:
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 90,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5,
    'learning_rate': 0.5
}
ROUNDS = 200

print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS, valid_sets=[d_test,], early_stopping_rounds=10)
# lgb.plot_importance(bst, figsize=(9,20))

light GBM train :-)
[1]	valid_0's binary_logloss: 0.323658
Train until valid scores didn't improve in 10 rounds.
[2]	valid_0's binary_logloss: 0.18583
[3]	valid_0's binary_logloss: 0.117068
[4]	valid_0's binary_logloss: 0.0799512
[5]	valid_0's binary_logloss: 0.05934
[6]	valid_0's binary_logloss: 0.0478385
[7]	valid_0's binary_logloss: 0.0414852
[8]	valid_0's binary_logloss: 0.0380516
[9]	valid_0's binary_logloss: 0.0360823
[10]	valid_0's binary_logloss: 0.0350338
[11]	valid_0's binary_logloss: 0.0344978
[12]	valid_0's binary_logloss: 0.0343477
[13]	valid_0's binary_logloss: 0.0341736
[14]	valid_0's binary_logloss: 0.0341722
[15]	valid_0's binary_logloss: 0.034123
[16]	valid_0's binary_logloss: 0.0350164
[17]	valid_0's binary_logloss: 0.0346803
[18]	valid_0's binary_logloss: 0.0349269
[19]	valid_0's binary_logloss: 0.0351137
[20]	valid_0's binary_logloss: 0.0352634
[21]	valid_0's binary_logloss: 0.0349673
[22]	valid_0's binary_logloss: 0.0355816
[23]	valid_0's binary_logloss: 0.0354441

In [65]:
from collections import Counter
Counter(labels)

Counter({0: 8407170, 1: 59878})

In [193]:
np.mean(labels)

0.007071886211109232

In [160]:
bst.best_iteration

20

In [210]:
preduct_result =bst.predict(df_test_dat[f_to_use], num_iteration=bst.best_iteration)

In [211]:
np.mean(preduct_result)

0.007319956148933192

In [249]:
result = preduct_result > 0.1126
sum(result)

14486

In [78]:
df_test, _ = features(test_orders)

build candidate list
('order row', 10000)
('order row', 20000)
('order row', 30000)
('order row', 40000)
('order row', 50000)
('order row', 60000)
('order row', 70000)
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int64
user_total_items                       int8
total_distinct_items                  int64
user_average_days_between_orders    float64
user_average_basket                 float64
dow                                    int8
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float64
aisle_id                            float64
department_id                       float64
product_orders                      float64
product_reorders                    float64
product_reorder_rate                float64
UP_orders                 

In [207]:
print('light GBM predict')
preds = bst.predict(df_test[f_to_use], num_iteration=bst.best_iteration)

light GBM predict


In [251]:
df_test.shape

(4840331, 23)

In [252]:
df_train.shape

(8467048, 22)

In [250]:
print np.mean(preds)
print np.max(preds)
result_pred = preds > 0.1126
sum(result_pred)

0.00725967585354
0.99999999869


40931

In [254]:
df_test['pred'] = preds

TRESHOLD = 0.1126  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv(DATA_PATH + 'sub.csv', index=False)