In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb
IDIR = '../data/original/'

In [2]:
print('loading prior')
priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

loading prior


In [3]:
print('loading train')
train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

loading train


In [4]:
print('loading orders')
orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

loading orders


In [5]:
print('loading products')
products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

loading products


In [6]:
print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))

priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


In [7]:
print('computing product f')
prods = pd.DataFrame()
prods['orders'] = priors.groupby(priors.product_id).size().astype(np.int32)
prods['reorders'] = priors['reordered'].groupby(priors.product_id).sum().astype(np.float32)
prods['reorder_rate'] = (prods.reorders / prods.orders).astype(np.float32)
products = products.join(prods, on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods

computing product f


In [8]:
print('add order info to priors')
orders.set_index('order_id', inplace=True, drop=False)
priors = priors.join(orders, on='order_id', rsuffix='_')
priors.drop('order_id_', inplace=True, axis=1)

add order info to priors


### user features

In [10]:
print('computing user f')
usr = pd.DataFrame()
usr['average_days_between_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['nb_orders'] = orders.groupby('user_id').size().astype(np.int16)


computing user f


In [12]:
users = pd.DataFrame()
users['total_items'] = priors.groupby('user_id').size().astype(np.int16)
users['all_products'] = priors.groupby('user_id')['product_id'].apply(set)
users['total_distinct_items'] = (users.all_products.map(len)).astype(np.int16)

In [13]:
users = users.join(usr)
del usr
users['average_basket'] = (users.total_items / users.nb_orders).astype(np.float32)
print('user f', users.shape)

user f (206209, 6)


In [14]:
users

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,nb_orders,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.000000,11,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,15,13.000000
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.000000,13,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.000000,6,3.000000
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.500000,5,7.400000
6,14,"{40992, 27521, 20323, 48679, 8424, 45007, 2190...",12,13.333333,4,3.500000
7,206,"{11520, 35333, 519, 10504, 47623, 45066, 13198...",68,10.450000,21,9.809524
8,49,"{11136, 8193, 17794, 26882, 39812, 24838, 651,...",36,23.333334,4,12.250000
9,76,"{8834, 38277, 33787, 5002, 11790, 38159, 7952,...",58,22.000000,4,19.000000
10,143,"{36865, 20995, 13829, 43014, 11782, 18441, 476...",94,21.799999,6,23.833334


### userXproduct features

In [15]:
print('compute userXproduct f - this is long...')
priors['user_product'] = priors.product_id + priors.user_id * 100000

compute userXproduct f - this is long...


In [16]:
d= dict()
for row in priors.itertuples():
    z = row.user_product
    if z not in d:
        d[z] = (1,
                (row.order_number, row.order_id),
                row.add_to_cart_order)
    else:
        d[z] = (d[z][0] + 1,
                max(d[z][1], (row.order_number, row.order_id)),
                d[z][2] + row.add_to_cart_order)

In [17]:
print('to dataframe (less memory)')
userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d
userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart']
userXproduct.nb_orders = userXproduct.nb_orders.astype(np.int16)
userXproduct.last_order_id = userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
userXproduct.sum_pos_in_cart = userXproduct.sum_pos_in_cart.astype(np.int16)
print('user X product f', len(userXproduct))

del priors


to dataframe (less memory)
user X product f 13293564


### train / test orders ###

In [19]:
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

split orders : train, test


### build list of candidate products to reorder, with features ###

In [21]:
def features(selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(orders.user_id)
    df['user_total_orders'] = df.user_id.map(users.nb_orders)
    df['user_total_items'] = df.user_id.map(users.total_items)
    df['total_distinct_items'] = df.user_id.map(users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(users.average_basket)
    
    print('order related features')
    # df['dow'] = df.order_id.map(orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(products.aisle_id)
    df['department_id'] = df.product_id.map(products.department_id)
    df['product_orders'] = df.product_id.map(products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(products.reorders)
    df['product_reorder_rate'] = df.product_id.map(products.reorder_rate)

    print('user_X_product related features')
    df['z'] = df.user_id * 100000 + df.product_id
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.z.map(userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.z.map(userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.z.map(userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    #df['UP_same_dow_as_last_order'] = df.UP_last_order_id.map(orders.order_dow) == \
    #                                              df.order_id.map(orders.order_dow)

    df.drop(['UP_last_order_id', 'z'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)

In [22]:
df_train, labels = features(train_orders, labels_given=True)

build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
UP_or

In [24]:
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last'] # 'dow', 'UP_same_dow_as_last_order'

In [27]:
df_train.columns

Index(['order_id', 'product_id', 'user_total_orders', 'user_total_items',
       'total_distinct_items', 'user_average_days_between_orders',
       'user_average_basket', 'order_hour_of_day', 'days_since_prior_order',
       'days_since_ratio', 'aisle_id', 'department_id', 'product_orders',
       'product_reorders', 'product_reorder_rate', 'UP_orders',
       'UP_orders_ratio', 'UP_average_pos_in_cart', 'UP_reorder_rate',
       'UP_orders_since_last', 'UP_delta_hour_vs_last'],
      dtype='object')

In [26]:
labels

array([0, 1, 1, ..., 0, 0, 0], dtype=int8)

In [28]:
df_train

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,...,department_id,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last
0,1187899,17122,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,4,13880,9377.0,0.675576,1,0.090909,6.000000,0.090909,6,7
1,1187899,196,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,7,35791,27791.0,0.776480,10,0.909091,1.400000,0.909091,1,0
2,1187899,26405,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,17,1214,536.0,0.441516,2,0.181818,5.000000,0.181818,7,1
3,1187899,46149,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,7,8558,6953.0,0.812456,3,0.272727,3.000000,0.272727,1,0
4,1187899,14084,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,16,15935,12923.0,0.810982,1,0.090909,2.000000,0.090909,10,0
5,1187899,13032,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,14,3751,2465.0,0.657158,3,0.272727,6.333333,0.272727,1,0
6,1187899,26088,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,19,2523,1360.0,0.539041,2,0.181818,4.500000,0.181818,9,1
7,1187899,39657,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,19,5019,3846.0,0.766288,1,0.090909,3.000000,0.090909,1,0
8,1187899,12427,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,19,6476,4797.0,0.740735,10,0.909091,3.300000,0.909091,1,0
9,1187899,25133,11,59,18,19.000000,5.363636,8,14.0,0.736842,...,16,6196,4586.0,0.740155,8,0.727273,4.000000,0.727273,1,0


In [29]:
print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'

formating for lgb


In [30]:
# , 'order_hour_of_day', 'dow'

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 100

print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
del d_train

### build candidates list for test ###

df_test, _ = features(test_orders)

print('light GBM predict')
preds = bst.predict(df_test[f_to_use])

df_test['pred'] = preds

TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub.csv', index=False)

light GBM train :-)
build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
aisle_id                              uint8
department_id                         uint8
product_orders                        int32
product_reorders                    float32
product_reorder_rate                float32
UP_orders                             int16
UP_orders_ratio                     float

In [31]:
test_orders

Unnamed: 0_level_0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
order_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2774568,2774568,3,test,13,5,15,11.0
329954,329954,4,test,6,3,12,30.0
1528013,1528013,6,test,4,3,16,22.0
1376945,1376945,11,test,8,6,11,8.0
1356845,1356845,12,test,6,1,20,30.0
2161313,2161313,15,test,23,1,9,7.0
1416320,1416320,16,test,7,0,13,7.0
1735923,1735923,19,test,10,6,17,8.0
1980631,1980631,20,test,5,1,11,30.0
139655,139655,22,test,16,5,6,1.0


In [32]:
df_test

Unnamed: 0,order_id,product_id,user_total_orders,user_total_items,total_distinct_items,user_average_days_between_orders,user_average_basket,order_hour_of_day,days_since_prior_order,days_since_ratio,...,product_orders,product_reorders,product_reorder_rate,UP_orders,UP_orders_ratio,UP_average_pos_in_cart,UP_reorder_rate,UP_orders_since_last,UP_delta_hour_vs_last,pred
0,2774568,17668,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,2110,1220.0,0.578199,5,0.384615,3.600000,0.384615,2,3,0.347532
1,2774568,44683,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,22275,11981.0,0.537868,2,0.153846,9.500000,0.153846,7,1,0.071725
2,2774568,48523,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,5129,2376.0,0.463248,2,0.153846,6.500000,0.153846,4,1,0.093899
3,2774568,21903,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,241921,186884.0,0.772500,8,0.615385,4.250000,0.615385,1,0,0.619394
4,2774568,14992,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,29069,16942.0,0.582820,2,0.153846,7.000000,0.153846,6,0,0.087794
5,2774568,21137,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,264683,205845.0,0.777704,1,0.076923,7.000000,0.076923,11,4,0.073733
6,2774568,32402,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,2056,1328.0,0.645914,3,0.230769,8.333333,0.230769,3,1,0.200166
7,2774568,22035,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,59676,45639.0,0.764780,3,0.230769,3.666667,0.230769,5,2,0.160822
8,2774568,49683,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,97315,67313.0,0.691702,1,0.076923,4.000000,0.076923,10,1,0.042686
9,2774568,39190,13,88,33,12.000000,6.769231,15,11.0,0.916667,...,10972,6294.0,0.573642,10,0.769231,1.800000,0.769231,1,0,0.763212
