In [1]:
import numpy as np
import pandas as pd
import lightgbm as lgb


In [2]:
IDIR = '../data/'


print('loading prior')
df_priors = pd.read_csv(IDIR + 'order_products__prior.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading train')
df_train = pd.read_csv(IDIR + 'order_products__train.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'add_to_cart_order': np.int16,
            'reordered': np.int8})

print('loading orders')
df_orders = pd.read_csv(IDIR + 'orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'object',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

print('loading products')
df_products = pd.read_csv(IDIR + 'products.csv', dtype={
        'product_id': np.uint16,
        'order_id': np.int32,
        'aisle_id': np.uint8,
        'department_id': np.uint8},
        usecols=['product_id', 'aisle_id', 'department_id'])

print('priors {}: {}'.format(df_priors.shape, ', '.join(df_priors.columns)))
print('orders {}: {}'.format(df_orders.shape, ', '.join(df_orders.columns)))
print('train {}: {}'.format(df_train.shape, ', '.join(df_train.columns)))

loading prior
loading train
loading orders
loading products
priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered


# Compute product features

In [3]:
print('computing product features (num orders, num reorders, reorder rate)')
num_prior_orders = len(df_priors.order_id.unique())
df_prods_temp = pd.DataFrame()
df_prods_temp['orders'] = df_priors.groupby(df_priors.product_id).size().astype(np.int32)
df_prods_temp['reorders'] = df_priors['reordered'].groupby(df_priors.product_id).sum().astype(np.float32)
df_prods_temp['reorder_rate'] = (df_prods_temp.reorders / df_prods_temp.orders).astype(np.float32)
df_prods_temp['prob_purchase'] = (df_prods_temp['orders'] / num_prior_orders).astype(np.float32)
# aisle
#df_prods_temp['aisle_reorders'] = df_priors['reordered'].groupby(df_priors.product_id).sum().astype(np.float32)
df_products = df_products.join(df_prods_temp, on='product_id')
df_products.set_index('product_id', drop=False, inplace=True)
del df_prods_temp

computing product features (num orders, num reorders, reorder rate)


In [4]:
df_products.to_csv('df_products.csv')

# Aisle and department features

In [5]:
# Review to be completey sure
# aisle 
df_aisles = pd.DataFrame()
df_aisles['aisle_id'] = df_products.aisle_id.unique()
df_aisles.set_index('aisle_id', drop=False, inplace=True)
df_aisle_temp = pd.DataFrame()
df_prods_priors = df_priors.join(df_products, on='product_id', rsuffix='_')
df_aisle_temp['aisle_orders'] = df_prods_priors.groupby(df_prods_priors.aisle_id).size().astype(np.int32)
df_aisle_temp['aisle_reorders'] = df_prods_priors['reordered'].groupby(df_prods_priors.aisle_id).sum().astype(np.float32)
df_aisle_temp['aisle_reorder_rate'] = (df_aisle_temp.aisle_reorders / df_aisle_temp.aisle_orders).astype(np.float32)
df_aisle_temp['aisle_prob_purchase'] = (df_aisle_temp['aisle_orders'] / num_prior_orders).astype(np.float32)
# department
df_department = pd.DataFrame()
df_department['department_id'] = df_products.department_id.unique()
df_department.set_index('department_id', drop=False, inplace=True)
df_deparment_temp = pd.DataFrame()
df_deparment_temp['department_orders'] = df_prods_priors.groupby(df_prods_priors.department_id).size().astype(np.int32)
df_deparment_temp['department_reorders'] = df_prods_priors['reordered'].groupby(df_prods_priors.department_id).sum().astype(np.float32)
df_deparment_temp['department_reorder_rate'] = (df_deparment_temp.department_reorders / df_deparment_temp.department_orders).astype(np.float32)
df_deparment_temp['department_prob_purchase'] = (df_deparment_temp['department_orders'] / num_prior_orders).astype(np.float32)
# join
df_aisles = df_aisles.join(df_aisle_temp, on='aisle_id', rsuffix='_aisle_')
df_department = df_department.join(df_deparment_temp, on='department_id', rsuffix='_department_')
del df_aisle_temp
del df_deparment_temp
del df_prods_priors

In [6]:
df_department.head()

Unnamed: 0_level_0,department_id,department_orders,department_reorders,department_reorder_rate,department_prob_purchase
department_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
19,19,2887550,1657973.0,0.57418,0.898184
13,13,1875577,650301.0,0.346721,0.583406
7,7,2690129,1757892.0,0.65346,0.836776
1,1,2236432,1211890.0,0.541885,0.695652
11,11,447123,143584.0,0.321129,0.139079


In [7]:
print(len(df_prods_priors[df_prods_priors.aisle_id == 61]))
print(len(df_prods_priors[(df_prods_priors.aisle_id == 61) & (df_prods_priors.reordered == 1) ]))

NameError: name 'df_prods_priors' is not defined

In [None]:
print(len(df_prods_priors[df_prods_priors.department_id == 19]))
print(len(df_prods_priors[(df_prods_priors.department_id == 19) & (df_prods_priors.reordered == 1) ]))

In [19]:
df_aisles.to_csv('df_aisles.csv')
df_department.to_csv('df_department.csv')

# ADD ORDER INFO TO PRIOR DATA FRAME

In [8]:
print('add order info to priors')
df_orders.set_index('order_id', inplace=True, drop=False)
df_priors = df_priors.join(df_orders, on='order_id', rsuffix='_')
df_priors.drop('order_id_', inplace=True, axis=1)

add order info to priors


In [9]:
df_priors.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order
0,2,33120,1,1,202279,prior,3,5,9,8.0
1,2,28985,2,1,202279,prior,3,5,9,8.0
2,2,9327,3,0,202279,prior,3,5,9,8.0
3,2,45918,4,1,202279,prior,3,5,9,8.0
4,2,30035,5,0,202279,prior,3,5,9,8.0


# COMPUTE USER FEATURES

In [7]:
print('computing user features (average days between orders, num orders, total purchased items, all different products ids bought, num diff items, average basket)')
df_usr_temp = pd.DataFrame()
df_usr_temp['average_days_between_orders'] = df_orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
df_usr_temp['mean_hour_purchase'] = df_orders.groupby('user_id')['order_hour_of_day'].mean().astype(np.float32)
df_usr_temp['median_hour_purchase'] = df_orders.groupby('user_id')['order_hour_of_day'].median().astype(np.float32)
df_usr_temp['nb_orders'] = df_orders.groupby('user_id').size().astype(np.int16)
# About Day of the week
most_frequent_day = []
n_orders_most_frequent_day = []
for user_dow in df_orders.groupby('user_id')['order_dow']:
    most_frequent_day.append(user_dow[1].value_counts().idxmax())
    n_orders_most_frequent_day.append(user_dow[1].value_counts().max())
df_usr_temp['most_frequent_day'] = np.array(most_frequent_day).astype(np.int8)
df_usr_temp['n_orders_most_frequent_day'] = np.array(n_orders_most_frequent_day).astype(np.int16)
df_usr_temp['prop_orders_most_frequent_day'] = (df_usr_temp['n_orders_most_frequent_day'] / df_usr_temp['nb_orders']).astype(np.float32)
dow_last_prior_purchase = []
for user in df_orders[df_orders.eval_set == 'prior'].groupby('user_id'):
    dow_last_prior_purchase.append(user[1][user[1].order_number == user[1].order_number.max()].order_dow.values[0])
df_usr_temp['dow_last_prior_purchase'] = np.array(dow_last_prior_purchase).astype(np.int8)

df_users = pd.DataFrame()
df_users['total_items'] = df_priors.groupby('user_id').size().astype(np.int16)
df_users['all_products'] = df_priors.groupby('user_id')['product_id'].apply(set)
df_users['total_distinct_items'] = (df_users.all_products.map(len)).astype(np.int16)

df_users = df_users.join(df_usr_temp)
del df_usr_temp
df_users['average_basket'] = (df_users.total_items / df_users.nb_orders).astype(np.float32)
print('user f', df_users.shape)

computing user features (average days between orders, num orders, total purchased items, all different products ids bought, num diff items, average basket)
user f (206209, 12)


In [63]:
df_users.to_csv('df_users.csv')

In [8]:
df_users.head()

Unnamed: 0_level_0,total_items,all_products,total_distinct_items,average_days_between_orders,mean_hour_purchase,median_hour_purchase,nb_orders,most_frequent_day,n_orders_most_frequent_day,prop_orders_most_frequent_day,dow_last_prior_purchase,average_basket
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,59,"{17122, 196, 26405, 46149, 14084, 13032, 26088...",18,19.0,10.090909,8.0,11,4,4,0.363636,4,5.363636
2,195,"{45066, 2573, 18961, 23, 32792, 1559, 22559, 1...",102,16.285715,10.6,10.0,15,1,6,0.4,3,13.0
3,88,"{17668, 44683, 48523, 21903, 14992, 21137, 324...",33,12.0,16.307692,16.0,13,0,6,0.461538,1,6.769231
4,18,"{21573, 42329, 17769, 35469, 37646, 1200, 1905...",17,17.0,12.5,12.5,6,5,2,0.333333,5,3.0
5,37,"{11777, 40706, 28289, 48775, 20754, 6808, 1398...",23,11.5,15.0,16.0,5,3,2,0.4,1,7.4


# COMPUTE USER-PRODUCT FEATURES

In [49]:
print('compute userXproduct candidate features...')
df_priors['user_product'] = df_priors.product_id.astype(np.int64) + df_priors.user_id.astype(np.int64) * 100000 # Check there is no collision

d_user_product = dict() #tuples user-product(times_candidate, times_reordered, times_candidate_after_purchase_last_order, times_reordered_next_order)

n_users = len(df_priors.user_id.unique())
i = 0
for user,group in df_priors.groupby('user_id'):
    total_orders = group.order_number.max()
    for product_id in group.product_id.unique():
        product_orders = group[group.product_id == product_id]
        first_order = product_orders.order_number.min()
        times_candidate = total_orders - first_order
        times_reorder = len(product_orders) - 1
        next_order_candidate = len(group[(group.product_id == product_id) & (group.order_number < total_orders)]) 
        times_reorder_next_order = np.sum(np.ediff1d(product_orders.sort_values('order_number').order_number.values) == 1)
        user_product = product_id.astype(np.int64) + user.astype(np.int64) * 100000
        d_user_product[user_product] = (times_candidate, times_reorder, next_order_candidate, times_reorder_next_order)
    i += 1
    if i % 10000 == 0:
        print(str(i) + "-" + str(n_users))
        
    
    

compute userXproduct features - this is long...
100-206209


In [75]:
d_products = {} #tuples product (times_candidate, times_reordered, times_candidate_after_purchase_last_order, times_reordered_next_order, diff_users)
for user_product in d_user_product:
    product_id = user_product % 100000
    times_candidate = d_user_product[user_product][0]
    times_reordered = d_user_product[user_product][1]
    times_candidate_next_order = d_user_product[user_product][2]
    times_reordered_next_order = d_user_product[user_product][3]
    if product_id in d_products:
        d_products[product_id] = (d_products[product_id][0] + times_candidate,
                       d_products[product_id][1] + times_reordered,
                        d_products[product_id][2] + times_candidate_next_order,
                        d_products[product_id][3] + times_reordered_next_order,
                        d_products[product_id][4] + 1)
    else:
        d_products[product_id] = (times_candidate, times_reordered, times_candidate_next_order, times_reordered_next_order, 1)
        
        

3112962
12962
3822935
22935
2834448
34448
4849683
49683
5734429
34429
5832734
32734
1736736
36736
540706
40706
6520869
20869
2700093
93
8618027
18027
3147366
47366
2342959
42959
7143475
43475
1409076
9076
9437238
37238
2736183
36183
2808025
8025
7127104
27104
9928769
28769
1130563
30563
5439558
39558
9508193
8193
2736205
36205
7848015
48015
1917008
17008
3506258
6258
6340691
40691
2146388
46388
5423191
23191
5800029
29
6324318
24318
917600
17600
9912419
12419
2343014
43014
3702888
2888
3342441
42441
8618090
18090
1032299
32299
5226604
26604
2834551
34551
6242425
42425
1015937
15937
7929987
29987
4309124
9124
3948679
48679
5849227
49227
6144142
44142
9027730
27730
7913620
13620
2719894
19894
3539097
39097
6504605
4605
6324382
24382
1116869
16869
9011361
11361
3129513
29513
5505194
5194
9207987
7987
6504478
4478
213176
13176
1228985
28985
245948
45948
7815363
15363
1048775
48775
2900169
169
7127243
27243
3506384
6384
1409239
9239
8634584
34584
4128985
28985
6619356
19356
9924156
24156
48

In [87]:
df_userXproduct_candidate = pd.DataFrame.from_dict(d_user_product, orient='index')
df_userXproduct_candidate.columns = ['UP_times_candidate', 
                                     'UP_times_reordered', 
                                     'UP_times_candidate_next_order', 
                                     'UP_times_reordered_next_order']

del d_user_product

df_userXproduct_candidate.UP_times_candidate = df_userXproduct_candidate.UP_times_candidate.astype(np.int16)
df_userXproduct_candidate.UP_times_reordered = df_userXproduct_candidate.UP_times_reordered.astype(np.int16)
df_userXproduct_candidate.UP_times_candidate_next_order = df_userXproduct_candidate.UP_times_candidate_next_order.astype(np.int16)
df_userXproduct_candidate.UP_times_reordered_next_order = df_userXproduct_candidate.UP_times_reordered_next_order.astype(np.int16)

df_userXproduct_candidate.to_csv('df_userXproduct_candidate.csv')

In [88]:
df_userXproduct_candidate.head()

Unnamed: 0,UP_times_candidate,UP_times_reordered,UP_times_candidate_next_order,UP_times_reordered_next_order
3112962,1,0,1,0
3822935,10,1,2,0
2834448,0,0,0,0
4849683,0,0,0,0
5734429,0,0,0,0


In [92]:
df_product_candidate = pd.DataFrame.from_dict(d_products, orient='index')
df_product_candidate.columns = ['times_candidate', 
                                     'times_reordered', 
                                     'times_candidate_next_order', 
                                     'times_reordered_next_order',
                                     'diff_users']

#del d_products

df_product_candidate.times_candidate = df_product_candidate.times_candidate.astype(np.int32)
df_product_candidate.times_reordered = df_product_candidate.times_reordered.astype(np.int32)
df_product_candidate.times_candidate_next_order = df_product_candidate.times_candidate_next_order.astype(np.int32)
df_product_candidate.times_reordered_next_order = df_product_candidate.times_reordered_next_order.astype(np.int32)
df_product_candidate.diff_users = df_product_candidate.diff_users.astype(np.int32)

df_product_candidate.to_csv('df_product_candidate.csv')

In [93]:
df_product_candidate.head()

Unnamed: 0,times_candidate,times_reordered,times_candidate_next_order,times_reordered_next_order,diff_users
8193,114,6,13,2,11
27307,18,2,2,0,1
49157,2,0,1,0,1
4097,0,0,0,0,1
16393,13,0,1,0,1


In [9]:
from collections import Counter

print('compute userXproduct features - this is long...')
df_priors['user_product'] = df_priors.product_id.astype(np.int64) + df_priors.user_id.astype(np.int64) * 100000 # Check there is no collision

# dictionary of tuples user-product 
d = dict()  # (num orders, (last order number, last_order_id, sum_pos), sum_pos_car)
for row in df_priors.itertuples():
    user_product = row.user_product
    if user_product not in d:
        d[user_product] = (1,
                (row.order_number, row.order_id),
                row.add_to_cart_order)
    else:
        d[user_product] = (d[user_product][0] + 1,
                max(d[user_product][1], (row.order_number, row.order_id)),
                d[user_product][2] + row.add_to_cart_order)

compute userXproduct features - this is long...


In [10]:
from collections import Counter
# Second set of features
#mean_num_days_previous_orders, mean_hour, std_hour, most_common_day, ocurrences_most_common_day, proportion_most_common_day
i = 0
d_user_actual_day = {}
d2 = {}
for user,user_group in df_priors.groupby('user_id'):
    actual_day = 0
    actual_order = 1
    user_products = []
    for row in user_group.sort_values('order_number', ascending=True).itertuples():
        user_product = row.user_product
        user_products.append(user_product)
        if row.order_number != actual_order:
            actual_order += 1
            actual_day += row.days_since_prior_order
        if user_product not in d2:
            d2[user_product] = {}
            d2[user_product]['order_days'] = [actual_day]
            d2[user_product]['hours'] = [row.order_hour_of_day]
            d2[user_product]['days_of_week'] = [row.order_dow]
        else:
            d2[user_product]['order_days'].append(actual_day)
            d2[user_product]['hours'].append(row.order_hour_of_day)
            d2[user_product]['days_of_week'].append(row.order_dow)
    for user_product in list(set(user_products)):
        if len(d2[user_product]['order_days']) > 1:
            mean_freq_days_order = np.mean(np.ediff1d(d2[user_product]['order_days'])).astype(np.float32)
            median_freq_days_order = np.median(np.ediff1d(d2[user_product]['order_days'])).astype(np.float32)
            only_one_order = False
        else:
            mean_freq_days_order = np.float32(0.0)
            median_freq_days_order = np.float32(0.0)
            only_one_order = True
        day_last_order = np.int16(d2[user_product]['order_days'][-1])
        mean_hours = np.mean(d2[user_product]['hours']).astype(np.float32)
        std_hour = np.std(d2[user_product]['hours']).astype(np.float32)
        hour_last_order = np.int8(d2[user_product]['hours'][-1])
        counter=Counter(d2[user_product]['days_of_week'])
        most_common_day_of_week = np.int8(counter.most_common(1)[0][0])
        occurences_most_common_day_of_week = np.int16(counter.most_common(1)[0][1]) 
        day_of_week_last_order = np.int8(d2[user_product]['days_of_week'][-1])   
        d2[user_product] = (mean_freq_days_order, 
                           median_freq_days_order,
                           only_one_order,
                           day_last_order,
                           mean_hours,
                           std_hour,
                           hour_last_order,
                           most_common_day_of_week,
                           occurences_most_common_day_of_week,
                           day_of_week_last_order)
    d_user_actual_day[user] = actual_day
    if i % 10000 == 0:
        print(i)
    i += 1

0
10000
20000
30000
40000
50000
60000
70000
80000
90000
100000
110000
120000
130000
140000
150000
160000
170000
180000
190000
200000


In [11]:
# Join dictionaries 
for user_product in d:
    d[user_product] = (d[user_product][0], d[user_product][1], d[user_product][2], 
                       d2[user_product][0],
                       d2[user_product][1],
                       d2[user_product][2],
                       d2[user_product][3],
                       d2[user_product][4],
                       d2[user_product][5],
                       d2[user_product][6],
                       d2[user_product][7],
                       d2[user_product][8],
                       d2[user_product][9])
del d2

In [19]:
# Create dataframe 
print('to dataframe (less memory)')
df_userXproduct = pd.DataFrame.from_dict(d, orient='index')
del d
df_userXproduct.columns = ['nb_orders', 'last_order_id', 'sum_pos_in_cart', 
                           'mean_freq_days_order', 'median_freq_days_order',
                           'only_one_order', 'day_last_order',
                           'mean_hours', 'std_hour',
                           'hour_last_order', 'most_common_day_of_week',
                           'occurences_most_common_day_of_week', 'day_of_week_last_order']
df_userXproduct.nb_orders = df_userXproduct.nb_orders.astype(np.int16)
df_userXproduct.last_order_id = df_userXproduct.last_order_id.map(lambda x: x[1]).astype(np.int32)
df_userXproduct.sum_pos_in_cart = df_userXproduct.sum_pos_in_cart.astype(np.int16)
df_userXproduct.mean_freq_days_order = df_userXproduct.mean_freq_days_order.astype(np.float32)
df_userXproduct.median_freq_days_order = df_userXproduct.median_freq_days_order.astype(np.float32)
df_userXproduct.only_one_order = df_userXproduct.only_one_order.astype(np.bool_)
df_userXproduct.day_last_order = df_userXproduct.day_last_order.astype(np.int16)
df_userXproduct.mean_hours = df_userXproduct.mean_hours.astype(np.float32)
df_userXproduct.std_hour = df_userXproduct.std_hour.astype(np.float32)
df_userXproduct.hour_last_order = df_userXproduct.hour_last_order.astype(np.int8)
df_userXproduct.most_common_day_of_week = df_userXproduct.most_common_day_of_week.astype(np.int8)
df_userXproduct.occurences_most_common_day_of_week = df_userXproduct.occurences_most_common_day_of_week.astype(np.int8)
df_userXproduct.day_of_week_last_order = df_userXproduct.day_of_week_last_order.astype(np.int8)
df_userXproduct['proportion_most_common_day'] = (df_userXproduct.occurences_most_common_day_of_week / df_userXproduct.nb_orders).astype(np.float32)
print('user X product f', len(df_userXproduct))

del df_priors

to dataframe (less memory)


AttributeError: 'DataFrame' object has no attribute 'ocurrences_most_common_day'

In [24]:
df_userXproduct.head()

Unnamed: 0,nb_orders,last_order_id,sum_pos_in_cart,mean_freq_days_order,median_freq_days_order,only_one_order,day_last_order,mean_hours,std_hour,hour_last_order,most_common_day_of_week,occurences_most_common_day_of_week,day_of_week_last_order,proportion_most_common_day
14126415872,5,843810,63,34.5,35.5,False,323,12.6,2.059126,11,0,1,5,0.2
20535312385,1,2699553,9,0.0,0.0,True,22,12.0,0.0,12,3,1,3,1.0
1375731717,7,2844957,51,27.666666,21.0,False,238,14.142858,2.948538,17,6,4,6,0.571429
8959033352,4,1924807,57,12.333333,12.0,False,209,15.25,1.920286,16,4,2,6,0.5
7342828204,3,723315,13,26.5,26.5,False,90,15.666667,5.249339,11,2,1,5,0.333333


In [23]:
df_userXproduct.to_csv('df_userXproduct.csv')

In [25]:
d_user_actual_day

{1: 176.0,
 2: 198.0,
 3: 133.0,
 4: 55.0,
 5: 40.0,
 6: 18.0,
 7: 203.0,
 8: 60.0,
 9: 36.0,
 10: 79.0,
 11: 123.0,
 12: 100.0,
 13: 84.0,
 14: 265.0,
 15: 227.0,
 16: 109.0,
 17: 290.0,
 18: 28.0,
 19: 76.0,
 20: 15.0,
 21: 317.0,
 22: 190.0,
 23: 44.0,
 24: 264.0,
 25: 60.0,
 26: 138.0,
 27: 355.0,
 28: 269.0,
 29: 196.0,
 30: 151.0,
 31: 93.0,
 32: 81.0,
 33: 57.0,
 34: 80.0,
 35: 171.0,
 36: 350.0,
 37: 295.0,
 38: 231.0,
 39: 72.0,
 40: 105.0,
 41: 103.0,
 42: 208.0,
 43: 104.0,
 44: 60.0,
 45: 60.0,
 46: 319.0,
 47: 32.0,
 48: 195.0,
 49: 197.0,
 50: 357.0,
 51: 30.0,
 52: 236.0,
 53: 33.0,
 54: 363.0,
 55: 106.0,
 56: 78.0,
 57: 105.0,
 58: 130.0,
 59: 151.0,
 60: 174.0,
 61: 74.0,
 62: 139.0,
 63: 347.0,
 64: 90.0,
 65: 166.0,
 66: 150.0,
 67: 174.0,
 68: 141.0,
 69: 49.0,
 70: 229.0,
 71: 314.0,
 72: 168.0,
 73: 133.0,
 74: 70.0,
 75: 195.0,
 76: 176.0,
 77: 148.0,
 78: 82.0,
 79: 109.0,
 80: 104.0,
 81: 139.0,
 82: 201.0,
 83: 90.0,
 84: 159.0,
 85: 112.0,
 86: 324.0,
 87: 2

# TRAIN/TEST ORDERS

In [26]:
### train / test orders ###
print('split orders : train, test')
df_test_orders = df_orders[df_orders.eval_set == 'test']
df_train_orders = df_orders[df_orders.eval_set == 'train']

df_train.set_index(['order_id', 'product_id'], inplace=True, drop=False)

split orders : train, test


# BUILD LIST OF CANDIDATE PRODUCTS TO REORDER WITH FEATURES

In [27]:
### build list of candidate products to reorder, with features ###

def features(df_selected_orders, labels_given=False):
    print('build candidate list')
    order_list = []
    product_list = []
    labels = []
    i=0
    for row in df_selected_orders.itertuples():
        i+=1
        if i%10000 == 0: print('order row',i)
        order_id = row.order_id
        user_id = row.user_id
        user_products = df_users.all_products[user_id]
        product_list += user_products
        order_list += [order_id] * len(user_products)
        if labels_given:
            labels += [(order_id, product) in df_train.index for product in user_products]
        
    df = pd.DataFrame({'order_id':order_list, 'product_id':product_list}, dtype=np.int32)
    labels = np.array(labels, dtype=np.int8)
    del order_list
    del product_list
    
    print('user related features')
    df['user_id'] = df.order_id.map(df_orders.user_id)
    df['user_total_orders'] = df.user_id.map(df_users.nb_orders)
    df['user_total_items'] = df.user_id.map(df_users.total_items)
    df['total_distinct_items'] = df.user_id.map(df_users.total_distinct_items)
    df['user_average_days_between_orders'] = df.user_id.map(df_users.average_days_between_orders)
    df['user_average_basket'] =  df.user_id.map(df_users.average_basket)
    df['mean_hour_purchase'] = df.user_id.map(df_users.mean_hour_purchase) # NEW
    df['median_hour_purchase'] = df.user_id.map(df_users.median_hour_purchase) # NEW
    df['most_frequent_day'] = df.user_id.map(df_users.most_frequent_day) # NEW
    df['n_orders_most_frequent_day'] = df.user_id.map(df_users.n_orders_most_frequent_day) # NEW
    df['prop_orders_most_frequent_day'] = df.user_id.map(df_users.prop_orders_most_frequent_day) # NEW
    df['dow_last_prior_purchase'] = df.user_id.map(df_users.dow_last_prior_purchase) # NEW
    df['day_last_order'] = df['user_id'].map(d_user_actual_day).astype(np.int16) # NEW (day count, not day of the week)
    
    print('order related features')
    df['order_dow'] = df.order_id.map(df_orders.order_dow)
    df['order_hour_of_day'] = df.order_id.map(df_orders.order_hour_of_day)
    df['days_since_prior_order'] = df.order_id.map(df_orders.days_since_prior_order)
    df['days_since_ratio'] = df.days_since_prior_order / df.user_average_days_between_orders
    df['delta_hour_vs_average'] = abs(df.order_hour_of_day - df.mean_hour_purchase).map(lambda x: min(x, 24-x)).astype(np.float32)
    df['same_day_most_common_day'] = (df['most_frequent_day'] == df.order_dow) # NEW
    df['same_day_last_order_day'] = (df['dow_last_prior_purchase'] == df.order_dow) # NEW
    df['current_day'] = (df.day_last_order + df.days_since_prior_order).astype(np.int16) # NEW (day count, not day of the week)
    
    print('product related features')
    df['aisle_id'] = df.product_id.map(df_products.aisle_id)
    df['department_id'] = df.product_id.map(df_products.department_id)
    df['product_orders'] = df.product_id.map(df_products.orders).astype(np.int32)
    df['product_reorders'] = df.product_id.map(df_products.reorders)
    df['product_reorder_rate'] = df.product_id.map(df_products.reorder_rate)
    df['prob_purchase'] = df.product_id.map(df_products.prob_purchase) # NEW
    # Features about product is candidate
    df['times_candidate'] = df.product_id.map(df_product_candidate.times_candidate) # NEW
    df['times_reordered'] = df.product_id.map(df_product_candidate.times_reordered) # NEW
    df['times_candidate_next_order'] = df.product_id.map(df_product_candidate.times_candidate_next_order) # NEW
    df['times_reordered_next_order'] = df.product_id.map(df_product_candidate.times_reordered_next_order) # NEW
    
    print('aisle related features')
    df['aisle_orders'] = df.aisle_id.map(df_aisles.aisle_orders).astype(np.int32)
    df['aisle_reorders'] = df.aisle_id.map(df_aisles.aisle_reorders).astype(np.int32)
    df['aisle_reorder_rate'] = df.aisle_id.map(df_aisles.aisle_reorder_rate).astype(np.float32)
    df['aisle_prob_purchase'] = df.aisle_id.map(df_aisles.aisle_prob_purchase).astype(np.float32)
    
    print('department related features')
    df['department_orders'] = df.department_id.map(df_department.department_orders).astype(np.int32)
    df['department_reorders'] = df.department_id.map(df_department.department_reorders).astype(np.int32)
    df['department_reorder_rate'] = df.department_id.map(df_department.aisle_reorder_rate).astype(np.float32)
    df['department_prob_purchase'] = df.department_id.map(df_department.aisle_prob_purchase).astype(np.float32)

    print('user_X_product related features')
    df['user_product'] = df.user_id.astype(np.int64) * 100000 + df.product_id.astype(np.int64)
    df.drop(['user_id'], axis=1, inplace=True)
    df['UP_orders'] = df.user_product.map(df_userXproduct.nb_orders)
    df['UP_orders_ratio'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_last_order_id'] = df.user_product.map(df_userXproduct.last_order_id)
    df['UP_average_pos_in_cart'] = (df.user_product.map(df_userXproduct.sum_pos_in_cart) / df.UP_orders).astype(np.float32)
    df['UP_reorder_rate'] = (df.UP_orders / df.user_total_orders).astype(np.float32)
    df['UP_orders_since_last'] = df.user_total_orders - df.UP_last_order_id.map(df_orders.order_number)
    df['UP_delta_hour_vs_last'] = abs(df.order_hour_of_day - df.UP_last_order_id.map(df_orders.order_hour_of_day)).map(lambda x: min(x, 24-x)).astype(np.int8)
    df['UP_mean_freq_days_order'] = df.user_product.map(df_userXproduct.mean_freq_days_order) # NEW
    df['UP_median_freq_days_order'] = df.user_product.map(df_userXproduct.median_freq_days_order) # NEW
    df['UP_only_one_order'] = df.user_product.map(df_userXproduct.only_one_order) # NEW
    df['UP_day_last_order'] = df.user_product.map(df_userXproduct.day_last_order) # NEW
    # Here add the difference with the current day (), and this with the mean (use dictionary)
    df['UP_days_from_last_purchase'] = (df.current_day - df.UP_day_last_order).astype(np.int16) # NEW
    df['UP_difference_last_purchase_and_mean'] = np.abs(df.UP_days_from_last_purchase - df.UP_median_freq_days_order).astype(np.float32) # NEW
    df['UP_days_from_last_purchase_ratio'] = df.current_day / df.UP_day_last_order # NEW
    df['UP_mean_hours'] = df.user_product.map(df_userXproduct.mean_hours) # NEW
    df['UP_delta_hour_vs_average_hour'] = abs(df.order_hour_of_day - df.UP_mean_hours).map(lambda x: min(x, 24-x)).astype(np.float32) # NEW
    df['UP_std_hour'] = df.user_product.map(df_userXproduct.std_hour) # NEW
    df['UP_most_common_day_of_week'] = df.user_product.map(df_userXproduct.most_common_day_of_week) # NEW
    df['UP_occurences_most_common_day_of_week'] = df.user_product.map(df_userXproduct.occurences_most_common_day_of_week) # NEW
    df['UP_proportion_occurences_most_common_day_of_week'] = df.user_product.map(df_userXproduct.proportion_most_common_day) # NEW
    #df['UP_day_of_week_last_order'] = df.user_product.map(df_userXproduct.day_of_week_last_order) # NEW
    df['UP_same_day_most_common_day'] = (df['UP_most_common_day_of_week'] == df.order_dow) # NEW
    df['UP_day_of_week_last_order'] = df.UP_last_order_id.map(df_orders.order_dow).astype(np.int8) # NEW
    df['UP_same_day_last_order'] = (df['UP_day_of_week_last_order'] == df.order_dow) # NEW
    # Features about the number of times is candidate
    df['UP_times_candidate'] = df.user_product.map(df_userXproduct_candidate.UP_times_candidate) # NEW
    df['UP_times_reordered'] = df.user_product.map(df_userXproduct_candidate.UP_times_reordered) # NEW
    df['UP_times_candidate_next_order'] = df.user_product.map(df_userXproduct_candidate.UP_times_candidate_next_order) # NEW
    df['UP_times_reordered_next_order'] = df.user_product.map(df_userXproduct_candidate.UP_times_reordered_next_order) # NEW
        
    
    df.drop(['UP_last_order_id', 'user_product'], axis=1, inplace=True)
    print(df.dtypes)
    print(df.memory_usage())
    return (df, labels)
    

df_train, labels = features(df_train_orders, labels_given=True)

f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate', 'UP_orders_since_last',
       'UP_delta_hour_vs_last',
       'n_orders_most_frequent_day', 'prop_orders_most_frequent_day',  # NEW user features
       'same_day_most_common_day', 'same_day_last_order_day', # NEW order feautres (order-user)
       'prob_purchase']


build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
order row 80000
order row 90000
order row 100000
order row 110000
order row 120000
order row 130000
user related features
order related features
product related features
user_X_product related features
order_id                                              int32
product_id                                            int32
user_total_orders                                     int16
user_total_items                                      int16
total_distinct_items                                  int16
user_average_days_between_orders                    float32
user_average_basket                                 float32
mean_hour_purchase                                  float32
median_hour_purchase                                float32
most_frequent_day                                      int8
n_orders_most_frequent_day                            int16
prop_

In [35]:
# Some fixes - only execute if needed
df_train['UP_difference_last_purchase_and_mean'] = np.abs(df_train.UP_days_from_last_purchase - df_train.UP_median_freq_days_order).astype(np.float32) # NEW
df_train['day_last_order'] = df_train['day_last_order'].astype(np.int16)
df_train['current_day'] = df_train['current_day'].astype(np.int16) 
df_train['UP_days_from_last_purchase'] = df_train['UP_days_from_last_purchase'].astype(np.int16)
df_train['UP_days_from_last_purchase_ratio'] = df_train['UP_days_from_last_purchase_ratio'] .astype(np.float32)


In [61]:
# Remove unnecesary columns
df_train.drop(['day_last_order', 'UP_day_last_order', ], axis=1, inplace=True)

MemoryError: 

In [28]:
labels

array([0, 1, 1, ..., 0, 0, 0], dtype=int8)

In [58]:
df_train[['order_id', 'product_id', 'most_frequent_day', 'n_orders_most_frequent_day', 'prop_orders_most_frequent_day',
         'dow_last_prior_purchase']].head(5)

Unnamed: 0,order_id,product_id,most_frequent_day,n_orders_most_frequent_day,prop_orders_most_frequent_day,dow_last_prior_purchase
0,1187899,17122,4,4,0.363636,4
1,1187899,196,4,4,0.363636,4
2,1187899,26405,4,4,0.363636,4
3,1187899,46149,4,4,0.363636,4
4,1187899,14084,4,4,0.363636,4


# Save/Load training dataframe in csv

In [62]:
df_train.to_csv('df_train.csv')

In [34]:
# Save pickle of labels
import pickle
with open("pickles/labels.pickle", 'wb') as handle:
    pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL) 

In [72]:
# Load pickle
with open("pickles/labels.pickle", 'rb') as handle:
    labels = pickle.load(handle)

# Create a validation set to tune the threshold later

In [35]:
from sklearn.cross_validation  import train_test_split
df_train['labels'] = labels
df_train, df_val = train_test_split(df_train, test_size = 0.075)

# Training

In [36]:
print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use],
                      label=df_train['labels'].values,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'
del df_train
d_val = lgb.Dataset(df_val[f_to_use],
                      label=df_val['labels'].values,
                      categorical_feature=['aisle_id', 'department_id'], free_raw_data=True)  # , 'order_hour_of_day', 'dow'

formating for lgb


In [77]:
# Simple training - Use it if we are not doing cross validation

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 30

print('light GBM train :-)')
bst = lgb.train(params=params, train_set=d_train, num_boost_round=ROUNDS, valid_sets=d_val, verbose_eval=10)
#lgb.plot_importance(bst, figsize=(9,20))
#del d_train

light GBM train :-)
[10]	valid_0's binary_logloss: 0.348581
[20]	valid_0's binary_logloss: 0.272318
[30]	valid_0's binary_logloss: 0.25294


In [76]:
print(bst.eval(d_train, name='train'))
print(bst.eval(d_val, name='valodat'))

[('train', 'binary_logloss', 0.25306100029480866, False)]
[('valodat', 'binary_logloss', 0.25294019447074295, False)]


In [18]:
# Train with Cross validation

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'},
    'num_leaves': 96,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
ROUNDS = 700

l_learning_rate= [0.15, 0.1, 0.05]
l_num_leaves = [106]
l_min_data_in_leaf = [110]

results = {}

for learning_rate in l_learning_rate:
    for num_leaves in l_num_leaves:
        for min_data_in_leaf in l_min_data_in_leaf:
            params['learning_rate'] = learning_rate
            params['num_leaves'] = num_leaves
            params['min_data_in_leaf'] = min_data_in_leaf

            print('light GBM train with learning rate ' + str(learning_rate) + ', num leaves ' + str(num_leaves) + ' and ' + str(min_data_in_leaf) + ': ')
            results[str(learning_rate) + str(num_leaves) + str(min_data_in_leaf)] = lgb.cv(params=params, train_set=d_train, num_boost_round=ROUNDS, nfold=5,
                         metrics=['binary_logloss', 'auc'], early_stopping_rounds=5, verbose_eval=50, seed=17)
#lgb.plot_importance(bst, figsize=(9,20))
#del d_train

light GBM train with learning rate 0.15, num leaves 106 and 110: 
[50]	cv_agg's binary_logloss: 0.245685 + 0.000651653	cv_agg's auc: 0.83308 + 0.000634807
[100]	cv_agg's binary_logloss: 0.244748 + 0.000659616	cv_agg's auc: 0.834641 + 0.00065596
[150]	cv_agg's binary_logloss: 0.24442 + 0.00065742	cv_agg's auc: 0.835173 + 0.000634414
[200]	cv_agg's binary_logloss: 0.244181 + 0.000704296	cv_agg's auc: 0.835581 + 0.000717619
[250]	cv_agg's binary_logloss: 0.243989 + 0.000711245	cv_agg's auc: 0.835915 + 0.000700959
[300]	cv_agg's binary_logloss: 0.243839 + 0.000682721	cv_agg's auc: 0.836168 + 0.000675322
[350]	cv_agg's binary_logloss: 0.243719 + 0.000706511	cv_agg's auc: 0.836361 + 0.00070007
[400]	cv_agg's binary_logloss: 0.243608 + 0.000690959	cv_agg's auc: 0.836533 + 0.000693244
light GBM train with learning rate 0.1, num leaves 106 and 110: 
[50]	cv_agg's binary_logloss: 0.246733 + 0.000636588	cv_agg's auc: 0.831646 + 0.000650419
[100]	cv_agg's binary_logloss: 0.245139 + 0.000676675	cv_

KeyboardInterrupt: 

In [38]:
# Train the single model with the selected parameters, and test with evaluation set

final_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': {'binary_logloss'}, #'auc'
    'num_leaves': 106,
    'min_data_in_leaf': 110,
    'max_depth': 10,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.95,
    'bagging_freq': 5
}
final_ROUNDS = 1000

print('light GBM train :-)')
bst = lgb.train(params=final_params, train_set=d_train, num_boost_round=final_ROUNDS, valid_sets=d_val, early_stopping_rounds=5, verbose_eval=20)
print(bst.eval(d_train, name='train'))
print(bst.eval(d_val, name='valodat'))

light GBM train :-)
Train until valid scores didn't improve in 5 rounds.
[20]	valid_0's binary_logloss: 0.273329
[40]	valid_0's binary_logloss: 0.24905
[60]	valid_0's binary_logloss: 0.246849
[80]	valid_0's binary_logloss: 0.246192
[100]	valid_0's binary_logloss: 0.24583
[120]	valid_0's binary_logloss: 0.24557
[140]	valid_0's binary_logloss: 0.245294
[160]	valid_0's binary_logloss: 0.245126
[180]	valid_0's binary_logloss: 0.245001
[200]	valid_0's binary_logloss: 0.244858
[220]	valid_0's binary_logloss: 0.244741
[240]	valid_0's binary_logloss: 0.244628
[260]	valid_0's binary_logloss: 0.244545
[280]	valid_0's binary_logloss: 0.244459
[300]	valid_0's binary_logloss: 0.244391
[320]	valid_0's binary_logloss: 0.244308
[340]	valid_0's binary_logloss: 0.24425
[360]	valid_0's binary_logloss: 0.244178
Early stopping, best iteration is:
[365]	valid_0's binary_logloss: 0.244168
[('train', 'binary_logloss', 0.23988801858937264, False)]
[('valodat', 'binary_logloss', 0.24416976618269667, False)]


In [39]:
# print feature importance
f_to_use
idx_features= np.arange(d_train.num_feature())
sorted_importance, sorted_idx = zip(*sorted(zip(bst.feature_importance(), idx_features), reverse=True))
for i in range(len(sorted_idx)):
    idx_feature = sorted_idx[i]
    print(str(f_to_use[idx_feature]) + ': ' + str(sorted_importance[i]))

days_since_ratio: 3564
total_distinct_items: 3061
user_average_basket: 2647
user_average_days_between_orders: 2606
user_total_items: 2552
UP_orders_since_last: 2510
product_reorder_rate: 2449
prop_orders_most_frequent_day: 1869
order_hour_of_day: 1758
days_since_prior_order: 1747
UP_orders_ratio: 1735
product_orders: 1651
aisle_id: 1518
UP_delta_hour_vs_last: 1415
UP_average_pos_in_cart: 1375
product_reorders: 1250
user_total_orders: 1196
department_id: 1012
n_orders_most_frequent_day: 904
UP_orders: 804
UP_reorder_rate: 223
same_day_most_common_day: 195
prob_purchase: 149
same_day_last_order_day: 125


In [40]:
# TODO: We could improve this with the real F1-score

# Tune threshold using validation set - Accuracy? f1_score? Do it individually per user and then compute mean?
from sklearn.metrics import accuracy_score, f1_score

thresholds = [0.15, 0.17, 0.19, 0.21, 0.23, 0.25, 0.27, 0.29]
preds = bst.predict(df_val[f_to_use])
acc_list = []
f1_list = []

for threshold in thresholds:
    y_pred= preds > threshold
    acc_list.append(accuracy_score(df_val['labels'].values, y_pred))
    f1_list.append(f1_score(df_val['labels'].values, y_pred))

pos_max = f1_list.index(max(f1_list))
print('Best threshold: ' + str(thresholds[pos_max]))
print('f1 score: ' + str(f1_list[pos_max]))
print('accuracy: ' + str(acc_list[pos_max]))
best_threshold = thresholds[pos_max]

Best threshold: 0.23
f1 score: 0.442139507064
accuracy: 0.879671176841


In [41]:
def multilabel_fscore(y_true, y_pred):
    """
    ex1:
    y_true = [1, 2, 3]
    y_pred = [2, 3]
    return: 0.8
    
    ex2:
    y_true = ["None"]
    y_pred = [2, "None"]
    return: 0.666
    
    ex3:
    y_true = [4, 5, 6, 7]
    y_pred = [2, 4, 8, 9]
    return: 0.25
    
    """
    y_true, y_pred = set(y_true), set(y_pred)
    precision = sum([1 for i in y_pred if i in y_true]) / len(y_pred)
    recall = sum([1 for i in y_true if i in y_pred]) / len(y_true)
    if precision + recall == 0:
        return 0
    return (2 * precision * recall) / (precision + recall)


thresholds = [0.15, 0.17, 0.19, 0.21, 0.23, 0.25, 0.27, 0.29]
preds = bst.predict(df_val[f_to_use])
df_val['pred'] = preds 
f1_list = []

for threshold in thresholds:
    f1_threshold = []
    for order,group_order in df_val.groupby('order_id'):
        y_true = group_order[group_order['labels'] == 1].product_id.values
        if len(y_true) == 0:
            y_true = ["None"]
        y_pred = group_order[group_order['pred'] > threshold].product_id.values
        if len(y_pred) == 0:
            y_pred = ["None"]
        f1_threshold.append(multilabel_fscore(y_true, y_pred))
    f1_list.append(np.mean(f1_threshold))

pos_max = f1_list.index(max(f1_list))
print('Best threshold: ' + str(thresholds[pos_max]))
print('f1 score: ' + str(f1_list[pos_max]))
best_threshold = thresholds[pos_max]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


KeyboardInterrupt: 

In [42]:
f1_list

[0.54082011382688788,
 0.56788957161287301,
 0.58918396791249938,
 0.60698961753748149,
 0.62147749131384566]

In [46]:
import pickle
# Train the final model with all the data

# Load pickle
with open("pickles/df_train.pickle", 'rb') as handle:
    df_train = pickle.load(handle)   
with open("pickles/labels.pickle", 'rb') as handle:
    labels = pickle.load(handle)
    
print('formating for lgb')
d_train = lgb.Dataset(df_train[f_to_use],
                      label=labels,
                      categorical_feature=['aisle_id', 'department_id'])  # , 'order_hour_of_day', 'dow'
final_ROUNDS = 460
print('light GBM train :-)')
bst = lgb.train(params=final_params, train_set=d_train, num_boost_round=final_ROUNDS, verbose_eval=10)

formating for lgb
light GBM train :-)


# Generate test predictions

In [47]:
### build candidates list for test ###

df_test, _ = features(df_test_orders)

print('light GBM predict')
preds = bst.predict(df_test[f_to_use])

df_test['pred'] = preds

build candidate list
order row 10000
order row 20000
order row 30000
order row 40000
order row 50000
order row 60000
order row 70000
user related features
order related features
product related features
user_X_product related features
order_id                              int32
product_id                            int32
user_total_orders                     int16
user_total_items                      int16
total_distinct_items                  int16
user_average_days_between_orders    float32
user_average_basket                 float32
mean_hour_purchase                  float32
median_hour_purchase                float32
most_frequent_day                      int8
n_orders_most_frequent_day            int16
prop_orders_most_frequent_day       float32
dow_last_prior_purchase                int8
order_dow                              int8
order_hour_of_day                      int8
days_since_prior_order              float32
days_since_ratio                    float32
delta_hour_vs_ave

In [48]:
TRESHOLD = best_threshold  

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in df_test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub.csv', index=False)

In [137]:
lgb.__version__

'2.0.1'