In [17]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline
# Reading datasets from external csv file
print('loading train data ')
train_x = pd.read_csv('train_x.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'user_total_orders':np.int8,
            'user_total_items':np.int8,
            'total_distinct_items':np.int8,
            'user_average_days_between_orders':np.float32,
            'user_average_basket':np.float32,
            'dow':np.int8,
            'order_hour_of_day':np.int8,
            'days_since_prior_order':np.int16,
            'days_since_ratio':np.float32,
            'asile_id':np.int8,
            'department_id':np.int8,
            'product_orders':np.int16,
            'product_reorders':np.int64,
            'product_reorders_rate':np.float32,
            'z':np.int32,
            'UP_orders':np.int8,
            'UP_orders_ratio':np.float32,
            'UP_last_order_id':np.int32,
            'UP_average_pos_in_cart':np.float32,
            'UP_reorder_rate':np.float32,
            'UP_orders_since_last':np.int8,
            'UP_delta_hour_vs_last':np.int8})

train_y = pd.read_csv('train_y.csv')
train_y = train_y.as_matrix()



print('loading test data')

test_x = pd.read_csv('test_x.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'user_total_orders':np.int8,
            'user_total_items':np.int8,
            'total_distinct_items':np.int8,
            'user_average_days_between_orders':np.float32,
            'user_average_basket':np.float32,
            'dow':np.int8,
            'order_hour_of_day':np.int8,
            'days_since_prior_order':np.int16,
            'days_since_ratio':np.float32,
            'asile_id':np.int8,
            'department_id':np.int8,
            'product_orders':np.int16,
            'product_reorders':np.int64,
            'product_reorders_rate':np.float32,
            'z':np.int32,
            'UP_orders':np.int8,
            'UP_orders_ratio':np.float32,
            'UP_last_order_id':np.int32,
            'UP_average_pos_in_cart':np.float32,
            'UP_reorder_rate':np.float32,
            'UP_orders_since_last':np.int8,
            'UP_delta_hour_vs_last':np.int8})

print('loading orders')
orders = pd.read_csv('orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

test_orders = orders[orders.eval_set == 'test']
del orders

print('finished')

loading train data 
loading test data
loading orders
finished


In [18]:
import lightgbm as lgb
train_y = np.array(train_y, dtype=np.int8)

In [None]:
# Features used for training the model
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate','dow','UP_orders_since_last','UP_delta_hour_vs_last'] 

train_xl = lgb.Dataset(train_x[f_to_use],
                      label=train_y,
                      categorical_feature=['aisle_id', 'department_id','dow','order_hour_of_day'],free_raw_data=False)

del train_x
del train_y

# Parameters used for light_gbm
lgb_params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'learning_rate': 0.012,
    'max_depth': 10,
    'num_leaves': 98, 
    'objective':'binary',
    'metric':'binary_logloss',
    'feature_fraction': 0.92,
    'bagging_fraction': 0.98,
    'max_bin': 10}


# cross-validation
cv_result_lgb = lgb.cv(lgb_params, 
                       train_xl, 
                       num_boost_round=1000, 
                       nfold=10, 
                       stratified=True, 
                       early_stopping_rounds=60, 
                      verbose_eval=150, 
                       show_stdv=True)

# Calculating mean cross validation classification error
num_boost_rounds_lgb = len(cv_result_lgb['binary_logloss-mean'])
print('num_boost_rounds_lgb=' + str(num_boost_rounds_lgb))
error_lgm = sum(cv_result_lgb['binary_logloss-mean'])
error_lgm = error_lgm/num_boost_rounds_lgb
print('mean error by cross validation',error_lgm*100)

# Training model with 100 rounds
rounds = 100
model_lgm = lgb.train(lgb_params, train_xl, rounds)
del train_xl

# Predicting 
preds =model_lgm.predict(test_x[f_to_use])
test_x['preds'] = preds

In [None]:
# Threshold value for classification is the error rate encountered in cross validation
thres = 0.20
d = dict()
for row in test_x.itertuples():
    if row.preds > thres:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('subissionlgb.csv', index=False)