In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
color = sns.color_palette()

%matplotlib inline
# Reading datasets from external csv file
print('loading train data ')
train_x = pd.read_csv('train_x.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'user_total_orders':np.int8,
            'user_total_items':np.int8,
            'total_distinct_items':np.int8,
            'user_average_days_between_orders':np.float32,
            'user_average_basket':np.float32,
            'dow':np.int8,
            'order_hour_of_day':np.int8,
            'days_since_prior_order':np.int16,
            'days_since_ratio':np.float32,
            'asile_id':np.int8,
            'department_id':np.int8,
            'product_orders':np.int16,
            'product_reorders':np.int64,
            'product_reorders_rate':np.float32,
            'z':np.int32,
            'UP_orders':np.int8,
            'UP_orders_ratio':np.float32,
            'UP_last_order_id':np.int32,
            'UP_average_pos_in_cart':np.float32,
            'UP_reorder_rate':np.float32,
            'UP_orders_since_last':np.int8,
            'UP_delta_hour_vs_last':np.int8})



print('loading test data')

test_x = pd.read_csv('test_x.csv', dtype={
            'order_id': np.int32,
            'product_id': np.uint16,
            'user_total_orders':np.int8,
            'user_total_items':np.int8,
            'total_distinct_items':np.int8,
            'user_average_days_between_orders':np.float32,
            'user_average_basket':np.float32,
            'dow':np.int8,
            'order_hour_of_day':np.int8,
            'days_since_prior_order':np.int16,
            'days_since_ratio':np.float32,
            'asile_id':np.int8,
            'department_id':np.int8,
            'product_orders':np.int16,
            'product_reorders':np.int64,
            'product_reorders_rate':np.float32,
            'z':np.int32,
            'UP_orders':np.int8,
            'UP_orders_ratio':np.float32,
            'UP_last_order_id':np.int32,
            'UP_average_pos_in_cart':np.float32,
            'UP_reorder_rate':np.float32,
            'UP_orders_since_last':np.int8,
            'UP_delta_hour_vs_last':np.int8})

print('finished')



loading train data 
loading test data
finished


In [2]:
train_y = pd.read_csv('train_y.csv')

In [3]:
train_x.shape

(8474661, 24)

In [6]:
train_y = train_y.as_matrix()

In [8]:
train_y.size

8474661

In [9]:
# Features used for training the model
f_to_use = ['user_total_orders', 'user_total_items', 'total_distinct_items',
       'user_average_days_between_orders', 'user_average_basket',
       'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
       'aisle_id', 'department_id', 'product_orders', 'product_reorders',
       'product_reorder_rate', 'UP_orders', 'UP_orders_ratio',
       'UP_average_pos_in_cart', 'UP_reorder_rate','dow','UP_orders_since_last','UP_delta_hour_vs_last'] 

xgdmat = xgb.DMatrix(train_x[f_to_use], train_y)
del train_x
del train_y

our_params = {'eta': 0.1, 'subsample': 0.76, 'colsample_bytree': 0.95,'eval_metric':'auc','gamma':0.70,
             'objective': 'binary:logistic', 'max_depth':6, 'min_child_weight':10,'alpha':2e-05,'lambda':10} 

# Grid Search CV optimized settings

#cv_xgb = xgb.cv(params = our_params, dtrain = xgdmat, num_boost_round = 1000, nfold = 10,
#                metrics = ['error'], # Make sure you enter metrics inside a list or you may encounter issues!
#                early_stopping_rounds = 60) # Look for early stopping that minimizes error
#cv_xgb.tail()

final_gb = xgb.train(our_params, xgdmat, num_boost_round = 100)
del xgdmat

# Gathering required features for test_orders
testdmat = xgb.DMatrix(test_x[f_to_use])

y_pred = final_gb.predict(testdmat) # Predict using our testdmat
del testdmat

test_x['preds']=y_pred

In [11]:
print('loading orders')
orders = pd.read_csv('orders.csv', dtype={
        'order_id': np.int32,
        'user_id': np.int32,
        'eval_set': 'category',
        'order_number': np.int16,
        'order_dow': np.int8,
        'order_hour_of_day': np.int8,
        'days_since_prior_order': np.float32})

loading orders


In [12]:
value = 0
orders['days_since_prior_order']=orders['days_since_prior_order'].fillna(value)
orders['days_since_prior_order'] = orders['days_since_prior_order'].astype(int)
orders.isnull().sum()  

order_id                  0
user_id                   0
eval_set                  0
order_number              0
order_dow                 0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

In [13]:
test_orders = orders[orders.eval_set == 'test']

In [18]:
# Threshold value for classification is the error rate encountered in cross validation
thres = 0.19
d = dict()
for row in test_x.itertuples():
    if row.preds > thres:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('submission_xgboost.csv', index=False)