https://www.kaggle.com/sociopath00/random-forest-using-gridsearchcv

https://www.kaggle.com/paulantoine/light-gbm-benchmark-0-3692

https://www.kaggle.com/datatheque/association-rules-mining-market-basket-analysis


order_products__*.csv
- These files specify which products were purchased in each order. order_products__prior.csv contains previous order contents for all customers. 'reordered' indicates that the customer has a previous order that contains the product. Note that some orders will have no reordered items. You may predict an explicit 'None' value for orders with no reordered items. See the evaluation page for full details.

In [1]:
# conda install -c conda-forge lightgbm

import pandas as pd
import numpy as np
import os
import lightgbm as lgb
import xgboost as xgb
import itertools as it
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import random

from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import matplotlib
matplotlib.use('Agg')
from matplotlib import pyplot
from datetime import datetime

from scipy import stats
from scipy.stats import randint
from sklearn.metrics import precision_score,recall_score,accuracy_score,f1_score,roc_auc_score


import warnings
warnings.filterwarnings('ignore')
import gc

In [2]:
os.chdir("/axp/buanalytics/cscust360/dev/Anu/Kaggle")

aisles = pd.read_csv("aisles.csv",parse_dates=True, skiprows=0, na_values = ['NaN'], encoding='utf-8')

departments = pd.read_csv("departments.csv",parse_dates=True, skiprows=0, na_values = ['NaN'])

priors = pd.read_csv("order_products__prior.csv.filepart",parse_dates=True, skiprows=0, na_values = ['NaN'],
                    dtype={'order_id': np.int32,'product_id': np.uint16,'add_to_cart_order': np.int16,'reordered': np.int8})

train = pd.read_csv("order_products__train.csv.filepart",parse_dates=True, skiprows=0, na_values = ['NaN'],
                   dtype={'order_id': np.int32, 'product_id': np.uint16, 'add_to_cart_order': np.int16,'reordered': np.int8})

orders = pd.read_csv("orders.csv.filepart",parse_dates=True, skiprows=0, na_values = ['NaN'],
                    dtype={'order_id': np.int32,'user_id': np.int32,'eval_set': 'category','order_number': np.int16,
                           'order_dow': np.int8,'order_hour_of_day': np.int8,'days_since_prior_order': np.float32})

products = pd.read_csv("products.csv.filepart",parse_dates=True, skiprows=0, na_values = ['NaN'], 
                       dtype={'product_id': np.uint16, 'order_id': np.int32, 'aisle_id': np.uint8,'department_id': np.uint8},
                       usecols=['product_id', 'aisle_id', 'department_id'])

# sample_submission = pd.read_csv("sample_submission/sample_submission.csv",parse_dates=True, skiprows=0, na_values = ['NaN'])

print('priors {}: {}'.format(priors.shape, ', '.join(priors.columns)))
print('orders {}: {}'.format(orders.shape, ', '.join(orders.columns)))
print('train {}: {}'.format(train.shape, ', '.join(train.columns)))


# Add aisle and department name to the products dataset
products = products.merge(aisles, how = 'left', left_on = 'aisle_id', right_on = 'aisle_id')
products = products.merge(departments, how = 'left', left_on = 'department_id', right_on = 'department_id')

# Add order info to priors
print('add order info to priors')
priors = priors.merge(orders, how = 'left', left_on='order_id',right_on='order_id')

priors['user_buy_product_times'] = priors.groupby(['user_id', 'product_id']).cumcount()+1
priors['prod_buy_first_time'] = np.where(priors['user_buy_product_times'] == 1, 1, 0)
priors['prod_buy_second_time'] = np.where(priors['user_buy_product_times'] == 2, 1, 0)

# Create features at product, user and user x product level
# Product features
print('computing product features')
prods = pd.DataFrame()
df = priors.groupby('product_id').agg({'order_id': 'count', 'reordered': np.sum, 'add_to_cart_order': np.mean, 
                                       'order_hour_of_day': np.mean, 'order_dow': np.mean, 'days_since_prior_order': np.mean,
                                       'prod_buy_first_time': np.sum, 'prod_buy_second_time': np.sum})
prods['prod_ordered'] = df['order_id'].astype(np.int32)
prods['prod_reordered'] = df['reordered'].astype(np.float32)
prods['prod_reorder_rate'] = (prods.prod_reordered / prods.prod_ordered).astype(np.float32)
prods['prod_avg_pos_in_cart'] = df['add_to_cart_order'].astype(np.float32)
prods['prod_avg_hour_of_day'] = df['order_hour_of_day'].astype(np.float32)
prods['prod_avg_day_of_week'] = df['order_dow'].astype(np.float32)
prods['prod_avg_days_since_prior_order'] = df['days_since_prior_order'].astype(np.float32)

prods['prod_buy_first_time'] = df['prod_buy_first_time'].astype(np.int32)
prods['prod_buy_second_time'] = df['prod_buy_second_time'].astype(np.int32)
prods['prod_reorder_prob'] = prods.prod_buy_second_time / prods.prod_buy_first_time
prods['prod_reorder_times'] = 1 + prods.prod_reordered / prods.prod_buy_first_time #Look into this in the next verison

prods.reset_index(inplace=True)
products = products.merge(prods, how = 'left', left_on='product_id', right_on='product_id')
products.set_index('product_id', drop=False, inplace=True)
del prods
del df
             
### user features
print('computing user features')
users = pd.DataFrame()
                     
usr = pd.DataFrame()
usr['user_avg_days_bet_orders'] = orders.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
usr['user_tot_orders'] = orders.groupby('user_id').size().astype(np.int32)
usr.reset_index(inplace=True)
                   
df = priors.groupby('user_id').agg({'order_id': 'count', 'product_id': 'nunique', 'reordered': np.sum, 
                                    'add_to_cart_order': np.mean})

users['user_tot_prods'] = df['order_id'].astype(np.int32)
users['user_tot_dist_prods'] = df['product_id'].astype(np.int32)

users['user_prod_list'] = priors.groupby('user_id')['product_id'].apply(set)
users['user_reorderd_prods'] = df['reordered'].astype(np.int32)
users['user_reorderd_rate'] = (users['user_reorderd_prods']/users['user_tot_prods']).astype(np.float32)
users.reset_index(inplace=True)
users = users.merge(usr, how = 'left', left_on = 'user_id', right_on = 'user_id')
users['user_avg_order_size'] = (users.user_tot_prods / users.user_tot_orders).astype(np.float32)
users.set_index('user_id', drop=False, inplace=True)
print('users shape', users.shape)
del usr
del df

### user X product features
priors['user_product'] = priors.user_id.astype(str) + "-" + priors.product_id.astype(str)
user_product = priors.groupby(['user_product']).agg({'order_id': 'nunique','order_number': 'max', 
                                                             'add_to_cart_order': np.sum, 'reordered': np.sum, 
                                                             'days_since_prior_order': np.mean, 'order_hour_of_day': np.mean, 'order_dow': np.mean})
user_product['uxp_tot_orders'] = user_product['order_id']
user_product['uxp_sum_pos_in_cart']=user_product['add_to_cart_order']

user_product['uxp_avg_days_since_prior_order'] = user_product['days_since_prior_order']
user_product['uxp_avg_hour_of_day'] = user_product['order_hour_of_day'].astype(np.float32)
user_product['uxp_avg_day_of_week'] = user_product['order_dow'].astype(np.float32) # added 4/27

user_product['uxp_reordered']=user_product['reordered']
user_product['order_number_1'] = user_product['order_number']
user_product.drop(['add_to_cart_order','order_id','order_number','order_hour_of_day','order_dow'], axis=1, inplace=True)
user_product['order_number'] = user_product['order_number_1']
user_product.reset_index(inplace=True)

# Get relevant columns for last order for a user X product
user_product = user_product.merge(priors[['user_product','user_id','product_id','order_number','order_id','order_hour_of_day','order_dow']], how='left', left_on=['user_product','order_number'], right_on=['user_product','order_number'])
user_product['uxp_last_order_id'] = user_product['order_id']
user_product['uxp_last_order_number'] = user_product['order_number']
user_product['uxp_last_order_hour_of_day'] = user_product['order_hour_of_day']
user_product['uxp_last_order_dow'] = user_product['order_dow']
user_product.drop(['order_id','order_number','order_number_1','reordered','order_hour_of_day', 'order_dow'], inplace=True, axis=1)
user_product.set_index('user_product', drop=False, inplace=True)
# user_product.head()


### train / test orders ###
print('split orders : train, test')
test_orders = orders[orders.eval_set == 'test']
train_orders = orders[orders.eval_set == 'train']

# Assigning index to train data set
train.set_index(['order_id', 'product_id'], inplace=True, drop=False)


# Create labels and add features to train dataset
df_train = train_orders.merge(user_product[['user_id','product_id']], how = 'left', left_on = 'user_id', right_on = 'user_id')
print("Shape of df_train is :", df_train.shape)
labels = []
train_index = set(train.index)  
for row in df_train.itertuples():
    order_id = row.order_id
    product_id = row.product_id
    labels += [(order_id, product_id) in train_index]
labels = np.array(labels, dtype=np.int8)
print("Shape of labels array is:", len(labels))

df_train['labels'] = labels

print('user related features')
df_train['user_tot_orders'] = df_train.user_id.map(users.user_tot_orders)
df_train['user_tot_prods'] = df_train.user_id.map(users.user_tot_prods)
df_train['user_tot_dist_prods'] = df_train.user_id.map(users.user_tot_dist_prods)
df_train['user_avg_days_bet_orders'] = df_train.user_id.map(users.user_avg_days_bet_orders)
df_train['user_avg_order_size'] =  df_train.user_id.map(users.user_avg_order_size)
df_train['user_reorderd_prods'] =  df_train.user_id.map(users.user_reorderd_prods)
df_train['user_reorderd_rate'] =  df_train.user_id.map(users.user_reorderd_rate)
                    
                     
print('order related features')
df_train['days_since_ratio'] = df_train.days_since_prior_order / df_train.user_avg_days_bet_orders

print('product related features')
df_train['aisle_id'] = df_train.product_id.map(products.aisle_id)
df_train['department_id'] = df_train.product_id.map(products.department_id)
df_train['aisle'] = df_train.product_id.map(products.aisle)
df_train['department'] = df_train.product_id.map(products.department)
df_train['prod_ordered'] = df_train.product_id.map(products.prod_ordered)
df_train['prod_reordered'] = df_train.product_id.map(products.prod_reordered)
df_train['prod_reorder_rate'] = df_train.product_id.map(products.prod_reorder_rate)
df_train['prod_avg_pos_in_cart'] = df_train.product_id.map(products.prod_avg_pos_in_cart)
df_train['prod_avg_hour_of_day'] = df_train.product_id.map(products.prod_avg_hour_of_day)
df_train['prod_avg_day_of_week'] = df_train.product_id.map(products.prod_avg_day_of_week)
df_train['prod_avg_days_since_prior_order'] = df_train.product_id.map(products.prod_avg_days_since_prior_order)
df_train['prod_reorder_prob'] = df_train.product_id.map(products.prod_reorder_prob)
df_train['prod_reorder_times'] = df_train.product_id.map(products.prod_reorder_times)

# df_train.drop(['user_id'], axis=1, inplace=True)

print('user_X_product related features')
df_train['user_product'] = df_train.user_id.astype(str) + "-" + df_train.product_id.astype(str)
df_train['uxp_tot_orders'] = df_train.user_product.map(user_product.uxp_tot_orders)

df_train['uxp_last_order_id'] = df_train.user_product.map(user_product.uxp_last_order_id)
df_train['uxp_avg_pos_in_cart'] = df_train.user_product.map(user_product.uxp_sum_pos_in_cart)/df_train.uxp_tot_orders
df_train['uxp_reordered'] = df_train.user_product.map(user_product.uxp_reordered)

df_train['uxp_order_rate'] = df_train.uxp_tot_orders/df_train.user_tot_orders
df_train['uxp_reorder_rate'] = df_train.uxp_reordered/df_train.user_reorderd_prods

df_train['uxp_avg_days_since_prior_order'] = df_train.user_product.map(user_product.uxp_avg_days_since_prior_order).astype(np.float32)
df_train['uxp_avg_hour_of_day'] = df_train.user_product.map(user_product.uxp_avg_hour_of_day).astype(np.float32)
df_train['uxp_avg_day_of_week'] = df_train.user_product.map(user_product.uxp_avg_day_of_week).astype(np.float32)

df_train['uxp_orders_since_last_order'] = df_train.user_tot_orders - df_train.user_product.map(user_product.uxp_last_order_number)
df_train['uxp_delta_hour_vs_last'] = abs(df_train.order_hour_of_day - df_train.user_product.map(user_product.uxp_last_order_hour_of_day)).map(lambda x: min(x, 24-x))
# df_train['uxp_delta_day_vs_last'] = abs(df_train.order_dow - df_train.user_product_key.map(user_product.uxp_last_order_dow)).map(lambda x: min(x, 7-x))
df_train['uxp_same_dow_as_last_order'] = df_train.user_product.map(user_product.uxp_last_order_dow) == df_train.order_dow


features_to_use = ['order_number', 'order_dow','order_hour_of_day', 'days_since_prior_order', 'product_id','user_tot_orders', 
                'user_tot_prods', 'user_tot_dist_prods', 'user_avg_days_bet_orders', 'user_avg_order_size','user_reorderd_prods',
                'user_reorderd_rate', 'aisle','department', 'prod_ordered', 'prod_reordered', 'prod_reorder_rate',
                'prod_avg_pos_in_cart', 'prod_avg_hour_of_day', 'prod_avg_day_of_week', 'prod_avg_days_since_prior_order', 
                'user_product_key', 'uxp_tot_orders', 'uxp_order_rate', 'uxp_last_order_id', 'uxp_avg_pos_in_cart', 'uxp_reordered', 'uxp_reorder_rate',
                  'uxp_orders_since_last_order', 'uxp_delta_hour_vs_last', 'uxp_avg_days_since_prior_order', 
                   'uxp_avg_hour_of_day', 'uxp_avg_day_of_week', 'prod_reorder_prob', 'prod_reorder_times', 
                   'uxp_order_rate', 'uxp_same_dow_as_last_order']

# df_train.head()


# Preparation of df_test features
df_test = test_orders.merge(user_product[['user_id','product_id']], how = 'left', left_on = 'user_id', right_on = 'user_id')


print('user related features')
df_test['user_tot_orders'] = df_test.user_id.map(users.user_tot_orders)
df_test['user_tot_prods'] = df_test.user_id.map(users.user_tot_prods)
df_test['user_tot_dist_prods'] = df_test.user_id.map(users.user_tot_dist_prods)
df_test['user_avg_days_bet_orders'] = df_test.user_id.map(users.user_avg_days_bet_orders)
df_test['user_avg_order_size'] =  df_test.user_id.map(users.user_avg_order_size)
df_test['user_reorderd_prods'] =  df_test.user_id.map(users.user_reorderd_prods)
df_test['user_reorderd_rate'] =  df_test.user_id.map(users.user_reorderd_rate)
                    
                     
print('order related features')
df_test['days_since_ratio'] = df_test.days_since_prior_order / df_test.user_avg_days_bet_orders

print('product related features')
df_test['aisle_id'] = df_test.product_id.map(products.aisle_id)
df_test['department_id'] = df_test.product_id.map(products.department_id)
df_test['aisle'] = df_test.product_id.map(products.aisle)
df_test['department'] = df_test.product_id.map(products.department)
df_test['prod_ordered'] = df_test.product_id.map(products.prod_ordered)
df_test['prod_reordered'] = df_test.product_id.map(products.prod_reordered)
df_test['prod_reorder_rate'] = df_test.product_id.map(products.prod_reorder_rate)
df_test['prod_avg_pos_in_cart'] = df_test.product_id.map(products.prod_avg_pos_in_cart)
df_test['prod_avg_hour_of_day'] = df_test.product_id.map(products.prod_avg_hour_of_day)
df_test['prod_avg_day_of_week'] = df_test.product_id.map(products.prod_avg_day_of_week)
df_test['prod_avg_days_since_prior_order'] = df_test.product_id.map(products.prod_avg_days_since_prior_order)
df_test['prod_reorder_prob'] = df_test.product_id.map(products.prod_reorder_prob)
df_test['prod_reorder_times'] = df_test.product_id.map(products.prod_reorder_times)

# df_test.drop(['user_id'], axis=1, inplace=True)

print('user_X_product related features')
df_test['user_product'] = df_test.user_id.astype(str) + "-" + df_test.product_id.astype(str)
df_test['uxp_tot_orders'] = df_test.user_product.map(user_product.uxp_tot_orders)

df_test['uxp_last_order_id'] = df_test.user_product.map(user_product.uxp_last_order_id)
df_test['uxp_avg_pos_in_cart'] = df_test.user_product.map(user_product.uxp_sum_pos_in_cart)/df_test.uxp_tot_orders
df_test['uxp_reordered'] = df_test.user_product.map(user_product.uxp_reordered)

df_test['uxp_order_rate'] = df_test.uxp_tot_orders/df_test.user_tot_orders
df_test['uxp_reorder_rate'] = df_test.uxp_reordered/df_test.user_reorderd_prods

df_test['uxp_avg_days_since_prior_order'] = df_test.user_product.map(user_product.uxp_avg_days_since_prior_order).astype(np.float32)
df_test['uxp_avg_hour_of_day'] = df_test.user_product.map(user_product.uxp_avg_hour_of_day).astype(np.float32)
df_test['uxp_avg_day_of_week'] = df_test.user_product.map(user_product.uxp_avg_day_of_week).astype(np.float32)

df_test['uxp_orders_since_last_order'] = df_test.user_tot_orders - df_test.user_product.map(user_product.uxp_last_order_number)
df_test['uxp_delta_hour_vs_last'] = abs(df_test.order_hour_of_day - df_test.user_product.map(user_product.uxp_last_order_hour_of_day)).map(lambda x: min(x, 24-x))
# df_test['uxp_delta_day_vs_last'] = abs(df_test.order_dow - df_test.user_product_key.map(user_product.uxp_last_order_dow)).map(lambda x: min(x, 7-x))
df_test['uxp_same_dow_as_last_order'] = df_test.user_product.map(user_product.uxp_last_order_dow) == df_test.order_dow

gc.collect()
print(df_train.shape)
print(df_test.shape)

priors (32434489, 4): order_id, product_id, add_to_cart_order, reordered
orders (3421083, 7): order_id, user_id, eval_set, order_number, order_dow, order_hour_of_day, days_since_prior_order
train (1384617, 4): order_id, product_id, add_to_cart_order, reordered
add order info to priors
computing product features
computing user features
users shape (206209, 9)
split orders : train, test
Shape of df_train is : (8474661, 8)
Shape of labels array is: 8474661
user related features
order related features
product related features
user_X_product related features
user related features
order related features
product related features
user_X_product related features
(8474661, 43)
(4833292, 42)


# 1. Light GBM Model

In [6]:
import psutil
print(psutil.virtual_memory())

svmem(total=270008086528, available=127754268672, percent=52.7, used=137012580352, free=29907632128, active=148541300736, inactive=85502771200, buffers=896446464, cached=102191427584, shared=734601216, slab=2482016256)


In [7]:
df_train.to_csv('/axp/buanalytics/cscust360/dev/Anu/Kaggle/df_train.csv')
df_test.to_csv('/axp/buanalytics/cscust360/dev/Anu/Kaggle/df_test.csv')

In [8]:
df_train.shape

(8474661, 43)

In [4]:
df_train = pd.read_csv('/axp/buanalytics/cscust360/dev/Anu/Kaggle/df_train.csv')
df_test = pd.read_csv('/axp/buanalytics/cscust360/dev/Anu/Kaggle/df_test.csv')
print(df_train.shape)
print(df_test.shape)

(8474661, 44)
(4833292, 43)


In [7]:
df_train.drop('Unnamed: 0', axis=1, inplace=True)
df_test.drop('Unnamed: 0', axis=1, inplace=True)
print(df_train.shape)
print(df_test.shape)

(8474661, 43)
(4833292, 42)


In [3]:
features_to_use = ['order_number', 'order_dow','order_hour_of_day', 'days_since_prior_order',
                   'user_tot_orders', 'user_tot_prods', 'user_tot_dist_prods','user_avg_days_bet_orders', 
                   'user_avg_order_size','user_reorderd_prods', 'user_reorderd_rate', 'days_since_ratio',
                   'aisle_id', 'department_id','prod_ordered','prod_reordered', 'prod_reorder_rate', 
                   'prod_avg_pos_in_cart','prod_avg_hour_of_day', 'prod_avg_day_of_week','prod_avg_days_since_prior_order', 
                   'prod_reorder_prob','prod_reorder_times', 'uxp_tot_orders','uxp_last_order_id', 
                   'uxp_avg_pos_in_cart', 'uxp_reordered','uxp_order_rate', 'uxp_reorder_rate', 'uxp_avg_days_since_prior_order',
                   'uxp_avg_hour_of_day', 'uxp_avg_day_of_week', 'uxp_orders_since_last_order', 'uxp_delta_hour_vs_last',
                   'uxp_same_dow_as_last_order']

features_to_use_1 = ['user_tot_orders', 'user_tot_prods', 'user_tot_dist_prods', 'user_avg_days_bet_orders', 
                     'user_avg_order_size', 'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio','aisle_id', 
                     'department_id','prod_ordered', 'prod_reordered', 'prod_reorder_rate','uxp_tot_orders', 'uxp_order_rate', 
                     'uxp_avg_pos_in_cart','uxp_reorder_rate','uxp_orders_since_last_order', 'uxp_delta_hour_vs_last']

print('formating for lgb')
d_train = lgb.Dataset(df_train[features_to_use], label=df_train['labels'],categorical_feature=['aisle_id', 'department_id'])
# del df_train

params = {'task': 'train', 'boosting_type': 'gbdt', 'objective': 'binary', 'metric': {'binary_logloss'}, 'num_leaves': 96,
          'max_depth': 10, 'feature_fraction': 0.9, 'bagging_fraction': 0.95, 'bagging_freq': 5
}
ROUNDS = 100

print('light GBM train :-)')
bst = lgb.train(params, d_train, ROUNDS)
# lgb.plot_importance(bst, figsize=(9,20))
del d_train

print('light GBM predict')
preds = bst.predict(df_test[features_to_use])

df_test['pred'] = preds

TRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > TRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub_lgbm_15Jun_allvars.csv', index=False)

formating for lgb
light GBM train :-)
light GBM predict


# 2. Random Forest Classification

In [6]:
numeric = ['user_tot_orders', 'user_tot_prods', 'user_tot_dist_prods', 'user_avg_days_bet_orders', 'user_avg_order_size', 
           'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio', 'prod_ordered', 'prod_reordered', 'prod_reorder_rate',
           'uxp_tot_orders', 'uxp_order_rate', 'uxp_avg_pos_in_cart','uxp_reorder_rate','uxp_orders_since_last_order', 'uxp_delta_hour_vs_last']

impute_values = {}
for col in numeric:
    try:
        impute_values[col] = df_train[col].median()
    except:
        pass
# features_to_use_1 = numeric + categorical
df_train[numeric] = df_train[numeric].fillna(impute_values)
del impute_values

impute_values = {}
for col in numeric:
    try:
        impute_values[col] = df_test[col].median()
    except:
        pass
# features_to_use_1 = numeric + categorical
df_test[numeric] = df_test[numeric].fillna(impute_values)
print("imputation done")


features_to_use_1 = ['user_tot_orders', 'user_tot_prods', 'user_tot_dist_prods', 'user_avg_days_bet_orders', 
                     'user_avg_order_size', 'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
                     'department','prod_ordered', 'prod_reordered', 'prod_reorder_rate','uxp_tot_orders', 'uxp_order_rate', 
                     'uxp_avg_pos_in_cart','uxp_reorder_rate','uxp_orders_since_last_order', 'uxp_delta_hour_vs_last']
X_train= pd.get_dummies(df_train.loc[:,features_to_use_1])
variables = X_train.columns.tolist()
print('TRAIN DATA SIZE: ', X_train.shape)
# X_train.to_csv("/axp/buanalytics/cscust360/dev/Anu/Kaggle/X_train_rem_aisle.csv")
y_train = df_train['labels']
# y_train.to_csv("/axp/buanalytics/cscust360/dev/Anu/Kaggle/y_train_rem_aisle.csv")
# del df_train

X_test = pd.get_dummies(df_test.loc[:,features_to_use_1])
test_cols = set(X_test.columns)
columns_not_present = set(variables) - test_cols
for col in columns_not_present:
    X_test[col] = 0
X_test = X_test[variables]
print('TEST DATA SIZE: ', X_test.shape)
# X_test.to_csv("/axp/buanalytics/cscust360/dev/Anu/Kaggle/X_test_rem_aisle.csv")
# del df_test
gc.collect()


for column in X_train.columns:
    if X_train[column].dtype == 'float64':
        X_train[column] = X_train[column].astype(np.float32)
y_train = y_train.astype(np.int8)
for column in X_test.columns:
    if X_test[column].dtype == 'float64':
        X_test[column] = X_test[column].astype(np.float32)

imputation done
TRAIN DATA SIZE:  (8474661, 38)
TEST DATA SIZE:  (4833292, 38)


In [7]:
from sklearn.metrics import roc_auc_score
from sklearn.ensemble import RandomForestClassifier

# Number of trees in random forest
n_estimators = [50,70]

# Number of features to consider at every split
max_features = ['auto'] #'sqrt'

# Maximum number of levels in tree
max_depth = [5,7]
# max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5]

# Minimum number of samples required at each leaf node
min_samples_leaf = [2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

# Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth}
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
#                'bootstrap': bootstrap}

print(param_grid)

rfc = RandomForestClassifier(random_state = 42, n_jobs = 3)
cv_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, scoring = 'roc_auc')
cv_rfc.fit(X_train.values, y_train.values)

# print results
print(cv_rfc.best_params_)

{'max_depth': [5, 7], 'n_estimators': [50, 70], 'max_features': ['auto']}
{'max_depth': 7, 'n_estimators': 70, 'max_features': 'auto'}


In [10]:
rfc = RandomForestClassifier(random_state = 42, n_jobs = 3, n_estimators = 70, 
                             max_features = 'auto', max_depth = 7)
rfc.fit(X_train.values, y_train.values)
y_test_pred = rfc.predict(X_test.values)
print(accuracy_score(rfc.predict(X_train.values), y_train.values))
df_test['pred'] = y_test_pred
prob = pd.DataFrame(rfc.predict_proba(X_test.values), columns=rfc.classes_)
df_test['pred_prob'] = prob[1]
df_test.head()

0.9070941008731794


Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,user_tot_orders,user_tot_prods,...,uxp_order_rate,uxp_reorder_rate,uxp_avg_days_since_prior_order,uxp_avg_hour_of_day,uxp_avg_day_of_week,uxp_orders_since_last_order,uxp_delta_hour_vs_last,uxp_same_dow_as_last_order,pred,pred_prob
0,2774568,3,test,13,5,15,11.0,1005,13,88,...,0.076923,0.0,17.0,16.0,3.0,3,1,False,0,0.044775
1,2774568,3,test,13,5,15,11.0,12845,13,88,...,0.076923,0.0,20.0,18.0,2.0,9,3,False,0,0.040813
2,2774568,3,test,13,5,15,11.0,14992,13,88,...,0.153846,0.018182,7.0,15.5,0.0,6,0,False,0,0.095523
3,2774568,3,test,13,5,15,11.0,15143,13,88,...,0.076923,0.0,,14.0,1.0,12,1,False,0,0.041981
4,2774568,3,test,13,5,15,11.0,16797,13,88,...,0.230769,0.036364,7.0,15.0,0.333333,4,1,False,0,0.162997


In [11]:
THRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data

d = dict()
for row in df_test.itertuples():
    if row.pred > THRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)

for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'

sub = pd.DataFrame.from_dict(d, orient='index')

sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub_rfc_17June.csv', index=False)

# 3. XGBoost Classification

In [3]:
numeric = ['user_tot_orders', 'user_tot_prods', 'user_tot_dist_prods', 'user_avg_days_bet_orders', 'user_avg_order_size', 
           'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio', 'prod_ordered', 'prod_reordered', 'prod_reorder_rate',
           'uxp_tot_orders', 'uxp_order_rate', 'uxp_avg_pos_in_cart','uxp_reorder_rate','uxp_orders_since_last_order', 'uxp_delta_hour_vs_last']

impute_values = {}
for col in numeric:
    try:
        impute_values[col] = df_train[col].median()
    except:
        pass
# features_to_use_1 = numeric + categorical
df_train[numeric] = df_train[numeric].fillna(impute_values)
del impute_values

impute_values = {}
for col in numeric:
    try:
        impute_values[col] = df_test[col].median()
    except:
        pass
# features_to_use_1 = numeric + categorical
df_test[numeric] = df_test[numeric].fillna(impute_values)
print("imputation done")

features_to_use_1 = ['user_tot_orders', 'user_tot_prods', 'user_tot_dist_prods', 'user_avg_days_bet_orders', 
                     'user_avg_order_size', 'order_hour_of_day', 'days_since_prior_order', 'days_since_ratio',
                     'department','prod_ordered', 'prod_reordered', 'prod_reorder_rate','uxp_tot_orders', 'uxp_order_rate', 
                     'uxp_avg_pos_in_cart','uxp_reorder_rate','uxp_orders_since_last_order', 'uxp_delta_hour_vs_last']
X_train= pd.get_dummies(df_train.loc[:,features_to_use_1])
variables = X_train.columns.tolist()
print('TRAIN DATA SIZE: ', X_train.shape)
# X_train.to_csv("/axp/buanalytics/cscust360/dev/Anu/Kaggle/X_train_rem_aisle.csv")
y_train = df_train['labels']
# y_train.to_csv("/axp/buanalytics/cscust360/dev/Anu/Kaggle/y_train_rem_aisle.csv")
# del df_train

X_test = pd.get_dummies(df_test.loc[:,features_to_use_1])
test_cols = set(X_test.columns)
columns_not_present = set(variables) - test_cols
for col in columns_not_present:
    X_test[col] = 0
X_test = X_test[variables]
print('TEST DATA SIZE: ', X_test.shape)
# X_test.to_csv("/axp/buanalytics/cscust360/dev/Anu/Kaggle/X_test_rem_aisle.csv")
# del df_test
gc.collect()

for column in X_train.columns:
    if X_train[column].dtype == 'float64':
        X_train[column] = X_train[column].astype(np.float32)
y_train = y_train.astype(np.int8)
for column in X_test.columns:
    if X_test[column].dtype == 'float64':
        X_test[column] = X_test[column].astype(np.float32)

imputation done
TRAIN DATA SIZE:  (8474661, 38)
TEST DATA SIZE:  (4833292, 38)


In [7]:
import psutil
print(psutil.virtual_memory())

svmem(total=540687081472, available=233800704000, percent=56.8, used=300302757888, free=46918955008, active=351852785664, inactive=129511563264, buffers=1312206848, cached=192153161728, shared=1223315456, slab=3933052928)


In [27]:
# Hyperparameter tuning for XGBoost model
xgb = XGBClassifier(objective = 'binary:logistic')

param_dist = {'n_estimators': [50, 70],
              'learning_rate': [0.01, 0.2],
#               'subsample': [stats.uniform(]0.3, 0.9),
              'max_depth': [3, 5]
#               'colsample_bytree': [0.5, 0.7],
#               'min_child_weight': [1, 2]
             }

# numFolds = 5
# kfold_5 = cross_validation.KFold(n = len(X), shuffle = True, n_folds = numFolds)

clf = RandomizedSearchCV(xgb, param_distributions = param_dist, n_iter = 2, cv = 3,
                         scoring = 'roc_auc', error_score = 0, verbose = 3, n_jobs = -1)

clf_result = clf.fit(X_train.values, y_train.values)

# summarize results
print("Best: %f using %s" % (clf_result.best_score_, clf_result.best_params_))
means = clf_result.cv_results_['mean_test_score']
stds = clf_result.cv_results_['std_test_score']
params = clf_result.cv_results_['params']

Best: 0.830810 using {'max_depth': 5, 'learning_rate': 0.2, 'n_estimators': 50}


In [4]:
from sklearn.metrics import accuracy_score
xgb = XGBClassifier(objective='binary:logistic', n_estimators=50, max_depth = 5, learning_rate=0.2)
xgb.fit(X_train.values, y_train.values)
# del X_train
# del y_train
gc.collect()
y_test_pred = xgb.predict(X_test.values)
# xgb.save_model('xgb_model_16Jun.model')
print(accuracy_score(xgb.predict(X_train.values), y_train.values))

0.9093839859789082


In [26]:
df_test['pred'] = y_test_pred
prob = pd.DataFrame(xgb.predict_proba(X_test.values), columns=xgb.classes_)
df_test['pred_prob'] = prob[1]
df_test.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,user_tot_orders,user_tot_prods,...,uxp_order_rate,uxp_reorder_rate,uxp_avg_days_since_prior_order,uxp_avg_hour_of_day,uxp_avg_day_of_week,uxp_orders_since_last_order,uxp_delta_hour_vs_last,uxp_same_dow_as_last_order,pred,pred_prob
0,2774568,3,test,13,5,15,11.0,1005,13,88,...,0.076923,0.0,17.0,16.0,3.0,3,1,False,0,0.054716
1,2774568,3,test,13,5,15,11.0,12845,13,88,...,0.076923,0.0,20.0,18.0,2.0,9,3,False,0,0.026354
2,2774568,3,test,13,5,15,11.0,14992,13,88,...,0.153846,0.018182,7.0,15.5,0.0,6,0,False,0,0.092646
3,2774568,3,test,13,5,15,11.0,15143,13,88,...,0.076923,0.0,,14.0,1.0,12,1,False,0,0.024793
4,2774568,3,test,13,5,15,11.0,16797,13,88,...,0.230769,0.036364,7.0,15.0,0.333333,4,1,False,0,0.16864


In [27]:
d = dict()
THRESHOLD = 0.3  # guess, should be tuned with crossval on a subset of train data
for row in df_test.itertuples():
    if row.pred_prob > THRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)
            
for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'
        
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub_xgboost_17Jun_30.csv', index=False)
sub.shape

(75000, 2)

In [28]:
d = dict()
THRESHOLD = 0.22  # guess, should be tuned with crossval on a subset of train data
for row in df_test.itertuples():
    if row.pred_prob > THRESHOLD:
        try:
            d[row.order_id] += ' ' + str(row.product_id)
        except:
            d[row.order_id] = str(row.product_id)
            
for order in test_orders.order_id:
    if order not in d:
        d[order] = 'None'
        
sub = pd.DataFrame.from_dict(d, orient='index')
sub.reset_index(inplace=True)
sub.columns = ['order_id', 'products']
sub.to_csv('sub_xgboost_17Jun_22.csv', index=False)
sub.shape

(75000, 2)

In [5]:
categorical_all = ['department']

def de_one_hot_encode(s):
    for col in categorical_all:
        if(col in s):
            return col
    return s

# Get variable importance
variables = X_train.columns.tolist()
features = pd.DataFrame({'Importance':xgb.feature_importances_ , 'Variable':variables})
features['Variable'] = features['Variable'].apply(de_one_hot_encode)
features = features.groupby('Variable').sum().reset_index()
features = features.sort_values('Importance', ascending=False )
features = features[['Importance', 'Variable']]
display(features)
features.to_csv('feature_importances_xgboost_21Jun.csv')

Unnamed: 0,Importance,Variable
14,0.446633,uxp_order_rate
15,0.179027,uxp_orders_since_last_order
16,0.099416,uxp_reorder_rate
17,0.072619,uxp_tot_orders
5,0.04947,prod_reorder_rate
2,0.037876,department
0,0.0261,days_since_prior_order
1,0.01438,days_since_ratio
6,0.013321,prod_reordered
13,0.012223,uxp_delta_hour_vs_last
