# Benchmark

In [19]:
# imports

import pandas as pd
pd.set_option('display.max_rows', 20)
pd.set_option('display.max_columns', 20)

import numpy as np
import gc

import catboost ## thanks to yandex ! :) 
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline 

# info about itertools.product: 
# https://pythonworld.ru/moduli/modul-itertools.html
# https://docs.python.org/2/library/itertools.html
from itertools import product

from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import RandomForestRegressor , AdaBoostRegressor, BaggingRegressor , ExtraTreesRegressor
from sklearn.metrics import make_scorer

import xgboost
from xgboost import DMatrix

In [20]:
# downcast types to save memory

def downcast_dtypes(df):
    '''
        Changes column types in the dataframe: 
                
                `float64` type to `float32`
                `int64`   type to `int32`
    '''
    
    # Select columns to downcast
    float_cols = [c for c in df if df[c].dtype == "float64"]
    int_cols =   [c for c in df if df[c].dtype == "int64"]
    
    # Downcast
    df[float_cols] = df[float_cols].astype(np.float32)
    df[int_cols]   = df[int_cols].astype(np.int32)
    
    return df

In [21]:
def rmse(y_true, y_pred):
    y_true[np.where(y_true < 0)] = 0
    y_true[np.where(y_true > 20)] = 20
    
    y_pred[np.where(y_pred < 0)] = 0
    y_pred[np.where(y_pred > 20)] = 20
    
    return np.sqrt(np.mean((y_pred - y_train)**2))
rmse_scorer = make_scorer(rmse, greater_is_better=False)

In [22]:
sales = pd.read_csv('input/train.csv')
shops = pd.read_csv('input/shops.csv')
items = pd.read_csv('input//items.csv')
item_cats = pd.read_csv('input/item_categories.csv')
test = pd.read_csv('input/test.csv')

In [23]:
test["date_block_num"] = sales.date_block_num.max() + 1
sales = pd.concat([sales, test])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  


In [24]:
# Create "grid" with columns
index_cols = ['shop_id', 'item_id', 'date_block_num']

# For every month we create a grid from all shops/items combinations from that month
grid = [] 
for block_num in sales['date_block_num'].unique():
    cur_shops = sales.loc[sales['date_block_num'] == block_num, 'shop_id'].unique()
    cur_items = sales.loc[sales['date_block_num'] == block_num, 'item_id'].unique()
    grid.append(np.array(list(product(*[cur_shops, cur_items, [block_num]])),dtype='int32'))

# Turn the grid into a dataframe
grid = pd.DataFrame(np.vstack(grid), columns = index_cols,dtype=np.int32)

In [25]:
# Groupby data to get shop-item-month aggregates
gb = sales.groupby(index_cols,as_index=False).agg({'item_cnt_day':{'target':'sum'}})
# Fix column names
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values] 
# Join it to the grid
all_data = pd.merge(grid, gb, how='left', on=index_cols).fillna(0)

# Same as above but with shop-month aggregates
gb = sales.groupby(['shop_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_shop':'sum'}})
gb.columns = [col[0] if col[-1]=='' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['shop_id', 'date_block_num']).fillna(0)

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [26]:
# Same as above but with item-month aggregates
gb = sales.groupby(['item_id', 'date_block_num'],as_index=False).agg({'item_cnt_day':{'target_item':'sum'}})
gb.columns = [col[0] if col[-1] == '' else col[-1] for col in gb.columns.values]
all_data = pd.merge(all_data, gb, how='left', on=['item_id', 'date_block_num']).fillna(0)

# Downcast dtypes from 64 to 32 bit to save memory
all_data = downcast_dtypes(all_data)
del grid, gb 
gc.collect();

  return super(DataFrameGroupBy, self).aggregate(arg, *args, **kwargs)


In [27]:
# List of columns that we will use to create lags
cols_to_rename = list(all_data.columns.difference(index_cols)) 

shift_range = [1,2,3,4]

for month_shift in shift_range:
    train_shift = all_data[index_cols + cols_to_rename].copy()
    
    train_shift['date_block_num'] = train_shift['date_block_num'] + month_shift
    
    foo = lambda x: '{}_lag_{}'.format(x, month_shift) if x in cols_to_rename else x
    train_shift = train_shift.rename(columns=foo)

    all_data = pd.merge(all_data, train_shift, on=index_cols, how='left').fillna(0)

del train_shift

In [28]:
# !! Don't use old data from year 2013 !! 
all_data = all_data[(all_data["date_block_num"] > 20)]

# List of all lagged features
fit_cols = [col for col in all_data.columns if col[-1] in [str(item) for item in shift_range]] 
# We will drop these at fitting stage
to_drop_cols = list(set(list(all_data.columns)) - (set(fit_cols)|set(index_cols))) + ['date_block_num'] 

# Category for each item
item_category_mapping = items[['item_id','item_category_id']].drop_duplicates()

all_data = pd.merge(all_data, item_category_mapping, how='left', on='item_id')
all_data = downcast_dtypes(all_data)
gc.collect();

In [29]:
# now separate train and test
test = pd.merge(test, all_data[all_data["date_block_num"] == 34], on=["shop_id", "item_id", "date_block_num"])
train = all_data[all_data["date_block_num"] < 34]

In [30]:
# Save `date_block_num`, as we can't use them as features, but will need them to split the dataset into parts 
dates = train['date_block_num']

last_block = dates.max()
print('Test `date_block_num` is %d' % last_block)

Test `date_block_num` is 33


# Validation

In [31]:
# validate our model on last month of train (October, 2015)

dates_train = dates[dates <  last_block]
dates_test  = dates[dates == last_block]

X_train = train.loc[dates <  last_block].drop(to_drop_cols, axis=1)
X_test =  train.loc[dates == last_block].drop(to_drop_cols, axis=1)

y_train = train.loc[dates <  last_block, 'target'].values
y_test =  train.loc[dates == last_block, 'target'].values

In [None]:
%%time
xgb = XGBRegressor(max_depth=5 , colsample_bytree=0.8 , colsample_bylevel=0.8)
xgb.fit(X_train.values, y_train)
preds_xgb = xgb.predict(X_test.values)

print("xgboost val score = {}".format(rmse(preds_xgb , y_test)))

In [None]:
best_%%time
scale = StandardScaler()
X_train_scale = scale.fit_transform(X_train)
X_test_scale = scale.transform(X_test)


mlp = MLPRegressor(random_state=42)
mlp.fit(X_train_scale , y_train)
preds_mlp = mlp.predict(X_test_scale)
print ('MLPRegressor val score = {}'.format(rmse(preds_mlp , y_test)))

In [None]:
%%time
rf = RandomForestRegressor(n_estimators=100 , max_depth=5 , n_jobs=-1 , random_state=0)
rf.fit(X_train.values, y_train)
preds_rf = rf.predict(X_test.values)

print("rf val score = {}".format(rmse(preds_rf , y_test)))

In [None]:
%%time
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_scale.shape[1]))
model.add(Dense(32, kernel_initializer='uniform', activation='relu'))
model.add(Dense(16, kernel_initializer='uniform', activation='relu'))
model.add(Dense(1))
model.compile(optimizer='rmsprop', loss='mse', metrics=['mse'])
model.fit(X_train_scale, y_train, batch_size=640, epochs=10)
preds_keras = model.predict(X_test_scale)
print ('keras val score = {}'.format(rmse(preds_keras.T[0] , y_test)))

In [None]:
%%time
xgb_params0={'colsample_bytree': 1, 'silent': 1, 'nthread': 8, 'min_child_weight': 10,\
    'n_estimators': 300, 'subsample': 1, 'learning_rate': 0.09, 'objective': 'reg:linear',\
    'seed': 10, 'max_depth': 7, 'gamma': 0.}
xgb_params1={'colsample_bytree': 0.77, 'silent': 1, 'nthread': 8, 'min_child_weight': 15,\
    'n_estimators': 500, 'subsample': 0.77, 'learning_rate': 0.035, 'objective': 'reg:linear',\
    'seed': 11, 'max_depth': 6, 'gamma': 0.2}

xgb = XGBRegressor(**xgb_params0)
xgb.fit(X_train.values, y_train)
preds_xgb = xgb.predict(X_test.values)

print("xgboost 0 val score = {}".format(rmse(preds_xgb , y_test)))

xgb = XGBRegressor(**xgb_params1)
xgb.fit(X_train.values, y_train)
preds_xgb = xgb.predict(X_test.values)

print("xgboost 1 val score = {}".format(rmse(preds_xgb , y_test)))

In [14]:
def my_rmse(y_pred, y_true):
    y_true = y_true.get_label()
    
    y_true[np.where(y_true < 0)] = 0
    y_true[np.where(y_true > 20)] = 20
    
    y_pred[np.where(y_pred < 0)] = 0
    y_pred[np.where(y_pred > 20)] = 20
    
    return 'my_rmse' , np.sqrt(np.mean((y_pred - y_true)**2))

In [15]:
dtrain = DMatrix(X_train, y_train)
dtest = DMatrix(X_test , y_test)
watchlist = [(dtrain, 'train') , (dtest, 'test')]
param = {'max_depth': 6,
         'learning_rate' : 0.03 , 
         'colsample_bylevel' : 0.8 ,
         'colsample_bytree' : 0.8 ,
         'eval_metric' : 'rmse',
         'nthread' : -1 , 
         'subsample':0.8
        }
num_round = 1000
bst = xgboost.train(param, dtrain, num_round, watchlist , early_stopping_rounds=100 , verbose_eval=10 , feval=my_rmse)

[0]	train-rmse:3.87756	test-rmse:5.30655	train-my_rmse:1.20389	test-my_rmse:1.14505
Multiple eval metrics have been passed: 'test-my_rmse' will be used for early stopping.

Will train until test-my_rmse hasn't improved in 100 rounds.
[10]	train-rmse:3.43361	test-rmse:5.07073	train-my_rmse:1.0882	test-my_rmse:1.05794
[20]	train-rmse:3.09287	test-rmse:4.8814	train-my_rmse:1.02569	test-my_rmse:1.01439
[30]	train-rmse:2.85003	test-rmse:4.76708	train-my_rmse:0.992111	test-my_rmse:0.992397
[40]	train-rmse:2.67943	test-rmse:4.71297	train-my_rmse:0.972734	test-my_rmse:0.980007
[50]	train-rmse:2.5512	test-rmse:4.66842	train-my_rmse:0.96138	test-my_rmse:0.97332
[60]	train-rmse:2.45106	test-rmse:4.63267	train-my_rmse:0.953173	test-my_rmse:0.967928
[70]	train-rmse:2.37207	test-rmse:4.61047	train-my_rmse:0.947288	test-my_rmse:0.964989
[80]	train-rmse:2.31706	test-rmse:4.57516	train-my_rmse:0.943956	test-my_rmse:0.963438
[90]	train-rmse:2.26748	test-rmse:4.57098	train-my_rmse:0.940967	test-my_rmse:0

In [17]:
D_test = DMatrix(test[X_train.columns])
preds = bst.predict(D_test)
ss = pd.read_csv("input/sample_submission.csv")
ss.item_cnt_month = preds
ss.loc[ss.item_cnt_month > 20,'item_cnt_month'] = 20
ss.loc[ss.item_cnt_month < 0, 'item_cnt_month'] = 0
ss.to_csv("output/xgb_first_submit.csv", index=False)