In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import mean_absolute_error

import xgboost as xgb
import eli5

from tqdm import tqdm

In [2]:
ls ../input/

features_test.h5   sales_test.h5   stores_data.csv
features_train.h5  sales_train.h5  train_data.h5


In [3]:
def check_log_model(df, feats, model, cv=5, scoring=mean_absolute_error, show_eli5=True):
    df_train = df[ ~df["weekly_sales"].isnull() ].copy()

    X = df_train[feats]
    y = df_train["weekly_sales"]
    
    X = X[ y > 0 ]
    y = y[ y > 0 ]
    
    y_log = np.log(y)
    
    cv = KFold(n_splits=cv, shuffle=True, random_state=0)
    scores = []
    for train_idx, test_idx in tqdm(cv.split(X)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_log_train, y_test = y_log.iloc[train_idx], y.iloc[test_idx]

        model.fit(X_train, y_log_train)
        y_log_pred = model.predict(X_test)
        y_pred = np.exp(y_log_pred)

        score = scoring(y_test, y_pred)
        scores.append(score)
        
    result = np.mean(scores), np.std(scores)
    
    if show_eli5:
        model.fit(X, y_log)
        print(result)
        return eli5.show_weights(model, feature_names=feats)

    return result


def parse_date(df, separate=True):  
    df['date'] = pd.to_datetime(df['date'])     
    
    if separate:        
        date_parts = ['day', 'month', 'year', 'week', 'quarter']           
        for part in date_parts:
            df['date_{}'.format(part)] = getattr(df['date'].dt, part)
            
    return df

In [4]:
df_sales_train = pd.read_hdf('../input/sales_train.h5')
df_sales_test = pd.read_hdf('../input/sales_test.h5')

df_features_train = pd.read_hdf('../input/features_train.h5')
df_features_test = pd.read_hdf('../input/features_test.h5')

df_stores_data = pd.read_csv('../input/stores_data.csv')

df_sales_train.shape, df_sales_test.shape, df_features_train.shape, df_features_test.shape, df_stores_data.shape

((285089, 6), (136481, 5), (4365, 12), (3825, 3), (45, 3))

In [5]:
df_sales_train = parse_date(df_sales_train)
df_sales_test = parse_date(df_sales_test)

df_features_train = parse_date(df_features_train, separate=0)
df_features_test = parse_date(df_features_test)

In [6]:
df = pd.concat([df_sales_train, df_sales_test])
df.shape

(421570, 11)

In [7]:
df.sample(5)

Unnamed: 0,id,weekly_sales,store,dept,date,is_holiday,date_day,date_month,date_year,date_week,date_quarter
236934,236934,,8,35,2012-12-10,False,10,12,2012,50,4
51210,51210,168329.46875,27,92,2010-12-11,False,11,12,2010,49,4
33706,33706,30753.970703,35,74,2010-09-07,False,7,9,2010,36,3
352002,352002,50156.808594,26,94,2010-07-23,False,23,7,2010,29,3
403067,403067,12463.379883,40,11,2010-11-06,False,6,11,2010,44,4


In [9]:
df_stores_data.columns=['store', 'type', 'store_size']
df_stores_data['type_cat'] = df_stores_data['type'].factorize()[0]
df_stores_data.sample(10)

Unnamed: 0,store,type,store_size,type_cat
16,17,B,93188,1
19,20,A,203742,0
44,45,B,118221,1
41,42,C,39690,2
24,25,B,128107,1
22,23,B,114533,1
14,15,B,123737,1
5,6,A,202505,0
15,16,B,57197,1
2,3,B,37392,1


In [10]:
if 'store_size' not in df:
    df = pd.merge(df, df_stores_data, on='store', how='left')
df.sample()

Unnamed: 0,id,weekly_sales,store,dept,date,is_holiday,date_day,date_month,date_year,date_week,date_quarter,type,store_size,type_cat
133316,197444,46523.808594,13,46,2010-02-07,False,7,2,2010,5,1,A,219622,0


In [11]:
df_sales_group = df_sales_train.groupby(['store', 'dept']).agg(["mean", "std", "median", "size"])['weekly_sales'].reset_index()
df_sales_group.columns = [
    'store',
    'dept',
    'store_dept_sales_mean',
    'store_dept_sales_std',
    'store_dept_sales_median',
    'store_dept_sales_size'
    ]
df_sales_group

Unnamed: 0,store,dept,store_dept_sales_mean,store_dept_sales_std,store_dept_sales_median,store_dept_sales_size
0,1,1,22494.710938,9784.865234,18820.289062,97
1,1,2,45666.632812,3038.616211,45222.300781,97
2,1,3,12785.541992,8479.493164,10136.780273,97
3,1,4,36428.882812,2818.775635,36174.429688,97
4,1,5,24401.593750,12340.451172,21183.419922,97
...,...,...,...,...,...,...
3288,45,93,2707.413818,771.995117,2609.360107,97
3289,45,94,3698.789551,2120.300537,4384.649902,88
3290,45,95,53106.800781,5650.617676,52619.531250,97
3291,45,97,6431.923828,718.022156,6359.370117,97


In [12]:
if 'store_dept_sales_mean' not in df:
    df = pd.merge(df, df_sales_group, on=['store', 'dept'], how='left')    

In [13]:
df.sample(5)

Unnamed: 0,id,weekly_sales,store,dept,date,is_holiday,date_day,date_month,date_year,date_week,date_quarter,type,store_size,type_cat,store_dept_sales_mean,store_dept_sales_std,store_dept_sales_median,store_dept_sales_size
308635,72486,,29,1,2012-04-20,False,20,4,2012,16,2,B,93638,1,15502.146484,7909.945801,12317.169922,97.0
257083,380007,39793.988281,14,5,2010-05-11,False,11,5,2010,19,2,A,200898,0,36572.925781,22803.199219,29896.970703,97.0
260182,384646,44756.558594,4,55,2010-11-26,True,26,11,2010,47,4,A,205863,0,19353.371094,8945.47168,16752.529297,97.0
358270,225633,,21,23,2012-03-08,False,8,3,2012,10,1,B,140167,1,37195.5,8587.107422,36153.148438,97.0
249989,369494,26676.220703,30,94,2010-12-02,True,2,12,2010,48,4,C,42988,2,24316.03125,2289.399902,24748.970703,97.0


In [14]:
obj_feats = df.select_dtypes(include=['number', 'bool']).columns
black_list = ['id', 'weekly_sales']
feats = [x for x in obj_feats if x not in black_list]
feats

['store',
 'dept',
 'is_holiday',
 'date_day',
 'date_month',
 'date_year',
 'date_week',
 'date_quarter',
 'store_size',
 'type_cat',
 'store_dept_sales_mean',
 'store_dept_sales_std',
 'store_dept_sales_median',
 'store_dept_sales_size']

In [15]:
model = xgb.XGBRegressor(max_depth=10, n_estimators=100, learning_rate=0.3, random_state=0)

check_log_model(df, feats, model)

5it [10:19, 123.82s/it]


(1452.3743, 12.815364)


Weight,Feature
0.9254,store_dept_sales_median
0.022,store_dept_sales_mean
0.0095,date_week
0.0073,store_dept_sales_size
0.0068,date_day
0.0057,date_year
0.0056,dept
0.0042,store_dept_sales_std
0.0039,date_month
0.0033,type_cat


In [85]:
df_train = df[ ~df['weekly_sales'].isnull() ].copy()
df_test = df[ df['weekly_sales'].isnull() ].copy()

X_train = df_train[feats]
y_train = df_train['weekly_sales']
X_test = df_test[feats]

X_train = X_train[ y_train > 0 ]
y_train = y_train[ y_train > 0 ]

y_train_log = np.log(y_train)
model.fit(X_train, y_train_log)
y_pred = model.predict(X_test)

df_test['weekly_sales'] = np.exp(y_pred)
df_test[ ['id', 'weekly_sales'] ].to_csv('../output/xgboost_store_mean.csv', index=False)

In [84]:
#model = xgb.XGBRegressor(max_depth=10, n_estimators=100, learning_rate=0.3, random_state=0)
#Score: 1898.23477
#Public score: 1890.97557


#model = xgb.XGBRegressor(max_depth=10, n_estimators=500, learning_rate=0.05, random_state=0)
#Score: 1864.58479
#Public score: 1866.41925