In [1]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import eli5

In [2]:
df_train = pd.read_hdf("../input/sales_train.h5")
df_test = pd.read_hdf("../input/sales_test.h5")

# df = pd.concat([df_train, df_test])

In [3]:
stores = pd.read_csv('../input/stores_data.csv')
df_train = pd.merge(df_train, stores, how='left', left_on='store', right_on='Store').drop('Store', axis=1)
df_test = pd.merge(df_test, stores, how='left', left_on='store', right_on='Store').drop('Store', axis=1)

In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 285089 entries, 0 to 285088
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            285089 non-null  int32  
 1   weekly_sales  285089 non-null  float32
 2   store         285089 non-null  int8   
 3   dept          285089 non-null  int8   
 4   date          285089 non-null  object 
 5   is_holiday    285089 non-null  bool   
 6   Type          285089 non-null  object 
 7   Size          285089 non-null  int64  
dtypes: bool(1), float32(1), int32(1), int64(1), int8(2), object(2)
memory usage: 11.7+ MB


In [5]:
def type_cat(x):
    if x == 'A':
        return 1
    elif x == 'B':
        return 2
    return 3
df_train['type_cat'] = df_train['Type'].map(type_cat)
df_train['date_date'] = pd.to_datetime(df_train['date'], format='%d/%m/%Y')
df_train['month'] = df_train['date_date'].dt.month
df_train['week'] = df_train['date_date'].dt.week
df_train['holidays'] = df_train['week'].isin(['47', '50', '51']).astype('int')
df_train = df_train.sort_values(by='date_date')

df_test['type_cat'] = df_test['Type'].map(type_cat)
df_test['date_date'] = pd.to_datetime(df_test['date'], format='%d/%m/%Y')
df_test['month'] = df_test['date_date'].dt.month
df_test['week'] = df_test['date_date'].dt.week
df_test['holidays'] = df_test['week'].isin(['47', '50', '51']).astype('int')

df_stde = df_train.groupby(['store', 'dept']).agg(['mean', 'std', 'median'])['weekly_sales'].reset_index()
df_train = pd.merge(df_train, df_stde, on=['store', 'dept'], how='left').fillna(-1)
df_test = pd.merge(df_test, df_stde, on=['store', 'dept'], how='left').fillna(-1)

In [6]:
#set feats to use in algorithm
feats_num = df_train.select_dtypes(['number', 'boolean']).columns
feats_drop = ['id', 'weekly_sales']

def get_feat(feats_num, feats_drop, feats_add=[]):
#     print(feats_drop)
    feats = [feat for feat in feats_num if feat not in feats_drop]
    return feats + feats_add

In [7]:
#xgboost parameters
params = {'max_depth': 5,
            'colsample_bytree': 0.9,
            'learning_rate': 0.2,
            'subsample': 0.9,
            'random_state': 21,
            'n_estimators': 100,
            }

def validate_model(df_train, feats, params, eli=False):
    X = df_train[feats].values
    y = df_train['weekly_sales'].values
    train_size = int(len(X) * 0.7)
    X_train, X_test = X[0:train_size], X[train_size:len(X)]
    y_train, y_test = y[0:train_size], y[train_size:len(X)] 

    model = XGBRegressor(**params)
    y_train_log = np.log(y_train - (np.min(y_train) -1))
    model.fit(X_train, y_train_log)
    y_pred_log = model.predict(X_test)
    y_pred = np.exp(y_pred_log) + (np.min(y_train) -1)
    score = mean_absolute_error(y_test, y_pred)
    print('MAE: {}'.format(score))

    if eli:
        model.fit(X, y)
        display(eli5.show_weights(model, feature_names=feats))
        
def submit(df_train, df_test, feats, params):
    
    X = df_train[feats].values
    y = df_train['weekly_sales'].values
    X_test = df_test[feats].values
    
    model = XGBRegressor(**params)
    y_log = np.log(y - (np.min(y) -1))
    model.fit(X, y_log)
    y_pred_log = model.predict(X_test)
    y_pred = np.exp(y_pred_log) + (np.min(y) -1)
    
    df_test['weekly_sales'] = y_pred
    df_test[ ['id', 'weekly_sales'] ].to_csv("../output/xgb_1.csv", index=False)

In [21]:
params = {'max_depth': 9,
            'colsample_bytree': 0.9,
            'learning_rate': 0.15,
            'subsample': 0.9,
            'random_state': 21,
            'n_estimators':300,
            }
feats = get_feat(feats_num, feats_drop)
validate_model(df_train, feats, params)

MAE: 1731.110107421875


In [26]:
feats = get_feat(feats_num, feats_drop)
submit(df_train, df_test, feats, params)