In [None]:
import pandas as pd
import numpy as np

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

import eli5

In [None]:
df_train = pd.read_hdf("../input/sales_train.h5")
df_test = pd.read_hdf("../input/sales_test.h5")

stores = pd.read_csv("../input/stores_data.csv")

In [None]:
df_train = pd.merge(df_train, stores, how='left', left_on='store', right_on='Store').drop('Store', axis=1)
df_test = pd.merge(df_test, stores, how='left', left_on='store', right_on='Store').drop('Store', axis=1)

In [None]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 285089 entries, 0 to 285088
Data columns (total 8 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   id            285089 non-null  int32  
 1   weekly_sales  285089 non-null  float32
 2   store         285089 non-null  int8   
 3   dept          285089 non-null  int8   
 4   date          285089 non-null  object 
 5   is_holiday    285089 non-null  bool   
 6   Type          285089 non-null  object 
 7   Size          285089 non-null  int64  
dtypes: bool(1), float32(1), int32(1), int64(1), int8(2), object(2)
memory usage: 11.7+ MB


In [None]:
df_train.sample(5)

Unnamed: 0,id,weekly_sales,store,dept,date,is_holiday,Type,Size
113852,168756,9933.69043,5,10,28/10/2011,False,B,34875
252666,373445,7610.620117,25,3,25/06/2010,False,B,128107
42832,63486,21544.939453,4,80,12/08/2011,False,A,205863
218244,322798,22.940001,1,45,01/10/2010,False,A,151315
275517,407309,4862.890137,39,83,03/12/2010,False,A,184109


In [None]:
def type_cat(x):
    if x == "A":
        return 1
    elif x == "B":
        return 2
    else:
        return 3
    

In [None]:
df_train['type_cat'] = df_train['Type'].map(type_cat)

In [None]:
def feature_engineering(df):
    df['type_cat'] = df['Type'].map(type_cat)
    df['date_date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')
    df["month"] = df["date_date"].dt.month
    df["week"] = df["date_date"].dt.week
    df["year"] = df["date_date"].dt.year
    df["dayofweek"] = df["date_date"].dt.dayofweek
    df["dayofyear"] = df["date_date"].dt.dayofyear
    df['holidays'] = df['week'].isin(['47', '50', '51']).astype('int')
    
       
    return df

df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)

In [None]:
df_train = df_train.sort_values(by='date_date')

In [None]:
df_stde = df_train.groupby(['store', 'dept']).agg(['mean', 'std', 'median'])['weekly_sales'].reset_index()


In [None]:
df_train = pd.merge(df_train, df_stde, on=['store', 'dept'], how='left').fillna(-1)
df_test = pd.merge(df_test, df_stde, on=['store', 'dept'], how='left').fillna(-1)

In [None]:
feats_num = df_train.select_dtypes(['number', 'boolean']).columns
black_list = ['id', 'weekly_sales']

def get_feats(feats_num, black_list, feats_add=[]):
    feats = [feat for feat in feats_num if feat not in black_list]
    return feats + feats_add

In [None]:
#xgboost parameters
params = {'max_depth': 5,
            'colsample_bytree': 0.9,
            'learning_rate': 0.2,
            'subsample': 0.9,
            'random_state': 21,
            'n_estimators': 100,
            }

def validate_model(df_train, feats, params, eli=False):
    X = df_train[feats].values
    y = df_train['weekly_sales'].values
    train_size = int(len(X) * 0.7)
    X_train, X_test = X[0:train_size], X[train_size:len(X)]
    y_train, y_test = y[0:train_size], y[train_size:len(X)] 

    model = XGBRegressor(**params)
    y_train_log = np.log(y_train - (np.min(y_train) -1))
    model.fit(X_train, y_train_log)
    y_pred_log = model.predict(X_test)
    y_pred = np.exp(y_pred_log) + (np.min(y_train) -1)
    score = mean_absolute_error(y_test, y_pred)
    print('MAE: {}'.format(score))

    if eli:
        model.fit(X, y)
        display(eli5.show_weights(model, feature_names=feats))
        
def submit(df_train, df_test, feats, params):
    
    X = df_train[feats].values
    y = df_train['weekly_sales'].values
    X_test = df_test[feats].values
    
    model = XGBRegressor(**params)
    y_log = np.log(y - (np.min(y) -1))
    model.fit(X, y_log)
    y_pred_log = model.predict(X_test)
    y_pred = np.exp(y_pred_log) + (np.min(y) -1)
    
    df_test['weekly_sales'] = y_pred
    df_test[ ['id', 'weekly_sales'] ].to_csv("../output/xgb_sol1.csv", index=False)


In [None]:
params = {'max_depth': 9,
            'colsample_bytree': 0.9,
            'learning_rate': 0.15,
            'subsample': 0.9,
            'random_state': 21,
            'n_estimators':300,
            }
feats = get_feats(feats_num, black_list)
validate_model(df_train, feats, params)

MAE: 1554.4569091796875


In [None]:
submit(df_train, df_test, feats, params)