In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns

In [2]:
def load(test=False):
    
    if test:
        X = pd.read_csv('test.csv')
        X.drop(['Id'], axis=1, inplace=True)
        y = None
    else:
        X = pd.read_csv('train.csv')
        y = X[['Sales']].T.values[0]
    
    return X, y   

In [3]:
def loadStore():
    return pd.read_csv('store.csv')

In [4]:
def rmspe(y_true, y_pred):
    return np.sqrt(np.mean((y_true-y_pred)/y_true) ** 2)

from sklearn.metrics import make_scorer

rmspe_scorer = make_scorer(rmspe, greater_is_better=False)

In [5]:
def rmspe_xgb(y, predicted):
    predicted = predicted.get_label()
    return "rmspe", rmspe(y, predicted)

# Load Data

In [29]:
train,y = load()

In [30]:
test, _ = load(True)

In [31]:
store = loadStore()

In [32]:
train_df = pd.merge(train, store, left_on='Store', right_on='Store', how='inner')
test_df = pd.merge(test, store, left_on='Store', right_on='Store', how='inner')

# Read saved data

In [52]:
train_df = pd.read_csv('train_df.csv').drop(['Unnamed: 0'], axis=1)
test_df = pd.read_csv('test_df.csv').drop(['Unnamed: 0'], axis=1)

# Getting features

In [33]:
#Fill NaN with Open = 1, because sales in this days > 0
test_df.Open.fillna(1, inplace=True)

In [34]:
#Getting some features from 'Date'
train_df['Year'] = pd.to_datetime(train_df.Date).map(lambda x: x.year)
train_df['DayOfYear'] = pd.to_datetime(train_df.Date).map(lambda x: x.dayofyear)
train_df['Month'] = pd.to_datetime(train_df.Date).map(lambda x: x.month)

test_df['Year'] = pd.to_datetime(test_df.Date).map(lambda x: x.year)
test_df['DayOfYear'] = pd.to_datetime(test_df.Date).map(lambda x: x.dayofyear)
test_df['Month'] = pd.to_datetime(test_df.Date).map(lambda x: x.month)

In [35]:
#Promo2 to 0 - if no Promo2, 1 - if in this day Promo2 is available
train_df.Promo2SinceYear.fillna(2016, inplace=True)
train_df.Promo2SinceWeek.fillna(1, inplace=True)
train_df.PromoInterval.fillna(0, inplace=True)

test_df.Promo2SinceYear.fillna(2016, inplace=True)
test_df.Promo2SinceWeek.fillna(1, inplace=True)
test_df.PromoInterval.fillna(0, inplace=True)


dictionary1 = {0:0, 'Jan,Apr,Jul,Oct': 1, 'Feb,May,Aug,Nov': 2, 'Mar,Jun,Sept,Dec': 3}
dictionary2 = {1:1, 4:1, 7:1, 10:1, 2:2, 5:2, 8:2, 11:2, 3:3, 6:3, 9:3, 12:3}

train_df['PromoInterval'] = train_df['PromoInterval'].map(dictionary1)
train_df['MonthInterval'] = train_df['Month'].map(dictionary2)

test_df['PromoInterval'] = test_df['PromoInterval'].map(dictionary1)
test_df['MonthInterval'] = test_df['Month'].map(dictionary2)



train_df['Promo2'] = np.sign((train_df.Year - train_df.Promo2SinceYear) * 365 + 
                              (train_df.DayOfYear - train_df.Promo2SinceWeek * 7))

test_df['Promo2'] = np.sign((test_df.Year - test_df.Promo2SinceYear) * 365 + 
                              (test_df.DayOfYear - test_df.Promo2SinceWeek * 7))

def binarizePromo2(df):

    promo2 = []
    
    for row in xrange(len(df)):

        if df.Promo2[row] == 1 and df.PromoInterval[row] == df.MonthInterval[row]:
            promo2.append(1)
        else:
            promo2.append(0)
    
    return promo2
        
train_df['Promo2'] = pd.DataFrame(binarizePromo2(train_df))
test_df['Promo2'] = pd.DataFrame(binarizePromo2(test_df))

In [36]:
#CompetitionOpen in days
train_df['CompetitionOpen'] = ((train_df.Year - train_df.CompetitionOpenSinceYear) * 365 + 
                                (train_df.DayOfYear - train_df.CompetitionOpenSinceMonth * 30))

test_df['CompetitionOpen'] = ((test_df.Year - test_df.CompetitionOpenSinceYear) * 365 + 
                                (test_df.DayOfYear - test_df.CompetitionOpenSinceMonth * 30))

In [37]:
#fill NaN with median
med = train_df.CompetitionDistance.median()
train_df['CompetitionDistance'] = train_df.CompetitionDistance.fillna(med)

med = train_df.CompetitionOpen.median()
train_df['CompetitionOpen'] = train_df.CompetitionOpen.fillna(med)


med = test_df.CompetitionDistance.median()
test_df['CompetitionDistance'] = test_df.CompetitionDistance.fillna(med)

med = test_df.CompetitionOpen.median()
test_df['CompetitionOpen'] = test_df.CompetitionOpen.fillna(med)

In [41]:
train_df.CompetitionDistance = train_df.CompetitionDistance.astype(int)
train_df.CompetitionOpen = train_df.CompetitionOpen.astype(int)

test_df.CompetitionDistance = test_df.CompetitionDistance.astype(int)
test_df.CompetitionOpen = test_df.CompetitionOpen.astype(int)

In [38]:
train_df = train_df[train_df.Sales <> 0]

# Category features

In [39]:
#Category featores to digits
from sklearn.preprocessing import LabelEncoder

label_enc = LabelEncoder()

enc_list = ['StoreType', 'Assortment', 'StateHoliday']

for name in enc_list:
    
    train_df[name] = label_enc.fit_transform(train_df[name])
    test_df[name] = label_enc.fit_transform(test_df[name])

In [40]:
train_df.drop(['Promo2SinceYear', 'Promo2SinceWeek', 'CompetitionOpenSinceYear', 
'CompetitionOpenSinceMonth', 'PromoInterval', 'MonthInterval', 'Year'], axis=1, inplace=True)

test_df.drop(['Promo2SinceYear', 'Promo2SinceWeek', 'CompetitionOpenSinceYear', 
'CompetitionOpenSinceMonth', 'PromoInterval', 'MonthInterval', 'Year'], axis=1, inplace=True)

# Metafeatures

In [46]:
#Split dataset into the 2 parts 50/50
from sklearn.cross_validation import train_test_split

drop_cols = ['Open', 'Sales', 'Customers', 'Date']

X = train_df.drop(drop_cols, axis=1).values
y_c = train_df['Customers'].T.values

X_tr, X_te, y_tr, y_te = train_test_split(X, y_c, test_size=0.50)

test = test_df.drop(['Open', 'Date'], axis=1).values

In [47]:
#train first half to predict 'Customers'
#predict 'Customers' to the second half
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(criterion='mse', n_jobs=-1, n_estimators=25, random_state=0)

result_tr = [0, 0]
result_te = [0, 0]

clf.fit(X_tr, y_tr)
result_tr[0] = clf.predict(X_te) 
result_te[0] = clf.predict(test) 

clf.fit(X_te, y_te)
result_tr[1] = clf.predict(X_tr) 
result_te[1] = clf.predict(test) 

In [48]:
#for test avarage of 2 predictions
train_df['Meta'] = np.concatenate((result_tr[0], result_tr[1]), axis=0).astype(int)
test_df['Meta'] = ((result_te[0] + result_te[1]) / 2).astype(int)

In [49]:
train_df.to_csv('train_df.csv')
test_df.to_csv('test_df.csv')

# Scoring

In [64]:
import xgboost

def fit_xgb(X_tr, X_te, y_tr, y_te):
    xgb_tr, xgb_te, yxgb_tr, yxgb_te = train_test_split(X_tr, y_tr, test_size=0.2, random_state=10)

    dtrain = xgboost.DMatrix(xgb_tr, label = yxgb_tr)
    dval = xgboost.DMatrix(xgb_te, label = yxgb_te)


    params = {'objective': 'reg:linear',
              'booster': 'gbtree',
              'eta': 0.3,
              'max_depth': 13,
              'subsample': 0.9,
              'colsample_bytree': 0.7,
              'silent': 1,
              'seed': 1001,
              'nthread': 4
              }
    num_round = 300

    plst = list(params.items())

    evallist  = [(dval,'eval'), (dtrain,'train')]

    bst = xgboost.train( plst, dtrain, num_round, evallist, early_stopping_rounds=150, feval=rmspe_xgb, verbose_eval=True)

    y_val = bst.predict(xgboost.DMatrix(X_te))

    print 'Error', np.fabs(rmspe(y_te, y_val))
    
    return bst

In [65]:
#split on time
drop_cols = ['Open', 'Sales', 'Customers', 'Date']

mask = [(pd.to_datetime(train_df.Date) < np.datetime64('2015-06-01T00:00:00.000000000+0000'),
        pd.to_datetime(train_df.Date) >= np.datetime64('2015-06-01T00:00:00.000000000+0000'))]

y_tr = train_df.loc[mask[0][0]].Sales.T.values
y_te = train_df.loc[mask[0][1]].Sales.T.values

X_tr = train_df.loc[mask[0][0], :].drop(drop_cols, axis=1).values
X_te = train_df.loc[mask[0][1], :].drop(drop_cols, axis=1).values

test = test_df.drop(['Open', 'Date'], axis=1).values

cols = train_df.drop(drop_cols, axis=1).columns

In [66]:
bst = fit_xgb(X_tr, X_te, y_tr, y_te)

Will train until train error hasn't decreased in 150 rounds.
[0]	eval-rmspe:2.347864	train-rmspe:2.341150
[1]	eval-rmspe:0.939040	train-rmspe:0.936635
[2]	eval-rmspe:0.498209	train-rmspe:0.496364
[3]	eval-rmspe:0.291911	train-rmspe:0.290270
[4]	eval-rmspe:0.178625	train-rmspe:0.177037
[5]	eval-rmspe:0.109437	train-rmspe:0.107920
[6]	eval-rmspe:0.065287	train-rmspe:0.063789
[7]	eval-rmspe:0.037720	train-rmspe:0.036255
[8]	eval-rmspe:0.019719	train-rmspe:0.018366
[9]	eval-rmspe:0.007956	train-rmspe:0.006562
[10]	eval-rmspe:0.000528	train-rmspe:0.001907
[11]	eval-rmspe:0.006076	train-rmspe:0.007427
[12]	eval-rmspe:0.009890	train-rmspe:0.011057
[13]	eval-rmspe:0.012503	train-rmspe:0.013550
[14]	eval-rmspe:0.013777	train-rmspe:0.014878
[15]	eval-rmspe:0.014240	train-rmspe:0.015262
[16]	eval-rmspe:0.014542	train-rmspe:0.015387
[17]	eval-rmspe:0.014596	train-rmspe:0.015508
[18]	eval-rmspe:0.014500	train-rmspe:0.015328
[19]	eval-rmspe:0.014460	train-rmspe:0.015340
[20]	eval-rmspe:0.014362	trai

Error 0.00376351296038


[349]	eval-rmspe:0.001374	train-rmspe:0.000934
Stopping. Best iteration:
[199]	eval-rmspe:0.000875	train-rmspe:0.000805



In [67]:
#split on random
from sklearn.cross_validation import train_test_split

drop_cols = ['Open', 'Sales', 'Customers', 'Date']

y = train_df['Sales'].T.values
X = train_df.drop(drop_cols, axis=1).values

test = test_df.drop(['Open', 'Date'], axis=1).values

cols = train_df.drop(drop_cols, axis=1).columns

X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.25, random_state=50)

In [68]:
bst = fit_xgb(X_tr, X_te, y_tr, y_te)

Will train until train error hasn't decreased in 150 rounds.
[0]	eval-rmspe:2.341537	train-rmspe:2.339708
[1]	eval-rmspe:0.948128	train-rmspe:0.946370
[2]	eval-rmspe:0.505538	train-rmspe:0.503917
[3]	eval-rmspe:0.293963	train-rmspe:0.292700
[4]	eval-rmspe:0.181760	train-rmspe:0.180721
[5]	eval-rmspe:0.111527	train-rmspe:0.110680
[6]	eval-rmspe:0.068449	train-rmspe:0.067730
[7]	eval-rmspe:0.040364	train-rmspe:0.039611
[8]	eval-rmspe:0.021799	train-rmspe:0.021068
[9]	eval-rmspe:0.008961	train-rmspe:0.008155
[10]	eval-rmspe:0.000558	train-rmspe:0.000249
[11]	eval-rmspe:0.005416	train-rmspe:0.006163
[12]	eval-rmspe:0.009425	train-rmspe:0.010235
[13]	eval-rmspe:0.012019	train-rmspe:0.012943
[14]	eval-rmspe:0.013823	train-rmspe:0.014600
[15]	eval-rmspe:0.014998	train-rmspe:0.015809
[16]	eval-rmspe:0.015451	train-rmspe:0.016338
[17]	eval-rmspe:0.015637	train-rmspe:0.016521
[18]	eval-rmspe:0.015415	train-rmspe:0.016109
[19]	eval-rmspe:0.015573	train-rmspe:0.016288
[20]	eval-rmspe:0.015259	trai

Error 0.0143505495191


Stopping. Best iteration:
[10]	eval-rmspe:0.000558	train-rmspe:0.000249



In [69]:
#importance of features
from sklearn.ensemble import RandomForestRegressor

clf = RandomForestRegressor(n_jobs=-1, random_state=10, n_estimators = 100)
  
clf.fit(X_tr, y_tr)

predicted = clf.predict(X_te)
    
imp = clf.feature_importances_
    
for i in xrange(len(cols)):
    
    print cols[i], imp[i]
    
print 'Error', np.fabs(rmspe(y_te, predicted))

Store 0.255037463271
DayOfWeek 0.0619847931429
Promo 0.135162833191
StateHoliday 0.00331530166946
SchoolHoliday 0.00552852774546
StoreType 0.0496856790686
Assortment 0.0209541175677
CompetitionDistance 0.251464786535
Promo2 0.00521432965735
DayOfYear 0.0656543783601
Month 0.00682628585218
CompetitionOpen 0.104620480348
Meta 0.0345510235913
Error 0.0210864899866


In [71]:
#cross_validation on RandomForest
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import cross_val_score

clf = RandomForestRegressor(n_jobs=-1, random_state=10, n_estimators = 100)

scores = cross_val_score(clf, X_tr, y_tr, cv=5, scoring=rmspe_scorer)

print np.mean(scores)

-0.0218817547286


# Predict

In [101]:
predicted = bst.predict(xgboost.DMatrix(test))
test_df['predicted'] = (predicted * test_df['Open']).astype(int)

In [96]:
test_df['predicted'].to_csv('output.csv', index=False, index_label=False)