In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

## Test set manipulation

In [None]:
test= pd.read_csv('data/test.csv', parse_dates = True)

In [None]:
test.loc[:,'Date'] = pd.to_datetime(test.loc[:,'Date'], format='%Y-%m-%d')
test.loc[:,'DayOfWeek'] = test.loc[:,'Date'].dt.weekday + 1

test.loc[test.loc[:,'StateHoliday'] == 0.0, 'StateHoliday'] = '0'
test.loc[test.loc[:,'StateHoliday'] == 0, 'StateHoliday'] = '0'

In [None]:
test.drop(['Customers', 'SchoolHoliday'], axis = 1, inplace = True)

In [None]:
test.loc[:,'Month'] = test.loc[:,'Date'].dt.month

In [None]:
test.loc[test.loc[:,'Month'] != 12, 'Month'] = 0
test.loc[test.loc[:,'Month'] == 12, 'Month'] = 1

In [None]:
stores_light = pd.read_csv('stores_light.csv')
test = test.merge(stores_light, how = 'left', on = 'Store')

In [None]:
test

In [None]:
train = pd.read_csv('data/train.csv', parse_dates=True)

train.loc[(train.loc[:, 'Sales'].isnull()) & (train['Open'] == 0), 'Sales'] = 0
train.loc[(train.loc[:, 'Sales'].isnull()) & (train['Customers'] == 0), 'Sales'] = 0

train.loc[(train.loc[:, 'Customers'].isnull()) & (train['Sales'] == 0), 'Customers'] = 0

no_null_sales = train.loc[train['Sales'].notnull(), :]
no_null_sales = no_null_sales[train['Open'] != 0]

no_null_sales = no_null_sales.loc[no_null_sales['Store'].notnull(), :]

no_null_sales.loc[:,'Date'] = pd.to_datetime(no_null_sales.loc[:,'Date'], format='%Y-%m-%d')
no_null_sales.loc[:,'DayOfWeek'] = no_null_sales.loc[:,'Date'].dt.weekday + 1

no_null_sales.loc[no_null_sales.loc[:,'StateHoliday'] == 0.0, 'StateHoliday'] = '0'
no_null_sales.loc[no_null_sales.loc[:,'StateHoliday'] == 0, 'StateHoliday'] = '0'

no_null_sales.loc[(no_null_sales.loc[:,'Open'].isnull()) & (no_null_sales.loc[:,'Sales'] == 0),'Open'] = 0
no_null_sales.loc[(no_null_sales.loc[:,'Open'].isnull()) & (no_null_sales.loc[:,'Sales'] > 0),'Open'] = 1

no_closed_stores = no_null_sales.loc[no_null_sales.loc[:,'Open'] == 1]
no_closed_stores = no_closed_stores.merge(stores_light, how = 'left', on = 'Store')

no_closed_stores = no_closed_stores.loc[no_closed_stores.loc[:,'Sales'] > 800]

In [None]:
assortment_dict = no_closed_stores.groupby('Assortment').mean()['Sales'].to_dict()
storetype_dict = no_closed_stores.groupby('StoreType').mean()['Sales'].to_dict()
store_dict = no_closed_stores.groupby('Store').mean()['Sales'].to_dict()
holiday_dict = no_closed_stores.groupby('StateHoliday').mean()['Sales'].to_dict()

In [None]:
test['Store'] = test['Store'].map(store_dict)
test['StateHoliday'] = test['StateHoliday'].map(holiday_dict)

test['StoreType'] = test['StoreType'].map(storetype_dict)
test['Assortment'] = test['Assortment'].map(assortment_dict)

test['PromoMonday'] = 0
test.loc[(test['Promo'] == 1) & (test['DayOfWeek'] == 1), 'PromoMonday'] = 1

## Download, import model and make predictions

In [None]:
url = "https://www.dropbox.com/s/q2e41mzg759fzzu/model.sav?dl=1"
import urllib.request
u = urllib.request.urlopen(url)
data = u.read()
u.close()
 
with open('model.sav', "wb") as f :
    f.write(data)

In [None]:
import pickle
model = pickle.load(open('model.sav', 'rb'))

In [None]:
test.fillna(0, inplace = True)
preds = model.predict(test.drop(['Date', 'Sales'], axis = 1))
actuals = test['Sales']

## Get score

In [None]:
def metric(preds, actuals):
    preds = preds.reshape(-1)
    actuals = actuals.reshape(-1)
    assert preds.shape == actuals.shape
    return 100 * np.linalg.norm((actuals - preds) / actuals) / np.sqrt(preds.shape[0])

def get_score(actuals, preds):
    new_test= pd.DataFrame({'Actuals': actuals,'Preds': preds})
    new_test = new_test.loc[new_test['Actuals'] != 0,:]
    return metric(np.array(new_test['Actuals']), np.array(new_test['Preds']))

In [None]:
get_score(actuals, preds)