In [1]:
import pandas as pd 
import numpy as np 
import plotly.express as px
from fbprophet import Prophet
from sklearn.metrics import mean_squared_error
from math import sqrt
from statsmodels.distributions.empirical_distribution import ECDF
import datetime
import seaborn as sns

import matplotlib.dates as md
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import host_subplot
import mpl_toolkits.axisartist as AA
plt.style.use(['fast', 'seaborn-whitegrid'])

In [2]:
train = pd.read_csv('DA1920_train.csv')
store = pd.read_csv('clean_store.csv')
train.shape
train.head()


Columns (7) have mixed types. Specify dtype option on import or set low_memory=False.



Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,31/07/2015,5263,555,1,1,0,1
1,2,5,31/07/2015,6064,625,1,1,0,1
2,3,5,31/07/2015,8314,821,1,1,0,1
3,4,5,31/07/2015,13995,1498,1,1,0,1
4,5,5,31/07/2015,4822,559,1,1,0,1


In [3]:
train.dtypes
train['StateHoliday'].value_counts()

0    855087
0    131072
a     20260
b      6690
c      4100
Name: StateHoliday, dtype: int64

In [4]:
train['StateHoliday'] = train['StateHoliday'].replace(['0'], 0)

In [5]:
store.isna().any()
store = store.drop(['Promofactor', 'Assortfactor', 'PromoYear'], axis = 1)

In [6]:
merge = pd.merge(train, store, on = 'Store', how = 'inner')

In [7]:
merge.head()

Unnamed: 0.1,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Unnamed: 0,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,31/07/2015,5263,555,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x
1,1,4,30/07/2015,5020,546,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x
2,1,3,29/07/2015,4782,523,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x
3,1,2,28/07/2015,5011,560,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x
4,1,1,27/07/2015,6102,612,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x


In [8]:
merge.to_csv('allstore.csv', index = False)

In [None]:
df = pd.read_csv('allstore.csv')
df.head()

Unnamed: 0.1,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday,Unnamed: 0,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,5,31/07/2015,5263,555,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x
1,1,4,30/07/2015,5020,546,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x
2,1,3,29/07/2015,4782,523,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x
3,1,2,28/07/2015,5011,560,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x
4,1,1,27/07/2015,6102,612,1,1,0,1,1,c,a,1270,9,2008,0,x,x,x


In [None]:
df.isnull().any()
df = df.drop(['Unnamed: 0'], axis = 1)

# Feature engineering 

In [None]:
# Dates 
df['Date'] = pd.to_datetime(df['Date'])
df['quarter'] = df['Date'].apply(lambda x: x.quarter)

df['weekday'] = df.apply(lambda row: row['Date'].weekday(), axis = 1)
df['weekday'] = (df['weekday'] < 5).astype(int)

df.set_index('Date', inplace = True)

df['year'] = df.index.year
df['month'] = df.index.month
df['day'] = df.index.day
df['week_of_year'] = df.index.weekofyear

df['day_of_year'] = df.index.dayofyear
df['day_of_week'] = df.index.dayofweek
df.head()

# Kurtosis & Skewness
- kurtosis close to 0 = Normal
- Skewness between -0.5 & 0.5 = symmetry 

In [None]:
sns.distplot(df.Sales);

print('Kurtosis:{}'.format(stats.kurtosis(df.Sales)))
print('Skewness:{}'.format(stats.skew(df.Sales)))

In [None]:
plt.figure(figsize=(14,6))
plt.subplot(1,2,1)
df['Sales'].hist(bins=50)
plt.title('Sales Distribution')
plt.subplot(1,2,2)
stats.probplot(df['Sales'], plot=plt);

# Does not look normally distributed 

# Resampling over time 

In [None]:
df1 = df.loc[:, ['Date', 'Sales']]

fig = plt.figure(figsize=(18,16))
fig.subplots_adjust(hspace=.4)
ax1 = fig.add_subplot(5,1,1)
ax1.plot(df1['Sales'].resample('D').mean(),linewidth = 1)
ax1.set_title('Mean Sales resampled over day')
ax1.tick_params(axis='both', which='major')

ax2 = fig.add_subplot(5,1,2, sharex=ax1)
ax2.plot(df1['Sales'].resample('W').mean(),linewidth=1)
ax2.set_title('Mean Sales resampled over week')
ax2.tick_params(axis='both', which='major')

ax3 = fig.add_subplot(5,1,3, sharex=ax1)
ax3.plot(df1['Sales'].resample('M').mean(),linewidth=1)
ax3.set_title('Mean Sales resampled over month')
ax3.tick_params(axis='both', which='major')

ax4  = fig.add_subplot(5,1,4, sharex=ax1)
ax4.plot(df1['Sales'].resample('Q').mean(),linewidth=1)
ax4.set_title('Mean Sales resampled over quarter')
ax4.tick_params(axis='both', which='major')

ax5  = fig.add_subplot(5,1,5, sharex=ax1)
ax5.plot(df1['Sales'].resample('A').mean(),linewidth=1)
ax5.set_title('Mean Sales resampled over year')
ax5.tick_params(axis='both', which='major');

In [None]:
plt.figure(figsize=(14,8))
plt.subplot(2,2,1)
df.groupby('year').Sales.agg('mean').plot()
plt.xlabel('')
plt.title('Mean Sales by Year')

plt.subplot(2,2,2)
df.groupby('quarter').Sales.agg('mean').plot()
plt.xlabel('')
plt.title('Mean Sales by Quarter')

plt.subplot(2,2,3)
df.groupby('month').Sales.agg('mean').plot()
plt.xlabel('')
plt.title('Mean Sales by Month')

plt.subplot(2,2,4)
df.groupby('day_of_week').Sales.agg('mean').plot()
plt.xlabel('')
plt.title('Mean Sales by Day of week');

In [None]:
# df = df.reset_index(drop = True)

pd.pivot_table(df, values = "Sales", 
               columns = "year", index = "month").plot(subplots = True,
                                                       figsize=(12, 12), 
                                                       layout=(3, 5), 
                                                       sharey=True);

In [None]:
plt.figure(figsize=(14,5))
plt.subplot(2,2,1)
plt.subplots_adjust(wspace=0.2)

sns.boxplot(x="year", y="Sales", data=df)
plt.xlabel('year')
plt.title('Box plot of Yearly Sales')
sns.despine(left=True)
plt.tight_layout()

plt.subplot(2,2,2)
sns.boxplot(x="quarter", y="Sales", data=df)
plt.xlabel('quarter')
plt.title('Box plot of Quarterly Sales')
sns.despine(left=True)
plt.tight_layout()

plt.subplot(2,2,3)
sns.boxplot(x="month", y="Sales", data=df)
plt.xlabel('month')
plt.title('Box plot of monthly Sales')
sns.despine(left=True)
plt.tight_layout();

plt.subplot(2,2,4)
sns.boxplot(x="day_of_week", y="Sales", data=df)
plt.xlabel('day_of_week')
plt.title('Box plot of day of week Sales')
sns.despine(left=True)
plt.tight_layout();

# StateHoliday 

In [None]:
def SH(c):
    if c['StateHoliday'] == 'a':
        return 1
    elif c['StateHoliday'] == 'b':
        return 1
    elif c['StateHoliday'] == 'c':
        return 1
    else:
        return 0
    
df['IsStateHoliday'] = df.apply(SH, axis = 1)

In [None]:
g = sns.FacetGrid(df, 
                 col = 'IsStateHoliday', 
                 height = 4, aspect = .8)

g.map(sns.barplot, 'IsStateHoliday', 'Sales');

In [None]:
data = pd.concat([df['StoreType'], df['Sales'], df['IsStateHoliday']], axis=1)
plt.figure(figsize=(20,6))
plt.title('Box Plot of Sales by StoreType and StateHoliday')
fig = sns.boxplot(x='StoreType', y='Sales', 
                  data=data, showfliers=False, hue="IsStateHoliday")

In [None]:
df.head()

In [None]:
dic={0:'Weekend',1:'Weekday'}
df['Day'] = df.weekday.map(dic)

In [None]:
# Weekend / Weekday

dic={0:'Weekend',1:'Weekday'}
df['Day'] = df.weekday.map(dic)
a=plt.figure(figsize=(9,4)) 

plt1=sns.boxplot('year','Sales',
                 hue='Day',
                 width=0.6,
                 fliersize=3,
                 data=df)     

a.legend(loc='upper center', bbox_to_anchor=(0.5, 1.00), shadow=True, ncol=2)
sns.despine(left=True, bottom=True) 
plt.xlabel('')
plt.tight_layout()                                                                                                                  
plt.legend().set_visible(False);

In [None]:
plt1=sns.factorplot('year','Sales',hue='Day',
                    data=df, size=4, aspect=1.5, legend=False)                                                                                                                                                                                                                                                                                                                                             
plt.title('Factor Plot of Sales by Weekend/Weekday')                                                             
plt.tight_layout()                                                                                                                  
sns.despine(left=True, bottom=True) 
plt.legend(loc='right');

In [None]:
sales_df = pd.DataFrame(data={'Sales':df['Sales'], 
                              'IsStateHoliday':df['IsStateHoliday']})

# GroupBy to see sales difference between 0 and 1 (StateHoliday)
sales_df['Sales'] = sales_df['Sales'].groupby(sales_df['IsStateHoliday']).transform('sum')

In [None]:
sns.barplot(x='IsStateHoliday', y='Sales', data=sales_df).set_yscale('log')

#sns.barplot(x='IsStateHoliday', y='Sales', data=sales_df)

# Stat test 

- D'Agostino K-sq test
    - quantify if data is drawn from Gaussian / Normal distribution
   
- Dickey-Fuller
    - Stationarity test

In [None]:
import math
from scipy import stats 
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.stattools import pacf

stat, p = stats.normaltest(df.Sales)

# H0 = not normal 
# H1 = normal 
# P <= alpha: reject H0 in favor of H1 
# P > alpha: fail to reject H0
print('Statistics = %.3f, p = %.3f' % (stat, p))

alpha = .05
2
if p > alpha:
    print('Not normally distributed (do not reject H0)')
else:
    print('Is normally distributed (reject H0)')

In [None]:
df1 = pd.read_csv('allstore.csv')
df1.head()

In [None]:
# Dickey-Fuller 
df1=df.resample('D', how=np.mean)

def test_stationarity(timeseries):
    rolmean = timeseries.rolling(window=30).mean() # 30 days 
    rolstd = timeseries.rolling(window=30).std()
    
    plt.figure(figsize=(27,9))
    sns.despine(left=True)
    orig = plt.plot(timeseries, color='gold',label='Original')
    mean = plt.plot(rolmean, color='green', label='Rolling Mean')
    std = plt.plot(rolstd, color='red', label = 'Rolling Std')

    plt.legend(loc='best'); plt.title('Rolling Mean & Standard Deviation')
    plt.show()
    
    print ('<Results of Dickey-Fuller Test>')
    dftest = adfuller(timeseries, autolag='AIC')
    dfoutput = pd.Series(dftest[0:4],
                         index=['Test Statistic','p-value','#Lags Used','Number of Observations Used'])
    for key,value in dftest[4].items():
        dfoutput['Critical Value (%s)'%key] = value
    print(dfoutput)
    
test_stationarity(df1.Sales.dropna())

#### p-val > 0.05 = Do not reject H0
- H0: data has unit root and is non-stationary 

#### p-val < 0.05 = Reject H0 
- data does not have unit root and is stationary

#### critical values 
- must be close to test statistic 

In [None]:
df.head()

In [None]:
df.reset_index(inplace=True)
df.head()

In [None]:
df.to_csv('model_data.csv', index = False)

# Encoding cats 

In [None]:
df = pd.read_csv('model_data.csv')
df.sort_values(by = ['Date'], inplace = True)
df.head()

In [None]:
x = pd.get_dummies(df, columns = ['StateHoliday','StoreType'])
x.head()

# Test data

In [None]:
test = pd.read_csv('DA1920_test.csv')

In [9]:
from datetime import datetime

dates = pd.date_range(start = '08/01/2015',
                     end = '17/09/2015',
                     freq = 'D')

In [10]:
te = pd.DataFrame(dates, columns = ['Date'])

# te['Store'] = train['Store']

In [12]:
te.head()

Unnamed: 0,Date
0,2015-08-01
1,2015-08-02
2,2015-08-03
3,2015-08-04
4,2015-08-05
