In [1]:
import numpy as np
import pandas as pd

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline
from matplotlib import rcParams
rcParams['figure.figsize'] = 10, 6
from sklearn import preprocessing

In [3]:
train = pd.read_csv("data/train_advanced.csv")
test = pd.read_csv("data/test_advanced.csv")
sample = pd.read_csv("data/sample_submission.csv")

# Exploratory Data Analysis

In [5]:
train.head()

Unnamed: 0,Date,Store,Item,Sales,Average Rating,Location,Category,Branding
0,1/1/2017,KMart,1,13,2.8,Wheelers Hill VIC 3170,Electronics,Sony
1,1/2/2017,KMart,1,11,1.4,Wheelers Hill VIC 3170,Electronics,Sony
2,1/3/2017,KMart,1,14,1.8,Wheelers Hill VIC 3170,Electronics,Sony
3,1/4/2017,KMart,1,13,1.3,Wheelers Hill VIC 3170,Electronics,Sony
4,1/5/2017,KMart,1,10,2.9,Wheelers Hill VIC 3170,Electronics,Sony


In [6]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 821000 entries, 0 to 820999
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   Date            821000 non-null  object 
 1   Store           821000 non-null  object 
 2   Item            821000 non-null  int64  
 3   Sales           821000 non-null  int64  
 4   Average Rating  821000 non-null  float64
 5   Location        820260 non-null  object 
 6   Category        821000 non-null  object 
 7   Branding        821000 non-null  object 
dtypes: float64(1), int64(2), object(5)
memory usage: 50.1+ MB


In [7]:
train['Location'].unique()

array(['Wheelers Hill VIC 3170', 'Preston,\xa0VIC,\xa03072',
       'South Melbourne,\xa0VIC\xa03205',
       'Keysborough, VIC, 3173, Australia', 'Mulgrave\xa0VIC\xa03170',
       'Dandenong, VIC, 3175, Australia',
       'Springvale, VIC, 3171, Australia', 'Ringwood VIC 3134',
       ' Epping VIC 3076', 'Mulgrave VIC 3170', nan], dtype=object)

In [8]:
def changeLoc(x,y,CHANGE):
    train["Location"].replace(CHANGE,(str(x)+", VIC, "+str(y)+", Australia"), 
           inplace=True)
changeLoc("Preston",3072,"Preston,\xa0VIC,\xa03072")
changeLoc("South Melbourne",3205,"South Melbourne,\xa0VIC\xa03205")
changeLoc("Mulgrave",3170,"Mulgrave\xa0VIC\xa03170")
changeLoc("Mulgrave",3170,"Mulgrave VIC 3170")
changeLoc("Epping",3076," Epping VIC 3076")
changeLoc("Ringwood",3134,"Ringwood VIC 3134")
changeLoc("Wheelers Hill",3170,"Wheelers Hill VIC 3170")
changeLoc("Mulgrave",3170,np.nan)

In [9]:
train['Location'].unique()

array(['Wheelers Hill, VIC, 3170, Australia',
       'Preston, VIC, 3072, Australia',
       'South Melbourne, VIC, 3205, Australia',
       'Keysborough, VIC, 3173, Australia',
       'Mulgrave, VIC, 3170, Australia',
       'Dandenong, VIC, 3175, Australia',
       'Springvale, VIC, 3171, Australia',
       'Ringwood, VIC, 3134, Australia', 'Epping, VIC, 3076, Australia'],
      dtype=object)

Date

In [10]:
train.Date=pd.to_datetime(train.Date,format='%m/%d/%Y')
train.Date.dtypes
train.Date.unique()

array(['2017-01-01T00:00:00.000000000', '2017-01-02T00:00:00.000000000',
       '2017-01-03T00:00:00.000000000', ...,
       '2021-06-28T00:00:00.000000000', '2021-06-29T00:00:00.000000000',
       '2021-06-30T00:00:00.000000000'], dtype='datetime64[ns]')

In [11]:
train['year'] = pd.DatetimeIndex(train['Date']).year
train['month'] = pd.DatetimeIndex(train['Date']).month
train['day'] = pd.DatetimeIndex(train['Date']).day

In [12]:
len(train[train["year"]==2017])

182500

In [13]:
train.head()

Unnamed: 0,Date,Store,Item,Sales,Average Rating,Location,Category,Branding,year,month,day
0,2017-01-01,KMart,1,13,2.8,"Wheelers Hill, VIC, 3170, Australia",Electronics,Sony,2017,1,1
1,2017-01-02,KMart,1,11,1.4,"Wheelers Hill, VIC, 3170, Australia",Electronics,Sony,2017,1,2
2,2017-01-03,KMart,1,14,1.8,"Wheelers Hill, VIC, 3170, Australia",Electronics,Sony,2017,1,3
3,2017-01-04,KMart,1,13,1.3,"Wheelers Hill, VIC, 3170, Australia",Electronics,Sony,2017,1,4
4,2017-01-05,KMart,1,10,2.9,"Wheelers Hill, VIC, 3170, Australia",Electronics,Sony,2017,1,5


In [14]:
train_dummy=pd.get_dummies(train.Category,prefix="Category")
train=pd.concat([train,train_dummy],axis=1)
train=train.drop(columns=["Category"])

In [15]:
train_bdummy=pd.get_dummies(train.Branding,prefix="Branding")
train=pd.concat([train,train_bdummy],axis=1)
train=train.drop(columns=["Branding"])

In [16]:
train_storedummy=pd.get_dummies(train.Store,prefix="Store")
train=pd.concat([train,train_storedummy],axis=1)
train=train.drop(columns=["Store"])

In [17]:
train_locationdummy=pd.get_dummies(train.Location,prefix="Store")
train=pd.concat([train,train_locationdummy],axis=1)
train=train.drop(columns=["Location"])

In [18]:
train.head()

Unnamed: 0,Date,Item,Sales,Average Rating,year,month,day,Category_Clothing & Accessories,Category_Cosmetics,Category_Electronics,...,Store_Woolies,"Store_Dandenong, VIC, 3175, Australia","Store_Epping, VIC, 3076, Australia","Store_Keysborough, VIC, 3173, Australia","Store_Mulgrave, VIC, 3170, Australia","Store_Preston, VIC, 3072, Australia","Store_Ringwood, VIC, 3134, Australia","Store_South Melbourne, VIC, 3205, Australia","Store_Springvale, VIC, 3171, Australia","Store_Wheelers Hill, VIC, 3170, Australia"
0,2017-01-01,1,13,2.8,2017,1,1,0,0,1,...,0,0,0,0,0,0,0,0,0,1
1,2017-01-02,1,11,1.4,2017,1,2,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,2017-01-03,1,14,1.8,2017,1,3,0,0,1,...,0,0,0,0,0,0,0,0,0,1
3,2017-01-04,1,13,1.3,2017,1,4,0,0,1,...,0,0,0,0,0,0,0,0,0,1
4,2017-01-05,1,10,2.9,2017,1,5,0,0,1,...,0,0,0,0,0,0,0,0,0,1


# 

In [None]:
from statsmodels.tsa.stattools import adfuller
from numpy import log
result = adfuller(train.Sales.dropna())
print('ADF Statistic: %f' % result[0])
print('p-value: %f' % result[1])