In [1]:
import pandas as pd
import numpy as np
import datetime
import os

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline 

import seaborn as sns

In [3]:
train_data_original = pd.read_csv("./data/train.csv.zip")
test_data = pd.read_csv("./data/test.csv.zip")

train_data = train_data_original.copy(deep=True)
data_cleaner = [train_data, test_data]

In [4]:
train_data.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [5]:
train_data.describe()

Unnamed: 0,store,item,sales
count,913000.0,913000.0,913000.0
mean,5.5,25.5,52.250287
std,2.872283,14.430878,28.801144
min,1.0,1.0,0.0
25%,3.0,13.0,30.0
50%,5.5,25.5,47.0
75%,8.0,38.0,70.0
max,10.0,50.0,231.0


In [6]:
test_data.tail()

Unnamed: 0,id,date,store,item
44995,44995,2018-03-27,10,50
44996,44996,2018-03-28,10,50
44997,44997,2018-03-29,10,50
44998,44998,2018-03-30,10,50
44999,44999,2018-03-31,10,50


In [7]:
for data in data_cleaner:
    data['date'] = data['date'].map(lambda x: datetime.datetime.strptime(x, '%Y-%m-%d'))

In [8]:
for data in data_cleaner:
    data['year'] = data['date'].map(lambda x: x.year)
    data['month'] = data['date'].map(lambda x: x.month)
    data['day'] = data['date'].map(lambda x: x.day)

In [10]:
print(train_data['store'].unique().tolist())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [11]:
print(train_data['item'].unique().tolist())

[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50]


In [13]:
# get a list of us federal holidays as a datetime object
from pandas.tseries.holiday import USFederalHolidayCalendar
days_off = USFederalHolidayCalendar().holidays(start='2013-01-01', end='2017-12-27').to_pydatetime()

In [14]:
for data in data_cleaner:
    #  column is a weekday as an integer: 0 is monday and 6 is sunday
    data['weekday'] = data['date'].map(lambda x: x.weekday())
    # gets the day of the year 1 - 365
    data['day_of_year'] = data['date'].map(lambda x: x.timetuple().tm_yday)
    # gets the week of the year 1 - 52
    data['week_of_year'] = data['date'].map(lambda x: x.isocalendar()[1])
    # checks if day of week is saturday or sunday 
    data['isWeekend'] = data['date'].map(lambda x: 1 if (x==6 or x==5) else 0)
    # checks if holiday 
    data['isHoliday'] = data['date'].map(lambda x: 1 if (x in days_off) else 0)

In [17]:
# analyze the dates
print(train_data.columns)
train_data.head()

Index(['date', 'store', 'item', 'sales', 'year', 'month', 'day', 'weekday',
       'day_of_year', 'week_of_year', 'isWeekend', 'isHoliday'],
      dtype='object')


Unnamed: 0,date,store,item,sales,year,month,day,weekday,day_of_year,week_of_year,isWeekend,isHoliday
0,2013-01-01,1,1,13,2013,1,1,1,1,1,0,1
1,2013-01-02,1,1,11,2013,1,2,2,2,1,0,0
2,2013-01-03,1,1,14,2013,1,3,3,3,1,0,0
3,2013-01-04,1,1,13,2013,1,4,4,4,1,0,0
4,2013-01-05,1,1,10,2013,1,5,5,5,1,0,0


In [30]:
# check that code ran perfectly
print('unique years:', train_data['year'].unique())
print('unique months:', len(train_data['month'].unique()))
print('unique day:', len(train_data['day'].unique().tolist()))
print('unique weekday:', len(train_data['weekday'].unique()))
print('unique day_of_year:', len(train_data['day_of_year'].unique()))
print('unique week_of_year:', len(train_data['week_of_year'].unique()))
print('unique isWeekend:', train_data['isWeekend'].unique())
print('unique isHoliday:', train_data['isHoliday'].unique())

unique years: [2013 2014 2015 2016 2017]
unique months: 12
unique day: 31
unique weekday: 7
unique day_of_year: 366
unique week_of_year: 53
unique isWeekend: [0]
unique isHoliday: [1 0]


In [51]:
# look at the stores column and try to sum up sales based on dates
# todo: do it for train_data dataframe also

# store sales sum/median
train_data['store_sales_sum'] = train_data.groupby(by=['store'])['sales'].transform('sum')
train_data['store_sales_median'] = train_data.groupby(by=['store'])['sales'].transform('median')

# store sales year sum/median
train_data['store_year_sales_sum'] = train_data.groupby(by=['year', 'store'])['sales'].transform('sum')
train_data['store_year_sales_median'] = train_data.groupby(by=['year', 'store'])['sales'].transform('median')

# store sales month sum/median
train_data['store_month_sales_sum'] = train_data.groupby(by=['month', 'store'])['sales'].transform('sum')
train_data['store_month_sales_median'] = train_data.groupby(by=['month', 'store'])['sales'].transform('median')

# store sales day sum/median
train_data['store_day_sales_sum'] = train_data.groupby(by=['day', 'store'])['sales'].transform('sum')
train_data['store_day_sales_median'] = train_data.groupby(by=['day', 'store'])['sales'].transform('median')

# store sales weekday sum/median
train_data['store_weekday_sales_sum'] = train_data.groupby(by=['weekday', 'store'])['sales'].transform('sum')
train_data['store_weekday_sales_median'] = train_data.groupby(by=['weekday', 'store'])['sales'].transform('median')

# store sales dayofyear sum/median
train_data['store_dayofyear_sales_sum'] = train_data.groupby(by=['day_of_year', 'store'])['sales'].transform('sum')
train_data['store_dayofyear_sales_median'] = train_data.groupby(by=['day_of_year', 'store'])['sales'].transform('median')

# store sales weekofyear sum/median
train_data['store_weekofyear_sales_sum'] = train_data.groupby(by=['week_of_year', 'store'])['sales'].transform('sum')
train_data['store_weekofyear_sales_median'] = train_data.groupby(by=['week_of_year', 'store'])['sales'].transform('median')

In [52]:
train_data.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekday,day_of_year,week_of_year,...,item_month_sales_sum,item_month_sales_median,item_day_sales_sum,item_day_sales_median,item_weekday_sales_sum,item_weekday_sales_median,item_dayofyear_sales_sum,item_dayofyear_sales_median,item_weekofyear_sales_sum,item_weekofyear_sales_median
0,2013-01-01,1,1,13,2013,1,1,1,1,1,...,22987,14.0,13178,21.0,52930,19.0,772,15.0,5113,15.0
1,2013-01-02,1,1,11,2013,1,2,2,2,1,...,22987,14.0,12913,20.0,53281,20.0,711,14.0,5113,15.0
2,2013-01-03,1,1,14,2013,1,3,3,3,1,...,22987,14.0,13350,22.0,56604,21.0,793,16.0,5113,15.0
3,2013-01-04,1,1,13,2013,1,4,4,4,1,...,22987,14.0,13249,21.0,60671,22.0,786,16.0,5113,15.0
4,2013-01-05,1,1,10,2013,1,5,5,5,1,...,22987,14.0,13535,22.0,64546,24.0,770,14.0,5113,15.0


In [50]:
# look at the item column and try to sum up sales based on dates
# todo: do it for train_data dataframe also

# store item sum/median
train_data['item_sales_sum'] = train_data.groupby(by=['item'])['sales'].transform('sum')
train_data['item_sales_median'] = train_data.groupby(by=['item'])['sales'].transform('median')

# store item year sum/medianitem
train_data['item_year_sales_sum'] = train_data.groupby(by=['year', 'item'])['sales'].transform('sum')
train_data['item_year_sales_median'] = train_data.groupby(by=['year', 'item'])['sales'].transform('median')

# store item month sum/median
train_data['item_month_sales_sum'] = train_data.groupby(by=['month', 'item'])['sales'].transform('sum')
train_data['item_month_sales_median'] = train_data.groupby(by=['month', 'item'])['sales'].transform('median')

# store item day sum/median
train_data['item_day_sales_sum'] = train_data.groupby(by=['day', 'item'])['sales'].transform('sum')
train_data['item_day_sales_median'] = train_data.groupby(by=['day', 'item'])['sales'].transform('median')

# store item weekday sum/median
train_data['item_weekday_sales_sum'] = train_data.groupby(by=['weekday', 'item'])['sales'].transform('sum')
train_data['item_weekday_sales_median'] = train_data.groupby(by=['weekday', 'item'])['sales'].transform('median')

# store item dayofyear sum/median
train_data['item_dayofyear_sales_sum'] = train_data.groupby(by=['day_of_year', 'item'])['sales'].transform('sum')
train_data['item_dayofyear_sales_median'] = train_data.groupby(by=['day_of_year', 'item'])['sales'].transform('median')

# store item weekofyear sum/medianitem
train_data['item_weekofyear_sales_sum'] = train_data.groupby(by=['week_of_year', 'item'])['sales'].transform('sum')
train_data['item_weekofyear_sales_median'] = train_data.groupby(by=['week_of_year', 'item'])['sales'].transform('median')

In [53]:
train_data.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekday,day_of_year,week_of_year,...,item_month_sales_sum,item_month_sales_median,item_day_sales_sum,item_day_sales_median,item_weekday_sales_sum,item_weekday_sales_median,item_dayofyear_sales_sum,item_dayofyear_sales_median,item_weekofyear_sales_sum,item_weekofyear_sales_median
0,2013-01-01,1,1,13,2013,1,1,1,1,1,...,22987,14.0,13178,21.0,52930,19.0,772,15.0,5113,15.0
1,2013-01-02,1,1,11,2013,1,2,2,2,1,...,22987,14.0,12913,20.0,53281,20.0,711,14.0,5113,15.0
2,2013-01-03,1,1,14,2013,1,3,3,3,1,...,22987,14.0,13350,22.0,56604,21.0,793,16.0,5113,15.0
3,2013-01-04,1,1,13,2013,1,4,4,4,1,...,22987,14.0,13249,21.0,60671,22.0,786,16.0,5113,15.0
4,2013-01-05,1,1,10,2013,1,5,5,5,1,...,22987,14.0,13535,22.0,64546,24.0,770,14.0,5113,15.0


In [54]:
# look at the store and item column and try to sum up sales based on dates
# todo: do it for train_data dataframe also

# store item sum/median
train_data['sotre_item_sales_sum'] = train_data.groupby(by=['store','item'])['sales'].transform('sum')
train_data['store_item_sales_median'] = train_data.groupby(by=['store','item'])['sales'].transform('median')

# store item year sum/medianitem
train_data['store_item_year_sales_sum'] = train_data.groupby(by=['year', 'store','item'])['sales'].transform('sum')
train_data['store_item_year_sales_median'] = train_data.groupby(by=['year', 'store','item'])['sales'].transform('median')

# store item month sum/median
train_data['store_item_month_sales_sum'] = train_data.groupby(by=['month', 'store','item'])['sales'].transform('sum')
train_data['store_item_month_sales_median'] = train_data.groupby(by=['month', 'store','item'])['sales'].transform('median')

# store item day sum/median
train_data['store_item_day_sales_sum'] = train_data.groupby(by=['day', 'store','item'])['sales'].transform('sum')
train_data['store_item_day_sales_median'] = train_data.groupby(by=['day', 'store','item'])['sales'].transform('median')

# store item weekday sum/median
train_data['store_item_weekday_sales_sum'] = train_data.groupby(by=['weekday', 'store','item'])['sales'].transform('sum')
train_data['store_item_weekday_sales_median'] = train_data.groupby(by=['weekday', 'store','item'])['sales'].transform('median')

# store item dayofyear sum/median
train_data['store_item_dayofyear_sales_sum'] = train_data.groupby(by=['day_of_year', 'store','item'])['sales'].transform('sum')
train_data['store_item_dayofyear_sales_median'] = train_data.groupby(by=['day_of_year', 'store','item'])['sales'].transform('median')

# store item weekofyear sum/medianitem
train_data['store_weekofyear_sales_sum'] = train_data.groupby(by=['week_of_year', 'store','item'])['sales'].transform('sum')
train_data['store_weekofyear_sales_median'] = train_data.groupby(by=['week_of_year', 'store','item'])['sales'].transform('median')

In [55]:
train_data.head()

Unnamed: 0,date,store,item,sales,year,month,day,weekday,day_of_year,week_of_year,...,store_item_year_sales_sum,store_item_year_sales_median,store_item_month_sales_sum,store_item_month_sales_median,store_item_day_sales_sum,store_item_day_sales_median,store_item_weekday_sales_sum,store_item_weekday_sales_median,store_item_dayofyear_sales_sum,store_item_dayofyear_sales_median
0,2013-01-01,1,1,13,2013,1,1,1,1,1,...,6025,16.0,2125,13.0,1195,19.0,4742,18.0,73,13
1,2013-01-02,1,1,11,2013,1,2,2,2,1,...,6025,16.0,2125,13.0,1123,19.0,4905,19.0,72,14
2,2013-01-03,1,1,14,2013,1,3,3,3,1,...,6025,16.0,2125,13.0,1252,20.0,5077,19.0,63,12
3,2013-01-04,1,1,13,2013,1,4,4,4,1,...,6025,16.0,2125,13.0,1176,19.5,5485,21.0,75,14
4,2013-01-05,1,1,10,2013,1,5,5,5,1,...,6025,16.0,2125,13.0,1219,20.0,5996,22.0,67,14


# Export processed data

In [56]:
train_data.to_csv('./data/preprocessed_train_data.csv')
test_data.to_csv('./data/preprocessed_test_data.csv')