In [1]:
import pandas as pd
import numpy as np
import datetime
import os

In [2]:
import matplotlib.pyplot as plt
%matplotlib inline 

import seaborn as sns

In [3]:
train_data_original = pd.read_csv("./data/train.csv.zip")
test_data = pd.read_csv("./data/test.csv.zip")

train_data = train_data_original.copy(deep=True)
data_cleaner = [train_data, test_data]

In [4]:
train_data.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [5]:
test_data.head()

Unnamed: 0,id,date,store,item
0,0,2018-01-01,1,1
1,1,2018-01-02,1,1
2,2,2018-01-03,1,1
3,3,2018-01-04,1,1
4,4,2018-01-05,1,1


In [6]:
train_data.describe()

Unnamed: 0,store,item,sales
count,913000.0,913000.0,913000.0
mean,5.5,25.5,52.250287
std,2.872283,14.430878,28.801144
min,1.0,1.0,0.0
25%,3.0,13.0,30.0
50%,5.5,25.5,47.0
75%,8.0,38.0,70.0
max,10.0,50.0,231.0


In [7]:
test_data.describe()

Unnamed: 0,id,store,item
count,45000.0,45000.0,45000.0
mean,22499.5,5.5,25.5
std,12990.525394,2.872313,14.43103
min,0.0,1.0,1.0
25%,11249.75,3.0,13.0
50%,22499.5,5.5,25.5
75%,33749.25,8.0,38.0
max,44999.0,10.0,50.0


In [8]:
df = pd.concat([train_data, test_data], sort=False)

In [9]:
# get a list of us federal holidays as a datetime object
from pandas.tseries.holiday import USFederalHolidayCalendar
days_off = USFederalHolidayCalendar().holidays(start='2013-01-01', end='2017-12-27').to_pydatetime()

In [10]:
df['date'] = pd.to_datetime(df['date'],infer_datetime_format=True)
df['month'] = df['date'].dt.month
df['weekday'] = df['date'].dt.dayofweek
df['year'] = df['date'].dt.year
df['day'] = df['date'].dt.day
df['week_of_year']  = df.date.dt.weekofyear

In [11]:
# more date features
df['day_of_year'] = df['date'].map(lambda x: x.timetuple().tm_yday)
# checks if day of week is saturday or sunday 
df['isWeekend'] = df['date'].map(lambda x: 1 if (x==6 or x==5) else 0)
# # checks if holiday 
df['isHoliday'] = df['date'].map(lambda x: 1 if (x in days_off) else 0)

# check for season 
print('checking for season')
# summer: check if month is between june and august (6-8)
df['isSummer'] = df['month'].map(lambda x : 1 if x == 6 or x == 7 or x == 8 else 0)
# winter: check if month is between dec and feb (12-2)
df['isWinter'] = df['month'].map(lambda x : 1 if x == 12 or x == 1 or x == 2 else 0)
# autumn: check if month is between sept and nov (9-11)
df['isAutumn'] = df['month'].map(lambda x : 1 if x == 9 or x == 10 or x == 11 else 0)
# spring: check if month is between march and may (3-5)
df['isSpring'] = df['month'].map(lambda x : 1 if x == 3 or x == 4 or x == 5 else 0)

In [14]:
df[['month', 'isSummer', 'isAutumn', 'isWinter', 'isSpring']].tail()

Unnamed: 0,month,isSummer,isAutumn,isWinter,isSpring
44995,3,0,0,0,1
44996,3,0,0,0,1
44997,3,0,0,0,1
44998,3,0,0,0,1
44999,3,0,0,0,1


In [16]:
print("year:", df['year'].unique())
print("month:", len(df['month'].unique()))
print("day:", len(df['day'].unique()))
print("week_of_year:", len(df['week_of_year'].unique()))
print("day_of_year:", len(df['day_of_year'].unique()))
print("weekday:", len(df['weekday'].unique()))
print("isWeekend:", df['isWeekend'].unique())
print("isHoliday:", df['isHoliday'].unique())
print("isSummer:", df['isSummer'].unique())
print("isAutumn:", df['isAutumn'].unique())
print("isSpring:", df['isSpring'].unique())
print("isWinter:", df['isSpring'].unique())

year: [2013 2014 2015 2016 2017 2018]
month: 12
day: 31
week_of_year: 53
day_of_year: 366
weekday: 7
isWeekend: [0]
isHoliday: [1 0]
isSummer: [0 1]
isAutumn: [0 1]
isSpring: [0 1]


# Getting transformation for Sales/Items Column

In [20]:
columns_to_sum_median_mean = ['store', 'item']

In [21]:
# store/item sales sum/median
for col in columns_to_sum_median_mean:
    df[f'{col}_sales_sum'] = df.groupby([f'{col}'])['sales'].transform('sum')
    df[f'{col}_sales_median'] = df.groupby([f'{col}'])['sales'].transform('median')
    df[f'{col}_sales_mean'] = df.groupby([f'{col}'])['sales'].transform('mean')

    # ---------------------------------------------------
    # store sales year sum/median 
    df[f'{col}_year_sales_sum'] = df.groupby(['year',f'{col}'])['sales'].transform('sum')
    df[f'{col}_year_sales_median'] = df.groupby(['year',f'{col}'])['sales'].transform('median')
    # store sales month sum/median 
    df[f'{col}_month_sales_sum'] = df.groupby(['month',f'{col}'])['sales'].transform('sum')
    df[f'{col}_month_sales_median'] = df.groupby(['month',f'{col}'])['sales'].transform('median')
    # store sales day sum/median 
    df[f'{col}_day_sales_sum'] = df.groupby(['day',f'{col}'])['sales'].transform('sum')
    df[f'{col}_day_sales_median'] = df.groupby(['day',f'{col}'])['sales'].transform('median')
    # store sales weekday sum/median
    df[f'{col}_weekday_sales_sum'] = df.groupby(['weekday',f'{col}'])['sales'].transform('sum')
    df[f'{col}_weekday_sales_median'] = df.groupby(['weekday',f'{col}'])['sales'].transform('median')
    # store dayofyear sum/median
    df[f'{col}_dayofyear_sales_sum'] = df.groupby(['day_of_year',f'{col}'])['sales'].transform('sum')
    df[f'{col}_dayofyear_sales_median'] = df.groupby(['day_of_year',f'{col}'])['sales'].transform('median')
    # store weekofyear sum/median 
    df[f'{col}_weekofyear_sales_sum'] = df.groupby(['week_of_year',f'{col}'])['sales'].transform('sum')
    df[f'{col}_weekofyear_sales_median'] = df.groupby(['week_of_year',f'{col}'])['sales'].transform('median')

    # ---------------------------------------------------
    # store sales year/day sum/median 
    df[f'{col}_year_day_sales_sum'] = df.groupby(['year','day',f'{col}'])['sales'].transform('sum')
    df[f'{col}_year_day_sales_median'] = df.groupby(['year','day','store'])['sales'].transform('median')

    # store sales year/month sum/median 
    df[f'{col}_year_month_sales_sum'] = df.groupby(['year', 'month',f'{col}'])['sales'].transform('sum')
    df[f'{col}_year_month_sales_median'] = df.groupby(['year', 'month',f'{col}'])['sales'].transform('median')

    # store sales year/weekday sum/median 
    df[f'{col}_year_weekday_sales_sum'] = df.groupby(['year', 'weekday','store'])['sales'].transform('sum')
    df[f'{col}_year_weekday_sales_median'] = df.groupby(['year', 'weekday','store'])['sales'].transform('median')

    # store sales year/dayofyear sum/median 
    df[f'{col}_year_dayofyear_sales_sum'] = df.groupby(['year', 'day_of_year', f'{col}'])['sales'].transform('sum')
    df[f'{col}_year_dayofyear_sales_median'] = df.groupby(['year', 'day_of_year',f'{col}'])['sales'].transform('median')

    # store sales year/weekofyear sum/median 
    df[f'{col}_year_weekofyear_sales_sum'] = df.groupby(['year', 'week_of_year', f'{col}'])['sales'].transform('sum')
    df[f'{col}_year_weekofyear_sales_median'] = df.groupby(['year', 'week_of_year', f'{col}'])['sales'].transform('median')

    # ---------------------------------------------------
    # store sales month/day sum/median 
    df[f'{col}_month_day_sales_sum'] = df.groupby(['month', 'day', f'{col}'])['sales'].transform('sum')
    df[f'{col}_month_day_sales_median'] = df.groupby(['month', 'day', f'{col}'])['sales'].transform('median')
    # store sales month/weekday sum/median 
    df[f'{col}_month_weekday_sales_sum'] = df.groupby(['month', 'weekday', f'{col}'])['sales'].transform('sum')
    df[f'{col}_month_weekday_sales_median'] = df.groupby(['month', 'weekday', f'{col}'])['sales'].transform('median')
    # store sales month/dayofyear sum/median 
    df[f'{col}_month_dayofyear_sales_sum'] = df.groupby(['month','day_of_year', f'{col}'])['sales'].transform('sum')
    df[f'{col}_month_dayofyear_sales_median'] = df.groupby(['month','day_of_year', f'{col}'])['sales'].transform('median')
    # store sales month/weekofyear sum/median 
    df[f'{col}_month_weekofyear_sales_sum'] = df.groupby(['month', 'week_of_year', f'{col}'])['sales'].transform('sum')
    df[f'{col}_month_weekofyear_sales_median'] = df.groupby(['month', 'week_of_year', f'{col}'])['sales'].transform('median')
    # --------------------------------------------------- 

    # store sales day/weekday sum/median 
    df[f'{col}_day_weekday_sales_sum'] = df.groupby(['day', 'weekday',f'{col}'])['sales'].transform('sum')
    df[f'{col}_day_weekday_sales_sales_median'] = df.groupby(['day', 'weekday',f'{col}'])['sales'].transform('median')
    df[f'{col}_day_weekday_sales_sales_mean'] = df.groupby(['day', 'weekday',f'{col}'])['sales'].transform('mean')
    # store sales day/dayofyear sum/median 
    df[f'{col}_day_dayofyear_sales_sum'] = df.groupby(['day', 'day_of_year', f'{col}'])['sales'].transform('sum')
    df[f'{col}_day_dayofyear_sales_median'] = df.groupby(['day', 'day_of_year', f'{col}'])['sales'].transform('median')
    df[f'{col}_day_dayofyear_sales_mean'] = df.groupby(['day', 'day_of_year', f'{col}'])['sales'].transform('mean')
    # store sales day/weekofyear sum/median 
    df[f'{col}_day_weekofyear_sales_sum'] = df.groupby(['day', 'week_of_year', f'{col}'])['sales'].transform('sum')
    df[f'{col}_day_weekofyear_sales_median'] = df.groupby(['day', 'week_of_year', f'{col}'])['sales'].transform('median')
    df[f'{col}_day_weekofyear_sales_mean'] = df.groupby(['day', 'week_of_year', f'{col}'])['sales'].transform('mean')

    # ---------------------------------------------------

    # ..... [ could have so many more ] .....

    # final one 
    # store sales year/month/day/weekday/dayofyear/weekofyear/ sum/median 
    df[f'{col}_year_month_day_weekofyear_dayofyear_weekday_sales_sum'] = df.groupby(['year', 'month', 'day', 'week_of_year', 'day_of_year', 'weekday', 'weekday', f'{col}'])['sales'].transform('sum')
    df[f'{col}_year_month_day_weekofyear_dayofyear_weekday_sales_median'] = df.groupby(['year', 'month', 'day', 'week_of_year', 'day_of_year', 'weekday', 'weekday', f'{col}'])['sales'].transform('median')
    df[f'{col}_year_month_day_weekofyear_dayofyear_weekday_sales_mean'] = df.groupby(['year', 'month', 'day', 'week_of_year', 'day_of_year', 'weekday', 'weekday', f'{col}'])['sales'].transform('mean')
    
    print(f'finished {col}')

finished store
finished item


In [22]:
df.head()

Unnamed: 0,date,store,item,sales,id,month,weekday,year,day,week_of_year,...,item_day_weekday_sales_sales_mean,item_day_dayofyear_sales_sum,item_day_dayofyear_sales_median,item_day_dayofyear_sales_mean,item_day_weekofyear_sales_sum,item_day_weekofyear_sales_median,item_day_weekofyear_sales_mean,item_year_month_day_weekofyear_dayofyear_weekday_sales_sum,item_year_month_day_weekofyear_dayofyear_weekday_sales_median,item_year_month_day_weekofyear_dayofyear_weekday_sales_mean
0,2013-01-01,1,1,13.0,,1,1,2013,1,1,...,19.9,772.0,15.0,15.44,385.0,12.5,12.833333,133.0,12.5,13.3
1,2013-01-02,1,1,11.0,,1,2,2013,2,1,...,19.633333,711.0,14.0,14.22,522.0,14.0,13.05,99.0,9.5,9.9
2,2013-01-03,1,1,14.0,,1,3,2013,3,1,...,21.055556,793.0,16.0,15.86,579.0,14.5,14.475,127.0,12.0,12.7
3,2013-01-04,1,1,13.0,,1,4,2013,4,1,...,22.6,786.0,16.0,15.72,786.0,16.0,15.72,145.0,15.0,14.5
4,2013-01-05,1,1,10.0,,1,5,2013,5,1,...,24.955556,770.0,14.0,15.4,658.0,16.0,16.45,149.0,14.0,14.9


# Getting Transformations For Store_Item Columns

In [23]:
# store sales sum/median
df['store_item_sales_sum'] = df.groupby(['store','item'])['sales'].transform('sum')
df['store_item_sales_median'] = df.groupby(['store','item'])['sales'].transform('median')
df['store_item_sales_mean'] = df.groupby(['store','item'])['sales'].transform('mean')

# ---------------------------------------------------
# store sales year sum/median 
df['store_item_year_sales_sum'] = df.groupby(['year','store','item'])['sales'].transform('sum')
df['store_item_year_sales_median'] = df.groupby(['year','store','item'])['sales'].transform('median')
# store sales month sum/median 
df['store_item_month_sales_sum'] = df.groupby(['month','store','item'])['sales'].transform('sum')
df['item_month_sales_median'] = df.groupby(['month','store','item'])['sales'].transform('median')
# store sales day sum/median 
df['store_item_day_sales_sum'] = df.groupby(['day','store','item'])['sales'].transform('sum')
df['store_item_day_sales_median'] = df.groupby(['day','store','item'])['sales'].transform('median')
# store sales weekday sum/median
df['store_item_weekday_sales_sum'] = df.groupby(['weekday','store','item'])['sales'].transform('sum')
df['store_item_weekday_sales_median'] = df.groupby(['weekday','store','item'])['sales'].transform('median')
# store dayofyear sum/median
df['store_item_dayofyear_sales_sum'] = df.groupby(['day_of_year','store','item'])['sales'].transform('sum')
df['store_item_dayofyear_sales_median'] = df.groupby(['day_of_year','store','item'])['sales'].transform('median')
# store weekofyear sum/median 
df['store_item_weekofyear_sales_sum'] = df.groupby(['week_of_year','store','item'])['sales'].transform('sum')
df['store_item_weekofyear_sales_median'] = df.groupby(['week_of_year','store','item'])['sales'].transform('median')

# ---------------------------------------------------
# store sales year/day sum/median item
df['store_item_year_day_sales_sum'] = df.groupby(['year','day','store','item'])['sales'].transform('sum')
df['store_item_year_day_sales_median'] = df.groupby(['year','day','store','item'])['sales'].transform('median')

# store sales year/month sum/median 
df['store_item_year_month_sales_sum'] = df.groupby(['year', 'month','store','item'])['sales'].transform('sum')
df['store_item_year_month_sales_median'] = df.groupby(['year', 'month','store','item'])['sales'].transform('median')

# store sales year/weekday sum/median 
df['store_item_year_weekday_sales_sum'] = df.groupby(['year', 'weekday','store','item'])['sales'].transform('sum')
df['store_item_year_weekday_sales_median'] = df.groupby(['year', 'weekday','store','item'])['sales'].transform('median')

# store sales year/dayofyear sum/median 
df['store_item_year_dayofyear_sales_sum'] = df.groupby(['year', 'day_of_year', 'store','item'])['sales'].transform('sum')
df['store_item_year_dayofyear_sales_median'] = df.groupby(['year', 'day_of_year','store','item'])['sales'].transform('median')

# store sales year/weekofyear sum/median 
df['store_item_year_weekofyear_sales_sum'] = df.groupby(['year', 'week_of_year', 'store','item'])['sales'].transform('sum')
df['store_item_year_weekofyear_sales_median'] = df.groupby(['year', 'week_of_year', 'store','item'])['sales'].transform('median')

# ---------------------------------------------------
# store sales month/day sum/median 
df['store_item_month_day_sales_sum'] = df.groupby(['month', 'day', 'store', 'item'])['sales'].transform('sum')
df['store_item_month_day_sales_median'] = df.groupby(['month', 'day', 'store', 'item'])['sales'].transform('median')
# store sales month/weekday sum/median 
df['store_item_month_weekday_sales_sum'] = df.groupby(['month', 'weekday', 'store', 'item'])['sales'].transform('sum')
df['store_item_month_weekday_sales_median'] = df.groupby(['month', 'weekday', 'store', 'item'])['sales'].transform('median')
# store sales month/dayofyear sum/median 
df['store_item_month_dayofyear_sales_sum'] = df.groupby(['month','day_of_year', 'store', 'item'])['sales'].transform('sum')
df['store_item_month_dayofyear_sales_median'] = df.groupby(['month','day_of_year', 'store', 'item'])['sales'].transform('median')
# store sales month/weekofyear sum/median 
df['store_item_month_weekofyear_sales_sum'] = df.groupby(['month', 'week_of_year', 'store', 'item'])['sales'].transform('sum')
df['store_item_month_weekofyear_sales_median'] = df.groupby(['month', 'week_of_year', 'store', 'item'])['sales'].transform('median')
# --------------------------------------------------- 

# store sales day/weekday sum/median 
df['store_item_day_weekday_sales_sum'] = df.groupby(['day', 'weekday','store', 'item'])['sales'].transform('sum')
df['store_item_day_weekday_sales_sales_median'] = df.groupby(['day', 'weekday','store', 'item'])['sales'].transform('median')
df['store_item_day_weekday_sales_sales_mean'] = df.groupby(['day', 'weekday','store', 'item'])['sales'].transform('mean')
# store sales day/dayofyear sum/median 
df['store_item_day_dayofyear_sales_sum'] = df.groupby(['day', 'day_of_year', 'store', 'item'])['sales'].transform('sum')
df['store_item_day_dayofyear_sales_median'] = df.groupby(['day', 'day_of_year', 'store', 'item'])['sales'].transform('median')
df['store_item_day_dayofyear_sales_mean'] = df.groupby(['day', 'day_of_year', 'store', 'item'])['sales'].transform('mean')
# store sales day/weekofyear sum/median 
df['store_item_day_weekofyear_sales_sum'] = df.groupby(['day', 'week_of_year', 'store', 'item'])['sales'].transform('sum')
df['store_item_day_weekofyear_sales_median'] = df.groupby(['day', 'week_of_year', 'store', 'item'])['sales'].transform('median')
df['store_item_day_weekofyear_sales_mean'] = df.groupby(['day', 'week_of_year', 'store', 'item'])['sales'].transform('mean')

# ---------------------------------------------------

# ..... [ could have so many more ] .....

# final one 
# store sales year/month/day/weekday/dayofyear/weekofyear/ sum/median 
df['store_item_year_month_day_weekofyear_dayofyear_weekday_sales_sum'] = df.groupby(['year', 'month', 'day', 'week_of_year', 'day_of_year', 'weekday', 'weekday', 'store','item'])['sales'].transform('sum')
df['store_item_year_month_day_weekofyear_dayofyear_weekday_sales_median'] = df.groupby(['year', 'month', 'day', 'week_of_year', 'day_of_year', 'weekday', 'weekday', 'store','item'])['sales'].transform('median')
df['store_item_year_month_day_weekofyear_dayofyear_weekday_sales_mean'] = df.groupby(['year', 'month', 'day', 'week_of_year', 'day_of_year', 'weekday', 'weekday', 'store','item'])['sales'].transform('mean')

In [24]:
df.head()

Unnamed: 0,date,store,item,sales,id,month,weekday,year,day,week_of_year,...,store_item_day_weekday_sales_sales_mean,store_item_day_dayofyear_sales_sum,store_item_day_dayofyear_sales_median,store_item_day_dayofyear_sales_mean,store_item_day_weekofyear_sales_sum,store_item_day_weekofyear_sales_median,store_item_day_weekofyear_sales_mean,store_item_year_month_day_weekofyear_dayofyear_weekday_sales_sum,store_item_year_month_day_weekofyear_dayofyear_weekday_sales_median,store_item_year_month_day_weekofyear_dayofyear_weekday_sales_mean
0,2013-01-01,1,1,13.0,,1,1,2013,1,1,...,17.333333,73.0,13.0,14.6,33.0,11.0,11.0,13.0,13.0,13.0
1,2013-01-02,1,1,11.0,,1,2,2013,2,1,...,16.666667,72.0,14.0,14.4,59.0,14.5,14.75,11.0,11.0,11.0
2,2013-01-03,1,1,14.0,,1,3,2013,3,1,...,18.555556,63.0,12.0,12.6,51.0,12.5,12.75,14.0,14.0,14.0
3,2013-01-04,1,1,13.0,,1,4,2013,4,1,...,20.222222,75.0,14.0,15.0,75.0,14.0,15.0,13.0,13.0,13.0
4,2013-01-05,1,1,10.0,,1,5,2013,5,1,...,21.888889,67.0,14.0,13.4,53.0,13.0,13.25,10.0,10.0,10.0


In [25]:
df.describe()

Unnamed: 0,store,item,sales,id,month,weekday,year,day,week_of_year,day_of_year,...,store_item_day_weekday_sales_sales_mean,store_item_day_dayofyear_sales_sum,store_item_day_dayofyear_sales_median,store_item_day_dayofyear_sales_mean,store_item_day_weekofyear_sales_sum,store_item_day_weekofyear_sales_median,store_item_day_weekofyear_sales_mean,store_item_year_month_day_weekofyear_dayofyear_weekday_sales_sum,store_item_year_month_day_weekofyear_dayofyear_weekday_sales_median,store_item_year_month_day_weekofyear_dayofyear_weekday_sales_mean
count,958000.0,958000.0,913000.0,45000.0,958000.0,958000.0,958000.0,958000.0,958000.0,958000.0,...,958000.0,958000.0,958000.0,958000.0,958000.0,958000.0,958000.0,958000.0,913000.0,913000.0
mean,5.5,25.5,52.250287,22499.5,6.311065,3.0,2015.141441,15.718685,25.691023,176.636743,...,52.265017,185.018924,51.270193,51.697097,178.992562,51.733573,51.738868,49.795942,52.250287,52.250287
std,2.872283,14.430877,28.801144,12990.525394,3.504592,1.998696,1.519304,8.795328,15.302475,107.105069,...,25.299565,109.586552,26.830601,26.823352,116.844417,27.660431,27.493552,30.211887,28.801144,28.801144
min,1.0,1.0,0.0,0.0,1.0,0.0,2013.0,1.0,1.0,1.0,...,6.777778,3.0,3.0,3.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,3.0,13.0,30.0,11249.75,3.0,1.0,2014.0,8.0,12.0,80.0,...,31.142857,95.0,30.0,30.2,90.0,30.0,30.0,27.0,30.0,30.0
50%,5.5,25.5,47.0,22499.5,6.0,3.0,2015.0,16.0,25.0,174.0,...,49.0,167.0,47.0,47.25,151.0,47.0,47.0,45.0,47.0,47.0
75%,8.0,38.0,70.0,33749.25,9.0,5.0,2016.0,23.0,39.0,270.0,...,69.555556,260.0,68.0,68.5,241.0,69.0,68.5,68.0,70.0,70.0
max,10.0,50.0,231.0,44999.0,12.0,6.0,2018.0,31.0,53.0,366.0,...,152.142857,656.0,214.0,214.0,799.0,214.0,214.0,231.0,231.0,231.0


# Export processed data

In [26]:
# train_data.to_csv('./data/preprocessed_train_data.csv')
# test_data.to_csv('./data/preprocessed_test_data.csv')
df.to_csv('./data/preprocessed_train_test_data.csv')