In [1]:
import pandas as pd
import numpy as np

In [2]:
dates_df = pd.read_csv('../data/store-sales-time-series/train.csv', usecols=['date'], parse_dates=['date'])
dates_df = dates_df.drop_duplicates()
dates_df = dates_df.sort_values(by=['date'])
dates_df = dates_df.reset_index(drop=True)
dates_df

Unnamed: 0,date
0,2013-01-01
1,2013-01-02
2,2013-01-03
3,2013-01-04
4,2013-01-05
...,...
1679,2017-08-11
1680,2017-08-12
1681,2017-08-13
1682,2017-08-14


In [3]:
min_date = dates_df['date'].min()
max_date = dates_df['date'].max()
date_range = pd.date_range(start=min_date, end=max_date)
dates_df = pd.DataFrame(date_range, columns=['date'])
dates_df

Unnamed: 0,date
0,2013-01-01
1,2013-01-02
2,2013-01-03
3,2013-01-04
4,2013-01-05
...,...
1683,2017-08-11
1684,2017-08-12
1685,2017-08-13
1686,2017-08-14


In [4]:
dates_df['day'] = dates_df.index
dates_df

Unnamed: 0,date,day
0,2013-01-01,0
1,2013-01-02,1
2,2013-01-03,2
3,2013-01-04,3
4,2013-01-05,4
...,...,...
1683,2017-08-11,1683
1684,2017-08-12,1684
1685,2017-08-13,1685
1686,2017-08-14,1686


In [14]:
day_of_week_df = pd.DataFrame()
day_of_week_df['day_of_week'] = dates_df['date'].dt.dayofweek
day_of_week_df

Unnamed: 0,day_of_week
0,1
1,2
2,3
3,4
4,5
...,...
1683,4
1684,5
1685,6
1686,0


In [15]:
day_of_week_one_hot_df = pd.get_dummies(day_of_week_df['day_of_week'])
day_of_week_one_hot_df.columns = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
day_of_week_one_hot_df

Unnamed: 0,Monday,Tuesday,Wednesday,Thursday,Friday,Saturday,Sunday
0,False,True,False,False,False,False,False
1,False,False,True,False,False,False,False
2,False,False,False,True,False,False,False
3,False,False,False,False,True,False,False
4,False,False,False,False,False,True,False
...,...,...,...,...,...,...,...
1683,False,False,False,False,True,False,False
1684,False,False,False,False,False,True,False
1685,False,False,False,False,False,False,True
1686,True,False,False,False,False,False,False


In [7]:
day_of_week_cyclic_df = pd.DataFrame()
day_of_week_cyclic_df['day_of_week_sin'] = np.sin(2 * np.pi * day_of_week_df['day_of_week'] / 7)
day_of_week_cyclic_df['day_of_week_cos'] = np.cos(2 * np.pi * day_of_week_df['day_of_week'] / 7)
day_of_week_cyclic_df

Unnamed: 0,day_of_week_sin,day_of_week_cos
0,0.781831,0.623490
1,0.974928,-0.222521
2,0.433884,-0.900969
3,-0.433884,-0.900969
4,-0.974928,-0.222521
...,...,...
1683,-0.433884,-0.900969
1684,-0.974928,-0.222521
1685,-0.781831,0.623490
1686,0.000000,1.000000


In [16]:
day_of_month_df = pd.DataFrame()
day_of_month_df['day_of_month'] = dates_df['date'].dt.day
day_of_month_df

Unnamed: 0,day_of_month
0,1
1,2
2,3
3,4
4,5
...,...
1683,11
1684,12
1685,13
1686,14


In [8]:
month_of_year_one_hot_df = pd.get_dummies(day_of_month_df['day_of_month'])
month_of_year_one_hot_df.columns = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
month_of_year_one_hot_df

Unnamed: 0,January,February,March,April,May,June,July,August,September,October,November,December
0,True,False,False,False,False,False,False,False,False,False,False,False
1,True,False,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,False,False,False,False,False,False,False,False
3,True,False,False,False,False,False,False,False,False,False,False,False
4,True,False,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
1683,False,False,False,False,False,False,False,True,False,False,False,False
1684,False,False,False,False,False,False,False,True,False,False,False,False
1685,False,False,False,False,False,False,False,True,False,False,False,False
1686,False,False,False,False,False,False,False,True,False,False,False,False


In [9]:
month_of_year_cyclic_df = pd.DataFrame()
month_of_year_cyclic_df['month_of_year_sin'] = np.sin(2 * np.pi * day_of_month_df['month_of_year'] / 12)
month_of_year_cyclic_df['month_of_year_cos'] = np.cos(2 * np.pi * day_of_month_df['month_of_year'] / 12)
month_of_year_cyclic_df

Unnamed: 0,month_of_year_sin,month_of_year_cos
0,0.0,1.000000
1,0.0,1.000000
2,0.0,1.000000
3,0.0,1.000000
4,0.0,1.000000
...,...,...
1683,-0.5,-0.866025
1684,-0.5,-0.866025
1685,-0.5,-0.866025
1686,-0.5,-0.866025


In [10]:
quarter_of_year_one_hot_df = pd.get_dummies(dates_df['quarter_of_year'])
quarter_of_year_one_hot_df.columns = [f'Q{i + 1}' for i in range(len(quarter_of_year_one_hot_df.columns))]
quarter_of_year_one_hot_df

Unnamed: 0,Q1,Q2,Q3,Q4
0,True,False,False,False
1,True,False,False,False
2,True,False,False,False
3,True,False,False,False
4,True,False,False,False
...,...,...,...,...
1683,False,False,True,False
1684,False,False,True,False
1685,False,False,True,False
1686,False,False,True,False


In [11]:
quarter_of_year_cyclic_df = pd.DataFrame()
quarter_of_year_cyclic_df['quarter_of_year_sin'] = np.sin(2 * np.pi * dates_df['quarter_of_year'] / 4)
quarter_of_year_cyclic_df['quarter_of_year_cos'] = np.cos(2 * np.pi * dates_df['quarter_of_year'] / 4)
quarter_of_year_cyclic_df

Unnamed: 0,quarter_of_year_sin,quarter_of_year_cos
0,0.000000e+00,1.0
1,0.000000e+00,1.0
2,0.000000e+00,1.0
3,0.000000e+00,1.0
4,0.000000e+00,1.0
...,...,...
1683,1.224647e-16,-1.0
1684,1.224647e-16,-1.0
1685,1.224647e-16,-1.0
1686,1.224647e-16,-1.0


In [12]:
year_one_hot_df = pd.get_dummies(dates_df['year'])
year_one_hot_df.columns = [min_date.year + i for i in range(len(year_one_hot_df.columns))]
year_one_hot_df

Unnamed: 0,2013,2014,2015,2016,2017
0,True,False,False,False,False
1,True,False,False,False,False
2,True,False,False,False,False
3,True,False,False,False,False
4,True,False,False,False,False
...,...,...,...,...,...
1683,False,False,False,False,True
1684,False,False,False,False,True
1685,False,False,False,False,True
1686,False,False,False,False,True
