In [1]:
import os
import gc
import time
import math
import datetime
from math import log, floor
from sklearn.neighbors import KDTree

import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.utils import shuffle
from tqdm.notebook import tqdm as tqdm

import seaborn as sns
from matplotlib import colors
import matplotlib.pyplot as plt
from matplotlib.colors import Normalize

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
from plotly.subplots import make_subplots

import pywt
from statsmodels.robust import mad

import scipy
import statsmodels
from scipy import signal
import statsmodels.api as sm
from fbprophet import Prophet
from scipy.signal import butter, deconvolve
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.api import ExponentialSmoothing, SimpleExpSmoothing, Holt

import warnings
warnings.filterwarnings("ignore")

  import pandas.util.testing as tm


In [0]:
sales_data =  pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/Kaggle_data/m5-forecasting-accuracy/sales_data.csv')
calendar = pd.read_csv('/content/drive/My Drive/Colab Notebooks/data/Kaggle_data/m5-forecasting-accuracy/calendar.csv')

In [3]:
sales_data.drop(['Unnamed: 0'],axis=1,inplace=True)
sales_data.head()

Unnamed: 0,id,d,value,wm_yr_wk,sell_price,National,Sporting,Cultural,Religious
0,HOBBIES_1_001_CA_1,1,0,11101,,0,0,0,0
1,HOBBIES_1_002_CA_1,1,0,11101,,0,0,0,0
2,HOBBIES_1_003_CA_1,1,0,11101,,0,0,0,0
3,HOBBIES_1_004_CA_1,1,0,11101,,0,0,0,0
4,HOBBIES_1_005_CA_1,1,0,11101,,0,0,0,0


In [0]:
calendar['d'].replace(regex ='d_',value='',inplace=True)
calendar['d'] = calendar['d'].astype('int16')

In [5]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,5,,,,,1,0,1


메모리를 낮추기 위해 type 변경을 하자.
- int 16: [-2 ** 15,2 ** 15]
- int 8 : [-2 ** 7,2 ** 7]
 > int 16이 여기서 다루고 있는 데이터를 모두 포함하기 때문에 이걸 쓰도록 하자.

In [0]:
def change_type(data,col):
  if data[col].dtypes =='int64':
    data[col] = data[col].astype('int16')
  elif data[col].dtypes == 'float64':
    data[col] = data[col].astype('float32')
  else:
    pass

In [0]:
for col in sales_data.columns:
  change_type(sales_data,col)

In [9]:
sales_data.dtypes

id             object
d               int16
value           int16
wm_yr_wk        int16
sell_price    float32
National        int16
Sporting        int16
Cultural        int16
Religious       int16
dtype: object

In [10]:
sales_data

Unnamed: 0,id,d,value,wm_yr_wk,sell_price,National,Sporting,Cultural,Religious
0,HOBBIES_1_001_CA_1,1,0,11101,,0,0,0,0
1,HOBBIES_1_002_CA_1,1,0,11101,,0,0,0,0
2,HOBBIES_1_003_CA_1,1,0,11101,,0,0,0,0
3,HOBBIES_1_004_CA_1,1,0,11101,,0,0,0,0
4,HOBBIES_1_005_CA_1,1,0,11101,,0,0,0,0
...,...,...,...,...,...,...,...,...,...
58327365,FOODS_3_823_WI_3,1913,1,11613,2.98,0,0,0,0
58327366,FOODS_3_824_WI_3,1913,0,11613,2.48,0,0,0,0
58327367,FOODS_3_825_WI_3,1913,0,11613,3.98,0,0,0,0
58327368,FOODS_3_826_WI_3,1913,3,11613,1.28,0,0,0,0


int16 까지는 type 변경을 해도 될 것으로 보임.

In [11]:
event_name_list= list(set(list(calendar['event_name_1'].unique()) + list(calendar['event_name_2'].unique())))
print(event_name_list)
event_name_list = event_name_list[1:]
print(event_name_list)
print(len(event_name_list))

[nan, 'Thanksgiving', 'Eid al-Fitr', 'SuperBowl', 'Chanukah End', 'Cinco De Mayo', "Mother's day", 'Halloween', 'ColumbusDay', 'MartinLutherKingDay', 'IndependenceDay', 'NBAFinalsStart', 'OrthodoxChristmas', 'Purim End', 'LentStart', 'LaborDay', 'EidAlAdha', 'MemorialDay', 'Christmas', 'OrthodoxEaster', 'PresidentsDay', 'StPatricksDay', 'VeteransDay', 'NBAFinalsEnd', 'Pesach End', 'ValentinesDay', 'LentWeek2', 'NewYear', 'Easter', 'Ramadan starts', "Father's day"]
['Thanksgiving', 'Eid al-Fitr', 'SuperBowl', 'Chanukah End', 'Cinco De Mayo', "Mother's day", 'Halloween', 'ColumbusDay', 'MartinLutherKingDay', 'IndependenceDay', 'NBAFinalsStart', 'OrthodoxChristmas', 'Purim End', 'LentStart', 'LaborDay', 'EidAlAdha', 'MemorialDay', 'Christmas', 'OrthodoxEaster', 'PresidentsDay', 'StPatricksDay', 'VeteransDay', 'NBAFinalsEnd', 'Pesach End', 'ValentinesDay', 'LentWeek2', 'NewYear', 'Easter', 'Ramadan starts', "Father's day"]
30


In [0]:
def make_onehot_name_col(data,col):
  data[col] = 0
  data[col] = data[col].astype('int16')
  for day in calendar.loc[(calendar['event_name_1'] == col)|(calendar['event_name_2'] == col),'d'].values:
    data.loc[data['d']==day, col] +=1
  # if data[col].dtypes == 'int64':
  #   data[col] = data[col].astype('int32')

for col in event_name_list:
  make_onehot_name_col(sales_data,col)

In [22]:
sales_data.head()

Unnamed: 0,id,d,value,wm_yr_wk,sell_price,National,Sporting,Cultural,Religious,Thanksgiving,Eid al-Fitr,SuperBowl,Chanukah End,Cinco De Mayo,Mother's day,Halloween,ColumbusDay,MartinLutherKingDay,IndependenceDay,NBAFinalsStart,OrthodoxChristmas,Purim End,LentStart,LaborDay,EidAlAdha,MemorialDay,Christmas,OrthodoxEaster,PresidentsDay,StPatricksDay,VeteransDay,NBAFinalsEnd,Pesach End,ValentinesDay,LentWeek2,NewYear,Easter,Ramadan starts,Father's day
0,HOBBIES_1_001_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,HOBBIES_1_002_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,HOBBIES_1_003_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,HOBBIES_1_004_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,HOBBIES_1_005_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


아직 더 넣을 만한 변수들이 있는지 확인해 보자.
- snap, year, wday, month 모두 넣어 주자.

In [24]:
calendar.head()

Unnamed: 0,date,wm_yr_wk,weekday,wday,month,year,d,event_name_1,event_type_1,event_name_2,event_type_2,snap_CA,snap_TX,snap_WI
0,2011-01-29,11101,Saturday,1,1,2011,1,,,,,0,0,0
1,2011-01-30,11101,Sunday,2,1,2011,2,,,,,0,0,0
2,2011-01-31,11101,Monday,3,1,2011,3,,,,,0,0,0
3,2011-02-01,11101,Tuesday,4,2,2011,4,,,,,1,1,0
4,2011-02-02,11101,Wednesday,5,2,2011,5,,,,,1,0,1


In [0]:
for col in ['snap_CA','snap_TX','snap_WI','year','month','wday']:
  sales_data = sales_data.merge(calendar[['wm_yr_wk','d',col]],on=['wm_yr_wk','d'],how='left')
  sales_data[col] = sales_data[col].astype('int16')

In [27]:
sales_data.head()

Unnamed: 0,id,d,value,wm_yr_wk,sell_price,National,Sporting,Cultural,Religious,Thanksgiving,Eid al-Fitr,SuperBowl,Chanukah End,Cinco De Mayo,Mother's day,Halloween,ColumbusDay,MartinLutherKingDay,IndependenceDay,NBAFinalsStart,OrthodoxChristmas,Purim End,LentStart,LaborDay,EidAlAdha,MemorialDay,Christmas,OrthodoxEaster,PresidentsDay,StPatricksDay,VeteransDay,NBAFinalsEnd,Pesach End,ValentinesDay,LentWeek2,NewYear,Easter,Ramadan starts,Father's day,snap_CA,snap_TX,snap_WI,year,month,wday
0,HOBBIES_1_001_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011,1,1
1,HOBBIES_1_002_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011,1,1
2,HOBBIES_1_003_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011,1,1
3,HOBBIES_1_004_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011,1,1
4,HOBBIES_1_005_CA_1,1,0,11101,,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2011,1,1


In [0]:
sales_data.to_csv('/content/drive/My Drive/Colab Notebooks/data/Kaggle_data/m5-forecasting-accuracy/total.csv',index=False)