In [None]:
import pandas as pd
import numpy as np
from itertools import product

In [2]:
# RAW
Train = pd.read_csv("./data_raw/train.csv").drop(columns=["id"])
Stores = pd.read_csv("./data_raw/stores.csv")

# AUXILIAR
Oil = pd.read_csv("./data_aux/oil.csv")
Holidays = pd.read_csv("./data_aux/holidays.csv")
Transactions = pd.read_csv("./data_aux/transactions.csv")

In [4]:
# MERGE OIL
Train = Train.merge(
  Oil[['date', 'oil_price']],
  on=['date']
)

In [5]:
# TRANSACTIONS
Train = Train.merge(
  Transactions[['date', 'store_nbr', 'transactions']],
  on=['date', 'store_nbr']
)

In [6]:
# STORE INFORMATIONS
Train = Train.merge(
  Stores[['store_nbr', 'city', 'state', 'type', 'cluster']],
  on=['store_nbr']
)

In [7]:
# HOLIDAYS
Train['is_holiday'] = False

# National
national = Holidays[Holidays['locale'] == 'National'].drop_duplicates(subset='date')
Train['is_holiday'] = Train['date'].isin(national['date'])

# Regional
regional = Holidays[Holidays['locale'] == 'Regional'].rename(columns={'locale_name': 'state'})
regional = regional.drop_duplicates(subset=['date', 'state'])
regional['is_regional_holiday'] = True
Train = Train.merge(regional[['date', 'state', 'is_regional_holiday']], on=['date', 'state'], how='left')
Train['is_holiday'] |= Train['is_regional_holiday'].infer_objects(copy=False)
Train.drop(columns=['is_regional_holiday'], inplace=True)

# Local
local = Holidays[Holidays['locale'] == 'Local'].rename(columns={'locale_name': 'city'})
local = local.drop_duplicates(subset=['date', 'city'])
local['is_local_holiday'] = True
Train = Train.merge(local[['date', 'city', 'is_local_holiday']], on=['date', 'city'], how='left')
Train['is_holiday'] |= Train['is_local_holiday'].infer_objects(copy=False)
Train.drop(columns=['is_local_holiday'], inplace=True)

In [8]:
# DAYS SINCE PAYDAY
Train['date'] = pd.to_datetime(Train['date'])

paydays = pd.date_range(start='2012-12-31', end=Train['date'].max(), freq='MS') 

paydays = pd.Series(paydays).drop_duplicates().sort_values().reset_index(drop=True)

def days_since_last_payday(date):
  prev_paydays = paydays[paydays <= date]
  if not prev_paydays.empty: return (date - prev_paydays.iloc[-1]).days
  else: return np.nan

Train['days_since_payday'] = Train['date'].apply(days_since_last_payday)

In [9]:
# DAY INFO
Train['date'] = pd.to_datetime(Train['date'])
Train['day_of_week'] = Train['date'].dt.dayofweek 
Train['is_weekend'] = Train['day_of_week'].isin([5, 6])
Train['date'] = pd.to_datetime(Train['date']).dt.date

In [10]:
Train.to_csv("./data_predict/train.csv")

In [11]:
Train

Unnamed: 0,date,store_nbr,family,sales,onpromotion,oil_price,transactions,city,state,type,cluster,is_holiday,days_since_payday,day_of_week,is_weekend
0,2013-01-01,1,AUTOMOTIVE,0.000,0,93.14,0.0,Quito,Pichincha,D,13,True,0,1,False
1,2013-01-01,1,BABY CARE,0.000,0,93.14,0.0,Quito,Pichincha,D,13,True,0,1,False
2,2013-01-01,1,BEAUTY,0.000,0,93.14,0.0,Quito,Pichincha,D,13,True,0,1,False
3,2013-01-01,1,BEVERAGES,0.000,0,93.14,0.0,Quito,Pichincha,D,13,True,0,1,False
4,2013-01-01,1,BOOKS,0.000,0,93.14,0.0,Quito,Pichincha,D,13,True,0,1,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3000883,2017-08-15,9,POULTRY,438.133,0,47.57,2155.0,Quito,Pichincha,B,6,False,14,1,False
3000884,2017-08-15,9,PREPARED FOODS,154.553,1,47.57,2155.0,Quito,Pichincha,B,6,False,14,1,False
3000885,2017-08-15,9,PRODUCE,2419.729,148,47.57,2155.0,Quito,Pichincha,B,6,False,14,1,False
3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8,47.57,2155.0,Quito,Pichincha,B,6,False,14,1,False
