In [24]:
import pandas as pd
import numpy as np

In [25]:
# RAW
Test = pd.read_csv("./data_raw/test.csv")
Stores = pd.read_csv("./data_raw/stores.csv")

# AUXILIAR
Oil = pd.read_csv("./data_aux/oil.csv")
Holidays = pd.read_csv("./data_aux/holidays.csv")
Transactions = pd.read_csv("./data_aux/transactions.csv")

In [26]:
# MERGE OIL
Test = Test.merge(
  Oil[['date', 'oil_price']],
  on=['date']
)

In [27]:
# STORE INFORMATIONS
Test = Test.merge(
  Stores[['store_nbr', 'city', 'state', 'type', 'cluster']],
  on=['store_nbr']
)

In [28]:
# HOLIDAYS
Test['is_holiday'] = False

# National
national = Holidays[Holidays['locale'] == 'National'].drop_duplicates(subset='date')
Test['is_holiday'] = Test['date'].isin(national['date'])

# Regional
regional = Holidays[Holidays['locale'] == 'Regional'].rename(columns={'locale_name': 'state'})
regional = regional.drop_duplicates(subset=['date', 'state'])
regional['is_regional_holiday'] = True
Test = Test.merge(regional[['date', 'state', 'is_regional_holiday']], on=['date', 'state'], how='left')
Test['is_holiday'] |= Test['is_regional_holiday'].infer_objects(copy=False)
Test.drop(columns=['is_regional_holiday'], inplace=True)

# Local
local = Holidays[Holidays['locale'] == 'Local'].rename(columns={'locale_name': 'city'})
local = local.drop_duplicates(subset=['date', 'city'])
local['is_local_holiday'] = True
Test = Test.merge(local[['date', 'city', 'is_local_holiday']], on=['date', 'city'], how='left')
Test['is_holiday'] |= Test['is_local_holiday'].infer_objects(copy=False)
Test.drop(columns=['is_local_holiday'], inplace=True)

In [29]:
# DAYS SINCE PAYDAY
Test['date'] = pd.to_datetime(Test['date'])

paydays = pd.date_range(start='2012-12-31', end=Test['date'].max(), freq='MS') 

paydays = pd.Series(paydays).drop_duplicates().sort_values().reset_index(drop=True)

def days_since_last_payday(date):
  prev_paydays = paydays[paydays <= date]
  if not prev_paydays.empty: return (date - prev_paydays.iloc[-1]).days
  else: return np.nan

Test['days_since_payday'] = Test['date'].apply(days_since_last_payday)

In [None]:
# DAY INFO
Test['date'] = pd.to_datetime(Test['date'])
Test['day_of_week'] = Test['date'].dt.dayofweek 
Test['is_weekend'] = Test['day_of_week'].isin([5, 6])
Test['date'] = pd.to_datetime(Test['date']).dt.date

In [32]:
Test.to_csv("./data_predict/test.csv")