# IMPORTS

## Libraries

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import datetime

## Loading Data

In [2]:
dfRaw = pd.read_csv('../../01-Data/Results/01-FirstRoundCRISP/dfDescriptionData.csv', low_memory=False, parse_dates=['Date'])

# FEATURE ENGINEERING

In [3]:
dfRaw1 = dfRaw.copy()

In [4]:
#year
dfRaw1['Year'] = dfRaw1['Date'].dt.year

#month
dfRaw1['Month'] = dfRaw1['Date'].dt.month

#day
dfRaw1['Day'] = dfRaw1['Date'].dt.day

#week of year
dfRaw1['WeekOfYear'] = dfRaw1['Date'].dt.weekofyear

#year week
dfRaw1['YearWeek'] = dfRaw1['Date'].dt.strftime('%Y-%W')

#Competion Sinse
dfRaw1['CompetionSinse'] = dfRaw1.apply(lambda row: datetime.datetime(year=row['CompetitionOpenSinceYear'], month=row['CompetitionOpenSinceMonth'], day=1), axis=1)
dfRaw1['CompetionTimeMonth'] = ((dfRaw1['Date'] - dfRaw1['CompetionSinse'])/30).apply(lambda row: row.days).astype(int)

#Promo Since
dfRaw1['PromoSince'] = dfRaw1['Promo2SinceYear'].astype(str) + '-' + dfRaw1['Promo2SinceWeek'].astype(str)
dfRaw1['PromoSince'] = dfRaw1['PromoSince'].apply(lambda row: datetime.datetime.strptime(row + '-1',  '%Y-%W-%w') - datetime.timedelta(days=7))
dfRaw1['PromoTimeWeek'] = ((dfRaw1['Date'] - dfRaw1['PromoSince'])/7).apply(lambda row: row.days).astype(int)

#Assortment (level: a = basic, b = extra, c = extended)
level = {
    'a' : 'basic', 'b' : 'extra', 'c' : 'extended'
}
dfRaw1['Assortment'] = dfRaw1['Assortment'].map(level)

# State Holiday (a = public holiday, b = Easter holiday, c = Christmas, 0 = None)
holiday = {
    'a' : 'public holiday', 'b' : 'Easter holiday', 'c' : 'Christmas'
}
dfRaw1['StateHoliday'] = dfRaw1['StateHoliday'].map(holiday)
dfRaw1['StateHoliday'].fillna('Regular Day', inplace=True)

In [5]:
dfRaw1.head().T

Unnamed: 0,0,1,2,3,4
Store,1,2,3,4,5
DayOfWeek,5,5,5,5,5
Date,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00,2015-07-31 00:00:00
Sales,5263,6064,8314,13995,4822
Customers,555,625,821,1498,559
Open,1,1,1,1,1
Promo,1,1,1,1,1
StateHoliday,Regular Day,Regular Day,Regular Day,Regular Day,Regular Day
SchoolHoliday,1,1,1,1,1
StoreType,c,a,a,c,a


# Variable Filtering

In [6]:
dfRaw2 = dfRaw1.copy()

## Row Fitering

In [7]:
dfRaw2 = dfRaw2[(dfRaw2['Open'] != 0) & (dfRaw2['Sales'] > 0)]

## Columns Filtering

In [8]:
toDrop = ['Customers', 'Open', 'PromoInterval', 'MonthMap']
dfRaw2.drop(toDrop, axis=1, inplace=True)

In [9]:
dfRaw2.columns

Index(['Store', 'DayOfWeek', 'Date', 'Sales', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'IsPromo', 'Year', 'Month', 'Day',
       'WeekOfYear', 'YearWeek', 'CompetionSinse', 'CompetionTimeMonth',
       'PromoSince', 'PromoTimeWeek'],
      dtype='object')

# Convert DataFrame to .csv

In [10]:
dfRaw2.to_csv('../../01-Data/Results/01-FirstRoundCRISP/dfFeatureEngineering.csv', index=False)