# IMPORTS

## Libraries

In [1]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings("ignore")

import seaborn as sns
import matplotlib.pyplot as plt
from IPython.core.display import HTML

from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder

## Helper Functions

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 16]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings()

Populating the interactive namespace from numpy and matplotlib


## Loading Data

In [4]:
dfRaw = pd.read_csv('../../01-Data/Results/01-FirstRoundCRISP/dfFeatureEngineering.csv', low_memory=False, parse_dates=['Date'])

# DATA PREPARATION

In [5]:
dfRaw1 = dfRaw.copy()

## ~~Normalization~~

## Rescaling

In [6]:
numAttributes = dfRaw1.select_dtypes(include=['int64', 'float64'])
numAttributes.head()

Unnamed: 0,Store,DayOfWeek,Sales,Promo,SchoolHoliday,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,IsPromo,Year,Month,Day,WeekOfYear,CompetionTimeMonth,PromoTimeWeek
0,1,5,5263,1,1,1270.0,9,2008,0,31,2015,0,2015,7,31,31,84,0
1,2,5,6064,1,1,570.0,11,2007,1,13,2010,1,2015,7,31,31,94,279
2,3,5,8314,1,1,14130.0,12,2006,1,14,2011,1,2015,7,31,31,105,226
3,4,5,13995,1,1,620.0,9,2009,0,31,2015,0,2015,7,31,31,71,0
4,5,5,4822,1,1,29910.0,4,2015,0,31,2015,0,2015,7,31,31,4,0


In [12]:
rs = RobustScaler()
mms = MinMaxScaler()

#Competion Distance >> Presence of well defined outiliers
numAttributes['CompetitionDistance'] = rs.fit_transform(numAttributes[['CompetitionDistance']].values)

#Competion Time Month >> Presence of well defined outiliers
numAttributes['CompetionTimeMonth'] = rs.fit_transform(numAttributes[['CompetionTimeMonth']].values)

#Promo Time Week
numAttributes['PromoTimeWeek'] = mms.fit_transform(numAttributes[['PromoTimeWeek']].values)

#Year
numAttributes['Year'] = mms.fit_transform(numAttributes[['Year']].values)

## Transformation

### Encoding

In [17]:
#State Holiday -> One Hot Encoding
dfRaw1 = pd.get_dummies(dfRaw1, prefix=['StateHoliday'], columns=['StateHoliday'])

#Store Type -> Label Encoding
le = LabelEncoder()
dfRaw1['StoreType'] = le.fit_transform(dfRaw1['StoreType'])

#Assortment -> Ordinal Encoding
dictAssortment = {
                    'basic': 1,
                    'extra': 2,
                    'extended': 3
                    }
dfRaw1['Assortment'] = dfRaw1['Assortment'].map(dictAssortment)

### Response Variable Transformation

In [18]:
dfRaw1['Sales'] = np.log1p(dfRaw1['Sales'])

### Nature Transformation

In [21]:
#Month
dfRaw1['MonthSin'] = dfRaw1['Month'].apply(lambda row: np.sin(row * (2 * np.pi/12)))
dfRaw1['MonthCos'] = dfRaw1['Month'].apply(lambda row: np.cos(row * (2 * np.pi/12)))
#Day
dfRaw1['DaySin'] = dfRaw1['Day'].apply(lambda row: np.sin(row * (2 * np.pi/30)))
dfRaw1['DayCos'] = dfRaw1['Day'].apply(lambda row: np.cos(row * (2 * np.pi/30)))
#Week of Year
dfRaw1['WeekOfYearSin'] = dfRaw1['WeekOfYear'].apply(lambda row: np.sin(row * (2 * np.pi/52)))
dfRaw1['WeekOfYearCos'] = dfRaw1['WeekOfYear'].apply(lambda row: np.cos(row * (2 * np.pi/52)))
#Day of Week
dfRaw1['DayOfWeekSin'] = dfRaw1['DayOfWeek'].apply(lambda row: np.sin(row * (2 * np.pi/7)))
dfRaw1['DayOfWeekCos'] = dfRaw1['DayOfWeek'].apply(lambda row: np.cos(row * (2 * np.pi/7)))

# Convert DataFrame to .csv

In [23]:
dfRaw1.to_csv('../../01-Data/Results/01-FirstRoundCRISP/dfDataPreparation.csv', index=False)