# 0.0 Imports

In [2]:
pip install xgboost

Collecting xgboost
  Downloading xgboost-1.5.2-py3-none-manylinux2014_x86_64.whl (173.6 MB)
[K     |████████████████████████████████| 173.6 MB 5.2 kB/s eta 0:00:01    |███▌                            | 18.9 MB 4.6 MB/s eta 0:00:34     |████▎                           | 23.4 MB 1.8 MB/s eta 0:01:23     |██████▋                         | 35.6 MB 4.9 MB/s eta 0:00:29     |█████████▎                      | 50.4 MB 5.2 MB/s eta 0:00:24     |███████████▌                    | 62.5 MB 5.1 MB/s eta 0:00:22     |████████████████▊               | 90.4 MB 5.2 MB/s eta 0:00:16     |███████████████████             | 103.4 MB 4.4 MB/s eta 0:00:16     |████████████████████████        | 129.9 MB 5.3 MB/s eta 0:00:09     |████████████████████████▊       | 134.4 MB 5.8 MB/s eta 0:00:07     |█████████████████████████▊      | 139.3 MB 7.4 MB/s eta 0:00:05     |██████████████████████████████▋ | 166.2 MB 5.1 MB/s eta 0:00:02
Installing collected packages: xgboost
Successfully installed xgboost-1.5.2
Note: y

In [3]:
import math
import datetime
import inflection

import numpy as np
import pandas as pd
import xgboost as xgb
import seaborn as sns
import matplotlib.pyplot as plt


from scipy import stats as ss
from boruta import BorutaPy
from tabulate import tabulate
from matplotlib import gridspec
from IPython.display import Image
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression, Lasso
from sklearn.preprocessing import RobustScaler, MinMaxScaler, LabelEncoder
from sklearn.metrics import mean_absolute_error, mean_squared_error ,mean_absolute_percentage_error

  from pandas import MultiIndex, Int64Index


## 0.1 Helper Functions

In [4]:
#V de Cramer
def cramer_v(x, y):
    cm = pd.crosstab( x, y).values
    n = cm.sum()
    r, k = cm.shape
    
    chi2 = ss.chi2_contingency( cm )[0]
    chi2corr = max(0, chi2 - (k-1)*(r-1)/(n-1))
    kcorr = k - (k-1)**2/(n-1)
    rcorr = r - (r-1)**2/(n-1)
    
    return np.sqrt((chi2corr/n)/(min(kcorr-1, rcorr-1))) 


def ml_error(model_name, y, yhat):
    mae = mean_absolute_error( y, yhat )
    mape = mean_absolute_percentage_error( y, yhat)
    rmse = np.sqrt(mean_squared_error(y, yhat))
    
    return pd.DataFrame({'Model Name': model_name,
                         'MAE': mae,
                         'MAPE': mape,
                         'RMSE': rmse }, index=[0] )


def cross_validation(x_training, kfold, model_name, model, verbose=False):
    mae_list = []
    mape_list = []
    rmse_list = []
    for k in reversed( range( 1, kfold+1) ):
        if verbose:
            print( f"\nKFold Number: {k}" )
        # start and end date for validation
        validation_start_date = x_training['date'].max() - datetime.timedelta( days=k*6*7)
        validation_end_date = x_training['date'].max() - datetime.timedelta( days=(k-1)*6*7)

        #filtering dataset
        training = x_training[x_training['date'] < validation_start_date]
        validation = x_training[(x_training['date'] >= validation_start_date) & (x_training['date'] <= validation_end_date)]

        # training and validation dataset
        xtraining = training.drop( ['date', 'sales'], axis=1 )
        ytraining = training['sales']

        #validation
        xvalidation = validation.drop( ['date', 'sales'], axis=1 )
        yvalidation = validation['sales']

        #model
        m = model.fit(xtraining, ytraining)

        #prediction
        yhat = m.predict( xvalidation )

        #performance
        m_result = ml_error( model_name, np.expm1(yvalidation), np.expm1(yhat))

        # store performance of each kfold iteration
        mae_list.append(m_result['MAE'])
        mape_list.append(m_result['MAPE'])
        rmse_list.append(m_result['RMSE'])

    return pd.DataFrame({'Model Name': model_name,
                         'MAE CV': np.round(np.mean(mae_list), 2).astype( str ) + ' +/- ' + np.round(np.std(mae_list), 2).astype( str ),
                         'MAPE CV': np.round(np.mean(mape_list), 2).astype( str ) + ' +/- ' + np.round(np.std(mape_list), 2).astype( str ),
                         'RMSE CV': np.round(np.mean(rmse_list), 2).astype( str ) + ' +/- ' + np.round(np.std(rmse_list), 2).astype( str ) }, index=[0])


plt.rcParams['figure.figsize'] = (18,10)
plt.style.use( 'bmh' )
plt.rcParams['font.size'] = 15

## 0.2 Loading Data

In [None]:
df_sales_raw = pd.read_csv( 'datasets/train.csv', low_memory=False )
df_store_raw = pd.read_csv( 'datasets/store.csv', low_memory=False )
                           
#merge
df_raw = pd.merge( df_sales_raw, df_store_raw, how='left', on='Store')

# 1.0 Descrição dos Dados

## 1.1 Rename Columns

In [None]:
df1 = df_raw.copy()

cols_old = ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 
            'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 
            'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear',
            'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']

snakecase = lambda x: inflection.underscore( x )

cols_new = list(map( snakecase, cols_old))

# rename
df1.columns = cols_new

## 1.2 Data Dimension

In [None]:
print(f'Number of Rows{df1.shape[0]}')
print(f'Number of Cols{df1.shape[1]}')

## 1.3 Data Types

In [None]:
df1['date'] = pd.to_datetime( df1['date'])
df1.dtypes

## 1.4 Check NA

In [None]:
df1.isna().sum()

## 1.5 Fillout NA

In [None]:
df1['competition_distance'].max()

In [None]:
# competition_distance    
df1['competition_distance'] = df1['competition_distance'].apply( lambda x: 200000.0 if math.isnan( x ) else x)

# competition_open_since_month
df1['competition_open_since_month'] = df1.apply( lambda x: x['date'].month if math.isnan( x['competition_open_since_month'] ) else x['competition_open_since_month'], axis=1)

# competition_open_since_year    
df1['competition_open_since_year'] = df1.apply( lambda x: x['date'].year if math.isnan( x['competition_open_since_year'] ) else x['competition_open_since_year'], axis=1)

# promo2_since_week              
df1['promo2_since_week'] = df1.apply( lambda x: x['date'].week if math.isnan( x['promo2_since_week'] ) else x['promo2_since_week'], axis=1)

# promo2_since_year              
df1['promo2_since_year'] = df1.apply( lambda x: x['date'].year if math.isnan( x['promo2_since_year'] ) else x['promo2_since_year'], axis=1)

# promo_interval 
month_map = {1: 'Jan', 2: 'Feb', 3: 'Mar', 4: 'Apr', 5: 'May', 6: 'Jun', 7: 'Jul', 8: 'Aug', 9: 'Sep', 10: 'Oct', 11: 'Nov', 12: 'Dec' }
df1['promo_interval'].fillna(0, inplace=True)
df1['month_map'] = df1['date'].dt.month.map( month_map )
df1['is_promo'] = df1[['promo_interval', 'month_map']].apply( lambda x: 0 if x['promo_interval'] == 0 else 1 if x['month_map'] in x['promo_interval'].split( ',' ) else 0, axis=1 )

df1.isna().sum()

## 1.6 Change Types

In [None]:
df1['competition_open_since_month'] = df1['competition_open_since_month'].astype( int )
df1['competition_open_since_year'] = df1['competition_open_since_year'].astype( int )

df1['promo2_since_week'] = df1['promo2_since_week'].astype( int )
df1['promo2_since_year'] = df1['promo2_since_year'].astype( int )

df1.dtypes

In [None]:
#df1.to_csv('df1.csv', index_label=False )

## 1.7 Descriptive Statistical

In [None]:
df2 = pd.read_csv( 'df1.csv', low_memory=False )

In [None]:
num_attributes = df2.select_dtypes( include = ['int64', 'float64'])
cat_attributes = df2.select_dtypes( exclude = ['int64', 'float64', 'datetime64[ns]'])

### 1.7.1 Numerical Attributes

In [None]:
# Central tendency - mean, median
ct1 = pd.DataFrame( num_attributes.apply( np.mean )).T
ct2 = pd.DataFrame( num_attributes.apply( np.median )).T

# dispersion - std, min, max, range, skew, kurtosis
d1 = pd.DataFrame( num_attributes.apply( np.std )).T
d2 = pd.DataFrame( num_attributes.apply( min )).T
d3 = pd.DataFrame( num_attributes.apply( max )).T
d4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() )).T
d5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() )).T
d6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() )).T

#concat
m = pd.concat( [d2, d3, d4, ct1, ct2, d1, d5, d6] ).T.reset_index()
m.columns = ['attributes', 'min', 'max', 'range', 'mean', 'median', 'std', 'skew', 'kurtosis'] 

In [None]:
sns.distplot( df2['competition_distance'] )

### 1.7.2 Categorical Attributes

In [None]:
cat_attributes.apply( lambda x: x.unique().shape[0])

In [None]:
aux1 = df2[(df2['state_holiday'] !='0') & (df2['sales']>0)]

plt.subplot( 1, 3, 1)
sns.boxplot( x= 'state_holiday', y= 'sales', data=aux1)

plt.subplot( 1, 3, 2)
sns.boxplot( x= 'store_type', y= 'sales', data=aux1)

plt.subplot( 1, 3, 3)
sns.boxplot( x= 'assortment', y= 'sales', data=aux1)

# 2.0 Feature Engineering

## 2.1 Mapa Mental de Hipóteses

In [None]:
Image( 'img/MindMapHypothesis.png' )

## 2.2 Criação da Hipóteses

### 2.2.1 Hipóteses Loja

    -1 Lojas com maior quadro de funcionários deveriam vender mais
    
    -2 Lojas com maior estoque deveriam vender mais
    
    -3 Lojas com maior porte deveriam vender mais
    
    -4 Lojas com maior sortimento deveriam vender mais
    
    -5 Lojas com competidores mais próximos deveriam vender menos
    
    -6 Lojas com competidores à mais tempo deveriam vender mais

### 2.2.1 Hipóteses Produto

    -1 Lojas que investem mais em marketing deveriam vender mais
    
    -2 Lojas que expoe mais os produtos nas vitrines deveriam vender mais
    
    -3 Lojas que tem preços menores nos produtos deveriam vender mais
    
    -4 Lojas que possuem preços menores por mais tempo deveriam vender mais 
    
    -5 Lojas com descontos maiores deveriam vender mais
    
    -6 Lojas com promoções ativas por mais tempo deveriam vender mais
    
    -7 Lojas com mais dias de promoção deveriam vender mais
    
    -8 Lojas com mais promoções consecutivas deveriam vender mais

### 2.2.1 Hipóteses Tempo

    -1 Lojas abertas durante o feriado de natal deveriam vender mais
    
    -2 Lojas deveriam vender mais ao longo dos anos
    
    -3 Lojas deveriam vender mais no segundo semestre do ano
    
    -4 Lojas deveriam vender mais depois do dia 10 do mês
    
    -5 Lojas deveriam vender menos aos finais de semana
    
    -6 Lojas deveriam vender menos durante os feriados escolares

## 2.3 Lista Final de Hipótesis

    -1 Lojas com maior sortimento deveriam vender mais
     
    -2 Lojas com competidores mais próximos deveriam vender menos
    
    -3 Lojas com competidores à mais tempo deveriam vender mais
     
    -4 Lojas com promoções ativas por mais tempo deveriam vender mais

    -5 Lojas com mais dias de promoção deveriam vender mais

    -6 Lojas com mais promoções consecutivas deveriam vender mais
      
    -7 Lojas abertas durante o feriado de natal deveriam vender mais
   
    -8 Lojas deveriam vender mais ao longo dos anos
    
    -9 Lojas deveriam vender mais no segundo semestre do ano
    
    -10 Lojas deveriam vender mais depois do dia 10 do mês
    
    -11 Lojas deveriam vender menos aos finais de semana
    
    -12 Lojas deveriam vender menos durante os feriados escolares    

## 2.2 Criação da Hipóteses

In [None]:
df2.head().T

In [None]:
df2['date'] = pd.to_datetime( df2['date'])
df2.dtypes

In [None]:
# year
df2['year'] = df2['date'].dt.year

#month
df2['month'] = df2['date'].dt.month

#day
df2['day'] = df2['date'].dt.day

#week of year
df2['week_of_year'] = df2['date'].dt.weekofyear

#year week
df2['year_week'] = df2['date'].dt.strftime( '%Y-%W' )

#competition since
df2['competition_since'] = df2.apply(lambda x: datetime.datetime( year=x['competition_open_since_year'], month=x['competition_open_since_month'], day=1), axis=1)
df2['competition_since_month'] = ((df2['date'] - df2['competition_since'])/30).apply(lambda x: x.days).astype( int )
df2['competition_time_month'] = ((df2['date'] - df2['competition_since'])/30).apply(lambda x: x.days).astype( int )

#promo since
df2['promo_since'] = df2['promo2_since_year'].astype(str) + '-' + df2['promo2_since_week'].astype(str)
df2['promo_since'] = df2['promo_since'].apply(lambda x: datetime.datetime.strptime( x + '-1', '%Y-%W-%w' ) - datetime.timedelta( days=7 ))
df2['promo_time_week'] = ( ( df2['date'] - df2['promo_since'] )/7 ).apply(lambda x: x.days ).astype( int )

#assortment / a=basic, b=extra, c=extended
df2['assortment'] = df2['assortment'].apply(lambda x: 'basic' if x == 'a' else 'extra' if x == 'b' else 'extended')

#state holiday / a=public holiday, b=easter holiday, c=christmas
df2['state_holiday'] = df2['state_holiday'].apply(lambda x: 'public_holiday' if x == 'a' else 'easter_holiday' if x == 'b' else 'christmas' if x =='c' else 'regular_day')


In [None]:
df2.head()

In [None]:
df2.to_csv('df2.csv', index_label=False)

# 3.0 Filtragem de Variáveis

In [None]:
df3 = pd.read_csv( 'df2.csv', low_memory=False )

## 3.1 Filtragem das Linhas

In [None]:
df3 = df3[(df3['open'] != 0) & (df3['sales'] > 0)]

## 3.2 Seleção das Colunas

In [None]:
df3 = df3.drop( columns = ['customers', 'open', 'promo_interval', 'month_map'] )

# 4.0 Análise Exploratória dos Dados

## 4.1 Análise Univariada

### 4.1.1 Response Variable

In [None]:
sns.distplot(df3['sales'])

### 4.1.2 Numerical Variable

In [None]:
num_attributes.hist(bins=25);

### 4.1.3 Categorical Variable

In [None]:
cat_attributes.head()

In [None]:
# state holiday
plt.subplot(3,2,1)
a = df3[df3['state_holiday'] != 'regular_day']
sns.countplot(a['state_holiday'])

plt.subplot(3,2,2)
sns.kdeplot(df3[df3['state_holiday'] == 'public_holiday']['sales'], label = 'public_holiday', shade=True)
sns.kdeplot(df3[df3['state_holiday'] == 'easter_holiday']['sales'], label = 'easter_holiday', shade=True)
sns.kdeplot(df3[df3['state_holiday'] == 'christmas']['sales'], label = 'christmas', shade=True)

# store type
plt.subplot(3,2,3)
sns.countplot(df3['store_type'])

plt.subplot(3,2,4)
sns.kdeplot(df3[df3['store_type'] == 'a']['sales'], label = 'a', shade=True)
sns.kdeplot(df3[df3['store_type'] == 'b']['sales'], label = 'b', shade=True)
sns.kdeplot(df3[df3['store_type'] == 'c']['sales'], label = 'c', shade=True)
sns.kdeplot(df3[df3['store_type'] == 'd']['sales'], label = 'd', shade=True)

#assortment
plt.subplot(3,2,5)
sns.countplot(df3['assortment'])

plt.subplot(3,2,6)
sns.kdeplot(df3[df3['assortment'] == 'extended']['sales'], label = 'extended', shade=True)
sns.kdeplot(df3[df3['assortment'] == 'basic']['sales'], label = 'basic', shade=True)
sns.kdeplot(df3[df3['assortment'] == 'extra']['sales'], label = 'extra', shade=True)



## 4.2 Análise Bivariada

### H1.Lojas com maior sortimento deveriam vender mais
**Falsa** Lojas com maior sortimento na verdade vendem menos

In [None]:
aux1 = df3[['assortment', 'sales']].groupby( 'assortment' ).sum().reset_index()
sns.barplot( x='assortment', y='sales', data = aux1);

aux2 = df3[['year_week', 'assortment', 'sales']].groupby( ['year_week','assortment'] ).sum().reset_index()
aux2.pivot( index='year_week',columns='assortment', values='sales').plot()

aux3 = aux2[aux2['assortment'] == 'extra']
aux3.pivot( index='year_week',columns='assortment', values='sales').plot()

### H2.Lojas com competidores mais próximos deveriam vender menos
**Falsa** Lojas com competidores mais próximos vendem mais

In [None]:
aux1 = df3[['competition_distance', 'sales']].groupby('competition_distance').sum().reset_index()

plt.subplot(1,3,1)
sns.scatterplot(x= 'competition_distance', y='sales', data=aux1);

plt.subplot(1,3,2)
bins = list(np.arange(0, 20000, 1000))
aux1['competition_distance_binned'] = pd.cut( aux1['competition_distance'], bins=bins)
aux2 = aux1[['competition_distance_binned', 'sales']].groupby('competition_distance_binned').sum().reset_index()
sns.barplot( x='competition_distance_binned', y='sales', data=aux2);

plt.subplot(1,3,3)
sns.heatmap( aux1.corr(method='pearson'), annot=True);

### H3.Lojas com competidores à mais tempo deveriam vender mais
**Falsa** Lojas com competidores a mais tempo vendem menos

In [None]:
plt.subplot(1, 3, 1)
aux1 = df3[['competition_time_month', 'sales']].groupby('competition_time_month').sum().reset_index()
aux2 = aux1 [( aux1['competition_time_month'] < 120 ) & ( aux1['competition_time_month'] !=0 )]
sns.barplot(x='competition_time_month', y='sales', data=aux2);
plt.xticks( rotation=90);

plt.subplot(1, 3, 2)
sns.regplot(x='competition_time_month', y='sales', data=aux2);

plt.subplot(1, 3, 3)
sns.heatmap( aux1.corr( method='pearson'), annot=True)

### H4.Lojas com promoções ativas por mais tempo deveriam vender mais
**Falsa** Lojas com promoções ativas por mais tempo vendem menos, depois de um certo tempo de promoção

In [None]:
aux1 = df3[['promo_time_week', 'sales']].groupby('promo_time_week').sum().reset_index()

grid = gridspec.GridSpec(2, 3)

plt.subplot( grid[0, 0] )
aux2 = aux1[aux1['promo_time_week'] > 0] # promo extended
sns.barplot(x='promo_time_week', y='sales', data=aux2);
plt.xticks( rotation=90 );

plt.subplot( grid[0, 1] )
sns.regplot(x='promo_time_week', y='sales', data=aux2);

plt.subplot( grid[1, 0] )
aux3 = aux1[aux1['promo_time_week'] < 0] # promo regular
sns.barplot(x='promo_time_week', y='sales', data=aux3);
plt.xticks( rotation=90 );

plt.subplot( grid[1, 1] )
sns.regplot(x='promo_time_week', y='sales', data=aux3);

plt.subplot( grid[:, 2] )
sns.heatmap( aux1.corr( method='pearson'), annot=True);

### <s> H5.Lojas com mais dias de promoção deveriam vender mais </s>

### H6.Lojas com mais promoções consecutivas deveriam vender mais
**Falsa** Lojas com mais promoções consecutivas vendem menos

In [None]:
df3[['promo', 'promo2', 'sales']].groupby(['promo', 'promo2']).sum().reset_index()

In [None]:
aux1 = df3[(df3['promo'] == 1) & (df3['promo2'] == 1)][['year_week', 'sales']].groupby( 'year_week').sum().reset_index()
ax = aux1.plot()

aux2 = df3[(df3['promo'] == 1) & (df3['promo2'] == 0)][['year_week', 'sales']].groupby( 'year_week').sum().reset_index()
aux2.plot( ax=ax )

ax.legend( labels=['Regular e Extended', 'Extended'])

### H7.Lojas abertas durante o feriado de natal deveriam vender mais
**Falsa** Lojas abertas durante o Natal vendem menos

In [None]:
aux = df3[df3['state_holiday'] != 'regular_day']

plt.subplot( 1, 2, 1 )
aux1 = aux[['state_holiday', 'sales']].groupby('state_holiday').sum().reset_index()
sns.barplot( x='state_holiday', y='sales', data=aux1);

plt.subplot( 1, 2, 2 )
aux2 = aux[['year', 'state_holiday', 'sales']].groupby(['year', 'state_holiday']).sum().reset_index()
sns.barplot( x='year', y='sales', hue='state_holiday', data=aux2)

### H8.Lojas deveriam vender mais ao longo dos anos
**Falsa** Lojas vendem menos ao longo dos anos

In [None]:
aux1 = df3[['year', 'sales']].groupby( 'year' ).sum().reset_index()

plt.subplot( 1, 3, 1 )
sns.barplot(x='year', y='sales', data=aux1);

plt.subplot( 1, 3, 2 )
sns.regplot(x='year', y='sales', data=aux1);

plt.subplot( 1, 3, 3 )
sns.heatmap(aux1.corr(method='pearson'), annot=True);

### H9. Lojas deveriam vender mais no segundo semestre do ano
**Falsa** Lojas vendme menos no segundo semestre do ano 

In [None]:
aux1 = df3[['month', 'sales']].groupby( 'month' ).sum().reset_index()

plt.subplot( 1, 3, 1 )
sns.barplot(x='month', y='sales', data=aux1);

plt.subplot( 1, 3, 2 )
sns.regplot(x='month', y='sales', data=aux1);

plt.subplot( 1, 3, 3 )
sns.heatmap(aux1.corr(method='pearson'), annot=True);

### H10.Lojas deveriam vender mais depois do dia 10 do mês
**Verdadeira** As lojas vendem mais depois do dia 10 de cada mês

In [None]:
aux1 = df3[['day', 'sales']].groupby( 'day' ).sum().reset_index()

plt.subplot( 2, 2, 1 )
sns.barplot(x='day', y='sales', data=aux1);

plt.subplot( 2, 2, 2 )
sns.regplot(x='day', y='sales', data=aux1);

plt.subplot( 2, 2, 3 )
sns.heatmap(aux1.corr(method='pearson'), annot=True);

plt.subplot( 2, 2, 4 )
aux1['before_after'] = aux1['day'].apply(lambda x: 'before_10_days' if x<= 10 else 'after_10_days')
aux2 = aux1[['before_after', 'sales']].groupby( 'before_after' ).sum().reset_index()
sns.barplot(x='before_after', y='sales', data=aux2);

### H11.Lojas deveriam vender menos aos finais de semana
**Verdadeira** Lojas vendem menos no final de semana

In [None]:
aux1 = df3[['day_of_week', 'sales']].groupby( 'day_of_week' ).sum().reset_index()

plt.subplot( 1, 3, 1 )
sns.barplot(x='day_of_week', y='sales', data=aux1);

plt.subplot( 1, 3, 2 )
sns.regplot(x='day_of_week', y='sales', data=aux1);

plt.subplot( 1, 3, 3 )
sns.heatmap(aux1.corr(method='pearson'), annot=True);

### H12.Lojas deveriam vender menos durante os feriados escolares  
**Verdadeira** Lojas vendem menos durante os feriados escolares, exceto em Julho e Agosto

In [None]:
aux1 = df3[['school_holiday', 'sales']].groupby('school_holiday').sum().reset_index()

plt.subplot(2,1,1)
sns.barplot( x='school_holiday', y='sales', data=aux1);

aux2 = df3[['month', 'school_holiday', 'sales']].groupby(['month', 'school_holiday']).sum().reset_index()

plt.subplot(2,1,2)
sns.barplot( x='month', y='sales', hue='school_holiday', data=aux2);

### Resumo das Hipótesis

In [None]:
tab = [['Hipóteses', 'Conclusão', 'Relevância'],
       ['H1', 'Falsa', 'Baixa'],
       ['H2', 'Falsa', 'Média'],
       ['H3', 'Falsa', 'Média'],
       ['H4', 'Falsa', 'Baixa'],
       ['H5', '-----', '-----'],
       ['H6', 'Falsa', 'Baixa'],
       ['H7', 'Falsa', 'Média'],
       ['H8', 'Falsa', 'Alta'],
       ['H9', 'Falsa', 'Alta'],
       ['H10', 'Verdadeira', 'Alta'],
       ['H11', 'Verdadeira', 'Alta'],
       ['H12', 'Verdadeira', 'Baixa'],
      ]
print( tabulate( tab, headers='firstrow'))

## 4.3 Análise Multivariada

### 4.3.1 Numerical Attributes

In [None]:
correlation = num_attributes.corr( method = 'pearson' )
sns.heatmap(correlation, annot=True);

### 4.3.1 Numerical Attributes

In [None]:
df3['competition_since'] = pd.to_datetime( df3['competition_since'])
df3['promo_since'] = pd.to_datetime( df3['promo_since'])

In [None]:
# 

In [None]:
# Only categorical data
a = df3.select_dtypes( include= 'object' )

# Calculated cramer_v
a1 = cramer_v(a['state_holiday'], a['state_holiday'])
a2 = cramer_v(a['state_holiday'], a['store_type'])
a3 = cramer_v(a['state_holiday'], a['assortment'])

a4 = cramer_v(a['store_type'], a['state_holiday'])
a5 = cramer_v(a['store_type'], a['store_type'])
a6 = cramer_v(a['store_type'], a['assortment'])

a7 = cramer_v(a['assortment'], a['state_holiday'])
a8 = cramer_v(a['assortment'], a['store_type'])
a9 = cramer_v(a['assortment'], a['assortment'])

# Final dataset
d = pd.DataFrame( {'state_holiday': [a1, a2, a3],
               'store_type':    [a4, a5, a6],
               'assortment':    [a7, a8, a9]
              })

d = d.set_index( d.columns )

sns.heatmap(d, annot=True)

# 5.0 Preparação dos Dados

In [None]:
#df3.to_csv('df3.csv', index_label=False )

In [None]:
df4 = pd.read_csv( 'df3.csv', low_memory=False )

## 5.1 Normalização

Na seção 4.1.2 Numerical Variable, não há nada que possa ser normalizado

## 5.2 Rescaling

In [None]:
rs = RobustScaler()
mms= MinMaxScaler()

# competition distance
df4['competition_distance'] = rs.fit_transform(df4[['competition_distance']].values)

# year
df4['year'] = mms.fit_transform(df4[['year']].values)

# competition time month
df4['competition_time_month'] = rs.fit_transform(df4[['competition_time_month']].values)

#promo time week
df4['promo_time_week'] = mms.fit_transform(df4[['promo_time_week']].values)


## 5.3 Transformação

### 5.3.1 Encoding

In [None]:
#state_holiday - onehot encoding
df4 = pd.get_dummies( df4, prefix=['state_holiday'], columns=['state_holiday'])

#store_type - label encoding
le = LabelEncoder()
df4['store_type'] = le.fit_transform(df4['store_type'])

#assortment - ordinal encoding
assortment_dict = {'basic': 1, 'extra': 2, 'extended': 3}
df4['assortment'] = df4['assortment'].map( assortment_dict )

### 5.3.2 Response Variable Transformation

In [None]:
df4['sales'] = np.log1p(df4['sales'])

In [None]:
sns.distplot(df4['sales'])

### 5.3.3 Nature Transformation

In [None]:
# day
df4['day_sin'] = df4['day'].apply( lambda x: np.sin( x * (2 * np.pi/30)))
df4['day_cos'] = df4['day'].apply( lambda x: np.cos( x * (2 * np.pi/30)))

# day of week
df4['day_of_week_sin'] = df4['day_of_week'].apply( lambda x: np.sin( x * (2 * np.pi/7)))
df4['day_of_week_cos'] = df4['day_of_week'].apply( lambda x: np.cos( x * (2 * np.pi/7)))

# month
df4['month_sin'] = df4['month'].apply( lambda x: np.sin( x * (2 * np.pi/12)))
df4['month_cos'] = df4['month'].apply( lambda x: np.cos( x * (2 * np.pi/12)))

# week of year
df4['week_of_year_sin'] = df4['week_of_year'].apply( lambda x: np.sin( x * (2 * np.pi/52)))
df4['week_of_year_cos'] = df4['week_of_year'].apply( lambda x: np.cos( x * (2 * np.pi/52)))

# 6.0 Feature Selection

In [None]:
#df4.to_csv('df4.csv', index_label=False )

In [5]:
df5 = pd.read_csv( 'df4.csv', low_memory=False )

## 6.1 Split dataframe into trainig and test dataset

In [6]:
df5['date'] = pd.to_datetime( df5['date'])
df5[['store', 'date']].groupby( 'store' ).max().reset_index()['date'][0] - datetime.timedelta( days=6*7 )
cols_drop = ['week_of_year', 'day', 'month', 'day_of_week', 'promo_since', 'competition_since','year_week']
df5 = df5.drop( cols_drop, axis=1)
df5[['store', 'date']].groupby( 'store' ).min().reset_index()

In [8]:
df5[['store', 'date']].groupby( 'store' ).max().reset_index()['date'][0] - datetime.timedelta( days=6*7 )

Timestamp('2015-06-19 00:00:00')

In [9]:
# training dataset
X_train = df5[df5['date'] < '2015-06-19']
Y_train = X_train['sales']

# test dataset
X_test = df5[df5['date'] >= '2015-06-19']
Y_test = X_test['sales']

print( f"Training Min Date: { X_train['date'].min()}")
print( f"Training Max Date: { X_train['date'].max()}")

print( f"\nTest Min Date: { X_test['date'].min()}")
print( f"Test Max Date: { X_test['date'].max()}")

Training Min Date: 2013-01-01 00:00:00
Training Max Date: 2015-06-18 00:00:00

Test Min Date: 2015-06-19 00:00:00
Test Max Date: 2015-07-31 00:00:00


## 6.2 Boruta as Feature Selector

In [None]:
# training and test dataset for Boruta
X_train_n = X_train.drop(['date', 'sales'], axis=1).values
Y_train_n = Y_train.values.ravel()


# define RandomForest Regressor
rf = RandomForestRegressor( n_jobs=-1 )

# define boruta
boruta = BorutaPy( rf, n_estimators='auto', verbose=2, random_state=42 ).fit( X_train_n, Y_train_n)

### 6.2.1 Best Features from Boruta

In [None]:
cols_selected = boruta.support_.tolist()

# best features
X_train_fs = X_train.drop( ['date', 'sales'], axis=1 )
cols_selected_boruta = X_train_fs.iloc[:, cols_selected].columns.to_list()

cols_not_selected_boruta = list(np.setdiff1d(X_train_fs.columns, cols_selected_boruta))

## 6.3 Manual Feature Selection
##### **A seleção das colunas foi realizada pelo Boruta usando o Google Colabs**

In [13]:
cols_selected_boruta = [
  'store',
 'promo',
 'store_type',
 'assortment',
 'competition_distance',
 'competition_open_since_month',
 'competition_open_since_year',
 'promo2',
 'promo2_since_week',
 'promo2_since_year',
 'competition_since_month',
 'competition_time_month',
 'promo_time_week',
 'day_sin',
 'day_cos',
 'day_of_week_sin',
 'day_of_week_cos',
 'month_sin',
 'month_cos',
 'week_of_year_sin',
 'week_of_year_cos']

# columns to add
feat_to_add = [ 'date', 'sales']

cols_selected_boruta_full = cols_selected_boruta.copy()

# final features
cols_selected_boruta_full.extend( feat_to_add )

# 7.0 Machine Learning Modelling 

In [14]:
x_train = X_train[ cols_selected_boruta ]
x_test = X_test[ cols_selected_boruta ]
x_training = X_train[cols_selected_boruta_full]

## 7.1 Average Model

In [26]:
aux1 = x_test.copy()
aux1['sales'] = y_test.copy()

# prediction
aux2 = aux1[['store', 'sales']].groupby( 'store' ).mean().reset_index().rename( columns={'sales': 'predictions'})
aux1 = pd.merge( aux1, aux2, how='left', on='store')
yhat_baseline = aux1['predictions']

#performance
baseline_result = ml_error('Average Model', np.expm1(y_test), np.expm1(yhat_baseline))
baseline_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Average Model,1354.800353,0.455051,1835.135542


## 7.2 Linear Regression Model

In [29]:
#model
lr = LinearRegression().fit( x_train, y_train)

#prediction
yhat_lr = lr.predict( x_test )

#performance
lr_result = ml_error( 'Linear Regression', np.expm1( y_test ), np.expm1( yhat_lr))
lr_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Linear Regression,1867.072608,0.292689,2671.050063


### 7.2.1 Linear Regression Model - Cross Validation

In [None]:
lr = LinearRegression()
lr_result_cv = cross_validation( x_training, 5, 'Linear Regression', lr)
lr_result_cv

## 7.3 Linear Regression Regularized Model - Lasso

In [32]:
#model
lrr = Lasso( alpha=0.001 ).fit( x_train, y_train)

#prediction
yhat_lrr = lrr.predict( x_test )

#performance
lrr_result = ml_error( 'Linear Regression - Lasso', np.expm1( y_test ), np.expm1( yhat_lrr))
lrr_result

Unnamed: 0,Model Name,MAE,MAPE,RMSE
0,Linear Regression - Lasso,1869.372232,0.292653,2677.110638


### 7.3.1 Linear Regression Regularized Model - Cross Validation

In [None]:
lrr = Lasso( alpha=0.01 )
lrr_result_cv = cross_validation( x_training, 5, 'Lasso', lrr)
lrr_result_cv

## 7.4 Random Forest Regressor

In [None]:
# #model
# rf = RandomForestRegressor( n_estimators=100, n_jobs=-1, random_state=42 ).fit( x_train, y_train)

# #prediction
# yhat_rf = rf.predict( x_test )

# #performance
# rf_result = ml_error( 'Random Forest Regressor', np.expm1( y_test ), np.expm1( yhat_rf))
# rf_result

### 7.4.1 Random Forest Regressor Model - Cross Validation

In [None]:
rf = RandomForestRegressor( n_estimators=100, n_jobs=-1, random_state=42 )
rf_result_cv = cross_validation( x_training, 5, 'Random Forest Regressor', rf, verbose=True)
rf_result_cv

## 7.5 XGBoost Regressor

In [None]:
# model_xgb = xgb.XGBRegressor( objective='reg:squarederror',
#                               n_estimators=100, 
#                               eta=0.01,
#                               max_depth=10,
#                               subsample=0.7,
#                               colsample_bytree=0.9 ).fit( x_train, y_train)

# #prediction
# yhat_xgb = model_xgb.predict( x_test )

# #performance
# xgb_result = ml_error( 'XGBoost Regressor', np.expm1( y_test ), np.expm1( yhat_xgb))
# xgb_result

### 7.5.1 XGBoost Regressor Model - Cross Validation

In [None]:
model_xgb = xgb.XGBRegressor( objective='reg:squarederror',
                              n_estimators=100, 
                              eta=0.01,
                              max_depth=10,
                              subsample=0.7,
                              colsample_bytree=0.9 )
xgb_result_cv = cross_validation( x_training, 5, 'XGBoost Regressor', model_xgb, verbose=True)
xgb_result_cv

## 7.6 Compare Model's Performance

### 7.6.1 Singe Performance

In [None]:
modelling_result = pd.concat( [baseline_result, lr_result, lrr_result, rf_result, xgb_result] )
modelling_result.sort_values( 'RMSE' )

### 7.6.2 Real Performance - Cross Validation'

In [None]:
modelling_result_cv = pd.concat( [lr_result_cv, lrr_result_cv, rf_result_cv, xgb_result_cv] )
modelling_result_cv