# 0.0. IMPORTS

In [1]:
import math
import numpy  as np
import pandas as pd
import random
import pickle
import warnings
import inflection
import seaborn as sns
import xgboost as xgb
import matplotlib

from scipy.stats           import kurtosis
from scipy                 import stats  as ss
from boruta                import BorutaPy
from matplotlib            import pyplot as plt
from IPython.display       import Image
from IPython.core.display  import HTML



## 0.1. Helper functions

In [2]:
def jupyter_settings():
    %matplotlib inline
    %pylab inline
    
    plt.style.use( 'bmh' )
    plt.rcParams['figure.figsize'] = [25, 12]
    plt.rcParams['font.size'] = 24
    
    display( HTML( '<style>.container { width:100% !important; }</style>') )
    pd.options.display.max_columns = None
    pd.options.display.max_rows = None
    pd.set_option( 'display.expand_frame_repr', False )
    
    sns.set()

In [3]:
jupyter_settings();

%pylab is deprecated, use %matplotlib inline and import the required libraries.
Populating the interactive namespace from numpy and matplotlib


`%matplotlib` prevents importing * from pylab and numpy
  warn("pylab import has clobbered these variables: %s"  % clobbered +


## 0.2. Loading data

In [4]:
df_train_raw = pd.read_csv('../data/train.csv' , low_memory=False)

#df_test_raw = pd.read_csv('../data/test.csv' , low_memory=False)

In [5]:
df_train_raw.sample()

Unnamed: 0,date,store,item,sales
889481,2013-08-08,8,49,29


# 1.0. DATA DESCRIPTION

In [6]:
df1 = df_train_raw.copy()

In [7]:
df1.columns

Index(['date', 'store', 'item', 'sales'], dtype='object')

## 1.2. Data dimensions

In [8]:
print('Number of rows: {}'.format(df1.shape[0]))
print('Number of columns: {}'.format(df1.shape[1]))

Number of rows: 913000
Number of columns: 4


## 1.3. Data types

In [9]:
df1.head()

Unnamed: 0,date,store,item,sales
0,2013-01-01,1,1,13
1,2013-01-02,1,1,11
2,2013-01-03,1,1,14
3,2013-01-04,1,1,13
4,2013-01-05,1,1,10


In [10]:
df1.dtypes

date     object
store     int64
item      int64
sales     int64
dtype: object

In [11]:
# transformation date
df1['date'] = pd.to_datetime(df1['date'])

# transformation sales
df1['sales' ] = df1['sales' ].astype('float64')

In [12]:
df1.dtypes

date     datetime64[ns]
store             int64
item              int64
sales           float64
dtype: object

## 1.4. Check NA

In [13]:
df1.isna().sum()

date     0
store    0
item     0
sales    0
dtype: int64

## 1.5. Descriptive statistics

In [14]:
# separação variáveis numéricas e categoricas

num_attributes = df1.select_dtypes(include = ['int64', 'float64'])
cat_attributes = df1.select_dtypes(exclude = ['int64', 'float64', 'datetime64[ns]'])

In [15]:
# medidas de tendência central - mean and median

ct1 = pd.DataFrame(num_attributes.apply(np.mean)).T
ct2 = pd.DataFrame(num_attributes.apply(np.median)).T


# medidas de dispersão - std, min, max, range, skew, kurtosis

md1 = pd.DataFrame( num_attributes.apply( np.std ) ).T 
md2 = pd.DataFrame( num_attributes.apply( min ) ).T 
md3 = pd.DataFrame( num_attributes.apply( max ) ).T 
md4 = pd.DataFrame( num_attributes.apply( lambda x: x.max() - x.min() ) ).T 
md5 = pd.DataFrame( num_attributes.apply( lambda x: x.skew() ) ).T 
md6 = pd.DataFrame( num_attributes.apply( lambda x: x.kurtosis() ) ).T 

#concatenar

medidas = pd.concat([ct1, ct2, md2, md3, md1, md4, md5, md6 ]).T.reset_index()
medidas.columns = ['attributes', 'mean', 'median', 'min', 'max', 'std', 'range', 'skew', 'kurtosis' ]

In [16]:
medidas

Unnamed: 0,attributes,mean,median,min,max,std,range,skew,kurtosis
0,store,5.5,5.5,1.0,10.0,2.872281,9.0,0.0,-1.224243
1,item,25.5,25.5,1.0,50.0,14.43087,49.0,0.0,-1.20096
2,sales,52.250287,47.0,0.0,231.0,28.801128,231.0,0.867112,0.50907


In [17]:
df1.describe()

Unnamed: 0,store,item,sales
count,913000.0,913000.0,913000.0
mean,5.5,25.5,52.250287
std,2.872283,14.430878,28.801144
min,1.0,1.0,0.0
25%,3.0,13.0,30.0
50%,5.5,25.5,47.0
75%,8.0,38.0,70.0
max,10.0,50.0,231.0


In [None]:
sns.displot( data = df1['sales'], kde=True );

In [None]:
sns.boxplot( data = df1['sales'], orient="h", color='skyblue' );

In [None]:
num_attributes.apply( lambda x : x.unique().shape[0])

In [None]:
aux = df1[(df1['store'] != '0') & (df1['sales'] > 0)]

plt.subplot();
sns.boxplot( x='store', y='sales', data=aux );

# 2.0. FEATURE ENGINEERING