# 0.0 IMPORTS

## 01. Libraries

In [1]:
import warnings
import inflection
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import dask.dataframe as dd

from dask.distributed import Client, progress

In [21]:
# import datetime
# import numpy as np
# import scipy.stats  as ss

# from IPython.display         import Image
# from IPython.core.display    import HTML

# import matplotlib.pyplot as plt
# import seaborn as sns

# 0.2 Notebook Config

In [2]:
warnings.filterwarnings("ignore")

In [3]:
client = Client()
client

0,1
Client  Scheduler: tcp://127.0.0.1:51403  Dashboard: http://127.0.0.1:8787/status,Cluster  Workers: 4  Cores: 12  Memory: 16.98 GB


In [4]:
# def jupyter_settings():
#     %matplotlib inline
    
#     plt.style.use('bmh')
#     plt.rcParams['figure.figsize'] = [25, 12]
#     plt.rcParams['font.size'] = 24
    
#     display( HTML('<style>.container { width:100% !important; }</style>'))
    
#     warnings.filterwarnings("ignore")
    
#     sns.set()

In [5]:
# seed = 42
# np.random.seed(seed)

# jupyter_settings()

## 0.3 Helper Functions

In [15]:
def rename_columns(df):
    '''Transforms the names of the columns of a dataframe into the snakecase format
    
    Params
    ------
    df: Dataframe to have the columns converted
    
    Return
    ------
    Returns the renamed columns
    
    '''
    # function to convert into snakecase format
    snakecase = lambda col: inflection.underscore(col)
    # mapping the function to convert into snakecase format with the dataframe columns
    new_columns = list(map(snakecase, df.columns))
    # rename the colums
    df.columns = new_columns
    return df.columns

## 0.4 Load Data

### 0.4.1 Load Raw Data

#### 0.4.1.1 train

In [6]:
types = {'Semana':np.uint8, 'Agencia_ID':np.uint16, 'Canal_ID':np.uint8,
         'Ruta_SAK':np.uint16, 'Cliente_ID':np.uint32, 'Producto_ID':np.uint16,
         'Demanda_uni_equil':np.uint32}

data_train_raw = dd.read_csv('../00-Data/csv_data/train.csv', usecols=types.keys(), dtype=types)

#### 0.4.1.2 producto_tabla

In [7]:
types = {'Producto_ID':np.uint16,
         'NombreProducto':'object'}


data_producto_raw = dd.read_csv('../00-Data/csv_data/producto_tabla.csv', usecols=types.keys(), dtype=types)

#### 0.4.1.3 cliente_tabla

In [8]:
types = {'Cliente_ID':np.uint32,
         'NombreCliente':'object'}


data_cliente_raw = dd.read_csv('../00-Data/csv_data/cliente_tabla.csv', usecols=types.keys(), dtype=types)

#### 0.4.1.4 town_state

In [9]:
types = {'Agencia_ID':np.uint16,
         'Town':'object',
         'State':'object'}


data_town_state_raw = dd.read_csv('../00-Data/csv_data/town_state.csv', usecols=types.keys(), dtype=types)

In [10]:
# data_train_raw = dd.read_csv('../00-Data/csv_data/train.csv')
# data_producto_raw = dd.read_csv('../00-Data/csv_data/producto_tabla.csv')
# data_cliente_raw = dd.read_csv('../00-Data/csv_data/cliente_tabla.csv')
# data_town_state_raw = dd.read_csv('../00-Data/csv_data/town_state.csv')

### 0.4.2 Merge Dataset

In [11]:
# data_raw = dd.merge(data_train_raw, data_producto_raw, how='left', on='Producto_ID')
# data_raw = dd.merge(data_raw, data_cliente_raw, how='left', on='Cliente_ID')
# data_raw = dd.merge(data_raw, data_town_state_raw, how='left', on='Agencia_ID')
# data_raw = data_raw.drop(['Producto_ID', 'Cliente_ID', 'Agencia_ID'], axis=1)
# data_raw.head()

# 1.0 DATA DESCRIPTION

## 1.1 <font color='blue'>Training</font>

In [12]:
df_train_01 = data_train_raw.copy()

### 1.1.1 Columns

In [14]:
df_train_01.columns

Index(['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID',
       'Producto_ID', 'Demanda_uni_equil'],
      dtype='object')

#### 1.1.2.1 Rename Columns

In [16]:
rename_columns(df_train_01)

Index(['semana', 'agencia_id', 'canal_id', 'ruta_sak', 'cliente_id',
       'producto_id', 'demanda_uni_equil'],
      dtype='object')

In [17]:
df_train_01.columns

Index(['semana', 'agencia_id', 'canal_id', 'ruta_sak', 'cliente_id',
       'producto_id', 'demanda_uni_equil'],
      dtype='object')

### 1.2.2 New Name Columns

In [15]:
df01.columns

Index(['semana', 'canal_id', 'ruta_sak', 'demanda_uni_equil',
       'nombre_producto', 'nombre_cliente', 'town', 'state'],
      dtype='object')

## 1.3 Data Dimensions

In [16]:
print(f'Number of Rows: {df01.shape[0].compute()}')
print(f'Number of Columns: {df01.shape[1]}')

Number of Rows: 74773833
Number of Columns: 8


## 1.4 Data Types

In [17]:
df01.dtypes

semana                uint8
canal_id              uint8
ruta_sak             uint16
demanda_uni_equil    uint32
nombre_producto      object
nombre_cliente       object
town                 object
state                object
dtype: object

## 1.5 Missing Values

In [18]:
data_raw.isnull().sum().compute()

Semana               0
Canal_ID             0
Ruta_SAK             0
Demanda_uni_equil    0
NombreProducto       0
NombreCliente        0
Town                 0
State                0
dtype: int64

### 1.5.2 Fillout NA

There is no NA values

## 1.6 Descriptive Statistical

In [19]:
num_attributes = df01.select_dtypes(exclude=['object'])

In [20]:
describe = num_attributes.describe().compute().T

describe['Relative Std'] = describe['std'] / describe['mean']
describe['range'] = describe['max'] - describe['min']
describe['IQR'] = describe['75%'] - describe['25%']
describe = describe[['min', 'max', 'range', 'mean','std', '50%', 'IQR', '25%', '75%', 'Relative Std']]
describe.columns = ['Min', 'Max', 'Range', 'Mean', 'Std', 'Median', 'IQR', '25%', '75%', 'Relative Std']
describe

Unnamed: 0,Min,Max,Range,Mean,Std,Median,IQR,25%,75%,Relative Std
semana,3.0,9.0,6.0,5.950119,2.013151,6.0,4.0,4.0,8.0,0.338338
canal_id,1.0,11.0,10.0,1.388785,1.470505,1.0,0.0,1.0,1.0,1.058843
ruta_sak,1.0,9991.0,9990.0,2120.527163,1494.363905,1614.0,1691.0,1213.0,2904.0,0.704713
demanda_uni_equil,0.0,5000.0,5000.0,7.25529,21.97233,4.0,6.0,2.0,8.0,3.028457


In [24]:
products = data_producto_raw.copy()
products['short_name'] = products.NombreProducto.str.extract('^(\D*)', expand=False)
products['brand'] = products.NombreProducto.str.extract('^.+\s(\D+) \d+$', expand=False)
w = products.NombreProducto.str.extract('(\d+)(Kg|g)', expand=True)
products['weight'] = w[0].astype('float')*w[1].map({'Kg':1000, 'g':1})
products['pieces'] =  products.NombreProducto.str.extract('(\d+)p ', expand=False).astype('float')
products.head()

## Complementar com ....
## https://github.com/hepnerthomas/Grupo-Bimbo-Inventory-Demand/blob/master/exploratory%20analysis.ipynb 
## https://www.kaggle.com/namra42/product-preprocessing

Unnamed: 0,Producto_ID,NombreProducto,short_name,brand,weight,pieces
0,0,NO IDENTIFICADO 0,NO IDENTIFICADO,IDENTIFICADO,,
1,9,Capuccino Moka 750g NES 9,Capuccino Moka,NES,750.0,
2,41,Bimbollos Ext sAjonjoli 6p 480g BIM 41,Bimbollos Ext sAjonjoli,BIM,480.0,6.0
3,53,Burritos Sincro 170g CU LON 53,Burritos Sincro,LON,170.0,
4,72,Div Tira Mini Doradita 4p 45g TR 72,Div Tira Mini Doradita,TR,45.0,4.0


In [26]:
products.short_name.value_counts(dropna=False).compute()

Pan Blanco                        39
Tortillinas                       37
Deliciosas Vainilla               35
Medias Noches                     25
Wonder                            22
                                  ..
Healthy Multi Grain                1
Hamburguesa Clasica                1
Granel Oreja                       1
Granel Classica Choco Bicolor      1
Wonderbutter                       1
Name: short_name, Length: 1014, dtype: int64

In [None]:
### Cliente por rota
## Popularida do produto

## https://github.com/seitin/bimbo/blob/master/initial_ml.ipynb

## lag
## https://github.com/siskaj/Bimbo/blob/master/bimbo_test.ipynb