# 0.0 IMPORTS

## 01. Libraries

In [1]:
import datetime
import pandas as pd
import numpy as np

# 0.2 Notebook Config

In [2]:
seed = 42
np.random.seed(seed)

## 0.3 Helper Functions

In [3]:
# Time Decorator
def timer(function):
    def wrapper(get):
        try:
            start = datetime.datetime.now()
            function(get)
            end = datetime.datetime.now()
            delta  = end - start
            print('runtime: ', delta)
            #return delta
        except:
            pass
    return wrapper

## 0.4 Load Data

### 0.4.1 Load Raw Data

In [4]:
data_train_raw = pd.read_feather('../00-Data/feather_data/train.feather')
data_producto_raw = pd.read_feather('../00-Data/feather_data/producto_tabla.feather')
data_cliente_raw = pd.read_feather('../00-Data/feather_data/cliente_tabla.feather')
data_town_state_raw = pd.read_feather('../00-Data/feather_data/town_state.feather')

### 0.4.2 Merge Dataset

In [9]:
data_raw = pd.merge(data_train_raw, data_producto_raw, how='left', on='Producto_ID')
data_raw = pd.merge(data_raw, data_cliente_raw, how='left', on='Cliente_ID')
data_raw = pd.merge(data_raw, data_town_state_raw, how='left', on='Agencia_ID')
data_raw = data_raw.drop(['Producto_ID', 'Cliente_ID', 'Agencia_ID'], axis=1)
data_raw.head()

Unnamed: 0,Semana,Agencia_ID,Canal_ID,Ruta_SAK,Cliente_ID,Producto_ID,Venta_uni_hoy,Venta_hoy,Dev_uni_proxima,Dev_proxima,Demanda_uni_equil,NombreProducto,NombreCliente,Town,State
0,3,1110,7,3301,15766,1212,3,25.14,0,0.0,3,Roles Canela 2p 120g BIM 1212,PUESTO DE PERIODICOS LAZARO,2008 AG. LAGO FILT,"MÉXICO, D.F."
1,3,1110,7,3301,15766,1216,4,33.52,0,0.0,4,Roles Glass 2p 135g BIM 1216,PUESTO DE PERIODICOS LAZARO,2008 AG. LAGO FILT,"MÉXICO, D.F."
2,3,1110,7,3301,15766,1238,4,39.32,0,0.0,4,Panquecito Gota Choc 2p 140g BIM 1238,PUESTO DE PERIODICOS LAZARO,2008 AG. LAGO FILT,"MÉXICO, D.F."
3,3,1110,7,3301,15766,1240,4,33.52,0,0.0,4,Mantecadas Vainilla 4p 125g BIM 1240,PUESTO DE PERIODICOS LAZARO,2008 AG. LAGO FILT,"MÉXICO, D.F."
4,3,1110,7,3301,15766,1242,3,22.92,0,0.0,3,Donitas Espolvoreadas 6p 105g BIM 1242,PUESTO DE PERIODICOS LAZARO,2008 AG. LAGO FILT,"MÉXICO, D.F."


# 1.0 DATA DESCRIPTION

In [10]:
df01 = data_raw.copy()

## 1.1 Columns

In [11]:
df01.columns

Index(['Semana', 'Agencia_ID', 'Canal_ID', 'Ruta_SAK', 'Cliente_ID',
       'Producto_ID', 'Venta_uni_hoy', 'Venta_hoy', 'Dev_uni_proxima',
       'Dev_proxima', 'Demanda_uni_equil', 'NombreProducto', 'NombreCliente',
       'Town', 'State'],
      dtype='object')

### 1.2.1 Rename Columns

In [12]:
import inflection
snakecase = lambda col: inflection.underscore(col)

new_columns = list(map(snakecase, df01.columns))

# rename
df01.columns = new_columns

### 1.2.2 New Name Columns

In [13]:
df01.columns

Index(['semana', 'agencia_id', 'canal_id', 'ruta_sak', 'cliente_id',
       'producto_id', 'venta_uni_hoy', 'venta_hoy', 'dev_uni_proxima',
       'dev_proxima', 'demanda_uni_equil', 'nombre_producto', 'nombre_cliente',
       'town', 'state'],
      dtype='object')

## 1.3 Data Dimensions

In [14]:
print(f'Number of Rows: {df01.shape[0]}')
print(f'Number of Columns: {df01.shape[1]}')

Number of Rows: 74773833
Number of Columns: 15


## 1.4 Data Types

In [15]:
df01.dtypes

semana                 int64
agencia_id             int64
canal_id               int64
ruta_sak               int64
cliente_id             int64
producto_id            int64
venta_uni_hoy          int64
venta_hoy            float64
dev_uni_proxima        int64
dev_proxima          float64
demanda_uni_equil      int64
nombre_producto       object
nombre_cliente        object
town                  object
state                 object
dtype: object

## 1.5 Missing Values

In [18]:
for col in df01.columns.tolist():
    print(col, df01[col].isnull().sum())

semana 0
agencia_id 0
canal_id 0
ruta_sak 0
cliente_id 0
producto_id 0
venta_uni_hoy 0
venta_hoy 0
dev_uni_proxima 0
dev_proxima 0
demanda_uni_equil 0
nombre_producto 0
nombre_cliente 0
town 0
state 0


### 1.5.2 ~~Fillout NA~~

there is no NA values

## 1.6 Descriptive Statistical

In [None]:
# Numerical Attributes
num_attributes = df01.select_dtypes(include=['int64', 'float64'])
not_numerial = ['row_number', 'customer_id']
num_attributes_cols = list(np.setdiff1d(num_attributes.columns, not_numerial))