<a href="https://colab.research.google.com/github/DavidScience/AB_InBev_/blob/main/%5B3%5DDesaf%C3%ADo_AB_InBev_DF_Entrenamiento%26Validacion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## [0] Librerías necesarias

In [1]:
# Conectamos Google Drive con Google Colab
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [2]:
# Librerías necesarias para el desarrollo
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import multiprocessing as mp
from joblib import Parallel, delayed
import pickle

## [1] Lectura de datos

In [3]:
# Lectura de datos preparados para el entrenamiento del algoritmo
datos = pd.read_csv('/gdrive/My Drive/AB_InBev/datos/data_train.csv')

In [4]:
# Inspección de las fechas más antiguas y recientes
print(datos["Date"].min())
print(datos["Date"].max())

2022-01-04
2022-06-30


In [5]:
# Convertimos el campo Date a fecha
datos['Date'] = pd.to_datetime(datos['Date'])

In [6]:
# Inspección de las fechas más antiguas y recientes
print(datos["Date"].min())
print(datos["Date"].max())

2022-01-04 00:00:00
2022-06-30 00:00:00


In [7]:
# Inspeccionamos los primeros registros
datos.head()

Unnamed: 0,Date,Account_id,Product_id,Category,Quantity,Year,Month
0,2022-01-04,33217773,8324,Gaseosas,712,2022,1
1,2022-01-04,33217773,8326,Gaseosas,890,2022,1
2,2022-01-04,33217773,9090,Gaseosas,445,2022,1
3,2022-01-04,33217773,12620,Gaseosas,890,2022,1
4,2022-01-04,33217776,8206,Gaseosas,178,2022,1


## [2] Preparación de los datos

In [8]:
# Creamos un indicador que concatena los indicadores de cliente y producto
datos["product"] = datos['Account_id'].astype("str") + '_' + datos['Product_id'].astype("str")

In [9]:
# Filtramos los datos a ser modelados
datos_mod = datos[["Date", "product", "Quantity"]]

In [10]:
# Agregamos los datos a nivel día
datos_groupby = datos_mod.groupby(["Date", "product"]) \
    .agg({"Quantity": "sum"}) \
    .rename(columns={"sum(link_cards)": "sum_link_cards"}).reset_index().sort_values(by = "Quantity", ascending = False)

In [11]:
# Filtramos los datos a ser modelados
datos_groupby_quan = datos_groupby[["Date", "product", "Quantity"]]

In [12]:
# Creamos una copia de los datos a ser modelados
features_names = ['Quantity']
datos_groupby_2 = datos_groupby_quan.copy()

In [13]:
# Creamos variables autoregresoras
for i in tqdm(range(1, 31)):
    expressions = {column: f"{column}_lag_{i}" for column in features_names}
    temporary_df = datos_groupby_2.assign(Date=lambda x: x["Date"] + pd.Timedelta(days=i)) \
                               .rename(columns=expressions)
    datos_groupby_quan = datos_groupby_quan.merge(temporary_df, on=["Date", "product"], how="left")

100%|██████████| 30/30 [00:43<00:00,  1.45s/it]


In [14]:
# Analizamos los datos iniciales
datos_groupby_quan.head()

Unnamed: 0,Date,product,Quantity,Quantity_lag_1,Quantity_lag_2,Quantity_lag_3,Quantity_lag_4,Quantity_lag_5,Quantity_lag_6,Quantity_lag_7,...,Quantity_lag_21,Quantity_lag_22,Quantity_lag_23,Quantity_lag_24,Quantity_lag_25,Quantity_lag_26,Quantity_lag_27,Quantity_lag_28,Quantity_lag_29,Quantity_lag_30
0,2022-06-30,34636806_23752,1200,,,,,,,,...,,,,,,,,,,
1,2022-06-30,33254127_14014,1200,,,,,,,,...,,,,,,,,,,
2,2022-06-30,36792027_16578,1200,,,,,,,,...,,,,,,,,,,
3,2022-06-30,40205586_11910,1200,,,,,,,,...,,,,,,,,,,
4,2022-06-30,33226635_14014,1200,,,,,,,,...,1068.0,,,,,,,,,


In [15]:
# Separamos los datos para el entrenamiento y validación de los algoritmos
train = datos_groupby_quan[datos_groupby_quan["Date"] <  "2022-06-01"].reset_index(drop=True).fillna(-1)
train = train[train["Date"] >= train['Date'].min() + np.timedelta64(30, 'D')].reset_index(drop=True)
test  = datos_groupby_quan[datos_groupby_quan["Date"] >= "2022-06-01"].reset_index(drop=True).fillna(-1)

In [16]:
# Inspeccionamos los primeros registros de los datos de testeo
test.head(3)

Unnamed: 0,Date,product,Quantity,Quantity_lag_1,Quantity_lag_2,Quantity_lag_3,Quantity_lag_4,Quantity_lag_5,Quantity_lag_6,Quantity_lag_7,...,Quantity_lag_21,Quantity_lag_22,Quantity_lag_23,Quantity_lag_24,Quantity_lag_25,Quantity_lag_26,Quantity_lag_27,Quantity_lag_28,Quantity_lag_29,Quantity_lag_30
0,2022-06-30,34636806_23752,1200,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,2022-06-30,33254127_14014,1200,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
2,2022-06-30,36792027_16578,1200,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [17]:
# Inspeccionamos los primeros registros de los datos de entrenamiento
train.head(3)

Unnamed: 0,Date,product,Quantity,Quantity_lag_1,Quantity_lag_2,Quantity_lag_3,Quantity_lag_4,Quantity_lag_5,Quantity_lag_6,Quantity_lag_7,...,Quantity_lag_21,Quantity_lag_22,Quantity_lag_23,Quantity_lag_24,Quantity_lag_25,Quantity_lag_26,Quantity_lag_27,Quantity_lag_28,Quantity_lag_29,Quantity_lag_30
0,2022-04-13,33226677_14014,1157,-1.0,-1.0,-1.0,-1.0,-1.0,890.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,2022-04-27,38306295_16578,1157,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,178.0,-1.0
2,2022-05-25,33743865_8156,1157,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,623.0,-1.0,-1.0


In [18]:
# Validamos la volumetría de los datos
train.shape, test.shape

((650363, 33), (154360, 33))

In [19]:
# Variables analizables
test_keys = test[["Date", "product", "Quantity"]]

In [20]:
# Datos de entrenamiento y validación
X_train = train.filter(regex='^Quantity_lag')
y_train = train["Quantity"]


X_test  = test.filter(regex='^Quantity_lag')
y_test  = test["Quantity"]

## [3] Entrenamiento del algoritmo

In [21]:
# Definimos los datasets necesarios para entrenar / validar un algoritmo LightGBM
dtrain = lgb.Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train))
dtest = lgb.Dataset(pd.DataFrame(X_test), pd.DataFrame(y_test), reference=dtrain)

In [22]:
# Definimos los parámetros del algoritmo
params = {'task': 'train',
 'is_unbalance': False,
 'boosting_type': 'gbdt', # "dart" "gbdt"
 'objective': 'regression',
 'metric': 'rmse',
 'learning_rate': 0.01,
 'verbose': -1,
 'seed': 2023,
 'bagging_fraction': 0.85,
 'bagging_freq': 3,
 'feature_fraction': 0.85,
 'lambda_l1': 0.76,
 'lambda_l2': 0.21,
 'max_depth': 5,
 'min_data_in_leaf': 50,
 'num_leaves': 25,
 'weight': 1.24}

In [23]:
# Entrenamos el algoritmo
gbm = lgb.train(params,
                dtrain,
                verbose_eval = 100,
                num_boost_round = 2000,
                valid_sets=[dtrain,dtest],
                early_stopping_rounds=200)

Please use weight argument of the Dataset constructor to pass this parameter.
Please use weight argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 240.819	valid_1's rmse: 227.173
[200]	training's rmse: 233.672	valid_1's rmse: 220.043
[300]	training's rmse: 230.077	valid_1's rmse: 216.679
[400]	training's rmse: 227.712	valid_1's rmse: 214.51
[500]	training's rmse: 226.089	valid_1's rmse: 213.017
[600]	training's rmse: 224.851	valid_1's rmse: 211.902
[700]	training's rmse: 223.822	valid_1's rmse: 210.993
[800]	training's rmse: 222.874	valid_1's rmse: 210.147
[900]	training's rmse: 222.169	valid_1's rmse: 209.557
[1000]	training's rmse: 221.506	valid_1's rmse: 208.978
[1100]	training's rmse: 220.957	valid_1's rmse: 208.509
[1200]	training's rmse: 220.458	valid_1's rmse: 208.097
[1300]	training's rmse: 220.058	valid_1's rmse: 207.789
[1400]	training's rmse: 219.714	valid_1's rmse: 207.517
[1500]	training's rmse: 219.423	valid_1's rmse: 207.293
[1600]	training's rmse: 219.163	valid_1's rmse: 207.091
[1700]	training's rmse: 218.946	valid_1's rmse: 206.

In [24]:
# Importancia variables
importancia=gbm.feature_importance(importance_type='gain')

In [25]:
# Guardamos el modelo
filename = '/gdrive/My Drive/AB_InBev/models/finalized_model.sav'
pickle.dump(gbm, open(filename, 'wb'))

## [4] Validación del algoritmo

In [26]:
# Scoring de los datos de testeo
scoreo = gbm.predict(pd.DataFrame(X_test), ntree_limit=gbm.best_iteration)

In [28]:
# Incluímos la variable predicha
test_keys["y_pred"] = scoreo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_keys["y_pred"] = scoreo


In [29]:
def mape(y_true, y_pred):
    n = len(y_true)
    return 100 * (1/n) * np.sum(np.abs((y_true - y_pred) / y_true))

In [30]:
products = test_keys["product"].unique().tolist()
mapes = []

In [31]:
for product in tqdm(products):
      mapes.append(mape(test_keys[test_keys["product"] == product]["Quantity"],
                        test_keys[test_keys["product"] == product]["y_pred"]))

100%|██████████| 105197/105197 [2:10:08<00:00, 13.47it/s]


In [32]:
result = pd.concat([pd.DataFrame({"product": products}), pd.DataFrame({"MAPE":mapes})], axis = 1) \
                    .sort_values(by = "MAPE") \
                    .reset_index(drop=True)

In [33]:
# Exportamos los datos
result.to_csv('/gdrive/My Drive/AB_InBev/datos/data_mapes_df.csv',index=False)

In [5]:
# Evaluamos los primeros resultados
result.head(5)

Unnamed: 0,product,MAPE
0,33231594_14014,0.021843
1,33973395_11910,0.021843
2,33221382_8262,0.021843
3,33252462_14014,0.021843
4,34207521_8350,0.021843


In [7]:
# Descriptivos del MAPE
result.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
MAPE,105197.0,inf,,0.021843,40.045317,70.022659,199.773415,inf
