## [0] Librerías necesarias

In [None]:
# Conectamos Google Drive con Google Colab
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


In [None]:
# Librerías necesarias para el desarrollo
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from tqdm import tqdm
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder
import multiprocessing as mp

In [None]:
from joblib import Parallel, delayed

## [1] Lectura de datos

In [None]:
datos = pd.read_csv('/gdrive/My Drive/AB_InBev/datos/data_train.csv')

In [None]:
print(datos["Date"].min())
print(datos["Date"].max())

2022-01-04
2022-06-30


In [None]:
datos['Date'] = pd.to_datetime(datos['Date'])

In [None]:
print(datos["Date"].min())
print(datos["Date"].max())

2022-01-04 00:00:00
2022-06-30 00:00:00


In [None]:
datos.head()

Unnamed: 0,Date,Account_id,Product_id,Category,Quantity,Year,Month
0,2022-01-04,33217773,8324,Gaseosas,712,2022,1
1,2022-01-04,33217773,8326,Gaseosas,890,2022,1
2,2022-01-04,33217773,9090,Gaseosas,445,2022,1
3,2022-01-04,33217773,12620,Gaseosas,890,2022,1
4,2022-01-04,33217776,8206,Gaseosas,178,2022,1


## Preparación del datos

In [None]:
datos["product"] = datos['Account_id'].astype("str") + '_' + datos['Product_id'].astype("str")

In [None]:
datos_mod = datos[["Date", "product", "Quantity"]]

In [None]:
datos_groupby = datos_mod.groupby(["Date", "product"]) \
    .agg({"Quantity": "sum"}) \
    .rename(columns={"sum(link_cards)": "sum_link_cards"}).reset_index().sort_values(by = "Quantity", ascending = False)

In [None]:
datos_groupby_quan = datos_groupby[["Date", "product", "Quantity"]]

In [None]:
features_names = ['Quantity']
datos_groupby_2 = datos_groupby_quan.copy()

In [None]:
for i in tqdm(range(1, 31)):
    expressions = {column: f"{column}_lag_{i}" for column in features_names}
    temporary_df = datos_groupby_2.assign(Date=lambda x: x["Date"] + pd.Timedelta(days=i)) \
                               .rename(columns=expressions)
    datos_groupby_quan = datos_groupby_quan.merge(temporary_df, on=["Date", "product"], how="left")

100%|██████████| 30/30 [00:34<00:00,  1.15s/it]


In [None]:
datos_groupby_quan.head()

Unnamed: 0,Date,product,Quantity,Quantity_lag_1,Quantity_lag_2,Quantity_lag_3,Quantity_lag_4,Quantity_lag_5,Quantity_lag_6,Quantity_lag_7,...,Quantity_lag_21,Quantity_lag_22,Quantity_lag_23,Quantity_lag_24,Quantity_lag_25,Quantity_lag_26,Quantity_lag_27,Quantity_lag_28,Quantity_lag_29,Quantity_lag_30
0,2022-06-30,34636806_23752,1200,,,,,,,,...,,,,,,,,,,
1,2022-06-30,33254127_14014,1200,,,,,,,,...,,,,,,,,,,
2,2022-06-30,36792027_16578,1200,,,,,,,,...,,,,,,,,,,
3,2022-06-30,40205586_11910,1200,,,,,,,,...,,,,,,,,,,
4,2022-06-30,33226635_14014,1200,,,,,,,,...,1068.0,,,,,,,,,


In [None]:
train = datos_groupby_quan[datos_groupby_quan["Date"] <  "2022-05-01"].reset_index(drop=True).fillna(-1)
train = train[train["Date"] >= train['Date'].min() + np.timedelta64(30, 'D')].reset_index(drop=True)
test  = datos_groupby_quan[datos_groupby_quan["Date"] >= "2022-05-01"].reset_index(drop=True).fillna(-1)

In [None]:
test.head(2)

Unnamed: 0,Date,product,Quantity,Quantity_lag_1,Quantity_lag_2,Quantity_lag_3,Quantity_lag_4,Quantity_lag_5,Quantity_lag_6,Quantity_lag_7,...,Quantity_lag_21,Quantity_lag_22,Quantity_lag_23,Quantity_lag_24,Quantity_lag_25,Quantity_lag_26,Quantity_lag_27,Quantity_lag_28,Quantity_lag_29,Quantity_lag_30
0,2022-06-30,34636806_23752,1200,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,2022-06-30,33254127_14014,1200,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0


In [None]:
train.head(2)

Unnamed: 0,Date,product,Quantity,Quantity_lag_1,Quantity_lag_2,Quantity_lag_3,Quantity_lag_4,Quantity_lag_5,Quantity_lag_6,Quantity_lag_7,...,Quantity_lag_21,Quantity_lag_22,Quantity_lag_23,Quantity_lag_24,Quantity_lag_25,Quantity_lag_26,Quantity_lag_27,Quantity_lag_28,Quantity_lag_29,Quantity_lag_30
0,2022-04-13,33226677_14014,1157,-1.0,-1.0,-1.0,-1.0,-1.0,890.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0
1,2022-04-27,38306295_16578,1157,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,178.0,-1.0


In [None]:
train.shape, test.shape

((487741, 33), (316982, 33))

In [None]:
test_keys = test[["Date", "product", "Quantity"]]

In [None]:
X_train = train.filter(regex='^Quantity_lag')
y_train = train["Quantity"]


X_test  = test.filter(regex='^Quantity_lag')
y_test  = test["Quantity"]

## [3] Entrenamiento del algoritmo

In [None]:
dtrain = lgb.Dataset(pd.DataFrame(X_train), pd.DataFrame(y_train))
dtest = lgb.Dataset(pd.DataFrame(X_test), pd.DataFrame(y_test), reference=dtrain)

In [None]:
params = {'task': 'train',
 'is_unbalance': False,
 'boosting_type': 'gbdt', # "dart" "gbdt"
 'objective': 'regression',
 'metric': 'rmse',
 'learning_rate': 0.01,
 'verbose': -1,
 'seed': 2023,
 'bagging_fraction': 0.85,
 'bagging_freq': 3,
 'feature_fraction': 0.85,
 'lambda_l1': 0.76,
 'lambda_l2': 0.21,
 'max_depth': 5,
 'min_data_in_leaf': 50,
 'num_leaves': 25,
 'weight': 1.24}

In [None]:
gbm = lgb.train(params,
                dtrain,
                verbose_eval = 100,
                num_boost_round = 2000,
                valid_sets=[dtrain,dtest],
                early_stopping_rounds=200)

Please use weight argument of the Dataset constructor to pass this parameter.
Please use weight argument of the Dataset constructor to pass this parameter.


Training until validation scores don't improve for 200 rounds
[100]	training's rmse: 244.304	valid_1's rmse: 229.081
[200]	training's rmse: 237.052	valid_1's rmse: 221.986
[300]	training's rmse: 233.471	valid_1's rmse: 218.474
[400]	training's rmse: 231.164	valid_1's rmse: 216.205
[500]	training's rmse: 229.556	valid_1's rmse: 214.633
[600]	training's rmse: 228.295	valid_1's rmse: 213.432
[700]	training's rmse: 227.244	valid_1's rmse: 212.442
[800]	training's rmse: 226.274	valid_1's rmse: 211.546
[900]	training's rmse: 225.528	valid_1's rmse: 210.859
[1000]	training's rmse: 224.819	valid_1's rmse: 210.213
[1100]	training's rmse: 224.313	valid_1's rmse: 209.765
[1200]	training's rmse: 223.828	valid_1's rmse: 209.342
[1300]	training's rmse: 223.422	valid_1's rmse: 208.994
[1400]	training's rmse: 223.092	valid_1's rmse: 208.713
[1500]	training's rmse: 222.803	valid_1's rmse: 208.479
[1600]	training's rmse: 222.556	valid_1's rmse: 208.273
[1700]	training's rmse: 222.345	valid_1's rmse: 208

In [None]:
importancia=gbm.feature_importance(importance_type='gain')

## [4] Validación del algoritmo

In [None]:
scoreo = gbm.predict(pd.DataFrame(X_test), ntree_limit=gbm.best_iteration)

In [None]:
test_keys["y_pred"] = scoreo

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_keys["y_pred"] = scoreo


In [None]:
def mape(y_true, y_pred):
    n = len(y_true)
    return 100 * (1/n) * np.sum(np.abs((y_true - y_pred) / y_true))

In [None]:
products = test_keys["product"].unique().tolist()
mapes = []

In [None]:
# Parallelizing with Pool.starmap()

# Step 1: Init multiprocessing.Pool()

pool = mp.Pool(mp.cpu_count())
pool

<multiprocessing.pool.Pool state=RUN pool_size=2>

In [None]:
# Step 2: 'pool.starmap' the 'mape'
results = pool.starmap(mape, [(test_keys[test_keys["product"] == product]["Quantity"], test_keys[test_keys["product"] == product]["y_pred"]) for product in tqdm(products)])
# Step 3: Don't forget to close
pool.close()

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
 40%|████      | 63449/157120 [2:19:09<3:00:25,  8.65it/s][A
 40%|████      | 63450/157120 [2:19:09<3:00:25,  8.65it/s][A
 40%|████      | 63451/157120 [2:19:09<2:58:29,  8.75it/s][A
 40%|████      | 63452/157120 [2:19:09<2:56:18,  8.85it/s][A
 40%|████      | 63453/157120 [2:19:09<2:54:46,  8.93it/s][A
 40%|████      | 63454/157120 [2:19:10<2:55:46,  8.88it/s][A
 40%|████      | 63455/157120 [2:19:10<2:55:51,  8.88it/s][A
 40%|████      | 63456/157120 [2:19:10<3:00:01,  8.67it/s][A
 40%|████      | 63457/157120 [2:19:10<2:57:05,  8.81it/s][A
 40%|████      | 63458/157120 [2:19:10<2:56:44,  8.83it/s][A
 40%|████      | 63459/157120 [2:19:10<3:00:10,  8.66it/s][A
 40%|████      | 63460/157120 [2:19:10<3:00:15,  8.66it/s][A
 40%|████      | 63461/157120 [2:19:10<2:58:54,  8.73it/s][A
 40%|████      | 63462/157120 [2:19:10<2:57:27,  8.80it/s][A
 40%|████      | 63463/157120 [2:19:11<2:57:37,  8.79it/s][A
 40%|

In [None]:
#results = Parallel(n_jobs=-1, prefer="threads")(delayed(mape)(test_keys[test_keys["product"] == product]["Quantity"], test_keys[test_keys["product"] == product]["y_pred"]) for product in tqdm(products))

 15%|█▌        | 23658/157120 [41:48<3:31:08, 10.54it/s]

In [None]:
for product in tqdm(products):
      mapes.append(mape(test_keys[test_keys["product"] == product]["Quantity"],
                        test_keys[test_keys["product"] == product]["y_pred"]))

 38%|███▊      | 59538/157120 [1:45:13<2:52:27,  9.43it/s]


KeyboardInterrupt: ignored

In [None]:
result = pd.concat([pd.DataFrame({"product": products}), pd.DataFrame({"MAPE":mapes})], axis = 1) \
                    .sort_values(by = "MAPE") \
                    .reset_index(drop=True)

In [None]:
result

In [None]:
train_quan = train[["Date", "product", "Quantity"]]

In [None]:
features_names = ['Quantity']
train_quan_2 = train_quan.copy()

In [None]:
train_quan.head()