In [1]:
!pip install -q lightgbm optuna


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [16]:
import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna
from sklearn.model_selection import TimeSeriesSplit
from datetime import date

  from .autonotebook import tqdm as notebook_tqdm


In [7]:
# selecionando só algumas colunas na leitura para economizar RAM
cols = ['pdv','produto','premise','transaction_date','quantity','gross_value','gross_profit',
        'net_value','discount','taxes','categoria','marca','fabricante','categoria_pdv','zipcode']
df_merge = pd.read_parquet("artifacts/run_2025_01/df_merge_clean.parquet")

In [8]:
df_merge['transaction_date'] = pd.to_datetime(df_merge['transaction_date'], errors='coerce')
for c in ['quantity','gross_value','gross_profit','net_value','discount','taxes']:
    df_merge[c] = pd.to_numeric(df_merge[c], errors='coerce').astype('float32')

In [9]:
df_merge.info()
df_merge.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6424477 entries, 0 to 6424476
Data columns (total 25 columns):
 #   Column            Dtype         
---  ------            -----         
 0   pdv               object        
 1   produto           object        
 2   distributor_id    object        
 3   transaction_date  datetime64[ns]
 4   reference_date    object        
 5   quantity          float32       
 6   gross_value       float32       
 7   net_value         float32       
 8   gross_profit      float32       
 9   discount          float32       
 10  taxes             float32       
 11  produto_prod      object        
 12  categoria         object        
 13  descricao         object        
 14  tipos             object        
 15  label             object        
 16  subcategoria      object        
 17  marca             object        
 18  fabricante        object        
 19  premise           object        
 20  categoria_pdv     object        
 21  zipcode 

Unnamed: 0,pdv,produto,distributor_id,transaction_date,reference_date,quantity,gross_value,net_value,gross_profit,discount,...,label,subcategoria,marca,fabricante,premise,categoria_pdv,zipcode,iso_year,iso_week,price_per_unit
0,7384367747233276219,328903483604537190,9,2022-07-13,2022-07-01,1.0,38.125,37.890625,10.042625,3.95,...,Core,Specialty,Bud Light Chelada Fuego,AB Anheuser Busch Inc,Off Premise,Package/Liquor,80905.0,2022,28,37.890625
1,3536908514005606262,5418855670645487653,5,2022-03-21,2022-03-01,6.0,107.25,106.440002,24.732002,17.1,...,Core,Lager,Michelob Ultra,AB Anheuser Busch Inc,Off Premise,Package/Liquor,80239.0,2022,12,17.74
2,3138231730993449825,1087005562675741887,6,2022-09-06,2022-09-01,3.0,56.625,56.220001,14.124002,5.25,...,Core,Lager,Bud Light Lime,AB Anheuser Busch Inc,Off Premise,Package/Liquor,80634.0,2022,36,18.74
3,3681167389484217654,1401422983880045188,5,2022-09-11,2022-09-01,129.0,1037.160034,1037.160034,156.348022,479.880005,...,,Liqueurs & Cordials,99 Butterscotch,Sazerac Spirits,Off Premise,Package/Liquor,80226.0,2022,36,8.04
4,7762413312337359369,6614994347738381720,4,2022-02-18,2022-02-01,1.0,26.23,23.950241,6.550241,0.0,...,Core,IPA,New Belgium Voodoo Ranger Imperial IPA,NB New Belgium,Off Premise,Convenience,30096.0,2022,7,23.950241


In [12]:
# Agrega semanalmente (PDV×SKU×premise)

weekly = (
    df_merge.groupby(['premise','pdv','produto','iso_year','iso_week'], as_index=False)
            .agg(qty=('quantity','sum'),
                 gv =('gross_value','sum'),
                 gp =('gross_profit','sum'),
                 price=('price_per_unit','median'))
)

# inicio da semana - iso
def week_start(y, w): 
    return pd.Timestamp(date.fromisocalendar(int(y), int(w), 1))
weekly['week_start'] = [week_start(y, w) for y, w in zip(weekly.iso_year, weekly.iso_week)]
weekly['year_week'] = weekly['iso_year']*100 + weekly['iso_week']

In [13]:
# Baseline considerando a última semana conhecida para validar pipeline

w22 = weekly[weekly['iso_year']==2022].copy()
last_w = w22['iso_week'].max()
baseline_key = ['premise','pdv','produto']

last_obs = (w22[w22['iso_week']==last_w]
            [baseline_key+['qty']]
            .rename(columns={'qty':'qty_last'}))

In [14]:
# prox. 5 semanas de janeiro - simulado

future_weeks = pd.DataFrame({
    'iso_year': 2023,
    'iso_week': list(range(1, 6))
})
future_weeks['week_start'] = [week_start(2023, w) for w in future_weeks['iso_week']]

grid = (w22[baseline_key].drop_duplicates()
        .merge(future_weeks, how='cross'))

In [15]:
# baseline naïve, repetindo a quantidade da última semana de 2022

pred_baseline = grid.merge(last_obs, on=baseline_key, how='left')
pred_baseline['pred_qty'] = pred_baseline['qty_last'].fillna(0.0)

#### Weekly Agg - PDVxSKUxPremise

In [17]:
KEY  = ['premise','pdv','produto']
TIME = ['iso_year','iso_week']

weekly = (df_merge
          .groupby(KEY + TIME, as_index=False)
          .agg(qty=('quantity','sum'),
               gv =('gross_value','sum'),
               gp =('gross_profit','sum')))

# preço médio por unidade (seguro para /0)
weekly['price'] = (weekly['gv'] / weekly['qty']).replace([np.inf, -np.inf], np.nan)

#### Feature Engineeering

In [18]:
# Criação de lags/rollings sem vazamento + sazonais + densidade de SKU.

def add_lag_feats(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.sort_values(KEY + TIME).copy()
    g  = df.groupby(KEY, sort=False)

    # Lags e rollings de quantidade
    for L in [1,2,3,4,8,12]:
        df[f'qty_lag{L}'] = g['qty'].shift(L)
    for W in [4,8,12]:
        df[f'qty_mean_{W}'] = g['qty'].shift(1).rolling(W, min_periods=1).mean()

    # Lags de preço e de gv/gp (somente lags!)
    for L in [1,4]:
        df[f'price_lag{L}'] = g['price'].shift(L)
        df[f'gv_lag{L}']    = g['gv'].shift(L)
        df[f'gp_lag{L}']    = g['gp'].shift(L)

    # Densidade de SKU ativos por PDV/semana
    df['sku_active_pdv'] = df.groupby(KEY + TIME)['produto'].transform('size')

    # Sazonalidade (semana ISO cíclica)
    df['w_sin'] = np.sin(2*np.pi*df['iso_week']/53.0)
    df['w_cos'] = np.cos(2*np.pi*df['iso_week']/53.0)

    # Categóricas (deixe o LGBM detectar automaticamente)
    for col in ['premise','categoria','marca','fabricante','tipos','categoria_pdv']:
        if col in df.columns:
            df[col] = df[col].astype('category')

    # Chave ordenável
    df['year_week'] = (df['iso_year']*100 + df['iso_week']).astype('int32')
    return df

feat = add_lag_feats(weekly)

#### Treino (somente 2022) e limpeza

In [19]:
train = feat[feat['iso_year']==2022].copy()

# Remove linhas sem histórico mínimo (lags necessários)
need_cols = [c for c in train.columns if c.startswith(('qty_lag','qty_mean','price_lag','gv_lag','gp_lag'))]
train = train.dropna(subset=need_cols).sort_values(['year_week'] + KEY)

X_cols = (
    [c for c in train.columns if c.startswith(('qty_lag','qty_mean','price_lag','gv_lag','gp_lag','w_','sku_active_pdv'))]
    + [c for c in ['premise','categoria','marca','fabricante','tipos','categoria_pdv'] if c in train.columns]
)
y_col = 'qty'

In [22]:
#### Optuna + CV temporal - CV por ordem temporal dos registros (ordenados por year_week)

ts = TimeSeriesSplit(n_splits=4)

def objective(trial):
    params = {
        'objective': 'regression',
        'metric': 'mae',                 # WMAPE calculamos fora
        'boosting_type': trial.suggest_categorical('boosting', ['gbdt','goss']),
        'num_leaves': trial.suggest_int('num_leaves', 31, 255),
        'max_depth': trial.suggest_int('max_depth', -1, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.6, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 0, 7),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 20, 300),
        'lambda_l1': trial.suggest_float('lambda_l1', 0.0, 5.0),
        'lambda_l2': trial.suggest_float('lambda_l2', 0.0, 5.0),
        'verbosity': -1,
        'seed': 42,
    }

    wmape_folds = []

    # índices já na ordem temporal (train foi ordenado)
    idx = np.arange(len(train))
    for tr_idx, va_idx in ts.split(idx):
        X_tr, y_tr = train.iloc[tr_idx][X_cols], train.iloc[tr_idx][y_col]
        X_va, y_va = train.iloc[va_idx][X_cols], train.iloc[va_idx][y_col]

        dtr = lgb.Dataset(X_tr, y_tr, free_raw_data=False)  # categ auto
        dva = lgb.Dataset(X_va, y_va, reference=dtr, free_raw_data=False)

        model = lgb.train(
            params,
            dtr,
            num_boost_round=5000,
            valid_sets=[dtr, dva],
            valid_names=['train','valid'],
            early_stopping_rounds=200,
            verbose_eval=False
        )

        trial.set_user_attr('best_iteration', model.best_iteration)
        y_hat = model.predict(X_va, num_iteration=model.best_iteration)
        wmape_folds.append(wmape_np(y_va.values, y_hat))

    return float(np.mean(wmape_folds))

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=40, show_progress_bar=False)

best_params = study.best_params
best_params.update({'objective':'regression','metric':'mae','verbosity':-1,'seed':42})
best_iter = study.best_trial.user_attrs.get('best_iteration', 2000)

[I 2025-09-22 18:41:07,123] A new study created in memory with name: no-name-5c4e3c77-260b-4901-bc01-1b7f450874f1
[W 2025-09-22 18:41:07,220] Trial 0 failed with parameters: {'boosting': 'gbdt', 'num_leaves': 238, 'max_depth': 3, 'learning_rate': 0.07537992624266947, 'feature_fraction': 0.6811167874647934, 'bagging_fraction': 0.9050403170434728, 'bagging_freq': 2, 'min_data_in_leaf': 141, 'lambda_l1': 4.037938541525785, 'lambda_l2': 2.16263073368237} because of the following error: TypeError("train() got an unexpected keyword argument 'early_stopping_rounds'").
Traceback (most recent call last):
  File "/Users/fabiooliveira/.pyenv/versions/forecast-env/lib/python3.11/site-packages/optuna/study/_optimize.py", line 201, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/var/folders/tt/cg9n1vqn42v1lvq9j7my8_g40000gn/T/ipykernel_3840/548430446.py", line 34, in objective
    model = lgb.train(
            ^^^^^^^^^^
TypeError: train() got an unexpecte

TypeError: train() got an unexpected keyword argument 'early_stopping_rounds'

In [21]:
!pip install -U lightgbm 


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [23]:
weeks = np.sort(train['year_week'].unique())
parts = np.array_split(weeks, 5)           # 5 blocos ~iguais de semanas (2022)
folds = []
for i in range(4):                         # 4 folds
    tr_weeks = np.concatenate(parts[:i+1])
    va_weeks = parts[i+1]
    tr_idx = train['year_week'].isin(tr_weeks).values
    va_idx = train['year_week'].isin(va_weeks).values
    folds.append((np.where(tr_idx)[0], np.where(va_idx)[0]))

def wmape_np(y, yhat):
    den = np.abs(y).sum()
    return float(np.abs(y - yhat).sum() / den) if den>0 else 0.0

##### Optuna e LGBM

In [24]:
import lightgbm as lgb
from lightgbm import LGBMRegressor

def objective(trial):
    params = dict(
        boosting_type       = trial.suggest_categorical('boosting_type', ['gbdt','goss']),
        num_leaves          = trial.suggest_int('num_leaves', 31, 255),
        max_depth           = trial.suggest_int('max_depth', -1, 12),
        learning_rate       = trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        subsample           = trial.suggest_float('subsample', 0.6, 1.0),
        colsample_bytree    = trial.suggest_float('colsample_bytree', 0.6, 1.0),
        min_child_samples   = trial.suggest_int('min_child_samples', 20, 300),
        reg_alpha           = trial.suggest_float('reg_alpha', 0.0, 5.0),
        reg_lambda          = trial.suggest_float('reg_lambda', 0.0, 5.0),
        n_estimators        = 5000,
        objective           = 'regression',
        metric              = 'mae',
        verbosity           = -1,
        random_state        = 42,
        enable_categorical  = True          # usa dtype 'category' direto
    )

    scores = []
    for tr_idx, va_idx in folds:
        X_tr, y_tr = train.iloc[tr_idx][X_cols], train.iloc[tr_idx][y_col]
        X_va, y_va = train.iloc[va_idx][X_cols], train.iloc[va_idx][y_col]

        model = LGBMRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
        )
        y_hat = model.predict(X_va, num_iteration=model.best_iteration_)
        scores.append(wmape_np(y_va.values, y_hat))
    return float(np.mean(scores))

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=40, show_progress_bar=False)

best_params = study.best_params.copy()
best_params.update(dict(objective='regression', metric='mae',
                        verbosity=-1, random_state=42, enable_categorical=True))

[I 2025-09-22 18:47:07,846] A new study created in memory with name: no-name-4cfe3b8d-391b-4181-aa71-8df8f9cfbfb7
[I 2025-09-22 18:51:12,032] Trial 0 finished with value: 0.37441362512684867 and parameters: {'boosting_type': 'goss', 'num_leaves': 91, 'max_depth': 8, 'learning_rate': 0.014711498598955396, 'subsample': 0.6553842994762533, 'colsample_bytree': 0.8898528543784643, 'min_child_samples': 275, 'reg_alpha': 3.3910692815882375, 'reg_lambda': 4.590328095601477}. Best is trial 0 with value: 0.37441362512684867.
[W 2025-09-22 18:51:29,644] Trial 1 failed with parameters: {'boosting_type': 'goss', 'num_leaves': 122, 'max_depth': 5, 'learning_rate': 0.01117747845539332, 'subsample': 0.6926189450508761, 'colsample_bytree': 0.6260777006778964, 'min_child_samples': 288, 'reg_alpha': 3.4432508057873283, 'reg_lambda': 2.9027668895671956} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "/Users/fabiooliveira/.pyenv/versions/forecast-env/lib/pyth

KeyboardInterrupt: 

In [25]:
# ---- métrica: WMAPE (micro) ----
def lgb_wmape(y_true, y_pred):
    den = np.abs(y_true).sum()
    val = float(np.abs(y_true - y_pred).sum() / max(den, 1e-9))
    return 'wmape', val, False  # menor é melhor

In [26]:
# Optuna com early stopping em WMAPE 
import optuna
import lightgbm as lgb
from lightgbm import LGBMRegressor
from sklearn.model_selection import TimeSeriesSplit
import numpy as np

ts = TimeSeriesSplit(n_splits=4)
folds = list(ts.split(train['year_week'].values))

# --------- objetivo Optuna ----------
def objective(trial):
    params = dict(
        boosting_type   = trial.suggest_categorical('boosting_type', ['gbdt','goss']),
        num_leaves      = trial.suggest_int('num_leaves', 31, 255),
        max_depth       = trial.suggest_int('max_depth', -1, 12),
        learning_rate   = trial.suggest_float('learning_rate', 0.01, 0.2, log=True),
        subsample       = trial.suggest_float('subsample', 0.6, 1.0),      # bagging
        colsample_bytree= trial.suggest_float('colsample_bytree', 0.6, 1.0),
        min_child_samples=trial.suggest_int('min_child_samples', 20, 300),
        reg_alpha       = trial.suggest_float('reg_alpha', 0.0, 5.0),
        reg_lambda      = trial.suggest_float('reg_lambda', 0.0, 5.0),
        n_estimators    = 5000,
        objective       = 'regression',
        verbosity       = -1,
        random_state    = 42,
        enable_categorical = True,
    )

    scores = []
    for tr_idx, va_idx in folds:
        X_tr, y_tr = train.iloc[tr_idx][X_cols], train.iloc[tr_idx][y_col]
        X_va, y_va = train.iloc[va_idx][X_cols], train.iloc[va_idx][y_col]

        model = LGBMRegressor(**params)
        model.fit(
            X_tr, y_tr,
            eval_set=[(X_va, y_va)],
            eval_metric=lgb_wmape,
            callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
        )
        scores.append(model.best_score_['valid_0']['wmape'])

    return float(np.mean(scores))

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=40, show_progress_bar=False)

best_params = study.best_params | dict(
    objective='regression', verbosity=-1, random_state=42, enable_categorical=True
)

[I 2025-09-22 18:58:19,912] A new study created in memory with name: no-name-a4dcc9d1-75bf-4243-b0c8-8f3c4ee8ddf2
[I 2025-09-22 18:58:39,468] Trial 0 finished with value: 0.37620896872132303 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 196, 'max_depth': 4, 'learning_rate': 0.036600001183410356, 'subsample': 0.6277752567060072, 'colsample_bytree': 0.8440168506820853, 'min_child_samples': 273, 'reg_alpha': 3.1593422665349333, 'reg_lambda': 4.913321686184848}. Best is trial 0 with value: 0.37620896872132303.
[I 2025-09-22 18:58:56,839] Trial 1 finished with value: 0.37382493939828504 and parameters: {'boosting_type': 'gbdt', 'num_leaves': 127, 'max_depth': 6, 'learning_rate': 0.099930017378788, 'subsample': 0.7835066221777065, 'colsample_bytree': 0.6333838534135205, 'min_child_samples': 248, 'reg_alpha': 1.4143087392627396, 'reg_lambda': 0.6266263033025493}. Best is trial 1 with value: 0.37382493939828504.
[I 2025-09-22 18:59:29,835] Trial 2 finished with value: 0.3729981464641

KeyboardInterrupt: 

In [27]:
study.best_value, study.best_params

(0.3710644937607641,
 {'boosting_type': 'goss',
  'num_leaves': 187,
  'max_depth': 8,
  'learning_rate': 0.015208794858682362,
  'subsample': 0.8710263156715767,
  'colsample_bytree': 0.7372351374705659,
  'min_child_samples': 290,
  'reg_alpha': 1.8444567479147946,
  'reg_lambda': 3.1541462618384895})

In [28]:
import json, pathlib
pathlib.Path("optuna_best.json").write_text(json.dumps(study.best_params, indent=2))

285

In [29]:
import json
with open("optuna_best.json") as f:
    best_params = json.load(f)

best_params.update({
    "objective": "regression",
    "verbosity": -1,
    "random_state": 42,
    "enable_categorical": True
})

In [30]:
best_params = {
  'boosting_type': 'goss',
  'num_leaves': 187,
  'max_depth': 8,
  'learning_rate': 0.015208794858682362,
  'subsample': 0.8710263156715767,
  'colsample_bytree': 0.7372351374705659,
  'min_child_samples': 290,
  'reg_alpha': 1.8444567479147946,
  'reg_lambda': 3.1541462618384895,
  'objective': 'regression',
  'verbosity': -1,
  'random_state': 42,
  'enable_categorical': True
}

In [31]:
import lightgbm as lgb
def lgb_wmape(y_true, y_pred):
    den = (abs(y_true)).sum()
    val = (abs(y_true - y_pred)).sum() / max(den, 1e-9)
    return ('wmape', float(val), False)

In [32]:
weeks = sorted(train['year_week'].unique())
va_weeks = set(weeks[-8:])
mask_va = train['year_week'].isin(va_weeks)

X_tr, y_tr = train.loc[~mask_va, X_cols], train.loc[~mask_va, 'qty']
X_va, y_va = train.loc[ mask_va, X_cols], train.loc[ mask_va, 'qty']

In [34]:
# fit com early stopping para achar n_estimators ideal
model_tmp = lgb.LGBMRegressor(**best_params, n_estimators=5000)

model_tmp.fit(
    X_tr, y_tr,
    eval_set=[(X_va, y_va)],
    eval_metric=lgb_wmape,
    callbacks=[
        lgb.early_stopping(200),         # early stopping
        lgb.log_evaluation(period=0)     # sem logs
    ]
)

best_iter = model_tmp.best_iteration_
best_iter

Training until validation scores don't improve for 200 rounds
Early stopping, best iteration is:
[386]	valid_0's l2: 13.7462	valid_0's wmape: 0.361038


386

In [35]:
model = lgb.LGBMRegressor(**best_params, n_estimators=best_iter)
model.fit(train[X_cols], train['qty'])

0,1,2
,boosting_type,'goss'
,num_leaves,187
,max_depth,8
,learning_rate,0.015208794858682362
,n_estimators,386
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
# Final 

def forecast_jan_2023(base_weekly, model, X_cols):
    hist = base_weekly.copy()
    outs = []
    for wk in [1,2,3,4,5]:
        grid = (hist[hist['iso_year']==2022][['premise','pdv','produto']].drop_duplicates()
                .assign(iso_year=2023, iso_week=wk))
        tmp  = pd.concat([hist, grid], ignore_index=True)
        tmp  = add_lag_feats(tmp)
        cur  = tmp[(tmp['iso_year']==2023) & (tmp['iso_week']==wk)].copy()

        yhat = model.predict(cur[X_cols])
        cur['qty'] = np.clip(yhat, 0, None)

        outs.append(cur[['premise','pdv','produto','iso_year','iso_week','qty']])
        # alimenta histórico para o próximo passo
        hist = pd.concat([hist, cur[hist.columns]], ignore_index=True)
    return pd.concat(outs, ignore_index=True)

preds_jan = forecast_jan_2023(feat[feat['iso_year']==2022], model, X_cols)

# === 2) montar arquivo no formato do hackathon ===
sub = (preds_jan.rename(columns={'iso_week':'semana','qty':'quantidade'})
                 [['semana','pdv','produto','quantidade']]
                 .sort_values(['semana','pdv','produto'])
                 .reset_index(drop=True))

# tipos e sanidade
sub['semana'] = sub['semana'].astype('int16')
sub['pdv']    = pd.to_numeric(sub['pdv'], errors='raise', downcast='integer')
sub['produto']= pd.to_numeric(sub['produto'], errors='raise', downcast='integer')
sub['quantidade'] = np.rint(sub['quantidade']).astype('int32')

#sub.to_csv('submission.csv', sep=';', index=False, encoding='utf-8')
sub.to_parquet('submission.parquet', index=False)

sub.head(), sub.shape

  g  = df.groupby(KEY, sort=False)
  df['sku_active_pdv'] = df.groupby(KEY + TIME)['produto'].transform('size')


In [2]:
import pandas as pd, numpy as np, gc
from lightgbm import LGBMRegressor

PATH = "artifacts/run_2025_01/df_merge_clean.parquet" 

use_cols = ['premise','pdv','produto','transaction_date',
            'quantity','gross_value','gross_profit','net_value',
            'categoria','marca','fabricante','tipos','categoria_pdv']
df = pd.read_parquet(PATH, columns=use_cols)
df['transaction_date'] = pd.to_datetime(df['transaction_date'])
iso = df['transaction_date'].dt.isocalendar()
df['iso_year'] = iso.year.astype('int16')
df['iso_week'] = iso.week.astype('int16')
del iso; gc.collect()

913

In [3]:
KEY  = ['premise','pdv','produto']
TIME = ['iso_year','iso_week']

w22 = (df.query('iso_year==2022')
         .groupby(KEY+TIME, as_index=False)
         .agg(qty=('quantity','sum'),
              gv =('gross_value','sum'),
              gp =('gross_profit','sum'),
              nv =('net_value','sum')))

# preço por unidade (proxy) – evita divisão por zero
w22['price'] = (w22['nv'] / w22['qty'].replace(0, np.nan)).fillna(0).astype('float32')
w22 = w22.drop(columns='nv')

In [4]:
def add_lag_feats(df_in: pd.DataFrame) -> pd.DataFrame:
    df = df_in.sort_values(KEY+TIME).copy()
    g  = df.groupby(KEY, sort=False)

    for L in [1,2,4,8,12]:
        df[f'qty_lag{L}'] = g['qty'].shift(L)
    for W in [4,8,12]:
        df[f'qty_mean_{W}'] = g['qty'].shift(1).rolling(W, min_periods=1).mean()

    for L in [1,4]:
        df[f'price_lag{L}'] = g['price'].shift(L)
        df[f'gv_lag{L}']    = g['gv'].shift(L)
        df[f'gp_lag{L}']    = g['gp'].shift(L)

    df['sku_active_pdv'] = df.groupby(KEY+TIME)['produto'].transform('size')
    df['w_sin'] = np.sin(2*np.pi*df['iso_week']/53.0)
    df['w_cos'] = np.cos(2*np.pi*df['iso_week']/53.0)
    df['year_week'] = (df['iso_year']*100 + df['iso_week']).astype('int32')

    for c in ['premise','categoria','marca','fabricante','tipos','categoria_pdv']:
        if c in df.columns: df[c] = df[c].astype('category')
    return df

feat = add_lag_feats(w22)
need = [c for c in feat.columns if c.startswith(('qty_lag','qty_mean','price_lag','gv_lag','gp_lag'))]
train = feat.dropna(subset=need)
X_cols = [c for c in train.columns if c.startswith(('qty_lag','qty_mean','price_lag','gv_lag','gp_lag','w_','sku_active_pdv'))] + \
         [c for c in ['premise','categoria','marca','fabricante','tipos','categoria_pdv'] if c in train.columns]

In [5]:
best_params = {
  'boosting_type': 'goss', 'num_leaves': 187, 'max_depth': 8,
  'learning_rate': 0.015208794858682362, 'subsample': 0.8710263156715767,
  'colsample_bytree': 0.7372351374705659, 'min_child_samples': 290,
  'reg_alpha': 1.8444567479147946, 'reg_lambda': 3.1541462618384895,
  'objective': 'regression', 'verbosity': -1, 'random_state': 42,
  'enable_categorical': True
}
best_iter = 386  # do seu treino anterior

model = LGBMRegressor(**best_params, n_estimators=best_iter)
model.fit(train[X_cols], train['qty'])

0,1,2
,boosting_type,'goss'
,num_leaves,187
,max_depth,8
,learning_rate,0.015208794858682362
,n_estimators,386
,subsample_for_bin,200000
,objective,'regression'
,class_weight,
,min_split_gain,0.0
,min_child_weight,0.001


In [None]:
def forecast_jan_2023(base22, model):
    hist = add_lag_feats(base22.copy())  # lags prontos até W52/2022
    out = []
    for wk in [1,2,3,4,5]:
        grid = (hist[hist['iso_year']==2022][KEY].drop_duplicates()
                .assign(iso_year=2023, iso_week=wk, qty=0.0, gv=0.0, gp=0.0, price=0.0))
        tmp  = pd.concat([hist, grid], ignore_index=True)
        cur  = add_lag_feats(tmp)
        cur  = cur[(cur['iso_year']==2023)&(cur['iso_week']==wk)]
        X    = cur[X_cols]
        cur['qty'] = model.predict(X).clip(min=0.0)
        out.append(cur[KEY+['iso_week','qty']])
       
        hist = pd.concat([hist, cur[hist.columns.intersection(hist.columns)]], ignore_index=True)
        del tmp, cur; gc.collect()
    return pd.concat(out, ignore_index=True)

preds = forecast_jan_2023(w22, model)

  g  = df.groupby(KEY, sort=False)
  df['sku_active_pdv'] = df.groupby(KEY+TIME)['produto'].transform('size')
