In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import json
import lightgbm as lgb
import holidays
import itertools
from tqdm import tqdm

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
import pickle

In [2]:
def generate_oot_folds(df, date_col='date', n_folds=5, test_window=7):
    """
    Gera folds OOT com base na coluna de data, aplicável a todas as séries.
    """
    df = df.copy()
    df[date_col] = pd.to_datetime(df[date_col])
    unique_dates = sorted(df[date_col].unique())

    folds = []
    for i in range(n_folds):
        test_end_idx = len(unique_dates) - i * test_window
        test_start_idx = test_end_idx - test_window
        train_end_idx = test_start_idx

        if train_end_idx <= 0:
            break

        train_dates = unique_dates[:train_end_idx]
        test_dates = unique_dates[test_start_idx:test_end_idx]

        folds.append((train_dates, test_dates))

    return folds


In [3]:
def smape_metric(y_true, y_pred):
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0  # Evita divisão por zero
    return np.mean(diff) * 100

# Read Data
---

In [4]:
df = pd.read_parquet("./sample_trat_curva_D.parquet")

In [5]:
df_valid = pd.read_csv("./valid.csv")

In [6]:
df = pd.concat([
    df.reset_index(drop=True),
    df_valid.query('curva == "D"').drop(columns=['estoque']).reset_index(drop=True)
], ignore_index=True)

# Processing
---

## changing_names


In [7]:
renames = {
    "loja_id": "merchant_id",
    'produto_id': 'product_id',
    'data': 'date',
    'categoria_id': 'category_id',
    'is_medicamento': 'is_medicine',
    'curva': 'sales_curve',
    'estoque_final': 'ending_stock',
    'venda': 'sales',
    'custo': 'cost',
    'preco': 'price',
    'estoque_inicial': 'starting_stock',
    'estoque_final_anterior': 'previous_ending_stock',
    'reposicao': 'restock',
}

df = df.rename(columns=renames)

## droping columns

In [8]:
to_drop = [
    'ending_stock',
    'starting_stock',
    'previous_ending_stock',
    'restock',
    'sales_curve'
]

df = df.drop(columns=to_drop)

## New Features

## Date 

In [9]:
df['date'] = pd.to_datetime(df['date'])

# Extracting date features
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'] >= 5
df['week_of_year'] = df['date'].dt.isocalendar().week

# Brazilian holidays
br_holidays = holidays.Brazil()
df['is_brazilian_holiday'] = df['date'].isin(br_holidays)

In [10]:
df.head()

Unnamed: 0,merchant_id,product_id,date,category_id,is_medicine,sales,cost,price,day,month,year,day_of_week,is_weekend,week_of_year,is_brazilian_holiday
0,1,6,2022-01-01,4.0,False,0.0,0.055587,0.07722,1,1,2022,5,True,52,False
1,1,6,2022-01-02,4.0,False,0.0,0.055587,0.07722,2,1,2022,6,True,52,False
2,1,6,2022-01-03,4.0,False,0.0,0.055587,0.07722,3,1,2022,0,False,1,False
3,1,6,2022-01-04,4.0,False,0.0,0.055587,0.07722,4,1,2022,1,False,1,False
4,1,6,2022-01-05,4.0,False,0.0,0.055587,0.07722,5,1,2022,2,False,1,False


## lags

In [11]:
# Criar colunas de semana anterior
df['prev_week'] = df['week_of_year'] - 1
df['prev_year'] = df['year']

# Ajustar quando a semana for 1 (voltar para última semana do ano anterior)
df.loc[df['week_of_year'] == 1, 'prev_week'] = 52  # ou 53 dependendo do calendário, pode ajustar se necessário
df.loc[df['week_of_year'] == 1, 'prev_year'] = df['year'] - 1

# Agrupar para calcular média de cost e price por semana
weekly_avg = df.groupby(
    ['product_id', 'merchant_id', 'year', 'week_of_year']
)[['cost', 'price', 'sales']].mean().reset_index()

# Merge com base na semana anterior
df = df.merge(
    weekly_avg,
    left_on=['product_id', 'merchant_id', 'prev_year', 'prev_week'],
    right_on=['product_id', 'merchant_id', 'year', 'week_of_year'],
    how='left',
    suffixes=('', '_prev_week')
)

# Renomear colunas de média da semana anterior
df.rename(columns={
    'cost_prev_week': 'prev_week_cost_avg',
    'sales_prev_week': 'prev_week_sales_avg',
    'price_prev_week': 'prev_week_price_avg'
}, inplace=True)

# Remover colunas auxiliares
df = df.drop(columns=[
        'year_prev_week', 
        'week_of_year_prev_week', 
        'prev_year', 
        'prev_week'
    ])

df = df.drop(columns=['cost', 'price'])

In [12]:
print(df.shape)
df = df.dropna(subset=['prev_week_cost_avg', 'prev_week_price_avg', 'prev_week_sales_avg'])
print(df.shape)

(15565344, 16)
(15460514, 16)


## Change types

In [13]:
df["date"] = pd.to_datetime(df["date"])

In [14]:
change_types = {
    'category_id': 'int64',
    'is_medicine': 'int64',
    'is_weekend': 'int64',
    'is_brazilian_holiday': 'int64',
}

df = df.astype(change_types)

In [15]:
df.head()

Unnamed: 0,merchant_id,product_id,date,category_id,is_medicine,sales,day,month,year,day_of_week,is_weekend,week_of_year,is_brazilian_holiday,prev_week_cost_avg,prev_week_price_avg,prev_week_sales_avg
0,1,6,2022-01-01,4,0,0.0,1,1,2022,5,1,52,0,0.061576,0.08712,0.0
1,1,6,2022-01-02,4,0,0.0,2,1,2022,6,1,52,0,0.061576,0.08712,0.0
9,1,6,2022-01-10,4,0,0.0,10,1,2022,0,0,2,0,0.055587,0.07722,0.0
10,1,6,2022-01-11,4,0,0.0,11,1,2022,1,0,2,0,0.055587,0.07722,0.0
11,1,6,2022-01-12,4,0,0.0,12,1,2022,2,0,2,0,0.055587,0.07722,0.0


# Train

In [16]:
# model_params = {
#     'Ridge': {
#         'model': Ridge(),
#         'params': {
#             'alpha': [10.0, 30.0, 50.0]
#         }
#     },
#     'Lasso': {
#         'model': Lasso(),
#         'params': {
#             'alpha': [1.0, 3.0, 5.0, 10.0]
#         }
#     },
#     'DecisionTreeRegressor': {
#         'model': DecisionTreeRegressor(),
#         'params': {
#             'max_depth': [None, 25, 30],           # Testa profundidade levemente restrita
#             'min_samples_split': [2, 3, 4],        # Mantém o ótimo e testa restrições suaves
#             'min_samples_leaf': [1, 2, 3]          # Mantém 1, mas testa suavizações
#         }
#     },
#     'RandomForestRegressor': {
#         'model': RandomForestRegressor(),
#         'params': {
#             'n_estimators': [20],
#             'max_depth': [None],
#             'min_samples_split': [2],
#             'min_samples_leaf': [1]
#         }
#     }
#     # ,
#     # 'XGBRegressor': {
#     #     'model': xgb.XGBRegressor(),
#     #     'params': {
#     #         'n_estimators': [60, 100, 150, 200],
#     #         'learning_rate': [0.01, 0.1, 0.5],
#     #         'subsample': [0.9]
#     #     }
#     # },
#     # 'LGBMRegressor': {
#     #     'model': LGBMRegressor(),
#     #     'params': {
#     #         'n_estimators': [40, 50, 100, 150, 200],
#     #         'learning_rate': [0.1, 0.2, 0.3],
#     #         'num_leaves': [40, 63, 70]
#     #     }
#     # }
# }

In [17]:
model_params = {
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [None],           # Testa profundidade levemente restrita
            'min_samples_split': [2],        # Mantém o ótimo e testa restrições suaves
            'min_samples_leaf': [1]          # Mantém 1, mas testa suavizações
        }
    }
}

In [18]:
# 1. Separar a última semana para validação final
max_date = df['date'].max()
val_start = max_date - pd.Timedelta(days=6)

df_val = df[df['date'] >= val_start]
df_train_full = df[df['date'] < val_start]

print(f"Validação final: de {df_val['date'].min().date()} até {df_val['date'].max().date()}")
print(f"Treino + Folds: até {df_train_full['date'].max().date()}")

Validação final: de 2025-02-22 até 2025-02-28
Treino + Folds: até 2025-02-21


In [19]:
folds = generate_oot_folds(df_train_full, date_col='date', n_folds=5, test_window=7)

# Vamos armazenar os conjuntos separados
fold_data = []

for i, (train_dates, test_dates) in enumerate(folds):
    train_df = df_train_full[df_train_full['date'].isin(train_dates)]
    test_df = df_train_full[df_train_full['date'].isin(test_dates)]

    fold_data.append({
        'fold': i + 1,
        'train': train_df,
        'test': test_df
    })

    print(f"Fold {i + 1} -> Treino: {train_df['date'].min().date()} até {train_df['date'].max().date()} | "
          f"Teste: {test_df['date'].min().date()} até {test_df['date'].max().date()}")


Fold 1 -> Treino: 2022-01-01 até 2025-02-14 | Teste: 2025-02-15 até 2025-02-21
Fold 2 -> Treino: 2022-01-01 até 2025-02-07 | Teste: 2025-02-08 até 2025-02-14
Fold 3 -> Treino: 2022-01-01 até 2025-01-31 | Teste: 2025-02-01 até 2025-02-07
Fold 4 -> Treino: 2022-01-01 até 2025-01-24 | Teste: 2025-01-25 até 2025-01-31
Fold 5 -> Treino: 2022-01-01 até 2025-01-17 | Teste: 2025-01-18 até 2025-01-24


In [20]:
# import os

# # Cria a pasta 'models' se ela não existir
# os.makedirs('./models', exist_ok=True)

# model_best_params = {}
# best_preds = 0

# for model_name, config in model_params.items():
#     print(f"Running manual grid search for {model_name}...")

#     model_class = config['model']
#     param_grid = config['params']

#     keys, values = zip(*param_grid.items())
#     param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

#     best_score = float('inf')
#     best_params = None
#     best_model = None

#     for params in tqdm(param_combinations):
#         fold_scores = []

#         for fold in fold_data:
#             train = fold['train']
#             test = fold['test']

#             X_train = train.drop(columns=["date", "sales"])
#             Y_train = train["sales"]
#             X_test = test.drop(columns=["date", "sales"])
#             Y_test = test["sales"]

#             model = model_class.set_params(**params)
#             model.fit(X_train, Y_train)

#             preds = model.predict(X_test)
#             preds = np.maximum(np.round(preds), 0)

#             smape = smape_metric(Y_test, preds)
#             # fold_scores.append(smape)

#         mean_smape = np.mean(fold_scores)

#         if mean_smape < best_score:
#             best_score = mean_smape
#             best_params = params
#             best_model = model
#             best_preds = preds

#     print(f'Best Parameters: {best_params}')
#     print(f'Best SMAPE (mean across folds): {best_score:.6f}\n\n')

#     model_best_params[model_name] = {
#         'model': model_name,
#         'best_param': best_params,
#         'best_score': best_score
#     }

#     with open('model_best_params.json', 'w') as f:
#         json.dump(model_best_params, f, indent=4)

#     with open(f'./models/{model_name}.pkl', 'wb') as f:
#         pickle.dump(best_model, f)

#     del model, best_model, preds


In [21]:
# Separa X e y
X_train = df_train_full.drop(columns=["date", "sales"])
Y_train = df_train_full["sales"]
X_test = df_val.drop(columns=["date", "sales"])
Y_test = df_val["sales"]

# Ajusta modelo final
final_model = DecisionTreeRegressor(max_depth=None, min_samples_split=2, min_samples_leaf=1)
final_model.fit(X_train, Y_train)

# Faz predições na validação
val_preds = np.maximum(np.round(final_model.predict(X_test)), 0)

# Avalia desempenho
val_mae = smape_metric(Y_test, val_preds)
print(f"SMAPE na validação final (última semana): {val_mae:.4f}")

SMAPE na validação final (última semana): 7.3919


In [22]:
# Cria sub-DataFrame com as colunas desejadas
df_val_preds = df_val[["merchant_id", "product_id", "date", "sales"]].copy()
df_val_preds["predict_sales"] = val_preds

# Salva em CSV
df_val_preds.to_csv("val_preds_D.csv", index=False)

# Exibe as primeiras linhas (opcional)
df_val_preds.head()

Unnamed: 0,merchant_id,product_id,date,sales,predict_sales
15440443,3,134,2025-02-22,0.0,0.0
15440444,8,134,2025-02-22,0.0,1.0
15440445,1,134,2025-02-22,0.0,0.0
15440446,2,134,2025-02-22,0.0,0.0
15440447,4,134,2025-02-22,0.0,0.0
