In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import json
import lightgbm as lgb
import holidays
import itertools
from tqdm import tqdm

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, HuberRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
import pickle

# Read Data
---

In [2]:
df = pd.read_parquet("./data/sample_trat_curva_C.parquet")

# Processing
---

## changing_names


In [3]:
renames = {
    "loja_id": "merchant_id",
    'produto_id': 'product_id',
    'data': 'date',
    'categoria_id': 'category_id',
    'is_medicamento': 'is_medicine',
    'curva': 'sales_curve',
    'estoque_final': 'ending_stock',
    'venda': 'sales',
    'custo': 'cost',
    'preco': 'price',
    'estoque_inicial': 'starting_stock',
    'estoque_final_anterior': 'previous_ending_stock',
    'reposicao': 'restock',
}

df = df.rename(columns=renames)

## droping columns

In [4]:
to_drop = [
    'ending_stock',
    'starting_stock',
    'previous_ending_stock',
    'restock',
    'sales_curve'
]

df = df.drop(columns=to_drop)

## New Features

## Date 

In [5]:
df['date'] = pd.to_datetime(df['date'])

# Extracting date features
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['year'] = df['date'].dt.year
df['day_of_week'] = df['date'].dt.dayofweek
df['is_weekend'] = df['day_of_week'] >= 5
df['week_of_year'] = df['date'].dt.isocalendar().week

# Brazilian holidays
br_holidays = holidays.Brazil()
df['is_brazilian_holiday'] = df['date'].isin(br_holidays)

In [6]:
df.head()

Unnamed: 0,merchant_id,product_id,date,category_id,is_medicine,sales,cost,price,day,month,year,day_of_week,is_weekend,week_of_year,is_brazilian_holiday
35464798,1,3,2023-06-20,9.0,True,0.0,0.024664,0.073309,20,6,2023,1,False,25,False
35464799,1,3,2023-06-21,9.0,True,0.0,0.024664,0.073309,21,6,2023,2,False,25,False
35464800,1,3,2023-06-22,9.0,True,0.0,0.024664,0.073309,22,6,2023,3,False,25,False
35464801,1,3,2023-06-23,9.0,True,0.0,0.024664,0.073309,23,6,2023,4,False,25,False
35464802,1,3,2023-06-24,9.0,True,0.0,0.024664,0.073309,24,6,2023,5,True,25,False


## lags

In [7]:
# Criar colunas de semana anterior
df['prev_week'] = df['week_of_year'] - 1
df['prev_year'] = df['year']

# Ajustar quando a semana for 1 (voltar para última semana do ano anterior)
df.loc[df['week_of_year'] == 1, 'prev_week'] = 52  # ou 53 dependendo do calendário, pode ajustar se necessário
df.loc[df['week_of_year'] == 1, 'prev_year'] = df['year'] - 1

# Agrupar para calcular média de cost e price por semana
weekly_avg = df.groupby(
    ['product_id', 'merchant_id', 'year', 'week_of_year']
)[['cost', 'price', 'sales']].mean().reset_index()

# Merge com base na semana anterior
df = df.merge(
    weekly_avg,
    left_on=['product_id', 'merchant_id', 'prev_year', 'prev_week'],
    right_on=['product_id', 'merchant_id', 'year', 'week_of_year'],
    how='left',
    suffixes=('', '_prev_week')
)

# Renomear colunas de média da semana anterior
df.rename(columns={
    'cost_prev_week': 'prev_week_cost_avg',
    'sales_prev_week': 'prev_week_sales_avg',
    'price_prev_week': 'prev_week_price_avg'
}, inplace=True)

# Remover colunas auxiliares
df = df.drop(columns=[
        'year_prev_week', 
        'week_of_year_prev_week', 
        'prev_year', 
        'prev_week'
    ])

df = df.drop(columns=['cost', 'price'])

In [8]:
print(df.shape)
df = df.dropna(subset=['prev_week_cost_avg', 'prev_week_price_avg', 'prev_week_sales_avg'])
print(df.shape)

(9904630, 16)
(9839122, 16)


## Change types

In [9]:
df["date"] = pd.to_datetime(df["date"])

In [10]:
change_types = {
    'category_id': 'int64',
    'is_medicine': 'int64',
    'is_weekend': 'int64',
    'is_brazilian_holiday': 'int64',
}

df = df.astype(change_types)

# Train Split

In [11]:
df["date"] = pd.to_datetime(df["date"])
df = df.sort_values("date").reset_index(drop=True)

split_index = int(0.8 * len(df))
cutoff_date = df.loc[split_index, "date"]

X = df[df["date"] <= cutoff_date].reset_index(drop=True)
Y = df[df["date"] > cutoff_date].reset_index(drop=True)

del df

print(f"Tamanho do X: {len(X)}")
print(f"Tamanho do Y: {len(Y)}")
print(f"Data de corte: {cutoff_date}")

Tamanho do X: 7880432
Tamanho do Y: 1958690
Data de corte: 2024-08-19 00:00:00


In [12]:
X_train = X.drop(columns=["date", "sales"])
Y_train = X["sales"]
X_test = Y.drop(columns=["date", "sales"])
Y_test = Y["sales"]

print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

(7880432, 14)
(7880432,)
(1958690, 14)
(1958690,)


# Train

In [None]:
model_params = {
    'Ridge': {
        'model': Ridge(),
        'params': {
            'alpha': [10.0, 30.0, 50.0]
        }
    },
    'Lasso': {
        'model': Lasso(),
        'params': {
            'alpha': [1.0, 3.0, 5.0, 10.0]
        }
    },
    'DecisionTreeRegressor': {
        'model': DecisionTreeRegressor(),
        'params': {
            'max_depth': [None, 10, 20, 30],
            'min_samples_split': [2, 5, 10],
            'min_samples_leaf': [1, 2, 4]
        }
    },
    'RandomForestRegressor': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [20, 30]
        }
    },
    'XGBRegressor': {
        'model': xgb.XGBRegressor(),
        'params': {
            'n_estimators': [60, 100, 150, 200],
            'learning_rate': [0.01, 0.1, 0.5],
            'subsample': [0.9]
        }
    },
    'LGBMRegressor': {
        'model': LGBMRegressor(),
        'params': {
            'n_estimators': [40, 50, 100, 150, 200],
            'learning_rate': [0.1, 0.2, 0.3],
            'num_leaves': [40, 63, 70]
        }
    }
}

: 

In [None]:
model_best_params = {}
best_preds = 0

for model_name, config in model_params.items():
    print(f"Running manual grid search for {model_name}...")

    model_class = config['model']
    param_grid = config['params']

    keys, values = zip(*param_grid.items())
    param_combinations = [dict(zip(keys, v)) for v in itertools.product(*values)]

    best_score = float('inf')
    best_params = None
    best_model = None

    for params in tqdm(param_combinations):
        model = model_class.set_params(**params)
        model.fit(X_train, Y_train)

        preds = model.predict(X_test)
        preds = np.maximum(preds, 0)
        
        mae = mean_absolute_error(Y_test, preds)

        if mae < best_score:
            best_score = mae
            best_params = params
            best_model = model
            best_preds = preds

    print(f'Best Parameters: {best_params}')
    print(f'Best MAE: {best_score:.6f}\n\n')

    model_best_params[model_name] = {
        'model': model_name,
        'best_param': best_params,
        'best_score': best_score
    }

    with open('model_best_params.json', 'w') as f:
        json.dump(model_best_params, f, indent=4)

    with open(f'./models/{model_name}.pkl', 'wb') as f:
        pickle.dump(best_model, f)
        
    del model, best_model, preds

Running manual grid search for Ridge...


100%|██████████| 3/3 [00:04<00:00,  1.52s/it]


Best Parameters: {'alpha': 10.0}
Best MAE: 0.115071


Running manual grid search for Lasso...


100%|██████████| 4/4 [00:21<00:00,  5.27s/it]


Best Parameters: {'alpha': 1.0}
Best MAE: 0.126124


Running manual grid search for DecisionTreeRegressor...


100%|██████████| 36/36 [45:15<00:00, 75.44s/it]


Best Parameters: {'max_depth': 10, 'min_samples_split': 5, 'min_samples_leaf': 1}
Best MAE: 0.115860


Running manual grid search for RandomForestRegressor...


100%|██████████| 2/2 [57:37<00:00, 1728.82s/it]


Best Parameters: {'n_estimators': 20}
Best MAE: 0.174171


