**Forecasting Sticker Sales** - Kaggle Competition

Author: Tihoc Andrei

In [41]:
# Imports

import pandas as pd
import optuna
import warnings

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_percentage_error
from xgboost import XGBRegressor

warnings.filterwarnings("ignore")


In [42]:
# Preprocessor class to handle date and categorical feature transformations

class Preprocessor:
    def __init__(self):
        self.cat_features = ["country", "store", "product"]
        self.date_col = "date"

    def transform_dates(self, X):
        df = X.copy()
        df[self.date_col] = pd.to_datetime(df[self.date_col])
        df["year"] = df[self.date_col].dt.year
        df["month"] = df[self.date_col].dt.month
        df["day"] = df[self.date_col].dt.day
        df["weekday"] = df[self.date_col].dt.weekday
        df["is_weekend"] = df["weekday"].isin([5, 6]).astype(int)
        return df.drop(columns=[self.date_col])

    def build_pipeline(self):
        date_transformer = FunctionTransformer(self.transform_dates, validate=False)
        categorical_transformer = OneHotEncoder(handle_unknown="ignore", sparse_output=False)

        preprocessor = Pipeline([
            ('date_features', date_transformer),
            ('column_trans', ColumnTransformer(
                transformers=[
                    ('cat', categorical_transformer, self.cat_features)
                ],
                remainder='passthrough'
            ))
        ])
        return preprocessor


In [43]:
# SalesForecaster class for Optuna tuning, model training and prediction

class SalesForecaster:
    def __init__(self, X, y, pipeline):
        self.X = X
        self.y = y
        self.pipeline = pipeline

    def objective(self, trial):
        params = {
            "n_estimators": trial.suggest_int("n_estimators", 300, 1000),
            "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
            "max_depth": trial.suggest_int("max_depth", 3, 10),
            "subsample": trial.suggest_float("subsample", 0.6, 1.0),
            "colsample_bytree": trial.suggest_float("colsample_bytree", 0.6, 1.0),
        }

        model = XGBRegressor(**params, random_state=42, tree_method="hist")

        full_pipeline = Pipeline([
            ('preprocess', self.pipeline),
            ('model', model)
        ])

        cv = TimeSeriesSplit(n_splits=3)
        scorer = make_scorer(mean_absolute_percentage_error, greater_is_better=False)
        scores = cross_val_score(full_pipeline, self.X, self.y, scoring=scorer, cv=cv, n_jobs=-1)

        return float(-sum(scores) / len(scores))

    def tune_model(self, n_trials=30):
        study = optuna.create_study(direction="minimize")
        study.optimize(self.objective, n_trials=n_trials)
        return study.best_params

    def train_best_model(self, best_params):
        model = XGBRegressor(**best_params, random_state=42, tree_method="hist")
        full_pipeline = Pipeline([
            ('preprocess', self.pipeline),
            ('model', model)
        ])
        full_pipeline.fit(self.X, self.y)
        self.model = full_pipeline

    def predict(self, X_test):
        return self.model.predict(X_test)


In [44]:
# Load training and test datasets, clean invalid target values

train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

# Remove rows with NaN or infinite target values
train = train[train["num_sold"].notna()]
train = train[~train["num_sold"].isin([float("inf"), float("-inf")])]
train = train.reset_index(drop=True)


In [45]:
# Prepare feature matrices and target vector

X = train.drop(columns=["num_sold", "id"])
y = train["num_sold"]
X_test = test.drop(columns=["id"])


In [46]:
# Instantiate preprocessing pipeline

preprocessor = Preprocessor().build_pipeline()


In [48]:
# Tune hyperparameters using Optuna

forecaster = SalesForecaster(X, y, preprocessor)
best_params = forecaster.tune_model(n_trials=10)
print("Best parameters found by Optuna:", best_params)


[I 2025-04-07 06:16:25,174] A new study created in memory with name: no-name-66f6a2cd-0c86-4ba6-9e84-655e27b836ae
[I 2025-04-07 06:16:56,518] Trial 0 finished with value: 0.1547757342756399 and parameters: {'n_estimators': 445, 'learning_rate': 0.20301062635355238, 'max_depth': 10, 'subsample': 0.8742777149615502, 'colsample_bytree': 0.6119883387624582}. Best is trial 0 with value: 0.1547757342756399.
[I 2025-04-07 06:17:14,448] Trial 1 finished with value: 0.2648089242434666 and parameters: {'n_estimators': 699, 'learning_rate': 0.02950964010108599, 'max_depth': 5, 'subsample': 0.9729935620928176, 'colsample_bytree': 0.6052404640070188}. Best is trial 0 with value: 0.1547757342756399.
[I 2025-04-07 06:17:33,617] Trial 2 finished with value: 0.19594900534738824 and parameters: {'n_estimators': 584, 'learning_rate': 0.15951128070874998, 'max_depth': 6, 'subsample': 0.7763904588227672, 'colsample_bytree': 0.8876194839015616}. Best is trial 0 with value: 0.1547757342756399.
[I 2025-04-07 

Best parameters found by Optuna: {'n_estimators': 589, 'learning_rate': 0.020172541521716283, 'max_depth': 10, 'subsample': 0.6286112437952457, 'colsample_bytree': 0.9417359764841966}


In [49]:
# Train final model using best parameters

forecaster.train_best_model(best_params)


In [51]:
# Predict on test set and generate submission

y_pred = forecaster.predict(X_test)
y_pred = y_pred.round().astype(int)
submission = pd.DataFrame({
    "id": test["id"],
    "num_sold": y_pred
})
submission.to_csv("submission.csv", index=False)
submission.head()


Unnamed: 0,id,num_sold
0,230130,144
1,230131,838
2,230132,786
3,230133,394
4,230134,460
