In [None]:
import numpy as np
import pandas as pd
from pathlib import Path

from sklearn.metrics import mean_absolute_error, mean_squared_error
from lightgbm import LGBMRegressor

import warnings
warnings.filterwarnings("ignore")
RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

DATA_DIR = Path(".")

In [None]:
train = pd.read_csv(DATA_DIR / "train.csv")
test = pd.read_csv(DATA_DIR / "test.csv")
sample = pd.read_csv(DATA_DIR / "sample_submission.csv")
if "Unnamed: 0" in train.columns:
    train = train.rename(columns={"Unnamed: 0": "id"})

print("Train shape:", train.shape)
print("Test shape:", test.shape)

In [None]:
train["period_start_dt"] = pd.to_datetime(
    train["period_start_dt"], format="%Y-%m-%d", errors="coerce"
)
test["period_start_dt"] = pd.to_datetime(
    test["period_start_dt"], format="%d.%m.%Y", errors="coerce"
)

for df_ in (train, test):
    df_["series_id"] = (
        df_["product_rk"].astype(str) + "_" + df_["store_location_rk"].astype(str)
    )

In [None]:
promo_mode = (
    train["PROMO1_FLAG"].mode().iloc[0]
    if "PROMO1_FLAG" in train.columns
    else 0
)
train["PROMO1_FLAG"] = train.get("PROMO1_FLAG", promo_mode).fillna(promo_mode)
if "PROMO1_FLAG" in test.columns:
    test["PROMO1_FLAG"] = test["PROMO1_FLAG"].fillna(promo_mode)
else:
    test["PROMO1_FLAG"] = promo_mode

base_cols = [
    "PRICE_REGULAR",
    "PRICE_AFTER_DISC",
    "AUTORIZATION_FLAG",
    "PROMO2_FLAG",
    "NUM_CONSULTANT",
]

for col in base_cols:
    if col in train.columns:
        train[col] = train.groupby(["product_rk", "store_location_rk"])[col].transform(
            lambda s: s.ffill().bfill()
        )
        prod_med = train.groupby("product_rk")[col].transform("median")
        train[col] = train[col].fillna(prod_med)
        prod_med_map = train.groupby("product_rk")[col].median().to_dict()
        test[col] = test["product_rk"].map(prod_med_map).fillna(0.0).values
    else:
        test[col] = 0.0


In [None]:
test = sample[["id"]].merge(test, on="id", how="left")

test["product_rk"] = (
    test["product_rk"].fillna(method="ffill").fillna(method="bfill")
)
test["store_location_rk"] = (
    test["store_location_rk"].fillna(method="ffill").fillna(method="bfill")
)
test["series_id"] = (
    test["product_rk"].astype(str) + "_" + test["store_location_rk"].astype(str)
)

In [None]:
key_cols = ["product_rk", "store_location_rk", "period_start_dt"]
test_new = test.merge(
    train[key_cols].drop_duplicates().assign(_in_train=1),
    on=key_cols,
    how="left",
)
test_new = test_new[test_new["_in_train"].isna()].drop(columns=["_in_train"])

df = pd.concat([train, test_new], sort=False).reset_index(drop=True)
df = df.sort_values(["series_id", "period_start_dt"]).reset_index(drop=True)

In [None]:
df["week"] = df["period_start_dt"].dt.isocalendar().week.astype("Int64")
df["month"] = df["period_start_dt"].dt.month.astype("Int64")
df["weekday"] = df["period_start_dt"].dt.weekday.astype("Int64")
df["year"] = df["period_start_dt"].dt.year.astype("Int64")

df["week_sin"] = np.sin(2 * np.pi * df["week"] / 52.0)
df["week_cos"] = np.cos(2 * np.pi * df["week"] / 52.0)

In [None]:
df["demand"] = df["demand"].astype(float)
df["demand_log"] = np.log1p(df["demand"].clip(lower=0))

df["demand_rel"] = (
    df.groupby("series_id")["demand"]
    .pct_change()
    .replace([np.inf, -np.inf], 0)
    .fillna(0)
)

In [None]:
lags = [1, 2, 3, 4, 8, 12, 26, 52]

for lag in lags:
    df[f"lag_{lag}"] = df.groupby("series_id")["demand"].shift(lag)
    df[f"log_lag_{lag}"] = df.groupby("series_id")["demand_log"].shift(lag)
    df[f"rel_lag_{lag}"] = df.groupby("series_id")["demand_rel"].shift(lag)

for w in [4, 8, 12]:
    df[f"roll_mean_{w}"] = (
        df.groupby("series_id")["demand"]
        .shift(1)
        .rolling(w, min_periods=1)
        .mean()
    )
    df[f"roll_median_{w}"] = (
        df.groupby("series_id")["demand"]
        .shift(1)
        .rolling(w, min_periods=1)
        .median()
    )
    df[f"rel_roll_mean_{w}"] = (
        df.groupby("series_id")["demand_rel"]
        .shift(1)
        .rolling(w, min_periods=1)
        .mean()
    )

In [None]:
if "PRICE_REGULAR" in df.columns and "PRICE_AFTER_DISC" in df.columns:
    df["PRICE_REGULAR"] = df["PRICE_REGULAR"].replace(0, np.nan)
    df["price_ratio"] = df["PRICE_AFTER_DISC"] / df["PRICE_REGULAR"]
    df["price_ratio"] = (
        df["price_ratio"].replace([np.inf, -np.inf], 1.0).fillna(1.0)
    )
else:
    df["price_ratio"] = 1.0

df["promo_discount"] = df["PROMO1_FLAG"] * (
    1.0 - df["price_ratio"].clip(upper=1.5)
)

df["promo_prev_mean_4"] = (
    df.groupby("series_id")["PROMO1_FLAG"]
    .shift(1)
    .rolling(4, min_periods=1)
    .mean()
)

In [None]:
series_stats = (
    train.groupby("series_id")["demand"]
    .agg(["median", "mean", "count"])
    .rename(
        columns={
            "median": "series_median",
            "mean": "series_mean",
            "count": "series_count",
        }
    )
)
df = df.merge(series_stats, on="series_id", how="left")

prod_stats = (
    train.groupby("product_rk")["demand"]
    .agg(["median", "mean"])
    .rename(
        columns={
            "median": "prod_median",
            "mean": "prod_mean",
        }
    )
)
df = df.merge(prod_stats, on="product_rk", how="left")

global_med = train["demand"].median()
df["series_median"] = df["series_median"].fillna(global_med)
df["series_mean"] = df["series_mean"].fillna(global_med)
df["prod_median"] = df["prod_median"].fillna(global_med)
df["prod_mean"] = df["prod_mean"].fillna(global_med)
df["series_count"] = df["series_count"].fillna(0)

In [None]:
base_fill_cols = [
    "month",
    "weekday",
    "week",
    "year",
    "week_sin",
    "week_cos",
    "PRICE_REGULAR",
    "PRICE_AFTER_DISC",
    "price_ratio",
    "PROMO1_FLAG",
    "PROMO2_FLAG",
    "AUTORIZATION_FLAG",
    "NUM_CONSULTANT",
    "promo_prev_mean_4",
    "promo_discount",
    "series_median",
    "series_mean",
    "prod_median",
    "prod_mean",
]

lag_cols = (
    [f"lag_{l}" for l in lags]
    + [f"log_lag_{l}" for l in lags]
    + [f"rel_lag_{l}" for l in lags]
    + [f"roll_mean_{w}" for w in [4, 8, 12]]
    + [f"roll_median_{w}" for w in [4, 8, 12]]
    + [f"rel_roll_mean_{w}" for w in [4, 8, 12]]
)

feature_cols = base_fill_cols + lag_cols

for c in feature_cols:
    if c in df.columns:
        df[c] = df.groupby("series_id")[c].transform(
            lambda s: s.ffill().bfill()
        )
        df[c] = df[c].fillna(df["prod_median"])
        df[c] = df[c].fillna(df["series_median"])
        df[c] = df[c].fillna(0.0)
    else:
        df[c] = 0.0

In [None]:
train_proc = df[df["demand"].notna()].copy()
test_proc = df[df["demand"].isna()].copy()

print(f"Processed: train rows {len(train_proc)}, test rows {len(test_proc)}")

X_all = train_proc[feature_cols].copy()
y_all = train_proc["demand"].astype(float)

In [None]:
train_proc_sorted = train_proc.sort_values("period_start_dt")
split_date = train_proc_sorted["period_start_dt"].quantile(0.85)

mask_tr = train_proc_sorted["period_start_dt"] < split_date
mask_val = ~mask_tr

X_tr = train_proc_sorted.loc[mask_tr, feature_cols]
y_tr = train_proc_sorted.loc[mask_tr, "demand"].astype(float)

X_val = train_proc_sorted.loc[mask_val, feature_cols]
y_val = train_proc_sorted.loc[mask_val, "demand"].astype(float)

y_tr_log = np.log1p(y_tr)
y_val_log = np.log1p(y_val)
y_all_log = np.log1p(y_all)

In [None]:
lgb_params = {
    "objective": "regression",
    "learning_rate": 0.04,
    "n_estimators": 2000,
    "num_leaves": 80,
    "min_data_in_leaf": 20,
    "feature_fraction": 0.9,
    "bagging_fraction": 0.8,
    "bagging_freq": 4,
    "lambda_l1": 0.1,
    "lambda_l2": 0.2,
    "random_state": RANDOM_STATE,
}

lgb_model = LGBMRegressor(**lgb_params)

print("Training LightGBM...")
lgb_model.fit(X_tr, y_tr_log)
print("LightGBM training done.")

val_pred_log = lgb_model.predict(X_val)
val_pred = np.expm1(val_pred_log)

val_mae = mean_absolute_error(y_val, val_pred)
val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))

print(f"[LGBM] Validation MAE:  {val_mae:.3f}")
print(f"[LGBM] Validation RMSE: {val_rmse:.3f}")

In [None]:
lgb_final = LGBMRegressor(**lgb_params)
print("Training final LGBM on full train...")
lgb_final.fit(X_all, y_all_log)
print("Final training done.")

In [None]:
X_test = test_proc[feature_cols].copy()
test_pred_log = lgb_final.predict(X_test)
test_pred = np.expm1(test_pred_log)

prod_q = train_proc.groupby("product_rk")["demand"].quantile(0.995).to_dict()
prod_upper = (
    test_proc["product_rk"]
    .map(prod_q)
    .fillna(train_proc["demand"].quantile(0.99))
    .values
)
test_pred = np.minimum(test_pred, prod_upper)

short_mask = test_proc["series_count"].fillna(0) < 2
test_pred[short_mask.values] = (
    test_proc.loc[short_mask, "prod_median"]
    .fillna(global_med)
    .values
)

test_pred = np.maximum(0, test_pred)
test_pred = np.round(test_pred).astype(int)

In [2]:
submission = test_proc[["id"]].copy().reset_index(drop=True)
submission["predicted"] = test_pred

submission = sample[["id"]].merge(submission, on="id", how="left")
submission["predicted"] = (
    submission["predicted"].fillna(global_med).astype(int)
)

out_path = DATA_DIR / "submission_lgb_final.csv"
submission.to_csv(out_path, index=False)
print("Saved:", out_path)
print(submission.head())

Train shape: (35344, 11)
Test shape: (1404, 5)
Processed: train rows 34144, test rows 1200
Training LightGBM...
LightGBM training done.
[LGBM] Validation MAE:  4.591
[LGBM] Validation RMSE: 7.569
Training final LGBM on full train...
Final training done.
Saved: submission_lgb_final.csv
    id  predicted
0  908          6
1  909          8
2  910          5
3  911          3
4  912          2
