benchmark_1.csv
48.267

benckmark_2.csv
52.759

Public score: 89.656

Private score: 86.008

In [215]:
from google.colab import drive
drive.mount('/content/drive')

!pip install catboost -q

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [216]:
!pip -q install --upgrade --force-reinstall --no-deps kaggle > log  # upgrade kaggle package (to avoid a warning)
!mkdir -p ~/.kaggle                                           # .kaggle folder must contain kaggle.json for kaggle executable to properly authenticate you to Kaggle.com
!cp /content/drive/MyDrive/kaggle.json ~/.kaggle/kaggle.json >log  # First, download kaggle.json from kaggle.com (in Account page) and place it in the root of mounted Google Drive
!cp kaggle.json ~/.kaggle/kaggle.json > log                   # Alternative location of kaggle.json (without a connection to Google Drive)
!chmod 600 ~/.kaggle/kaggle.json                      # give only the owner full read/write access to kaggle.json
!kaggle config set -n competition -v  dscs-25-hw3       # set the competition context for the next few kaggle API calls. !kaggle config view - shows current settings
!kaggle competitions download >> log                          # download competition dataset as a zip file
!unzip -o *.zip >> log                                        # Kaggle dataset is copied as a single file and needs to be unzipped.
lb =!kaggle competitions leaderboard --show                   # print public leaderboard

cp: cannot stat 'kaggle.json': No such file or directory
- competition is now set to: dscs-25-hw3


In [217]:
%%time
%%capture
%reset -f
from IPython.core.interactiveshell import InteractiveShell as IS; IS.ast_node_interactivity = "all"
import numpy as np, pandas as pd, time, warnings
from collections import defaultdict, deque
warnings.filterwarnings('ignore')

class Timer():
    def __init__(self, lim=60):
        self.t0, self.lim = time.time(), lim
        print(f'⏳ started. You have {lim} sec. Good luck!')
    def ShowTime(self):
        elapsed = time.time() - self.t0
        msg = f'Runtime is {elapsed:.0f} sec'
        if elapsed > self.lim:
            print(f'\033[91m\033[1m{msg} > {self.lim} sec limit!!!\033[0m')
        else:
            print(msg)

np.set_printoptions(linewidth=100, precision=2, suppress=True)
pd.set_option('display.max_columns', 20, 'display.precision', 2)

CPU times: user 310 ms, sys: 18.3 ms, total: 328 ms
Wall time: 538 ms


In [218]:
tmr = Timer(120)

⏳ started. You have 120 sec. Good luck!


In [219]:
from catboost import CatBoostRegressor
from sklearn.metrics import mean_absolute_error

In [220]:
def smape(y_true, y_pred):
    y_true = np.array(y_true, dtype=float)
    y_pred = np.array(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denom
    diff[denom == 0] = 0.0
    return 100 * np.mean(diff)

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
sample_sub = pd.read_csv('sample_submission.csv')

train["period_start_dt"] = pd.to_datetime(train["period_start_dt"])
try:
    test["period_start_dt"] = pd.to_datetime(test["period_start_dt"])
except:
    test["period_start_dt"] = pd.to_datetime(test["period_start_dt"], format="%d.%m.%Y")

In [221]:
for df in [train, test]:
    df["year"] = df["period_start_dt"].dt.year
    df["month"] = df["period_start_dt"].dt.month
    df["weekofyear"] = df["period_start_dt"].dt.isocalendar().week.astype(int)
    df["quarter"] = df["period_start_dt"].dt.quarter
    df["dayofyear"] = df["period_start_dt"].dt.dayofyear
    df["dow"] = df["period_start_dt"].dt.dayofweek
    df["month_sin"] = np.sin(2 * np.pi * df["month"] / 12)
    df["month_cos"] = np.cos(2 * np.pi * df["month"] / 12)
    df["week_sin"] = np.sin(2 * np.pi * df["weekofyear"] / 52)
    df["week_cos"] = np.cos(2 * np.pi * df["weekofyear"] / 52)
    df["doy_sin"] = np.sin(2 * np.pi * df["dayofyear"] / 365)
    df["doy_cos"] = np.cos(2 * np.pi * df["dayofyear"] / 365)
    df["is_december"] = (df["month"] == 12).astype(int)
    df["is_january"] = (df["month"] == 1).astype(int)

In [222]:
global_mean = train['demand'].mean()
global_std = train['demand'].std()

agg_configs = [
    (["product_rk"], "p"),
    (["store_location_rk"], "s"),
    (["product_rk", "store_location_rk"], "ps"),
    (["product_rk", "weekofyear"], "pw"),
    (["store_location_rk", "weekofyear"], "sw"),
    (["product_rk", "month"], "pm"),
]

for keys, prefix in agg_configs:
    agg = train.groupby(keys)['demand'].agg(['mean', 'median', 'std']).reset_index()
    agg.columns = keys + [f"{prefix}_mean", f"{prefix}_median", f"{prefix}_std"]
    train = train.merge(agg, on=keys, how='left')
    test = test.merge(agg, on=keys, how='left')

for col in train.columns:
    if col.endswith(('_mean', '_median', '_std')):
        fill_val = global_mean if not col.endswith('_std') else global_std
        train[col] = train[col].fillna(fill_val)
        test[col] = test[col].fillna(fill_val)

print(f"After aggregations - Train: {train.shape}, Test: {test.shape}")

After aggregations - Train: (35344, 43), Test: (1404, 37)


In [223]:
train = train.sort_values(['product_rk', 'store_location_rk', 'period_start_dt']).reset_index(drop=True)
train['demand_raw'] = train['demand']

lags = [1, 2, 3, 7, 14, 28]
windows = [7, 14, 28]

for lag in lags:
    train[f'lag_{lag}'] = train.groupby(['product_rk', 'store_location_rk'])['demand_raw'].shift(lag)

for w in windows:
    train[f'roll_mean_{w}'] = train.groupby(['product_rk', 'store_location_rk'])['demand_raw'].transform(
        lambda x: x.shift(1).rolling(w, min_periods=1).mean()
    )
    train[f'roll_std_{w}'] = train.groupby(['product_rk', 'store_location_rk'])['demand_raw'].transform(
        lambda x: x.shift(1).rolling(w, min_periods=1).std().fillna(0)
    )

for alpha in [0.1, 0.3]:
    train[f'ema_{int(alpha*10)}'] = train.groupby(['product_rk', 'store_location_rk'])['demand_raw'].transform(
        lambda x: x.shift(1).ewm(alpha=alpha).mean()
    )

train['trend'] = train.groupby(['product_rk', 'store_location_rk']).cumcount()

In [224]:
lag_cols = [f'lag_{l}' for l in lags]
roll_cols = [f'roll_mean_{w}' for w in windows] + [f'roll_std_{w}' for w in windows]
ema_cols = ['ema_1', 'ema_3']

feature_columns = [
    'product_rk', 'store_location_rk', 'year', 'month', 'weekofyear', 'quarter',
    'month_sin', 'month_cos', 'week_sin', 'week_cos', 'doy_sin', 'doy_cos',
    'is_december', 'is_january', 'dow',
    'p_mean', 'p_median', 'p_std',
    's_mean', 's_median', 's_std',
    'ps_mean', 'ps_median', 'ps_std',
    'pw_mean', 'sw_mean', 'pm_mean',
    'trend'
] + lag_cols + roll_cols + ema_cols

feature_columns = [f for f in feature_columns if f in train.columns]
cat_features = ['product_rk', 'store_location_rk', 'year', 'month', 'weekofyear', 'quarter', 'dow']
cat_features = [c for c in cat_features if c in feature_columns]

for col in lag_cols + roll_cols + ema_cols:
    if col in train.columns:
        group_fill = train.groupby(['product_rk', 'store_location_rk'])[col].transform('mean')
        train[col] = train[col].fillna(group_fill)
        train[col] = train[col].fillna(global_mean)

for col in feature_columns:
    if col in train.columns:
        if train[col].isna().any():
            if col in cat_features:
                train[col] = train[col].fillna(0).astype(int)
            else:
                train[col] = train[col].fillna(global_mean)

if 'demand_raw' not in train.columns:
    train['demand_raw'] = train['demand']
train['demand_raw'] = train['demand_raw'].fillna(global_mean)

print(f"Features: {len(feature_columns)}, Training samples: {len(train)}")
print(f"NaN check - Target: {train['demand_raw'].isna().sum()}, Features: {train[feature_columns].isna().sum().sum()}")

Features: 42, Training samples: 35344
NaN check - Target: 0, Features: 0


In [225]:
VALID_DAYS = 28
max_date = train["period_start_dt"].max()
val_start = max_date - pd.Timedelta(days=VALID_DAYS)

train_ds = train[train["period_start_dt"] < val_start].reset_index(drop=True)
val_ds = train[train["period_start_dt"] >= val_start].reset_index(drop=True)

train_ds = train_ds[train_ds['demand_raw'].notna()].reset_index(drop=True)
val_ds = val_ds[val_ds['demand_raw'].notna()].reset_index(drop=True)

for col in feature_columns:
    if col in train_ds.columns:
        train_ds[col] = train_ds[col].fillna(global_mean)
    if col in val_ds.columns:
        val_ds[col] = val_ds[col].fillna(global_mean)

print(f"Train size: {len(train_ds)}, Val size: {len(val_ds)}")
print(f"Train NaNs - Target: {train_ds['demand_raw'].isna().sum()}, Features: {train_ds[feature_columns].isna().sum().sum()}")
print(f"Val NaNs - Target: {val_ds['demand_raw'].isna().sum()}, Features: {val_ds[feature_columns].isna().sum().sum()}")

Train size: 34144, Val size: 1200
Train NaNs - Target: 0, Features: 0
Val NaNs - Target: 0, Features: 0


In [226]:
model = CatBoostRegressor(
    loss_function='MAE',
    iterations=2000,
    learning_rate=0.02,
    depth=9,
    l2_leaf_reg=4,
    random_strength=1.2,
    bootstrap_type='Bayesian',
    bagging_temperature=1.0,
    random_seed=42,
    early_stopping_rounds=100,
    verbose=False
)

model.fit(
    train_ds[feature_columns],
    train_ds['demand_raw'],
    eval_set=(val_ds[feature_columns], val_ds['demand_raw']),
    cat_features=cat_features,
    verbose=False
)

val_pred = model.predict(val_ds[feature_columns])
print(f"Validation MAE: {mean_absolute_error(val_ds['demand_raw'], val_pred):.4f}")
print(f"Validation SMAPE: {smape(val_ds['demand_raw'], val_pred):.4f}")

<catboost.core.CatBoostRegressor at 0x7d8f4f4870e0>

Validation MAE: 11.4318
Validation SMAPE: 77.6268


In [227]:
max_history = max(lags + windows + [56])

queues = defaultdict(lambda: deque(maxlen=max_history))
for (p, s), grp in train.sort_values("period_start_dt").groupby(['product_rk', 'store_location_rk']):
    vals = list(grp['demand_raw'].tail(max_history))
    q = deque(maxlen=max_history)
    for v in vals:
        q.append(float(v))
    queues[(p, s)] = q

group_means = train.groupby(['product_rk', 'store_location_rk'])['demand_raw'].mean().to_dict()
prod_means = train.groupby('product_rk')['demand_raw'].mean().to_dict()

print(f"Initialized {len(queues)} time series queues")

Initialized 245 time series queues


In [228]:
def build_features_from_queue(p, s, row, queues, group_means, prod_means, global_mean):
    q = queues.get((p, s), deque([], maxlen=max_history))
    arr = list(q)

    fallback = group_means.get((p, s), prod_means.get(p, global_mean))

    feat = {}

    for lag in lags:
        if len(arr) >= lag:
            feat[f'lag_{lag}'] = arr[-lag]
        else:
            feat[f'lag_{lag}'] = fallback

    for w in windows:
        take = arr[-w:] if len(arr) >= 1 else []
        feat[f'roll_mean_{w}'] = np.mean(take) if len(take) else fallback
        feat[f'roll_std_{w}'] = np.std(take, ddof=0) if len(take) > 1 else 0.0

    feat['ema_1'] = arr[-1] if len(arr) else fallback

    feat['trend'] = len(arr)

    return feat

In [229]:
test = test.sort_values('period_start_dt').reset_index(drop=True)
test['forecast'] = np.nan
unique_dates = sorted(test['period_start_dt'].unique())

for current_date in unique_dates:
    mask = test['period_start_dt'] == current_date
    batch = test[mask].copy()

    if batch.shape[0] == 0:
        continue

    X_rows = []
    idxs = []

    for idx, row in batch.iterrows():
        p = row['product_rk']
        s = row['store_location_rk']

        feat = build_features_from_queue(p, s, row, queues, group_means, prod_means, global_mean)

        for f in feature_columns:
            if f not in feat:
                feat[f] = row.get(f, global_mean if 'mean' in f else 0)

        X_rows.append(feat)
        idxs.append(idx)

    X_df = pd.DataFrame(X_rows)[feature_columns]

    for c in cat_features:
        if c in X_df.columns:
            X_df[c] = X_df[c].astype(int)

    preds = model.predict(X_df)

    for i, pval in zip(idxs, preds):
        pval = max(0, pval)
        test.loc[i, 'forecast'] = pval
        key = (test.loc[i, 'product_rk'], test.loc[i, 'store_location_rk'])
        queues[key].append(float(pval))

print(f"Generated {test['forecast'].notna().sum()} predictions")

Generated 1404 predictions


In [230]:
test['forecast'] = test['forecast'].fillna(global_mean).clip(lower=0)
test['forecast'] = np.round(test['forecast']).astype(int)

submission = sample_sub.copy()
submission = submission.merge(test[['id', 'forecast']], on='id', how='left')
submission['predicted'] = submission['forecast'].fillna(int(round(global_mean))).astype(int)
submission[['id', 'predicted']].to_csv('submission.csv', index=False)

print("Submission file created!")
print(f"Predictions range: [{submission['predicted'].min()}, {submission['predicted'].max()}]")
print(f"Mean prediction: {submission['predicted'].mean():.2f}")

!kaggle competitions submit -c dscs-25-hw3 -f submission.csv -m "Message"

Submission file created!
Predictions range: [3, 37]
Mean prediction: 11.47
100% 9.57k/9.57k [00:00<00:00, 16.1kB/s]
Successfully submitted to DSCS_25_HW3

In [231]:
tmr.ShowTime()

Runtime is 43 sec
