# Deal Prediction Modelling 

Importing required libraries

In [1]:
!pip -q install xgboost

import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from datetime import timedelta
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from pathlib import Path
import sys, subprocess

In [2]:
# Loading the dataset
output = Path(r"C:/Users/pmayr/Downloads/Output")
data_path = output/"staged_features_events_brands_size.csv"

In [3]:
df = pd.read_csv(data_path)
if "scrape_date" not in df.columns:
    df["scrape_date"] = pd.to_datetime(df["scrape_date_str"], errors = "coerce")

  df = pd.read_csv(data_path)


In [4]:
#Keeping only yhe rows withe the sku and  date
df = df.dropna(subset=["sku", "scrape_date"]).copy()

# fill discounts safely
if "discount_percentage" in df.columns:
    df["discount_percentage"] = df["discount_percentage"].fillna(0.0)
if "discount_pct_filled" in df.columns:
    df["discount_pct_filled"] = df["discount_pct_filled"].fillna(0.0)

# cast some categoricals (optional)
for col in ["brand_tier", "size_band", "season", "category"]:
    if col in df.columns:
        df[col] = df[col].astype("category")

print(df.shape, df[["sku","scrape_date","category"]].head(3))

(47503, 36)        sku scrape_date category
0  8371390  2025-04-02   Easter
1  7473849  2025-04-02   Easter
2  5726070  2025-04-02   Easter


In [18]:
# ===== LABELS with fallback to next observation =====
def build_forward_labels_with_fallback(sub: pd.DataFrame) -> pd.DataFrame:
    sub = sub.sort_values("scrape_date").reset_index(drop=True)
    n = len(sub)
    next_days = np.full(n, np.nan, dtype=float)
    next_pct  = np.full(n, np.nan, dtype=float)

    # Define promo condition robustly
    has_disc_col = "discount_pct_filled" in sub.columns
    on_promo = (
        (sub["is_on_promo"] == 1)
        | (sub["discount_percentage"].fillna(0) > 0 if "discount_percentage" in sub.columns else False)
        | (sub["discount_pct_filled"].fillna(0) > 0 if has_disc_col else False)
    ).to_numpy()

    promo_idx = np.where(on_promo)[0]

    for i in range(n - 1):  # last row can’t have a future label
        # 1) try next promo strictly after i
        j_candidates = promo_idx[promo_idx > i]
        if len(j_candidates) > 0:
            j = j_candidates[0]
        else:
            # 2) fallback to the very next observation
            j = i + 1

        # days until j
        next_days[i] = (sub.loc[j, "scrape_date"] - sub.loc[i, "scrape_date"]).days

        # discount % at j (prefer filled)
        if has_disc_col and pd.notna(sub.loc[j, "discount_pct_filled"]):
            next_pct[i] = float(sub.loc[j, "discount_pct_filled"])
        elif "discount_percentage" in sub.columns and pd.notna(sub.loc[j, "discount_percentage"]):
            next_pct[i] = float(sub.loc[j, "discount_percentage"])
        else:
            next_pct[i] = 0.0  # safe fallback

    sub["y_days_to_next_discount"] = next_days
    sub["y_next_discount_pct"]     = next_pct
    return sub

# rebuild panel with the new labels
panel = df.groupby("sku", group_keys=False).apply(build_forward_labels_with_fallback)
print("[labels] non-null rates:",
      "days", panel["y_days_to_next_discount"].notna().mean(),
      "| pct", panel["y_next_discount_pct"].notna().mean())


[labels] non-null rates: days 0.49889480664379093 | pct 0.49889480664379093


  panel = df.groupby("sku", group_keys=False).apply(build_forward_labels_with_fallback)


In [10]:
# ===== 4) FEATURES: NUMERIC + LAGS + COMPACT CAT CODES (no one-hot) =====
# numeric features to use
num_cols = [c for c in [
    "b_price","item_price","original_price","b_unit_price","item_unit_price",
    "price_gap","unit_price_gap",
    "discount_pct_filled"
] if c in panel.columns]

# create lags
panel = panel.sort_values(["sku","scrape_date"]).copy()
if "item_price" in panel.columns:
    panel["item_price_lag1"]    = panel.groupby("sku")["item_price"].shift(1)
    panel["pct_chg_item_price"] = panel["item_price"] / panel["item_price_lag1"] - 1.0
    num_cols += ["item_price_lag1","pct_chg_item_price"]

if "discount_pct_filled" in panel.columns:
    panel["disc_pct_filled_lag1"] = panel.groupby("sku")["discount_pct_filled"].shift(1)
    num_cols += ["disc_pct_filled_lag1"]

# ---- compact categorical encoding ----
cat_cols = [c for c in ["brand_tier","size_band","season","category"] if c in panel.columns]

def add_compact_codes(df, col, top_k=30):
    # keep only top_k frequent levels; others -> "OTHER"
    s = df[col].astype(str)
    vc = s.value_counts(dropna=False)
    keep = set(vc.head(top_k).index)
    safe = s.where(s.isin(keep), "OTHER").astype("category")
    code_col = f"{col}_code"
    df[code_col] = safe.cat.codes.astype("int16")   # compact integer code
    return code_col

code_cols = []
for c in cat_cols:
    code_cols.append(add_compact_codes(panel, c, top_k=30))

num_cols += code_cols

# build X/Y with compact numeric features only (no get_dummies)
X = panel[num_cols].astype("float32")   # float32 keeps memory low
y_days = panel["y_days_to_next_discount"]
y_disc = panel["y_next_discount_pct"]

# sanity
assert X.index.equals(panel.index), "X and panel indices must match."
print("[X shape]", X.shape)
print("Categorical codes added:", code_cols)


[X shape] (47503, 13)
Categorical codes added: ['brand_tier_code', 'size_band_code', 'season_code', 'category_code']


In [14]:
# ===== 5) TEMPORAL SPLIT =====
last_date   = panel["scrape_date"].max()
val_cutoff  = last_date - pd.Timedelta(days=14)
test_cutoff = last_date - pd.Timedelta(days=7)

mask_train = panel["scrape_date"] <  val_cutoff
mask_val   = (panel["scrape_date"] >= val_cutoff) & (panel["scrape_date"] < test_cutoff)
mask_test  = panel["scrape_date"] >= test_cutoff

m_days = y_days.notna()
m_disc = y_disc.notna()

# indices (aligned to panel.index)
mask_tr_days  = (mask_train & m_days).to_numpy()
mask_va_days  = (mask_val   & m_days).to_numpy()
mask_te_days  = (mask_test  & m_days).to_numpy()

mask_tr_disc  = (mask_train & m_disc).to_numpy()
mask_va_disc  = (mask_val   & m_disc).to_numpy()
mask_te_disc  = (mask_test  & m_disc).to_numpy()

print("Days mask counts:",
      mask_tr_days.sum(), mask_va_days.sum(), mask_te_days.sum())
print("Pct  mask counts:",
      mask_tr_disc.sum(), mask_va_disc.sum(), mask_te_disc.sum())

Days mask counts: 6282 36 125
Pct  mask counts: 6282 36 125


In [15]:
def sanitize_with_mask(X_df: pd.DataFrame, y_ser: pd.Series, mask: np.ndarray):
    # 1) quick boolean slice by mask (no index alignment cost)
    Xc = X_df[mask]
    yc = y_ser[mask]

    # 2) drop rows with non-finite labels
    yv = yc.to_numpy()
    m_y = np.isfinite(yv)
    Xc = Xc.iloc[m_y]
    yc = yc.iloc[m_y]

    # 3) now cast to float32 (AFTER filtering to save memory)
    Xc = Xc.astype(np.float32, copy=False)
    yc = yc.astype(np.float32, copy=False)

    # 4) replace inf -> NaN then drop any row with NaN/inf in X
    Xc = Xc.replace([np.inf, -np.inf], np.nan)
    yc = yc.replace([np.inf, -np.inf], np.nan)

    mX = np.isfinite(Xc.to_numpy()).all(axis=1)
    Xc = Xc.iloc[mX]
    yc = yc.iloc[mX]

    # 5) safety
    assert np.isfinite(yc.to_numpy()).all(), "Label still non-finite."
    assert np.isfinite(Xc.to_numpy()).all(), "Features still non-finite."
    return Xc, yc

# Build clean matrices using masks (NO .loc with giant indexers)
Xd_tr, yd_tr = sanitize_with_mask(X, y_days, mask_tr_days)
Xd_va, yd_va = sanitize_with_mask(X, y_days, mask_va_days)
Xd_te, yd_te = sanitize_with_mask(X, y_days, mask_te_days)

Xr_tr, yr_tr = sanitize_with_mask(X, y_disc, mask_tr_disc)
Xr_va, yr_va = sanitize_with_mask(X, y_disc, mask_va_disc)
Xr_te, yr_te = sanitize_with_mask(X, y_disc, mask_te_disc)

print("Days (clean):", Xd_tr.shape, Xd_va.shape, Xd_te.shape)
print("Pct  (clean):", Xr_tr.shape, Xr_va.shape, Xr_te.shape)

Days (clean): (0, 13) (0, 13) (0, 13)
Pct  (clean): (0, 13) (0, 13) (0, 13)


In [17]:
# ===== 7) TRAIN & EVALUATE XGBOOST =====
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, r2_score

xgb_days = XGBRegressor(
    n_estimators=600, max_depth=8, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    tree_method="hist", random_state=42
)
xgb_disc = XGBRegressor(
    n_estimators=600, max_depth=8, learning_rate=0.05,
    subsample=0.8, colsample_bytree=0.8, reg_lambda=1.0,
    tree_method="hist", random_state=42
)

if len(yd_tr) == 0 or len(yr_tr) == 0:
    print("No training rows after cleaning; check label creation or split windows.")

xgb_days.fit(Xd_tr, yd_tr)
pred_days_va = np.clip(xgb_days.predict(Xd_va), 0, None)
pred_days_te = np.clip(xgb_days.predict(Xd_te), 0, None)

print("\n[Time-to-next-discount]")
print(" VAL  MAE:", mean_absolute_error(yd_va, pred_days_va).round(2),
      "| R2:", r2_score(yd_va, pred_days_va).round(3))
print(" TEST MAE:", mean_absolute_error(yd_te, pred_days_te).round(2),
      "| R2:", r2_score(yd_te, pred_days_te).round(3))

xgb_disc.fit(Xr_tr, yr_tr)
pred_disc_va = np.clip(xgb_disc.predict(Xr_va), 0, 100)
pred_disc_te = np.clip(xgb_disc.predict(Xr_te), 0, 100)

print("\n[Next-discount-%]")
print(" VAL  MAE:", mean_absolute_error(yr_va, pred_disc_va).round(2),
      "| R2:", r2_score(yr_va, pred_disc_va).round(3))
print(" TEST MAE:", mean_absolute_error(yr_te, pred_disc_te).round(2),
      "| R2:", r2_score(yr_te, pred_disc_te).round(3))


No training rows after cleaning; check label creation or split windows.


  bst.update(dtrain, iteration=i, fobj=obj)



[Time-to-next-discount]


ValueError: Found array with 0 sample(s) (shape=(0,)) while a minimum of 1 is required.