
# Smart Product Pricing ‚Äî **Enhanced Text-Only+Stacking** Notebook ‚úÖ

This notebook upgrades your baseline to target **SMAPE ‚â§ 47** by implementing:
- **Per-unit target** training (then rescale to price).
- A parallel **TF‚ÄëIDF ‚Üí Ridge** branch and **blend** with your MiniLM+numeric model.
- **Brand target encoding (fold-wise)**, **keyword flags**, and improved **quantity parsing** (ranges, counts, bulk cues).
- **Stratified CV on log(per-unit)** bins.
- Tuned **XGBoost** params for the MiniLM+numeric branch.
- Optional **MiniLM KMeans clusters** as weak categories (one-hot bucketization).

> Paths are configurable; the code expects `dataset/train.csv` (and optional `dataset/test.csv` with the same columns but without `price`).


In [None]:

# =========================
# 0) Environment & Installs
# =========================
# (Run only once per environment)

import sys
# Lightweight core libs
!pip -q install numpy pandas scikit-learn scipy xgboost==2.1.1

# Text models & vectorizers
# Adjust CUDA wheel as needed (or remove extra-index-url for CPU-only)
!pip -q install sentence-transformers==3.0.1 transformers==4.44.2 torch --extra-index-url https://download.pytorch.org/whl/cu121

# Optional progress bars
!pip -q install tqdm


In [None]:

# =============================
# 1) Imports, Config & Utilities
# =============================
import os, re, math, gc, json, warnings
from pathlib import Path
from typing import Tuple, List, Dict
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.cluster import MiniBatchKMeans
from scipy import sparse
import xgboost as xgb
from tqdm import tqdm

warnings.filterwarnings("ignore")

# ---------- Paths ----------
DATA_DIR = Path("dataset")
TRAIN_CSV = DATA_DIR / "train.csv"   # columns: sample_id, catalog_content, image_link, price
TEST_CSV  = DATA_DIR / "test.csv"    # optional: same columns except price

OUTPUT_DIR = Path("outputs"); OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

RANDOM_STATE = 42
N_SPLITS = 5

def smape(y_true, y_pred, eps=1e-9):
    # Symmetric MAPE (in %)
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    diff = np.abs(y_pred - y_true) / denom
    return 100.0 * np.mean(diff)

print("Environment ready.")


In [None]:

# ==================
# 2) Load the data üì•
# ==================
assert TRAIN_CSV.exists(), f"Missing {TRAIN_CSV}"
train_df = pd.read_csv(TRAIN_CSV)

has_test = TEST_CSV.exists()
test_df  = pd.read_csv(TEST_CSV) if has_test else None

print(train_df.head(3))
print("Train shape:", train_df.shape)
if has_test:
    print("Test shape:", test_df.shape)


In [None]:

# =============================================
# 3) Basic text normalization & brand extraction
# =============================================
def normalize_text(s: str) -> str:
    if not isinstance(s, str): return ""
    s = s.replace("\n", " ").replace("\r", " ")
    s = re.sub(r"\s+", " ", s).strip()
    return s

def extract_brand(text: str) -> str:
    if not isinstance(text, str): return "~na~"
    t = text.lower()
    m = re.search(r"item name:\s*(.+)$", t, flags=re.IGNORECASE|re.MULTILINE)
    cand = text if m is None else m.group(1)
    cand = cand.split(" by ")[0].split(",")[0]
    cand = re.sub(r"[^a-zA-Z0-9&+\-\s]", " ", cand).strip()
    cand = re.sub(r"\s+", " ", cand)
    return cand[:50] if cand else "~na~"

for df in [train_df] + ([test_df] if has_test else []):
    df["catalog_content"] = df["catalog_content"].astype(str).apply(normalize_text)
    df["brand"] = df["catalog_content"].apply(extract_brand)
    
train_df.head(3)


In [None]:

# ===================================
# 4) Quantity parsing ‚Üí total_units üß™
# ===================================
VOL_MAP = {"ml": 1.0, "l": 1000.0, "fl oz": 29.5735, "oz": 29.5735}
WT_MAP  = {"g": 1.0, "kg": 1000.0, "lb": 453.592, "ounce": 28.3495}
COUNT_TOKENS = {"count","ct","pcs","pieces","tabs","caps","pack"}

_re_num     = r"(\d+(?:[\.,]\d+)?)"
_re_range   = rf"{_re_num}\s*[-‚Äì]\s*{_re_num}"
_re_unit    = r"(ml|l|fl\s*oz|ounce|oz|g|kg|lb|ct|count|pcs|pieces|tabs|caps)"
_re_pack    = rf"(?:(?:pack)(?:\s*of)?\s*{_re_num})|(({_re_num}))\s*pack"

def _to_float(s):
    try: return float(str(s).replace(",", "."))
    except: return np.nan

def unit_to_base(val: float, unit: str):
    u = unit.lower().strip()
    if u in ("ml", "l", "fl oz", "oz"):
        base = val if u=="ml" else 1000.0*val if u=="l" else 29.5735*val  # treat oz as fl oz
        return base, "volume_ml"
    if u in ("g", "kg", "lb", "ounce"):
        base = val if u=="g" else 1000.0*val if u=="kg" else 453.592*val if u=="lb" else 28.3495*val
        return base, "weight_g"
    if u in ("ct","count","pcs","pieces","tabs","caps","pack"):
        return val, "count"
    return np.nan, "~none~"

def parse_total_units(text: str):
    if not isinstance(text, str): return np.nan, "~none~"
    t = text.lower()

    # pack count
    pack_count = np.nan
    pm = re.search(_re_pack, t)
    if pm:
        nums = [n for n in pm.groups() if n is not None]
        if nums:
            pack_count = _to_float(nums[-1])

    # range like "10-12 oz"
    m = re.search(rf"{_re_range}\s*{_re_unit}", t)
    if m:
        a = _to_float(m.group(1)); b = _to_float(m.group(2)); u = m.group(3).replace("  ", " ").strip()
        val = np.mean([a,b])
        amt, unit_type = unit_to_base(val, u)
        if not np.isnan(amt):
            if not np.isnan(pack_count): amt *= pack_count
            return amt, unit_type

    # single "12 fl oz"
    m = re.search(rf"{_re_num}\s*{_re_unit}", t)
    if m:
        v = _to_float(m.group(1)); u = m.group(2).replace("  ", " ").strip()
        amt, unit_type = unit_to_base(v, u)
        if not np.isnan(amt):
            if not np.isnan(pack_count): amt *= pack_count
            return amt, unit_type

    # count-only fallbacks
    for tok in COUNT_TOKENS:
        m = re.search(rf"{_re_num}\s*{tok}\b", t)
        if m:
            cnt = _to_float(m.group(1))
            if not np.isnan(cnt):
                if not np.isnan(pack_count): cnt *= pack_count
                return cnt, "count"

    if not np.isnan(pack_count):
        return pack_count, "count"

    return np.nan, "~none~"

for df in [train_df] + ([test_df] if has_test else []):
    parsed = df["catalog_content"].apply(parse_total_units)
    df["total_units_base"] = parsed.apply(lambda x: x[0])
    df["unit_kind"] = parsed.apply(lambda x: x[1])
    flags = {
        "is_value_pack": df["catalog_content"].str.contains(r"\b(value pack|bulk|family size)\b", case=False, na=False).astype(int),
        "is_refill":     df["catalog_content"].str.contains(r"\brefill\b", case=False, na=False).astype(int),
        "is_variety":    df["catalog_content"].str.contains(r"\bvariety\b", case=False, na=False).astype(int),
        "has_range":     df["catalog_content"].str.contains(r"\d+\s*[-‚Äì]\s*\d+", case=False, na=False).astype(int),
    }
    for k,v in flags.items():
        df[k] = v

train_df[["catalog_content","brand","total_units_base","unit_kind","is_value_pack","is_refill","is_variety","has_range"]].head(5)


In [None]:

# ===================================
# 5) Target: per-unit, then rescale üéØ
# ===================================
def build_targets(df: pd.DataFrame):
    tu = df["total_units_base"].fillna(1.0).clip(lower=1e-6).astype(float)
    y_price = df["price"].astype(float)
    y_unit = (y_price / tu).astype(float)
    return y_price.values, y_unit.values, tu.values

y_price = train_df["price"].values.astype(float)
y_price, y_unit, tu_train = build_targets(train_df)

from sklearn.preprocessing import StandardScaler
logy = np.log1p(y_unit).reshape(-1,1)
scaler_y = StandardScaler().fit(logy)
y_std = scaler_y.transform(logy).ravel()

bins = pd.qcut(np.log1p(y_unit), q=20, duplicates="drop")
print("Targets prepared. Example:", y_price[:3], y_unit[:3])


In [None]:

# ===================================================
# 6) Numeric features (incl. brand enc. placeholder) üî¢
# ===================================================
def prepare_numeric(df: pd.DataFrame) -> pd.DataFrame:
    num = pd.DataFrame(index=df.index)
    num["total_units_base"] = df["total_units_base"].fillna(df["total_units_base"].median())
    for k in ["volume_ml","weight_g","count","~none~"]:
        num[f"unit_{k}"] = (df["unit_kind"]==k).astype(int)
    for k in ["is_value_pack","is_refill","is_variety","has_range"]:
        num[k] = df[k].astype(int)
    s = df["catalog_content"].fillna("")
    num["len_chars"] = s.str.len().clip(0, 2000)
    num["len_words"] = s.apply(lambda x: len(x.split())).clip(0, 400)
    num["num_digits"] = s.str.count(r"\d").clip(0, 100)
    return num

X_num_train = prepare_numeric(train_df)
X_num_test  = prepare_numeric(test_df) if has_test else None
X_num_train.head(3), X_num_train.shape


In [None]:

# =====================================
# 7) Brand target encoding (fold-wise) üè∑Ô∏è
# =====================================
brands = train_df["brand"].fillna("~na~").astype(str).values

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
brand_te = np.zeros(len(train_df), dtype=np.float32)

for tr_idx, va_idx in skf.split(np.zeros(len(bins)), bins):
    mean_tr = y_unit[tr_idx].mean()
    gmean = pd.Series(y_unit[tr_idx]).groupby(pd.Series(brands[tr_idx])).mean()
    enc = pd.Series(brands[va_idx]).map(gmean).fillna(mean_tr).values
    brand_te[va_idx] = enc

X_num_train["brand_te_per_unit"] = brand_te

topN = 500
vc = pd.Series(brands).value_counts().head(topN).index.tolist()
for b in vc:
    X_num_train[f"brand_{b}"] = (pd.Series(brands)==b).astype(int).values

if has_test:
    brands_test = test_df["brand"].fillna("~na~").astype(str)
    full_gmean = pd.Series(y_unit).groupby(pd.Series(brands)).mean()
    test_te = brands_test.map(full_gmean).fillna(y_unit.mean()).values
    X_num_test["brand_te_per_unit"] = test_te
    for b in vc:
        X_num_test[f"brand_{b}"] = (brands_test==b).astype(int).values

print("Numeric features with brand TE:", X_num_train.shape)


In [None]:

# ==================================
# 8) MiniLM sentence embeddings üî§üß†
# ==================================
import torch
from sentence_transformers import SentenceTransformer

MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
st_model = SentenceTransformer(MODEL_NAME, device="cuda" if torch.cuda.is_available() else "cpu")

def encode_texts(texts: List[str], batch_size: int = 256):
    embs = st_model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
    return np.asarray(embs, dtype=np.float32)

train_texts = train_df["catalog_content"].fillna("").tolist()
minilm_train = encode_texts(train_texts)

if has_test:
    test_texts  = test_df["catalog_content"].fillna("").tolist()
    minilm_test = encode_texts(test_texts)
else:
    minilm_test = None

print("MiniLM shapes:", minilm_train.shape, None if minilm_test is None else minilm_test.shape)


In [None]:

# ======================================
# 9) Optional: KMeans clusters on MiniLM
# ======================================
K_CLUST = 100
kmeans = MiniBatchKMeans(n_clusters=K_CLUST, random_state=RANDOM_STATE, batch_size=4096)
clus_train = kmeans.fit_predict(minilm_train)
clus_train_oh = np.eye(K_CLUST, dtype=np.float32)[clus_train]

if has_test:
    clus_test = kmeans.predict(minilm_test)
    clus_test_oh = np.eye(K_CLUST, dtype=np.float32)[clus_test]
else:
    clus_test_oh = None

print("Cluster one-hots:", clus_train_oh.shape)


In [None]:

# =======================================================
# 10) Assemble design matrices for MiniLM + numeric branch
# =======================================================
def hstack_safe(*arrs):
    mats = []
    for a in arrs:
        if a is None: continue
        if isinstance(a, pd.DataFrame) or isinstance(a, pd.Series):
            mats.append(a.values)
        else:
            mats.append(a)
    return np.hstack(mats).astype(np.float32)

X_mini_num_train = hstack_safe(minilm_train, X_num_train.values, clus_train_oh)
X_mini_num_test  = hstack_safe(minilm_test,  X_num_test.values,  clus_test_oh) if has_test else None

print("MiniLM+num train shape:", X_mini_num_train.shape)


In [None]:

# =============================================
# 11) TF-IDF (word + char) and Ridge OOF branch
# =============================================
tfw = TfidfVectorizer(ngram_range=(1,2), max_features=150_000, min_df=3)
tfc = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), max_features=80_000, min_df=5)

X_tfw = tfw.fit_transform(train_df["catalog_content"].fillna(""))
X_tfc = tfc.fit_transform(train_df["catalog_content"].fillna(""))
X_tfidf = sparse.hstack([X_tfw, X_tfc]).tocsr()

oof_tfidf = np.zeros(len(train_df), dtype=np.float32)
ridge_models = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
for fold, (tr, va) in enumerate(skf.split(np.zeros(len(bins)), bins), 1):
    m = Ridge(alpha=2.0, random_state=RANDOM_STATE)
    m.fit(X_tfidf[tr], y_std[tr])  # standardized log per-unit
    oof_tfidf[va] = m.predict(X_tfidf[va])
    ridge_models.append(m)
    print(f"[Ridge fold {fold}] done.")

if has_test:
    Xt_tfw = tfw.transform(test_df["catalog_content"].fillna(""))
    Xt_tfc = tfc.transform(test_df["catalog_content"].fillna(""))
    Xt_tfidf = sparse.hstack([Xt_tfw, Xt_tfc]).tocsr()
else:
    Xt_tfidf = None


In [None]:

# =====================================
# 12) XGBoost for MiniLM+numeric branch
# =====================================
params = dict(
    objective="reg:squarederror",
    tree_method="hist",
    max_depth=8,
    min_child_weight=4.0,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=1e-2,
    reg_lambda=1.0,
    learning_rate=0.03,
    nthread=-1,
    random_state=RANDOM_STATE,
)

oof_mini = np.zeros(len(train_df), dtype=np.float32)
xgb_models = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)
for fold, (tr, va) in enumerate(skf.split(np.zeros(len(bins)), bins), 1):
    dtr = xgb.DMatrix(X_mini_num_train[tr], label=y_std[tr])
    dva = xgb.DMatrix(X_mini_num_train[va], label=y_std[va])
    watch = [(dtr, "train"), (dva, "valid")]
    m = xgb.train(
        params=params,
        dtrain=dtr,
        num_boost_round=4000,
        evals=watch,
        verbose_eval=200,
        early_stopping_rounds=200
    )
    oof_mini[va] = m.predict(dva, iteration_range=(0, m.best_iteration+1))
    xgb_models.append(m)
    print(f"[XGB fold {fold}] best iters:", m.best_iteration+1)

print("Branches OOF ready.")


In [None]:

# ===================================
# 13) Blend, evaluate OOF (train fold)
# ===================================
w_mini, w_tfidf = 0.6, 0.4  # tune if needed
oof_std_blend = w_mini*oof_mini + w_tfidf*oof_tfidf

# inverse standardized log(per-unit) ‚Üí per-unit
oof_log_per_unit = (scaler_y.inverse_transform(oof_std_blend.reshape(-1,1))).ravel()
oof_per_unit     = np.expm1(oof_log_per_unit)

# final price = per_unit * total_units
oof_price = (oof_per_unit * tu_train).clip(0.01)

cv_smape = smape(train_df["price"].values, oof_price)
print(f"CV SMAPE (OOF): {cv_smape:.3f}%")

pd.DataFrame({
    "sample_id": train_df["sample_id"],
    "price_true": train_df["price"].values,
    "price_pred": oof_price,
    "per_unit_pred": oof_per_unit,
}).to_csv(OUTPUT_DIR / "oof_predictions.csv", index=False)
print("Saved:", OUTPUT_DIR / "oof_predictions.csv")


In [None]:

# ===============================
# 14) Inference & submission file
# ===============================
if has_test:
    # TF-IDF preds
    preds_tfidf = np.mean([m.predict(Xt_tfidf) for m in ridge_models], axis=0)

    # MiniLM+num preds
    dtest = xgb.DMatrix(X_mini_num_test)
    preds_mini = np.mean([m.predict(dtest, iteration_range=(0, m.best_iteration+1)) for m in xgb_models], axis=0)

    # Blend standardized
    preds_std_blend = w_mini*preds_mini + w_tfidf*preds_tfidf

    # back to price
    log_per_unit = scaler_y.inverse_transform(preds_std_blend.reshape(-1,1)).ravel()
    per_unit = np.expm1(log_per_unit)

    tu_test = test_df["total_units_base"].fillna(1.0).clip(lower=1e-6).astype(float).values
    price_pred = (per_unit * tu_test).clip(0.01)

    sub = pd.DataFrame({
        "sample_id": test_df["sample_id"],
        "price": price_pred
    })
    sub_path = OUTPUT_DIR / "submission.csv"
    sub.to_csv(sub_path, index=False)
    print("Saved submission:", sub_path)
else:
    print("No test.csv detected; skipped submission.")



## Notes & Tips
- If GPU is available for `sentence-transformers`, encoding will be much faster (`device='cuda'`).
- Tune `w_mini`/`w_tfidf` to minimize SMAPE on OOF (`np.linspace(0.1,0.9,9)` grid is fine).
- If your per-unit parsing is still noisy for some categories, try clipping `y_unit` at the 99.5th percentile **when fitting the scaler only** (keep raw for SMAPE).
- You can also try LightGBM for the MiniLM+numeric branch‚Äîsometimes improves a bit with high-dimensional sparse inputs.
