# Smart Product Pricing — **Text+MiniLM + DNN (No XGBoost)** ✅

In [None]:

# 0) Installs
!pip -q install numpy pandas scikit-learn scipy sentence-transformers==3.0.1 transformers==4.44.2 tqdm torch


In [None]:

# 1) Imports & Config
import re, warnings
from pathlib import Path
import numpy as np, pandas as pd
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import Ridge
from sklearn.cluster import MiniBatchKMeans
from scipy import sparse

warnings.filterwarnings("ignore")

DATA_DIR = Path("dataset")
TRAIN_CSV = DATA_DIR / "train.csv"
TEST_CSV  = DATA_DIR / "test.csv"
OUTPUT_DIR = Path("outputs"); OUTPUT_DIR.mkdir(exist_ok=True, parents=True)

RANDOM_STATE = 42
N_SPLITS = 5
BATCH_SIZE = 1024
EPOCHS = 30
PATIENCE = 5
LR = 2e-3
WD = 1e-4
DROPOUT = 0.2
HIDDEN = [1024, 512, 256]
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

def smape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    diff = np.abs(y_pred - y_true) / denom
    return 100.0 * np.mean(diff)

print("Device:", DEVICE)


In [None]:

# 2) Load
train_df = pd.read_csv(TRAIN_CSV)
has_test = TEST_CSV.exists()
test_df  = pd.read_csv(TEST_CSV) if has_test else None
print(train_df.head(2))


In [None]:

# 3) Normalize text & brand
def normalize_text(s):
    if not isinstance(s, str): return ""
    s = s.replace("\n"," ").replace("\r"," ")
    s = re.sub(r"\s+"," ", s).strip()
    return s

def extract_brand(text):
    if not isinstance(text, str): return "~na~"
    t = text.lower()
    m = re.search(r"item name:\s*(.+)$", t, flags=re.IGNORECASE|re.MULTILINE)
    cand = text if m is None else m.group(1)
    cand = cand.split(" by ")[0].split(",")[0]
    cand = re.sub(r"[^a-zA-Z0-9&+\-\s]", " ", cand).strip()
    cand = re.sub(r"\s+", " ", cand)
    return cand[:50] if cand else "~na~"

for df in [train_df] + ([test_df] if has_test else []):
    df["catalog_content"] = df["catalog_content"].astype(str).apply(normalize_text)
    df["brand"] = df["catalog_content"].apply(extract_brand)


In [None]:

# 4) Quantity parsing
VOL_MAP = {"ml": 1.0, "l": 1000.0, "fl oz": 29.5735, "oz": 29.5735}
WT_MAP  = {"g": 1.0, "kg": 1000.0, "lb": 453.592, "ounce": 28.3495}
COUNT_TOKENS = {"count","ct","pcs","pieces","tabs","caps","pack"}

_re_num   = r"(\d+(?:[\.,]\d+)?)"
_re_range = rf"{_re_num}\s*[-–]\s*{_re_num}"
_re_unit  = r"(ml|l|fl\s*oz|ounce|oz|g|kg|lb|ct|count|pcs|pieces|tabs|caps)"
_re_pack  = rf"(?:(?:pack)(?:\s*of)?\s*{_re_num})|(({_re_num}))\s*pack"

def _to_float(s):
    try: return float(str(s).replace(",", "."))
    except: return np.nan

def unit_to_base(val, unit):
    u = unit.lower().strip()
    if u in ("ml","l","fl oz","oz"):
        base = val if u=="ml" else 1000.0*val if u=="l" else 29.5735*val
        return base, "volume_ml"
    if u in ("g","kg","lb","ounce"):
        base = val if u=="g" else 1000.0*val if u=="kg" else 453.592*val if u=="lb" else 28.3495*val
        return base, "weight_g"
    if u in ("ct","count","pcs","pieces","tabs","caps","pack"):
        return val, "count"
    return np.nan, "~none~"

def parse_total_units(text):
    if not isinstance(text, str): return np.nan, "~none~"
    t = text.lower()
    pack_count = np.nan
    pm = re.search(_re_pack, t)
    if pm:
        nums = [n for n in pm.groups() if n is not None]
        if nums:
            pack_count = _to_float(nums[-1])
    m = re.search(rf"{_re_range}\s*{_re_unit}", t)
    if m:
        a = _to_float(m.group(1)); b = _to_float(m.group(2)); u = m.group(3).strip()
        val = np.mean([a,b])
        amt, kind = unit_to_base(val, u)
        if not np.isnan(amt):
            if not np.isnan(pack_count): amt *= pack_count
            return amt, kind
    m = re.search(rf"{_re_num}\s*{_re_unit}", t)
    if m:
        v = _to_float(m.group(1)); u = m.group(2).strip()
        amt, kind = unit_to_base(v, u)
        if not np.isnan(amt):
            if not np.isnan(pack_count): amt *= pack_count
            return amt, kind
    for tok in COUNT_TOKENS:
        m = re.search(rf"{_re_num}\s*{tok}\b", t)
        if m:
            cnt = _to_float(m.group(1))
            if not np.isnan(cnt):
                if not np.isnan(pack_count): cnt *= pack_count
                return cnt, "count"
    if not np.isnan(pack_count):
        return pack_count, "count"
    return np.nan, "~none~"

for df in [train_df] + ([test_df] if has_test else []):
    parsed = df["catalog_content"].apply(parse_total_units)
    df["total_units_base"] = parsed.apply(lambda x: x[0])
    df["unit_kind"] = parsed.apply(lambda x: x[1])
    df["is_value_pack"] = df["catalog_content"].str.contains(r"\b(value pack|bulk|family size)\b", case=False, na=False).astype(int)
    df["is_refill"]     = df["catalog_content"].str.contains(r"\brefill\b", case=False, na=False).astype(int)
    df["is_variety"]    = df["catalog_content"].str.contains(r"\bvariety\b", case=False, na=False).astype(int)
    df["has_range"]     = df["catalog_content"].str.contains(r"\d+\s*[-–]\s*\d+", case=False, na=False).astype(int)


In [None]:

# 5) Targets & strat labels
def build_targets(df: pd.DataFrame):
    tu = df["total_units_base"].fillna(1.0).clip(lower=1e-6).astype(float)
    y_price = df["price"].astype(float)
    y_unit = (y_price / tu).astype(float)
    return y_price.values, y_unit.values, tu.values

y_price, y_unit, tu_train = build_targets(train_df)
y_unit_clip = np.clip(y_unit, np.percentile(y_unit,1), np.percentile(y_unit,99))
scaler_y = StandardScaler().fit(np.log1p(y_unit_clip).reshape(-1,1))
y_std = scaler_y.transform(np.log1p(y_unit).reshape(-1,1)).ravel()

def make_strat_labels(y, max_bins=20, n_splits=5, min_count_per_class=5):
    import pandas as pd, numpy as np
    q = max_bins
    y = np.asarray(y, dtype=float)
    while q >= max(3, n_splits):
        bins = pd.qcut(np.log1p(y), q=q, duplicates="drop")
        labels = pd.Series(bins).cat.codes.to_numpy()
        _, cnt = np.unique(labels, return_counts=True)
        if cnt.min() >= min_count_per_class:
            return labels
        q -= 2
    ranks = pd.Series(np.log1p(y)).rank().to_numpy()
    labels = np.floor(ranks / (len(y)/float(n_splits*2))).astype(int)
    return labels

y_strat = make_strat_labels(y_unit, max_bins=20, n_splits=N_SPLITS)


In [None]:

# 6) Numeric features
def prepare_numeric(df: pd.DataFrame) -> pd.DataFrame:
    num = pd.DataFrame(index=df.index)
    num["total_units_base"] = df["total_units_base"].fillna(df["total_units_base"].median())
    for k in ["volume_ml","weight_g","count","~none~"]:
        num[f"unit_{k}"] = (df["unit_kind"]==k).astype(int)
    for k in ["is_value_pack","is_refill","is_variety","has_range"]:
        num[k] = df[k].astype(int)
    s = df["catalog_content"].fillna("")
    num["len_chars"] = s.str.len().clip(0, 2000)
    num["len_words"] = s.apply(lambda x: len(x.split())).clip(0, 400)
    num["num_digits"] = s.str.count(r"\d").clip(0, 100)
    return num

X_num_train = prepare_numeric(train_df)
X_num_test  = prepare_numeric(test_df) if has_test else None


In [None]:

# 7) Brand target encoding
brands = train_df["brand"].fillna("~na~").astype(str).values
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
brand_te = np.zeros(len(train_df), dtype=np.float32)
for tr, va in skf.split(np.zeros(len(y_strat)), y_strat):
    mean_tr = y_unit[tr].mean()
    gmean = pd.Series(y_unit[tr]).groupby(pd.Series(brands[tr])).mean()
    enc = pd.Series(brands[va]).map(gmean).fillna(mean_tr).values
    brand_te[va] = enc
X_num_train["brand_te_per_unit"] = brand_te

topN = 500
vc = pd.Series(brands).value_counts().head(topN).index.tolist()
for b in vc:
    X_num_train[f"brand_{b}"] = (pd.Series(brands)==b).astype(int).values

if has_test:
    brands_test = test_df["brand"].fillna("~na~").astype(str)
    full_gmean = pd.Series(y_unit).groupby(pd.Series(brands)).mean()
    test_te = brands_test.map(full_gmean).fillna(y_unit.mean()).values
    X_num_test["brand_te_per_unit"] = test_te
    for b in vc:
        X_num_test[f"brand_{b}"] = (brands_test==b).astype(int).values


In [None]:

# 8) MiniLM embeddings + optional clusters
from sentence_transformers import SentenceTransformer
MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
st_model = SentenceTransformer(MODEL_NAME, device=("cuda" if torch.cuda.is_available() else "cpu"))

def encode_texts(texts, batch_size=256):
    embs = st_model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
    return np.asarray(embs, dtype=np.float32)

train_texts = train_df["catalog_content"].fillna("").tolist()
minilm_train = encode_texts(train_texts)
if has_test:
    test_texts  = test_df["catalog_content"].fillna("").tolist()
    minilm_test = encode_texts(test_texts)
else:
    minilm_test = None

from sklearn.cluster import MiniBatchKMeans
K_CLUST = 100
kmeans = MiniBatchKMeans(n_clusters=K_CLUST, random_state=RANDOM_STATE, batch_size=4096)
clus_train = kmeans.fit_predict(minilm_train)
clus_train_oh = np.eye(K_CLUST, dtype=np.float32)[clus_train]
clus_test_oh = None
if minilm_test is not None:
    clus_test = kmeans.predict(minilm_test)
    clus_test_oh = np.eye(K_CLUST, dtype=np.float32)[clus_test]

def hstack_safe(*arrs):
    mats = []
    for a in arrs:
        if a is None: continue
        if isinstance(a, pd.DataFrame) or isinstance(a, pd.Series):
            mats.append(a.values)
        else:
            mats.append(a)
    return np.hstack(mats).astype(np.float32)

X_mini_num_train = hstack_safe(minilm_train, X_num_train.values, clus_train_oh)
X_mini_num_test  = hstack_safe(minilm_test,  X_num_test.values,  clus_test_oh) if minilm_test is not None else None


In [None]:

# 9) TF-IDF + Ridge
tfw = TfidfVectorizer(ngram_range=(1,2), max_features=150_000, min_df=3)
tfc = TfidfVectorizer(analyzer="char_wb", ngram_range=(3,5), max_features=80_000, min_df=5)
X_tfidf = sparse.hstack([tfw.fit_transform(train_df["catalog_content"].fillna("")),
                         tfc.fit_transform(train_df["catalog_content"].fillna(""))]).tocsr()
oof_tfidf = np.zeros(len(train_df), dtype=np.float32)
ridge_models = []
skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
for fold, (tr, va) in enumerate(skf.split(np.zeros(len(y_strat)), y_strat), 1):
    m = Ridge(alpha=2.0, random_state=RANDOM_STATE)
    m.fit(X_tfidf[tr], y_std[tr])
    oof_tfidf[va] = m.predict(X_tfidf[va])
    ridge_models.append(m)
    print(f"[Ridge fold {fold}] done.")
Xt_tfidf = None
if has_test:
    Xt_tfidf = sparse.hstack([tfw.transform(test_df["catalog_content"].fillna("")),
                              tfc.transform(test_df["catalog_content"].fillna(""))]).tocsr()


In [None]:

# 10) DNN on MiniLM+numeric
scaler_X = StandardScaler(with_mean=True, with_std=True)
X_mini_num_train_std = scaler_X.fit_transform(X_mini_num_train)
X_mini_num_test_std  = scaler_X.transform(X_mini_num_test) if X_mini_num_test is not None else None

class TabularDataset(Dataset):
    def __init__(self, X, y): self.X=torch.from_numpy(X).float(); self.y=torch.from_numpy(y).float()
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, i): return self.X[i], self.y[i]

class MLP(nn.Module):
    def __init__(self, in_dim, hidden, dropout):
        super().__init__()
        layers=[]; d=in_dim
        for h in hidden:
            layers += [nn.Linear(d,h), nn.BatchNorm1d(h), nn.ReLU(), nn.Dropout(dropout)]
            d=h
        layers += [nn.Linear(d,1)]
        self.net = nn.Sequential(*layers)
    def forward(self,x): return self.net(x).squeeze(-1)

def train_fold(Xtr, ytr, Xva, yva, in_dim):
    model = MLP(in_dim, HIDDEN, DROPOUT).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
    loss_fn = nn.MSELoss()
    best=float("inf"); patience=0; best_state=None
    tr_dl = DataLoader(TabularDataset(Xtr,ytr), batch_size=BATCH_SIZE, shuffle=True)
    va_dl = DataLoader(TabularDataset(Xva,yva), batch_size=BATCH_SIZE, shuffle=False)
    for ep in range(1,EPOCHS+1):
        model.train(); tr_loss=0.0
        for xb,yb in tr_dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad(); pred = model(xb); loss = loss_fn(pred,yb); loss.backward(); opt.step()
            tr_loss += loss.item()*xb.size(0)
        tr_loss/=len(tr_dl.dataset)
        model.eval(); va_loss=0.0
        with torch.no_grad():
            for xb,yb in va_dl:
                xb,yb = xb.to(DEVICE), yb.to(DEVICE)
                pred = model(xb); loss = loss_fn(pred,yb); va_loss += loss.item()*xb.size(0)
        va_loss/=len(va_dl.dataset)
        if va_loss < best-1e-5: best=va_loss; patience=0; best_state={k:v.detach().cpu().clone() for k,v in model.state_dict().items()}
        else: patience+=1
        if ep%3==0: print(f"Epoch {ep:02d} | train {tr_loss:.4f} | valid {va_loss:.4f} | best {best:.4f}")
        if patience>=PATIENCE: break
    model.load_state_dict(best_state)
    return model

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=RANDOM_STATE)
oof_dnn = np.zeros(len(train_df), dtype=np.float32)
dnn_models=[]
for fold,(tr,va) in enumerate(skf.split(np.zeros(len(y_strat)), y_strat),1):
    model = train_fold(X_mini_num_train_std[tr], y_std[tr], X_mini_num_train_std[va], y_std[va], X_mini_num_train_std.shape[1])
    dnn_models.append(model)
    with torch.no_grad():
        xva = torch.from_numpy(X_mini_num_train_std[va]).float().to(DEVICE)
        oof_dnn[va] = model(xva).cpu().numpy()
    print(f"[DNN fold {fold}] done.")


In [None]:

# 11) Blend OOF & evaluate
w_dnn, w_tfidf = 0.7, 0.3
oof_std_blend = w_dnn*oof_dnn + w_tfidf*oof_tfidf
oof_log_per_unit = scaler_y.inverse_transform(oof_std_blend.reshape(-1,1)).ravel()
oof_per_unit = np.expm1(oof_log_per_unit)
oof_price = (oof_per_unit * tu_train).clip(0.01)
cv = smape(train_df["price"].values, oof_price)
print(f"CV SMAPE (OOF): {cv:.3f}%")
pd.DataFrame({"sample_id": train_df["sample_id"], "price_true": train_df["price"], "price_pred": oof_price}).to_csv(OUTPUT_DIR/"oof_predictions_dnn.csv", index=False)
print("Saved:", OUTPUT_DIR/"oof_predictions_dnn.csv")


In [None]:

# 12) Inference
if has_test:
    preds_tfidf = np.mean([m.predict(Xt_tfidf) for m in ridge_models], axis=0) if 'Xt_tfidf' in globals() and Xt_tfidf is not None else 0.0
    # DNN preds
    preds_dnn_folds=[]
    with torch.no_grad():
        xt = torch.from_numpy(X_mini_num_test_std).float().to(DEVICE)
        for model in dnn_models:
            preds_dnn_folds.append(model(xt).cpu().numpy())
    preds_dnn = np.mean(preds_dnn_folds, axis=0)
    preds_std_blend = w_dnn*preds_dnn + w_tfidf*preds_tfidf
    log_per_unit = scaler_y.inverse_transform(preds_std_blend.reshape(-1,1)).ravel()
    per_unit = np.expm1(log_per_unit)
    tu_test = test_df["total_units_base"].fillna(1.0).clip(lower=1e-6).astype(float).values
    price_pred = (per_unit * tu_test).clip(0.01)
    sub = pd.DataFrame({"sample_id": test_df["sample_id"], "price": price_pred})
    sub.to_csv(OUTPUT_DIR/"submission_dnn.csv", index=False)
    print("Saved submission:", OUTPUT_DIR/"submission_dnn.csv")
else:
    print("No test.csv detected; skipped submission.")
