# Smart Pricing — **RoBERTa Embeddings + DNN (100 epochs)**

In [1]:
# Installs
!pip -q install numpy pandas scikit-learn scipy sentence-transformers==3.0.1 transformers==4.44.2 tqdm torch

In [19]:
# Imports & config
import re
from pathlib import Path
import numpy as np, pandas as pd
import torch, torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

DATA_DIR = Path("dataset"); TRAIN_CSV = DATA_DIR/"train.csv"; TEST_CSV = DATA_DIR/"test.csv"
OUTPUT_DIR = Path("OUTPUT-3"); OUTPUT_DIR.mkdir(exist_ok=True, parents=True)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
SEED=42; EPOCHS=500; BATCH_SIZE=1024; LR=2e-3; WD=1e-4; DROPOUT=0.2; HIDDEN=[1024,512,256]
print("Device:", DEVICE)

Device: cuda


In [3]:
# Load
train_df = pd.read_csv(TRAIN_CSV); has_test = TEST_CSV.exists(); test_df = pd.read_csv(TEST_CSV) if has_test else None
def normalize_text(s): 
    if not isinstance(s,str): return ""
    return re.sub(r"\s+"," ", s.replace("\n"," ").replace("\r"," ")).strip()
train_df["catalog_content"] = train_df["catalog_content"].astype(str).apply(normalize_text)
if has_test: test_df["catalog_content"] = test_df["catalog_content"].astype(str).apply(normalize_text)
print(train_df.head(2))

   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  


In [5]:
# 3) RoBERTa sentence embeddings (robust loader with fallbacks)
import os, numpy as np
from tqdm import tqdm

HF_TOKEN = os.getenv("HF_TOKEN", None)  # optional: set this in your shell if your org requires it

def try_sentence_transformer(ids, device):
    from sentence_transformers import SentenceTransformer
    last_err = None
    for mid in ids:
        try:
            print(f"Trying SentenceTransformer: {mid}")
            return SentenceTransformer(mid, device=device, use_auth_token=HF_TOKEN)
        except Exception as e:
            print(f"  -> failed: {e.__class__.__name__}: {e}")
            last_err = e
    raise last_err

def mean_pool_last_hidden(model_outputs, attention_mask):
    # mean pooling excluding padding tokens
    token_embeddings = model_outputs.last_hidden_state  # [B, T, H]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    summed = (token_embeddings * input_mask_expanded).sum(dim=1)
    counts = input_mask_expanded.sum(dim=1).clamp(min=1e-9)
    return (summed / counts)

def build_roberta_fallback(device):
    # plain transformers fallback (no sentence-transformers)
    from transformers import AutoTokenizer, AutoModel
    tok = AutoTokenizer.from_pretrained("roberta-base", use_auth_token=HF_TOKEN)
    mdl = AutoModel.from_pretrained("roberta-base", use_auth_token=HF_TOKEN).to(device)
    mdl.eval()
    def embed(texts, batch_size=64, normalize=True):
        embs = []
        for i in tqdm(range(0, len(texts), batch_size), desc="Encoding (roberta-base)"):
            batch = texts[i:i+batch_size]
            enc = tok(batch, padding=True, truncation=True, max_length=256, return_tensors="pt")
            with torch.no_grad():
                out = mdl(enc["input_ids"].to(device), attention_mask=enc["attention_mask"].to(device))
                pooled = mean_pool_last_hidden(out, enc["attention_mask"].to(device))  # [B, H]
                vec = pooled
                if normalize:
                    vec = torch.nn.functional.normalize(vec, p=2, dim=1)
                embs.append(vec.cpu().numpy())
        return np.vstack(embs).astype(np.float32)
    return embed

RO_BERTA_CANDIDATES = [
    # public, RoBERTa-based sentence-transformers (no token needed in most setups)
    "sentence-transformers/paraphrase-roberta-base-v1",
    "sentence-transformers/all-distilroberta-v1",
    # the one you tried (kept last, sometimes requires auth in certain mirrors)
    "sentence-transformers/all-roberta-base-v1",
]

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

embed_fn = None
try:
    # First try sentence-transformers variants
    st_model = try_sentence_transformer(RO_BERTA_CANDIDATES, DEVICE)
    def embed_fn(texts, batch_size=256, normalize=True):
        # sentence_transformers already returns normalized if normalize_embeddings=True
        embs = st_model.encode(texts, batch_size=batch_size, show_progress_bar=True,
                               normalize_embeddings=True)
        return np.asarray(embs, dtype=np.float32)
except Exception as e:
    print("All sentence-transformers RoBERTa attempts failed; falling back to plain roberta-base.")
    embed_fn = build_roberta_fallback(DEVICE)

def encode_texts(texts, batch_size=256):
    return embed_fn(texts, batch_size=batch_size, normalize=True)

# Build embeddings
train_texts = train_df["catalog_content"].fillna("").tolist()
X_train_emb = encode_texts(train_texts, batch_size=128)  # lower batch if OOM

X_test_emb = None
if has_test:
    test_texts = test_df["catalog_content"].fillna("").tolist()
    X_test_emb = encode_texts(test_texts, batch_size=128)

print("Embedding shapes:", X_train_emb.shape, None if X_test_emb is None else X_test_emb.shape)


Trying SentenceTransformer: sentence-transformers/paraphrase-roberta-base-v1


No sentence-transformers model found with name sentence-transformers/paraphrase-roberta-base-v1. Creating a new one with mean pooling.


  -> failed: OSError: sentence-transformers/paraphrase-roberta-base-v1 is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`
Trying SentenceTransformer: sentence-transformers/all-distilroberta-v1


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Batches: 100%|██████████| 586/586 [1:15:14<00:00,  7.70s/it]   
Batches: 100%|██████████| 586/586 [19:04<00:00,  1.95s/it]


Embedding shapes: (75000, 768) (75000, 768)


In [6]:
# # RoBERTa embeddings
# from sentence_transformers import SentenceTransformer
# MODEL_NAME = "sentence-transformers/all-roberta-base-v1"
# st_model = SentenceTransformer(MODEL_NAME, device=DEVICE)
# def encode_texts(texts, batch_size=256):
#     embs = st_model.encode(texts, batch_size=batch_size, show_progress_bar=True, normalize_embeddings=True)
#     return np.asarray(embs, dtype=np.float32)
# X_train_emb = encode_texts(train_df["catalog_content"].fillna("").tolist())
# X_test_emb = encode_texts(test_df["catalog_content"].fillna("").tolist()) if has_test else None
# print("Shapes:", X_train_emb.shape, None if X_test_emb is None else X_test_emb.shape)

In [20]:
# Targets
y = train_df["price"].astype(float).values
y_clip = np.clip(y, np.percentile(y,1), np.percentile(y,99))
scaler_y = StandardScaler().fit(np.log1p(y_clip).reshape(-1,1))
y_std = scaler_y.transform(np.log1p(y).reshape(-1,1)).ravel()
X_tr, X_va, y_tr, y_va = train_test_split(X_train_emb, y_std, test_size=0.1, random_state=SEED)

In [21]:
# DNN
class TabDS(Dataset):
    def __init__(self, X, y=None): self.X=torch.from_numpy(X).float(); self.y=None if y is None else torch.from_numpy(y).float()
    def __len__(self): return self.X.shape[0]
    def __getitem__(self,i): return (self.X[i], self.y[i]) if self.y is not None else self.X[i]

# class MLP(nn.Module):
#     def __init__(self, in_dim, hidden=[1024,512,256], dropout=0.2):
#         super().__init__(); layers=[]; d=in_dim
#         for h in hidden: layers += [nn.Linear(d,h), nn.BatchNorm1d(h), nn.ReLU(), nn.Dropout(dropout)]; d=h
#         layers += [nn.Linear(d,1)]; self.net=nn.Sequential(*layers)
#     def forward(self,x): return self.net(x).squeeze(-1)

class MLP(nn.Module):
    def __init__(self, in_dim, hidden=[768, 512, 256], dropout=0.2):
        super().__init__()
        self.in_norm = nn.LayerNorm(in_dim)  # stabilize input scale
        layers, d = [], in_dim
        for h in hidden:
            layers += [nn.Linear(d, h), nn.GELU(), nn.Dropout(dropout)]
            d = h
        layers += [nn.Linear(d, 1)]
        self.net = nn.Sequential(*layers)
    def forward(self, x):
        return self.net(self.in_norm(x)).squeeze(-1)


def train_model(X_tr, y_tr, X_va, y_va, in_dim):
    model = MLP(in_dim).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)
    loss_fn = nn.SmoothL1Loss(beta=0.5)  # Huber
    tr_dl = DataLoader(TabDS(X_tr, y_tr), batch_size=BATCH_SIZE, shuffle=True)
    va_dl = DataLoader(TabDS(X_va, y_va), batch_size=BATCH_SIZE, shuffle=False)

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        opt, max_lr=2e-3, epochs=EPOCHS, steps_per_epoch=len(tr_dl)
    )
    CLIP_NORM = 1.0

    for ep in range(1, EPOCHS+1):
        model.train(); tr_loss = 0.0
        for xb, yb in tr_dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
            opt.step()
            scheduler.step()
            tr_loss += loss.item() * xb.size(0)
        tr_loss /= len(tr_dl.dataset)

        # validation
        model.eval(); va_loss = 0.0
        with torch.no_grad():
            for xb, yb in va_dl:
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                pred = model(xb)
                va_loss += loss_fn(pred, yb).item() * xb.size(0)
        va_loss /= len(va_dl.dataset)
        if ep % 10 == 0 or ep <= 5:
            print(f"Epoch {ep:03d} | train {tr_loss:.4f} | valid {va_loss:.4f}")

    return model


model = train_model(X_tr,y_tr,X_va,y_va,X_train_emb.shape[1])

Epoch 001 | train 0.5422 | valid 0.5214
Epoch 002 | train 0.4924 | valid 0.4891
Epoch 003 | train 0.4702 | valid 0.4724
Epoch 004 | train 0.4556 | valid 0.4607
Epoch 005 | train 0.4431 | valid 0.4510
Epoch 010 | train 0.4009 | valid 0.4300
Epoch 020 | train 0.3474 | valid 0.4162
Epoch 030 | train 0.2934 | valid 0.4177
Epoch 040 | train 0.2414 | valid 0.4198
Epoch 050 | train 0.1963 | valid 0.4194
Epoch 060 | train 0.1665 | valid 0.4203
Epoch 070 | train 0.1448 | valid 0.4351
Epoch 080 | train 0.1321 | valid 0.4322
Epoch 090 | train 0.1199 | valid 0.4297
Epoch 100 | train 0.1121 | valid 0.4404
Epoch 110 | train 0.1033 | valid 0.4342
Epoch 120 | train 0.1003 | valid 0.4298
Epoch 130 | train 0.0932 | valid 0.4327
Epoch 140 | train 0.0878 | valid 0.4332
Epoch 150 | train 0.0830 | valid 0.4354
Epoch 160 | train 0.0775 | valid 0.4309
Epoch 170 | train 0.0720 | valid 0.4320
Epoch 180 | train 0.0698 | valid 0.4315
Epoch 190 | train 0.0666 | valid 0.4369
Epoch 200 | train 0.0643 | valid 0.4389


In [22]:
# Retrain on full set for 100 epochs
model_full = MLP(X_train_emb.shape[1], HIDDEN, DROPOUT).to(DEVICE)
opt=torch.optim.AdamW(model_full.parameters(), lr=LR, weight_decay=WD); loss_fn=nn.MSELoss()
dl=DataLoader(TabDS(X_train_emb, y_std), batch_size=BATCH_SIZE, shuffle=True)
for ep in range(1, EPOCHS+1):
    model_full.train(); loss_sum=0.0
    for xb,yb in dl:
        xb,yb=xb.to(DEVICE), yb.to(DEVICE); opt.zero_grad(); pred=model_full(xb); loss=loss_fn(pred,yb); loss.backward(); opt.step(); loss_sum+=loss.item()*xb.size(0)
    loss_sum/=len(dl.dataset)
    if ep%10==0 or ep<=5: print(f"[FULL] Epoch {ep:03d} | loss {loss_sum:.4f}")

[FULL] Epoch 001 | loss 0.8150
[FULL] Epoch 002 | loss 0.6799
[FULL] Epoch 003 | loss 0.6301
[FULL] Epoch 004 | loss 0.5868
[FULL] Epoch 005 | loss 0.5505
[FULL] Epoch 010 | loss 0.3901
[FULL] Epoch 020 | loss 0.2247
[FULL] Epoch 030 | loss 0.1686
[FULL] Epoch 040 | loss 0.1424
[FULL] Epoch 050 | loss 0.1275
[FULL] Epoch 060 | loss 0.1165
[FULL] Epoch 070 | loss 0.1085
[FULL] Epoch 080 | loss 0.1009
[FULL] Epoch 090 | loss 0.0938
[FULL] Epoch 100 | loss 0.0914
[FULL] Epoch 110 | loss 0.0880
[FULL] Epoch 120 | loss 0.0823
[FULL] Epoch 130 | loss 0.0819
[FULL] Epoch 140 | loss 0.0787
[FULL] Epoch 150 | loss 0.0765
[FULL] Epoch 160 | loss 0.0736
[FULL] Epoch 170 | loss 0.0704
[FULL] Epoch 180 | loss 0.0700
[FULL] Epoch 190 | loss 0.0673
[FULL] Epoch 200 | loss 0.0655
[FULL] Epoch 210 | loss 0.0645
[FULL] Epoch 220 | loss 0.0647
[FULL] Epoch 230 | loss 0.0610
[FULL] Epoch 240 | loss 0.0623
[FULL] Epoch 250 | loss 0.0587
[FULL] Epoch 260 | loss 0.0585
[FULL] Epoch 270 | loss 0.0578
[FULL] E

In [23]:
# --- Compute SMAPE on validation set ---
def smape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    diff = np.abs(y_pred - y_true) / denom
    return 100.0 * np.mean(diff)

model.eval()
with torch.no_grad():
    xva = torch.from_numpy(X_va).float().to(DEVICE)
    pred_std = model(xva).cpu().numpy()

# inverse standardization + log transform
log_pred = scaler_y.inverse_transform(pred_std.reshape(-1,1)).ravel()
log_true = scaler_y.inverse_transform(y_va.reshape(-1,1)).ravel()

# back to price scale
y_pred_price = np.expm1(log_pred)
y_true_price = np.expm1(log_true)

val_smape = smape(y_true_price, y_pred_price)
print(f"Validation SMAPE: {val_smape:.3f}%")

Validation SMAPE: 56.311%


In [24]:
# Predict & save submission
if (DATA_DIR/"test.csv").exists():
    model_full.eval()
    with torch.no_grad():
        xt=torch.from_numpy(X_test_emb).float().to(DEVICE); pred_std=model_full(xt).cpu().numpy()
    log_price = scaler_y.inverse_transform(pred_std.reshape(-1,1)).ravel()
    price = np.expm1(log_price)
    sub = pd.DataFrame({"sample_id": pd.read_csv(DATA_DIR/'test.csv')["sample_id"], "price": price})
    outp = OUTPUT_DIR/"submission_roberta_dnn.csv"; sub.to_csv(outp, index=False); print("Saved:", outp)
else:
    print("No test.csv found; skip submission.")

Saved: OUTPUT-3\submission_roberta_dnn.csv


In [18]:
# ====== Helpers ======
import numpy as np, torch, torch.nn as nn
from torch.utils.data import DataLoader
from sklearn.model_selection import StratifiedKFold
import pandas as pd
from pathlib import Path

def smape(y_true, y_pred, eps=1e-9):
    y_true = np.asarray(y_true, dtype=float)
    y_pred = np.asarray(y_pred, dtype=float)
    denom = (np.abs(y_true) + np.abs(y_pred) + eps) / 2.0
    diff = np.abs(y_pred - y_true) / denom
    return 100.0 * np.mean(diff)

def make_strat_labels(y, max_bins=20, n_splits=5):
    import pandas as pd, numpy as np
    y = np.asarray(y, float)
    bins = pd.qcut(np.log1p(y), q=max_bins, duplicates="drop")
    return pd.Series(bins).cat.codes.to_numpy()

def train_one_fold(X_tr, y_tr, X_va, y_va, in_dim):
    model = MLP(in_dim, hidden=[768,512,256], dropout=DROPOUT).to(DEVICE)  # mild tweak
    loss_fn = nn.SmoothL1Loss(beta=0.5)  # Huber
    opt = torch.optim.AdamW(model.parameters(), lr=LR, weight_decay=WD)

    tr_dl = DataLoader(TabDS(X_tr, y_tr), batch_size=BATCH_SIZE, shuffle=True)
    va_dl = DataLoader(TabDS(X_va, y_va), batch_size=BATCH_SIZE, shuffle=False)

    scheduler = torch.optim.lr_scheduler.OneCycleLR(
        opt, max_lr=LR, epochs=EPOCHS, steps_per_epoch=len(tr_dl)
    )
    CLIP_NORM = 1.0

    for ep in range(1, EPOCHS+1):
        model.train(); tr_loss = 0.0
        for xb, yb in tr_dl:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            opt.zero_grad()
            pred = model(xb)
            loss = loss_fn(pred, yb)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), CLIP_NORM)
            opt.step()
            scheduler.step()
            tr_loss += loss.item() * xb.size(0)
        tr_loss /= len(tr_dl.dataset)

        # (optional) quick valid print every 10 epochs
        if ep % 10 == 0 or ep <= 5:
            model.eval(); va_loss = 0.0
            with torch.no_grad():
                for xb, yb in va_dl:
                    xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                    va_loss += loss_fn(model(xb), yb).item() * xb.size(0)
            va_loss /= len(va_dl.dataset)
            print(f"[Fold train] epoch {ep:03d} | train {tr_loss:.4f} | valid {va_loss:.4f}")
    return model

# ====== 5-fold CV training ======
y_strat = make_strat_labels(y, n_splits=5)
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

oof_std = np.zeros(len(train_df), dtype=np.float32)
test_std_folds = []
models = []

for fold, (tr, va) in enumerate(skf.split(X_train_emb, y_strat), 1):
    print(f"\n========== Fold {fold} ==========")
    model = train_one_fold(
        X_train_emb[tr], y_std[tr],
        X_train_emb[va], y_std[va],
        in_dim=X_train_emb.shape[1]
    )
    models.append(model)

    # OOF predictions (standardized log scale)
    model.eval()
    with torch.no_grad():
        xva = torch.from_numpy(X_train_emb[va]).float().to(DEVICE)
        oof_std[va] = model(xva).cpu().numpy()

        if 'X_test_emb' in globals() and X_test_emb is not None:
            xt = torch.from_numpy(X_test_emb).float().to(DEVICE)
            test_std_folds.append(model(xt).cpu().numpy())

# ====== CV SMAPE on price scale ======
oof_log = scaler_y.inverse_transform(oof_std.reshape(-1,1)).ravel()
oof_price = np.expm1(oof_log)
cv_smape = smape(train_df["price"].values, oof_price)
print(f"\n5-fold CV SMAPE: {cv_smape:.3f}%")

# ====== Test preds & submission (if test available) ======
if 'X_test_emb' in globals() and X_test_emb is not None:
    pred_std = np.mean(test_std_folds, axis=0)  # average on standardized log scale
    log_price = scaler_y.inverse_transform(pred_std.reshape(-1,1)).ravel()
    price = np.expm1(log_price)

    submission = pd.DataFrame({
        "sample_id": test_df["sample_id"].values,
        "price": price
    })

    OUTPUT_DIR = Path("outputs"); OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
    outp = OUTPUT_DIR / "submission_roberta_dnn_cv.csv"
    submission.to_csv(outp, index=False)
    print("Saved submission:", outp)
else:
    print("No test embeddings found; skipped submission.")



[Fold train] epoch 001 | train 0.5485 | valid 0.5173
[Fold train] epoch 002 | train 0.4978 | valid 0.4841
[Fold train] epoch 003 | train 0.4714 | valid 0.4631
[Fold train] epoch 004 | train 0.4537 | valid 0.4526
[Fold train] epoch 005 | train 0.4373 | valid 0.4477
[Fold train] epoch 010 | train 0.3750 | valid 0.4286
[Fold train] epoch 020 | train 0.2645 | valid 0.4300
[Fold train] epoch 030 | train 0.1888 | valid 0.4339
[Fold train] epoch 040 | train 0.1314 | valid 0.4380
[Fold train] epoch 050 | train 0.1000 | valid 0.4345
[Fold train] epoch 060 | train 0.0797 | valid 0.4333
[Fold train] epoch 070 | train 0.0620 | valid 0.4329
[Fold train] epoch 080 | train 0.0521 | valid 0.4290
[Fold train] epoch 090 | train 0.0458 | valid 0.4287
[Fold train] epoch 100 | train 0.0440 | valid 0.4289

[Fold train] epoch 001 | train 0.5460 | valid 0.5181
[Fold train] epoch 002 | train 0.4973 | valid 0.4883
[Fold train] epoch 003 | train 0.4740 | valid 0.4693
[Fold train] epoch 004 | train 0.4563 | vali