# Streamlined Multimodal Price Prediction Pipeline
Focus: Simplicity + Effectiveness

In [None]:
import osimport sysimport jsonimport reimport numpy as npimport pandas as pdimport torchimport torch.nn as nnfrom pathlib import Pathfrom dotenv import load_dotenvimport warningswarnings.filterwarnings('ignore')# Add project root to Python pathproject_root = Path().resolve().parentif str(project_root) not in sys.path:    sys.path.insert(0, str(project_root))from src.utils.seed import seed_everythingfrom src.training.metrics import smape_npfrom src.data.load import read_csvs, make_foldsfrom src.utils.device import setup_device, get_optimal_batch_sizesload_dotenv()SEED = 42# Auto-detect: CUDA (GCP/GPU) > MPS (Apple Silicon) > CPUDEVICE = setup_device(verbose=True, benchmark=False)TRAIN_CSV = str(project_root / os.getenv("TRAIN_CSV", "dataset/train.csv"))TEST_CSV = str(project_root / os.getenv("TEST_CSV", "dataset/test.csv"))IMG_DIR = project_root / Path(os.getenv("IMG_DIR", "data/processed/images"))OUT_DIR = project_root / Path(os.getenv("OUT_DIR", "outputs"))HF_HOME = project_root / Path(os.getenv("HF_HOME", ".hf_cache"))for p in [IMG_DIR, OUT_DIR, HF_HOME, OUT_DIR/"oof", OUT_DIR/"test_preds"]:    p.mkdir(parents=True, exist_ok=True)seed_everything(SEED)print("Device:", DEVICE)

# Load Data + Smart Feature Engineering

In [None]:
from sklearn.model_selection import StratifiedShuffleSplit

# Read data
train_df, test_df = read_csvs(TRAIN_CSV, TEST_CSV)
train_df = make_folds(train_df, n_folds=5, seed=SEED)

# Create holdout split (10%)
y_log = np.log1p(train_df["price"].clip(lower=0.0))
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.10, random_state=SEED)
holdout_tr_idx, holdout_va_idx = next(sss.split(np.zeros(len(y_log)), pd.qcut(y_log, q=20, duplicates="drop")))
hold_tr_df = train_df.iloc[holdout_tr_idx].copy()
hold_va_df = train_df.iloc[holdout_va_idx].copy()

print(f"\nData shapes:")
print(f"  Train: {len(train_df)} | Test: {len(test_df)}")
print(f"  Holdout train: {len(hold_tr_df)} | Holdout val: {len(hold_va_df)}")

# Quantity & Unit Parsing (Critical for Price Prediction)

In [None]:
def parse_quantity_features(s):
    """Extract normalized quantity/weight/volume from product text"""
    s = str(s).lower()
    
    # Weight patterns (convert to grams)
    weights_g = []
    for pattern in [r'(\d+\.?\d*)\s*kg', r'(\d+\.?\d*)\s*g\b']:
        for match in re.finditer(pattern, s):
            val = float(match.group(1))
            if 'kg' in match.group(0):
                val *= 1000
            weights_g.append(val)
    
    # Volume patterns (convert to ml)
    volumes_ml = []
    for pattern in [r'(\d+\.?\d*)\s*l\b', r'(\d+\.?\d*)\s*ml']:
        for match in re.finditer(pattern, s):
            val = float(match.group(1))
            if 'l' in match.group(0) and 'ml' not in match.group(0):
                val *= 1000
            volumes_ml.append(val)
    
    # Pack count
    pack_count = 1
    pack_patterns = [
        r'(\d+)\s*pack', r'pack\s*of\s*(\d+)', 
        r'(\d+)\s*ct\b', r'(\d+)\s*count'
    ]
    for pattern in pack_patterns:
        matches = re.findall(pattern, s)
        if matches:
            pack_count = max(pack_count, max([int(m) for m in matches]))
    
    # Text statistics
    word_count = len(s.split())
    num_count = len(re.findall(r'\d+', s))
    
    # Quality indicators
    is_premium = int(any(kw in s for kw in ['organic', 'premium', 'gourmet', 'natural']))
    is_bulk = int(any(kw in s for kw in ['bulk', 'value', 'economy', 'family']))
    
    return {
        'weight_g': max(weights_g) if weights_g else 0,
        'volume_ml': max(volumes_ml) if volumes_ml else 0,
        'pack_count': pack_count,
        'total_units': max(weights_g + volumes_ml) if (weights_g or volumes_ml) else 0,
        'word_count': word_count,
        'num_count': num_count,
        'is_premium': is_premium,
        'is_bulk': is_bulk,
        'has_image': 1  # Will be updated later
    }

def extract_features(df):
    """Extract all features from catalog content"""
    df = df.copy()
    features = df['catalog_content'].fillna('').apply(parse_quantity_features).apply(pd.Series)
    df = pd.concat([df, features], axis=1)
    
    # Log transform large quantities (helps with scale)
    df['log_weight'] = np.log1p(df['weight_g'])
    df['log_volume'] = np.log1p(df['volume_ml'])
    df['log_total_units'] = np.log1p(df['total_units'])
    
    return df

print("Extracting features...")
train_df = extract_features(train_df)
test_df = extract_features(test_df)
hold_tr_df = extract_features(hold_tr_df)
hold_va_df = extract_features(hold_va_df)

# Feature columns
feature_cols = [
    'weight_g', 'volume_ml', 'pack_count', 'total_units',
    'log_weight', 'log_volume', 'log_total_units',
    'word_count', 'num_count', 'is_premium', 'is_bulk', 'has_image'
]

print(f"Extracted {len(feature_cols)} features")

# Map Image Paths

In [None]:
from urllib.parse import urlparse

def to_local_path(url: str) -> str:
    p = urlparse(str(url))
    name = os.path.basename(p.path) or "na.jpg"
    return (IMG_DIR / name).as_posix()

for df in (train_df, test_df, hold_tr_df, hold_va_df):
    df["image_path"] = df["image_link"].fillna("").map(to_local_path)

# Text Embeddings: Use DistilRoBERTa (Fast + Effective)

In [None]:
from transformers import AutoTokenizer, AutoModel
from tqdm import trange

# DistilRoBERTa: Fast, accurate, Apache-2.0
# Alternative: "sentence-transformers/all-MiniLM-L6-v2" (smaller, faster)
TEXT_MODEL = "sentence-transformers/all-mpnet-base-v2"  # Best quality-speed tradeoff

print(f"Loading text encoder: {TEXT_MODEL}")
text_tokenizer = AutoTokenizer.from_pretrained(TEXT_MODEL, cache_dir=HF_HOME.as_posix())
text_encoder = AutoModel.from_pretrained(TEXT_MODEL, cache_dir=HF_HOME.as_posix()).to(DEVICE)
text_encoder.eval()

def mean_pooling(model_output, attention_mask):
    """Mean pooling with attention mask"""
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

@torch.no_grad()
def encode_text(texts, batch_size=64, max_len=128, cache_path=None):
    """Encode texts with mean pooling"""
    if cache_path and Path(cache_path).exists():
        print(f"Loading cached embeddings from {cache_path}")
        return np.load(cache_path)
    
    embeddings = []
    for i in trange(0, len(texts), batch_size, desc="Encoding text"):
        batch = texts[i:i+batch_size]
        encoded = text_tokenizer(
            batch, padding=True, truncation=True, 
            max_length=max_len, return_tensors="pt"
        )
        encoded = {k: v.to(DEVICE) for k, v in encoded.items()}
        
        outputs = text_encoder(**encoded)
        pooled = mean_pooling(outputs, encoded['attention_mask'])
        pooled = torch.nn.functional.normalize(pooled, p=2, dim=1)
        embeddings.append(pooled.cpu().numpy())
    
    result = np.vstack(embeddings).astype(np.float32)
    if cache_path:
        np.save(cache_path, result)
    return result

# Cache directory
CACHE_DIR = Path("data/processed/streamlined_cache")
CACHE_DIR.mkdir(parents=True, exist_ok=True)

# Encode all texts
train_texts = train_df["catalog_content"].fillna("").astype(str).tolist()
test_texts = test_df["catalog_content"].fillna("").astype(str).tolist()
holdtr_texts = hold_tr_df["catalog_content"].fillna("").astype(str).tolist()
holdva_texts = hold_va_df["catalog_content"].fillna("").astype(str).tolist()

print("\nEncoding text embeddings...")
text_train = encode_text(train_texts, cache_path=CACHE_DIR/"text_train.npy")
text_test = encode_text(test_texts, cache_path=CACHE_DIR/"text_test.npy")
text_holdtr = encode_text(holdtr_texts, cache_path=CACHE_DIR/"text_holdtr.npy")
text_holdva = encode_text(holdva_texts, cache_path=CACHE_DIR/"text_holdva.npy")

print(f"Text embedding shape: {text_train.shape}")

# Image Embeddings: CLIP ViT-B/32 (Standard)

In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from tqdm import tqdm

CLIP_MODEL = "openai/clip-vit-base-patch32"
print(f"Loading CLIP: {CLIP_MODEL}")

clip_processor = CLIPProcessor.from_pretrained(CLIP_MODEL, cache_dir=HF_HOME.as_posix())
clip_model = CLIPModel.from_pretrained(CLIP_MODEL, cache_dir=HF_HOME.as_posix()).to(DEVICE)
clip_model.eval()

@torch.no_grad()
def encode_images(paths, batch_size=32, cache_path=None):
    """Encode images with CLIP"""
    if cache_path and Path(cache_path).exists():
        print(f"Loading cached embeddings from {cache_path}")
        return np.load(cache_path)
    
    embeddings = []
    batch_images = []
    
    def process_batch():
        if not batch_images:
            return
        inputs = clip_processor(images=batch_images, return_tensors="pt")
        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
        feats = clip_model.get_image_features(**inputs)
        feats = torch.nn.functional.normalize(feats, p=2, dim=1)
        embeddings.append(feats.cpu().numpy())
        batch_images.clear()
    
    for path in tqdm(paths, desc="Encoding images"):
        try:
            img = Image.open(path).convert("RGB")
            batch_images.append(img)
        except:
            # Use zero vector for missing images
            zero = np.zeros((1, clip_model.config.projection_dim), dtype=np.float32)
            embeddings.append(zero)
            continue
        
        if len(batch_images) >= batch_size:
            process_batch()
    
    process_batch()
    result = np.vstack(embeddings).astype(np.float32)
    
    if cache_path:
        np.save(cache_path, result)
    return result

print("\nEncoding image embeddings...")
img_train = encode_images(train_df["image_path"].tolist(), cache_path=CACHE_DIR/"img_train.npy")
img_test = encode_images(test_df["image_path"].tolist(), cache_path=CACHE_DIR/"img_test.npy")
img_holdtr = encode_images(hold_tr_df["image_path"].tolist(), cache_path=CACHE_DIR/"img_holdtr.npy")
img_holdva = encode_images(hold_va_df["image_path"].tolist(), cache_path=CACHE_DIR/"img_holdva.npy")

print(f"Image embedding shape: {img_train.shape}")

# Simple but Effective Model Architecture

In [None]:
class SimpleRegressor(nn.Module):
    """Simple 3-layer MLP with residual connection"""
    def __init__(self, in_dim, hidden=256, dropout=0.2):
        super().__init__()
        self.bn0 = nn.BatchNorm1d(in_dim)
        self.fc1 = nn.Linear(in_dim, hidden)
        self.bn1 = nn.BatchNorm1d(hidden)
        self.dropout1 = nn.Dropout(dropout)
        
        self.fc2 = nn.Linear(hidden, hidden // 2)
        self.bn2 = nn.BatchNorm1d(hidden // 2)
        self.dropout2 = nn.Dropout(dropout)
        
        self.fc3 = nn.Linear(hidden // 2, 1)
        
    def forward(self, x):
        x = self.bn0(x)
        x = self.dropout1(torch.relu(self.bn1(self.fc1(x))))
        x = self.dropout2(torch.relu(self.bn2(self.fc2(x))))
        return self.fc3(x).squeeze(1)


class HuberLoss(nn.Module):
    """Smooth L1 loss for robust training"""
    def __init__(self, delta=1.0):
        super().__init__()
        self.delta = delta
    
    def forward(self, pred, target):
        error = pred - target
        abs_error = torch.abs(error)
        quadratic = torch.clamp(abs_error, max=self.delta)
        linear = abs_error - quadratic
        return torch.mean(0.5 * quadratic**2 + self.delta * linear)


def train_model(X_tr, y_tr, X_va, y_va, device, 
                epochs=40, batch_size=256, lr=1e-3, 
                hidden=256, dropout=0.2, patience=10):
    """Train with early stopping on validation SMAPE"""
    
    model = SimpleRegressor(X_tr.shape[1], hidden=hidden, dropout=dropout).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
    criterion = HuberLoss(delta=0.5)
    
    best_smape = float('inf')
    best_state = None
    patience_counter = 0
    
    for epoch in range(epochs):
        model.train()
        indices = torch.randperm(len(X_tr), device=device)
        
        for i in range(0, len(indices), batch_size):
            batch_idx = indices[i:i+batch_size]
            X_batch = X_tr[batch_idx]
            y_batch = y_tr[batch_idx]
            
            pred = model(X_batch)
            loss = criterion(pred, y_batch)
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
        
        scheduler.step()
        
        # Validate
        model.eval()
        with torch.no_grad():
            pred_va = model(X_va)
            # Convert to price space for SMAPE
            pred_price = torch.expm1(pred_va).clamp(min=0).cpu().numpy()
            true_price = torch.expm1(y_va).clamp(min=0).cpu().numpy()
            val_smape = smape_np(true_price, pred_price)
        
        if val_smape < best_smape:
            best_smape = val_smape
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_counter = 0
        else:
            patience_counter += 1
            if patience_counter >= patience:
                break
    
    # Load best weights
    if best_state:
        model.load_state_dict(best_state)
    
    model.eval()
    return model, best_smape

# Train Models with 5-Fold CV

In [None]:
from sklearn.preprocessing import StandardScaler

print("\n" + "="*60)
print("TRAINING MODELS")
print("="*60)

# We'll train 3 models:
# 1. Text + Features
# 2. Image + Features  
# 3. Text + Image + Features (Full fusion)

oof_text = np.zeros(len(train_df))
oof_image = np.zeros(len(train_df))
oof_fusion = np.zeros(len(train_df))

test_text = np.zeros(len(test_df))
test_image = np.zeros(len(test_df))
test_fusion = np.zeros(len(test_df))

for fold in range(5):
    print(f"\n{'='*60}")
    print(f"FOLD {fold+1}/5")
    print('='*60)
    
    # Split data
    tr_idx = train_df[train_df.fold != fold].index
    va_idx = train_df[train_df.fold == fold].index
    
    # Scale features (fit on train fold only)
    scaler = StandardScaler()
    feat_tr = scaler.fit_transform(train_df.loc[tr_idx, feature_cols])
    feat_va = scaler.transform(train_df.loc[va_idx, feature_cols])
    feat_te = scaler.transform(test_df[feature_cols])
    
    # Build input matrices
    X_text_tr = np.hstack([text_train[tr_idx], feat_tr]).astype(np.float32)
    X_text_va = np.hstack([text_train[va_idx], feat_va]).astype(np.float32)
    X_text_te = np.hstack([text_test, feat_te]).astype(np.float32)
    
    X_img_tr = np.hstack([img_train[tr_idx], feat_tr]).astype(np.float32)
    X_img_va = np.hstack([img_train[va_idx], feat_va]).astype(np.float32)
    X_img_te = np.hstack([img_test, feat_te]).astype(np.float32)
    
    X_fusion_tr = np.hstack([text_train[tr_idx], img_train[tr_idx], feat_tr]).astype(np.float32)
    X_fusion_va = np.hstack([text_train[va_idx], img_train[va_idx], feat_va]).astype(np.float32)
    X_fusion_te = np.hstack([text_test, img_test, feat_te]).astype(np.float32)
    
    # Convert to tensors
    X_text_tr_t = torch.from_numpy(X_text_tr).to(DEVICE)
    X_text_va_t = torch.from_numpy(X_text_va).to(DEVICE)
    X_text_te_t = torch.from_numpy(X_text_te).to(DEVICE)
    
    X_img_tr_t = torch.from_numpy(X_img_tr).to(DEVICE)
    X_img_va_t = torch.from_numpy(X_img_va).to(DEVICE)
    X_img_te_t = torch.from_numpy(X_img_te).to(DEVICE)
    
    X_fusion_tr_t = torch.from_numpy(X_fusion_tr).to(DEVICE)
    X_fusion_va_t = torch.from_numpy(X_fusion_va).to(DEVICE)
    X_fusion_te_t = torch.from_numpy(X_fusion_te).to(DEVICE)
    
    # Targets (log space)
    y_tr = torch.from_numpy(np.log1p(train_df.loc[tr_idx, 'price'].values).astype(np.float32)).to(DEVICE)
    y_va = torch.from_numpy(np.log1p(train_df.loc[va_idx, 'price'].values).astype(np.float32)).to(DEVICE)
    
    # Train text model
    print("\nTraining Text Model...")
    model_text, smape_text = train_model(
        X_text_tr_t, y_tr, X_text_va_t, y_va, DEVICE,
        epochs=40, batch_size=256, lr=1e-3, hidden=256
    )
    with torch.no_grad():
        oof_text[va_idx] = torch.expm1(model_text(X_text_va_t)).clamp(min=0).cpu().numpy()
        test_text += torch.expm1(model_text(X_text_te_t)).clamp(min=0).cpu().numpy()
    print(f"  Validation SMAPE: {smape_text:.4f}")
    
    # Train image model
    print("\nTraining Image Model...")
    model_img, smape_img = train_model(
        X_img_tr_t, y_tr, X_img_va_t, y_va, DEVICE,
        epochs=40, batch_size=256, lr=1e-3, hidden=256
    )
    with torch.no_grad():
        oof_image[va_idx] = torch.expm1(model_img(X_img_va_t)).clamp(min=0).cpu().numpy()
        test_image += torch.expm1(model_img(X_img_te_t)).clamp(min=0).cpu().numpy()
    print(f"  Validation SMAPE: {smape_img:.4f}")
    
    # Train fusion model
    print("\nTraining Fusion Model...")
    model_fusion, smape_fusion = train_model(
        X_fusion_tr_t, y_tr, X_fusion_va_t, y_va, DEVICE,
        epochs=40, batch_size=256, lr=8e-4, hidden=384, dropout=0.25
    )
    with torch.no_grad():
        oof_fusion[va_idx] = torch.expm1(model_fusion(X_fusion_va_t)).clamp(min=0).cpu().numpy()
        test_fusion += torch.expm1(model_fusion(X_fusion_te_t)).clamp(min=0).cpu().numpy()
    print(f"  Validation SMAPE: {smape_fusion:.4f}")

# Average test predictions
test_text /= 5
test_image /= 5
test_fusion /= 5

# Calculate OOF scores
smape_text_oof = smape_np(train_df['price'].values, oof_text)
smape_img_oof = smape_np(train_df['price'].values, oof_image)
smape_fusion_oof = smape_np(train_df['price'].values, oof_fusion)

print(f"\n{'='*60}")
print("OOF SCORES (5-Fold CV)")
print('='*60)
print(f"Text Model:   {smape_text_oof:.4f}")
print(f"Image Model:  {smape_img_oof:.4f}")
print(f"Fusion Model: {smape_fusion_oof:.4f}")

# Train LightGBM (Tree-based model for diversity)

In [None]:
try:
    import lightgbm as lgb
except ImportError:
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "lightgbm", "-q"])
    import lightgbm as lgb

print("\n" + "="*60)
print("TRAINING LIGHTGBM")
print("="*60)

# Build feature matrix for GBM (no scaling needed)
dense_train = train_df[feature_cols].values
dense_test = test_df[feature_cols].values

X_gbm_train = np.hstack([text_train, img_train, dense_train]).astype(np.float32)
X_gbm_test = np.hstack([text_test, img_test, dense_test]).astype(np.float32)

oof_lgb = np.zeros(len(train_df))
test_lgb = np.zeros(len(test_df))

for fold in range(5):
    print(f"\nFold {fold+1}/5...")
    tr_idx = train_df[train_df.fold != fold].index
    va_idx = train_df[train_df.fold == fold].index
    
    model = lgb.LGBMRegressor(
        n_estimators=1000,
        learning_rate=0.05,
        num_leaves=31,
        max_depth=-1,
        subsample=0.8,
        colsample_bytree=0.8,
        min_child_samples=20,
        reg_alpha=0.1,
        reg_lambda=1.0,
        random_state=SEED + fold,
        verbose=-1
    )
    
    model.fit(
        X_gbm_train[tr_idx], np.log1p(train_df.loc[tr_idx, 'price'].values),
        eval_set=[(X_gbm_train[va_idx], np.log1p(train_df.loc[va_idx, 'price'].values))],
        callbacks=[lgb.early_stopping(100, verbose=False), lgb.log_evaluation(0)]
    )
    
    oof_lgb[va_idx] = np.expm1(model.predict(X_gbm_train[va_idx]))
    test_lgb += np.expm1(model.predict(X_gbm_test))

test_lgb /= 5
smape_lgb_oof = smape_np(train_df['price'].values, oof_lgb)
print(f"\nLightGBM OOF SMAPE: {smape_lgb_oof:.4f}")

# Simple Ensemble (Best 2-3 Models)

In [None]:
print("\n" + "="*60)
print("ENSEMBLE OPTIMIZATION")
print("="*60)

# Only use models with reasonable scores
models = {
    'fusion': oof_fusion,
    'text': oof_text,
    'lgb': oof_lgb,
    'image': oof_image,
}

# Sort by OOF score
scores = {
    'fusion': smape_fusion_oof,
    'text': smape_text_oof,
    'lgb': smape_lgb_oof,
    'image': smape_img_oof,
}

print("\nIndividual model scores:")
for name, score in sorted(scores.items(), key=lambda x: x[1]):
    print(f"  {name:10s}: {score:.4f}")

# Simple grid search for best 2-model combo
y_true = train_df['price'].values
best_smape = float('inf')
best_weights = None
best_models = None

model_names = list(models.keys())
for i in range(len(model_names)):
    for j in range(i+1, len(model_names)):
        name1, name2 = model_names[i], model_names[j]
        
        for w1 in np.arange(0, 1.05, 0.1):
            w2 = 1 - w1
            pred = w1 * models[name1] + w2 * models[name2]
            smape = smape_np(y_true, pred)
            
            if smape < best_smape:
                best_smape = smape
                best_weights = (w1, w2)
                best_models = (name1, name2)

print(f"\nBest 2-model ensemble:")
print(f"  Models: {best_models[0]} + {best_models[1]}")
print(f"  Weights: {best_weights[0]:.2f} + {best_weights[1]:.2f}")
print(f"  OOF SMAPE: {best_smape:.4f}")

# Holdout Validation

In [None]:
print("\n" + "="*60)
print("HOLDOUT VALIDATION")
print("="*60)

# Scale features for holdout
scaler_hold = StandardScaler()
feat_hold_tr = scaler_hold.fit_transform(hold_tr_df[feature_cols])
feat_hold_va = scaler_hold.transform(hold_va_df[feature_cols])

# Build inputs
X_fusion_hold_tr = np.hstack([text_holdtr, img_holdtr, feat_hold_tr]).astype(np.float32)
X_fusion_hold_va = np.hstack([text_holdva, img_holdva, feat_hold_va]).astype(np.float32)

X_fusion_hold_tr_t = torch.from_numpy(X_fusion_hold_tr).to(DEVICE)
X_fusion_hold_va_t = torch.from_numpy(X_fusion_hold_va).to(DEVICE)

y_hold_tr = torch.from_numpy(np.log1p(hold_tr_df['price'].values).astype(np.float32)).to(DEVICE)
y_hold_va = torch.from_numpy(np.log1p(hold_va_df['price'].values).astype(np.float32)).to(DEVICE)

# Train on holdout train
print("Training on holdout train set...")
model_hold, _ = train_model(
    X_fusion_hold_tr_t, y_hold_tr, X_fusion_hold_va_t, y_hold_va, DEVICE,
    epochs=40, batch_size=256, lr=8e-4, hidden=384
)

with torch.no_grad():
    pred_hold = torch.expm1(model_hold(X_fusion_hold_va_t)).clamp(min=0).cpu().numpy()

smape_holdout = smape_np(hold_va_df['price'].values, pred_hold)

print(f"\n{'='*60}")
print(f"HOLDOUT SMAPE: {smape_holdout:.4f}")
print('='*60)

if smape_holdout < 30:
    print("✅ TARGET ACHIEVED!")
else:
    print(f"⚠️  Close! Gap to 30: {smape_holdout - 30:.2f}")

# Generate Final Submission

In [None]:
print("\n" + "="*60)
print("GENERATING SUBMISSION")
print("="*60)

# Use ensemble weights from CV
test_preds = {
    'fusion': test_fusion,
    'text': test_text,
    'lgb': test_lgb,
    'image': test_image
}

final_pred = (best_weights[0] * test_preds[best_models[0]] + 
              best_weights[1] * test_preds[best_models[1]])

# Clip to training distribution
lower = train_df['price'].quantile(0.001)
upper = train_df['price'].quantile(0.999)
final_pred = np.clip(final_pred, lower, upper)

# Create submission
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_pred
})

submission.to_csv(OUT_DIR/"test_preds"/"streamlined_ensemble.csv", index=False)

print(f"\n✅ Submission saved!")
print(f"\nFinal Summary:")
print(f"  Best OOF SMAPE: {best_smape:.4f}")
print(f"  Holdout SMAPE:  {smape_holdout:.4f}")
print(f"  Models used:    {best_models[0]} + {best_models[1]}")
print(f"  Weights:        {best_weights[0]:.2f} + {best_weights[1]:.2f}")
print("="*60)