In [None]:
import osimport sysimport jsonimport numpy as npimport pandas as pdimport torchimport torch.nn as nnfrom pathlib import Pathfrom dotenv import load_dotenvimport warningswarnings.filterwarnings('ignore')project_root = Path().resolve().parentif str(project_root) not in sys.path:    sys.path.insert(0, str(project_root))from src.utils.seed import seed_everythingfrom src.training.metrics import smape_npfrom src.data.load import read_csvs, make_foldsfrom src.utils.device import setup_device, get_optimal_batch_sizesload_dotenv()SEED = 42# Auto-detect: CUDA (GCP/GPU) > MPS (Apple Silicon) > CPUDEVICE = setup_device(verbose=True, benchmark=False)TRAIN_CSV = str(project_root / os.getenv("TRAIN_CSV", "dataset/train.csv"))TEST_CSV = str(project_root / os.getenv("TEST_CSV", "dataset/test.csv"))IMG_DIR = project_root / Path(os.getenv("IMG_DIR", "data/processed/images"))OUT_DIR_V2 = project_root / Path("outputs_v2")  # V2 output directoryHF_HOME = project_root / Path(os.getenv("HF_HOME", ".hf_cache"))for p in [IMG_DIR, OUT_DIR_V2, HF_HOME, OUT_DIR_V2/"oof_v2", OUT_DIR_V2/"test_preds_v2"]:    p.mkdir(parents=True, exist_ok=True)seed_everything(SEED)print(f"Device: {DEVICE}")

In [None]:
train_df, test_df = read_csvs(TRAIN_CSV, TEST_CSV)
train_df = make_folds(train_df, n_folds=3, seed=SEED)  # Only 3 folds for speed

# Simple feature extraction
def quick_features(df):
    df = df.copy()
    text = df['catalog_content'].fillna('')
    df['text_len'] = text.str.len()
    df['word_count'] = text.str.split().str.len()
    df['has_pack'] = text.str.contains(r'\bpack\b', case=False, regex=True).astype(int)
    df['has_number'] = text.str.contains(r'\d+', regex=True).astype(int)
    return df

train_df = quick_features(train_df)
test_df = quick_features(test_df)
feature_cols = ['text_len', 'word_count', 'has_pack', 'has_number', 'has_image']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
train_feats = scaler.fit_transform(train_df[feature_cols])
test_feats = scaler.transform(test_df[feature_cols])

print(f"Train: {len(train_df)} | Test: {len(test_df)}")
print(f"Features: {len(feature_cols)}")

In [None]:
from urllib.parse import urlparse

def to_local_path(url: str) -> str:
    p = urlparse(str(url))
    name = os.path.basename(p.path) or "na.jpg"
    return (IMG_DIR / name).as_posix()

train_df["image_path"] = train_df["image_link"].fillna("").map(to_local_path)
test_df["image_path"] = test_df["image_link"].fillna("").map(to_local_path)

In [None]:
from transformers import AutoTokenizer, AutoModel
from tqdm import trange

MINILM = "sentence-transformers/all-MiniLM-L6-v2"  # Smaller, faster
tok = AutoTokenizer.from_pretrained(MINILM, cache_dir=HF_HOME.as_posix())
enc = AutoModel.from_pretrained(MINILM, cache_dir=HF_HOME.as_posix()).to(DEVICE)
enc.eval()

@torch.no_grad()
def encode_text_fast(texts, batch_size=256, max_len=128, cache_path=None):
    """Fast mean-pooling text encoder"""
    if cache_path and Path(cache_path).exists():
        return np.load(cache_path)
    
    out = []
    for i in trange(0, len(texts), batch_size, desc="Text encoding"):
        batch = texts[i:i+batch_size]
        enc_dict = tok(batch, padding=True, truncation=True, 
                      max_length=max_len, return_tensors="pt")
        enc_dict = {k: v.to(DEVICE) for k, v in enc_dict.items()}
        
        outputs = enc(**enc_dict)
        mask = enc_dict["attention_mask"].unsqueeze(-1)
        embeddings = (outputs.last_hidden_state * mask).sum(1) / mask.sum(1)
        embeddings = torch.nn.functional.normalize(embeddings, p=2, dim=1)
        out.append(embeddings.cpu().numpy())
    
    arr = np.vstack(out).astype(np.float32)
    if cache_path:
        np.save(cache_path, arr)
    return arr

CACHE_V2 = Path("data/processed/cache_v2")
CACHE_V2.mkdir(parents=True, exist_ok=True)

train_texts = train_df["catalog_content"].fillna("").astype(str).tolist()
test_texts = test_df["catalog_content"].fillna("").astype(str).tolist()

print("Encoding text (MiniLM)...")
txt_train = encode_text_fast(train_texts, cache_path=CACHE_V2/"txt_train_v2.npy")
txt_test = encode_text_fast(test_texts, cache_path=CACHE_V2/"txt_test_v2.npy")
print(f"Text embeddings: {txt_train.shape}")

In [None]:
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
from tqdm import tqdm

CLIP_NAME = "openai/clip-vit-base-patch32"
processor = CLIPProcessor.from_pretrained(CLIP_NAME, cache_dir=HF_HOME.as_posix())
clip_model = CLIPModel.from_pretrained(CLIP_NAME, cache_dir=HF_HOME.as_posix()).to(DEVICE)
clip_model.eval()

@torch.no_grad()
def encode_images_fast(paths, batch_size=128, cache_path=None):
    """Fast image encoding without TTA"""
    if cache_path and Path(cache_path).exists():
        return np.load(cache_path)
    
    feats, batch = [], []
    
    def flush():
        nonlocal feats, batch
        if not batch:
            return
        inputs = processor(images=batch, return_tensors="pt")["pixel_values"].to(DEVICE)
        f = clip_model.get_image_features(pixel_values=inputs)
        f = torch.nn.functional.normalize(f, p=2, dim=1)
        feats.append(f.cpu().numpy())
        batch = []
    
    for p in tqdm(paths, desc="Image encoding"):
        try:
            img = Image.open(p).convert("RGB")
            batch.append(img)
        except:
            # Zero vector for missing images
            z = np.zeros((1, clip_model.visual_projection.out_features), dtype=np.float32)
            feats.append(z)
            continue
        
        if len(batch) >= batch_size:
            flush()
    flush()
    
    arr = np.vstack(feats).astype(np.float32)
    if cache_path:
        np.save(cache_path, arr)
    return arr

print("Encoding images (CLIP)...")
img_train = encode_images_fast(train_df["image_path"].tolist(), 
                               cache_path=CACHE_V2/"img_train_v2.npy")
img_test = encode_images_fast(test_df["image_path"].tolist(), 
                              cache_path=CACHE_V2/"img_test_v2.npy")
print(f"Image embeddings: {img_train.shape}")

In [None]:
class FastRegressor(nn.Module):
    """Lightweight MLP for speed"""
    def __init__(self, input_dim, hidden=256, drop=0.2):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(hidden, hidden//2),
            nn.ReLU(),
            nn.Dropout(drop),
            nn.Linear(hidden//2, 1)
        )
    
    def forward(self, x):
        return self.net(x).squeeze(-1)

def train_fast(Xtr, ytr_log, Xva, yva_log, epochs=15, bs=512, lr=1e-3):
    """Quick training with Huber loss"""
    model = FastRegressor(Xtr.shape[1], hidden=256, drop=0.2).to(DEVICE)
    opt = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    loss_fn = nn.HuberLoss(delta=1.0)
    
    best_smape = float("inf")
    best_state = None
    patience_count = 0
    
    for ep in range(epochs):
        model.train()
        idx = torch.randperm(Xtr.shape[0], device=DEVICE)
        for i in range(0, len(idx), bs):
            b = idx[i:i+bs]
            pred = model(Xtr[b])
            loss = loss_fn(pred, ytr_log[b])
            opt.zero_grad()
            loss.backward()
            opt.step()
        
        # Early stopping on SMAPE
        model.eval()
        with torch.no_grad():
            p_va = model(Xva)
            p_va_price = torch.expm1(p_va).clamp_min(0).cpu().numpy()
            y_va_price = torch.expm1(yva_log).cpu().numpy()
            sm = smape_np(y_va_price, p_va_price)
        
        if sm < best_smape:
            best_smape = sm
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            patience_count = 0
        else:
            patience_count += 1
            if patience_count >= 3:
                break
    
    if best_state:
        model.load_state_dict(best_state)
    
    model.eval()
    with torch.no_grad():
        p_va = model(Xva)
    return model, p_va

In [None]:
print("\n" + "="*60)
print("Training Multimodal Model (3-Fold CV)")
print("="*60)

oof = np.zeros(len(train_df), dtype=np.float32)
test_accum = np.zeros(len(test_df), dtype=np.float32)

for fold in range(3):
    print(f"\nFold {fold+1}/3...")
    tr = train_df[train_df.fold != fold]
    va = train_df[train_df.fold == fold]
    
    # Fold-local scaling
    scaler_f = StandardScaler()
    feats_tr = scaler_f.fit_transform(train_df.loc[tr.index, feature_cols])
    feats_va = scaler_f.transform(train_df.loc[va.index, feature_cols])
    feats_te = scaler_f.transform(test_df[feature_cols])
    
    # Combine: text + image + features
    Xtr = np.hstack([txt_train[tr.index], img_train[tr.index], feats_tr]).astype(np.float32)
    Xva = np.hstack([txt_train[va.index], img_train[va.index], feats_va]).astype(np.float32)
    Xte = np.hstack([txt_test, img_test, feats_te]).astype(np.float32)
    
    Xtr = torch.from_numpy(Xtr).to(DEVICE)
    Xva = torch.from_numpy(Xva).to(DEVICE)
    Xte = torch.from_numpy(Xte).to(DEVICE)
    
    ytr = torch.from_numpy(np.log1p(tr.price.values.clip(min=0)).astype(np.float32)).to(DEVICE)
    yva = torch.from_numpy(np.log1p(va.price.values.clip(min=0)).astype(np.float32)).to(DEVICE)
    
    model, p_va = train_fast(Xtr, ytr, Xva, yva, epochs=15, bs=512, lr=1e-3)
    
    with torch.no_grad():
        p_te = model(Xte)
    
    oof[va.index] = torch.expm1(p_va).cpu().numpy()
    test_accum += torch.expm1(p_te).cpu().numpy()

sm = smape_np(train_df.price.values, np.clip(oof, 0, None))
print(f"\n{'='*60}")
print(f"OOF SMAPE: {sm:.4f}")
print(f"{'='*60}")

In [None]:
np.save(OUT_DIR_V2/"oof_v2"/"fast_v2.npy", oof)

final_sub = pd.DataFrame({
    "sample_id": test_df.sample_id,
    "price": np.clip(test_accum/3.0, 0, None)
})
final_sub.to_csv(OUT_DIR_V2/"test_preds_v2"/"fast_v2.csv", index=False)

print(f"\n✅ Saved to: {OUT_DIR_V2}/test_preds_v2/fast_v2.csv")
print(f"\nSubmission preview:")
print(final_sub.head())

In [None]:
print("\n" + "="*60)
print("Optional: Adding XGBoost")
print("="*60)

try:
    import xgboost as xgb
    
    # Combine all features for GBM
    X_full_train = np.hstack([txt_train, img_train, train_feats]).astype(np.float32)
    X_full_test = np.hstack([txt_test, img_test, test_feats]).astype(np.float32)
    
    oof_xgb = np.zeros(len(train_df), dtype=np.float32)
    test_xgb_accum = np.zeros(len(test_df), dtype=np.float32)
    
    for fold in range(3):
        print(f"Fold {fold+1}/3...", end=" ")
        tr = train_df[train_df.fold != fold]
        va = train_df[train_df.fold == fold]
        
        model = xgb.XGBRegressor(
            n_estimators=500,  # Fewer trees for speed
            learning_rate=0.05,
            max_depth=6,
            subsample=0.8,
            colsample_bytree=0.8,
            random_state=SEED + fold,
            tree_method='hist',
            n_jobs=-1
        )
        
        model.fit(
            X_full_train[tr.index],
            np.log1p(tr.price.values),
            eval_set=[(X_full_train[va.index], np.log1p(va.price.values))],
            verbose=False
        )
        
        oof_xgb[va.index] = np.expm1(model.predict(X_full_train[va.index]))
        test_xgb_accum += np.expm1(model.predict(X_full_test))
        print("Done")
    
    sm_xgb = smape_np(train_df.price.values, np.clip(oof_xgb, 0, None))
    print(f"XGBoost OOF SMAPE: {sm_xgb:.4f}")
    
    # Simple 50/50 ensemble
    oof_blend = (oof + oof_xgb) / 2
    test_blend = (test_accum/3.0 + test_xgb_accum/3.0) / 2
    
    sm_blend = smape_np(train_df.price.values, np.clip(oof_blend, 0, None))
    print(f"Blended OOF SMAPE: {sm_blend:.4f}")
    
    # Save blended
    pd.DataFrame({
        "sample_id": test_df.sample_id,
        "price": np.clip(test_blend, 0, None)
    }).to_csv(OUT_DIR_V2/"test_preds_v2"/"blended_v2.csv", index=False)
    
    print(f"✅ Saved blended to: {OUT_DIR_V2}/test_preds_v2/blended_v2.csv")

except ImportError:
    print("XGBoost not installed. Skipping ensemble.")

print("\n" + "="*60)
print("🎉 FAST V2 PIPELINE COMPLETE!")
print("="*60)
print(f"\nAll outputs saved to: {OUT_DIR_V2}")
print("\nKey differences from full pipeline:")
print("  ✓ 3 folds instead of 5 (40% faster)")
print("  ✓ Smaller model (256 hidden vs 512-768)")
print("  ✓ No TTA on images")
print("  ✓ Fewer features")
print("  ✓ Faster text encoder (MiniLM-L6)")
print("  ✓ Only 15 epochs with early stopping")
print("="*60)