# Validation Predictions & Weight Tuning

In [15]:
# ============================================================
# 📦 ENSEMBLE VALIDATION (MULTITHREADED, RESUMABLE & MEMORY-SAFE)
# ============================================================
import os
import sys
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from transformers import CLIPProcessor, CLIPModel
import joblib
from concurrent.futures import ThreadPoolExecutor
from PIL import Image
from scipy.sparse import csr_matrix

# ============================================================
# ⚙️ CONFIG / PATHS
# ============================================================
WORK_DIR = '/content/drive/MyDrive/Amazon_ML_Challenge/student_resource'
TEXT_CACHE_DIR = os.path.join(WORK_DIR, "cache")
IMAGE_DIR = os.path.join(WORK_DIR, "images")
TRAIN_CSV = os.path.join(WORK_DIR, "dataset", "train.csv")
VAL_PROGRESS_FILE = os.path.join(WORK_DIR, "ensemble_val_progress.json")
IMAGE_VAL_PRED_FILE = os.path.join(WORK_DIR, "image_val_preds.npy")
MODEL_DIR = os.path.join(WORK_DIR, "models")

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ============================================================
# 1️⃣ LOAD TEXT DATA & MODEL
# ============================================================
npz_data = np.load(os.path.join(TEXT_CACHE_DIR, "X_train_step3.npz"), allow_pickle=True)
X_train_text = csr_matrix(
    (npz_data['data'], npz_data['indices'], npz_data['indptr']),
    shape=tuple(npz_data['shape'])
)
y_train = np.load(os.path.join(TEXT_CACHE_DIR, "y_train_step3.npy"))

assert X_train_text.shape[0] == len(y_train)
print("✅ X_train_text shape:", X_train_text.shape)
print("✅ y_train shape:", y_train.shape)

text_model_path = os.path.join(TEXT_CACHE_DIR, "best_model_lightgbm.pkl")
text_model = joblib.load(text_model_path)

X_tr, X_val_text, y_tr, y_val = train_test_split(
    X_train_text, y_train, test_size=0.1, random_state=42
)
text_val_preds = text_model.predict(X_val_text)

# ============================================================
# 2️⃣ LOAD & PREPROCESS VALIDATION IMAGES (BATCHED, RESUMABLE)
# ============================================================
train_df = pd.read_csv(TRAIN_CSV)
val_ids = train_df.sample(frac=0.1, random_state=42)["sample_id"].values
val_img_paths = [os.path.join(IMAGE_DIR, f"{sid}.jpg") for sid in val_ids]

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=True)
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
clip_model.eval()

def load_image(path):
    try:
        img = Image.open(path).convert("RGB")
        return img
    except:
        return None

# Multithreaded image loading
with ThreadPoolExecutor(max_workers=8) as executor:
    val_images = list(tqdm(executor.map(load_image, val_img_paths),
                           total=len(val_img_paths),
                           desc="🖼️ Loading validation images"))

# Filter out failed images
valid_idx = [i for i, img in enumerate(val_images) if img is not None]
val_images = [val_images[i] for i in valid_idx]
val_ids = val_ids[valid_idx]

# ============================================================
# 3️⃣ LOAD IMAGE MODEL (PriceRegressor)
# ============================================================
sys.path.append(WORK_DIR)
from price_regressor import PriceRegressor

model_ckpts = [f for f in os.listdir(MODEL_DIR) if f.endswith(".pt")]
if not model_ckpts:
    raise FileNotFoundError(f"No .pt model checkpoint found in {MODEL_DIR}")

latest_ckpt = max(model_ckpts, key=lambda f: os.path.getmtime(os.path.join(MODEL_DIR, f)))
ckpt_path = os.path.join(MODEL_DIR, latest_ckpt)

image_model = PriceRegressor().to(device)
ckpt = torch.load(ckpt_path, map_location=device)
if isinstance(ckpt, dict) and "model_state_dict" in ckpt:
    image_model.load_state_dict(ckpt["model_state_dict"])
else:
    image_model.load_state_dict(ckpt)
image_model.eval()
print("✅ Loaded image model checkpoint:", ckpt_path)

# ============================================================
# 4️⃣ LOAD OR INITIALIZE PROGRESS
# ============================================================
if os.path.exists(VAL_PROGRESS_FILE):
    with open(VAL_PROGRESS_FILE, "r") as f:
        progress = json.load(f)
else:
    progress = {"last_index": 0, "image_val_done": False}

if os.path.exists(IMAGE_VAL_PRED_FILE):
    image_val_preds = list(np.load(IMAGE_VAL_PRED_FILE))
else:
    image_val_preds = []

# ============================================================
# 5️⃣ GENERATE IMAGE VALIDATION PREDICTIONS (RESUMABLE)
# ============================================================
BATCH_SIZE_PROC = 64
start_idx = progress.get("last_index", 0)

if not progress.get("image_val_done", False):
    print(f"🔹 Resuming from index {start_idx}...")

    with torch.no_grad():
        for i in tqdm(range(start_idx, len(val_images), BATCH_SIZE_PROC), desc="🎯 Image Val Predictions"):
            batch_imgs = val_images[i:i+BATCH_SIZE_PROC]
            inputs = processor(images=batch_imgs, return_tensors="pt", padding=True)
            batch_pixels = inputs["pixel_values"].to(device)

            # --- Extract CLIP embeddings first ---
            batch_embeds = clip_model.get_image_features(batch_pixels)

            # Feed embeddings to PriceRegressor
            preds = image_model(batch_embeds).cpu().numpy().flatten()
            image_val_preds.extend(preds)

            # Save partial predictions & progress
            np.save(IMAGE_VAL_PRED_FILE, np.array(image_val_preds))
            progress["last_index"] = i + BATCH_SIZE_PROC
            with open(VAL_PROGRESS_FILE, "w") as f:
                json.dump(progress, f)

    progress["image_val_done"] = True
    with open(VAL_PROGRESS_FILE, "w") as f:
        json.dump(progress, f)
else:
    print("🔹 Loading cached image validation predictions...")
    image_val_preds = np.load(IMAGE_VAL_PRED_FILE)

# ============================================================
# 6️⃣ ENSEMBLE WEIGHT TUNING (SMAPE)
# ============================================================
def smape(y_true, y_pred):
    return 100/len(y_true) * np.sum(
        2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred) + 1e-8)
    )

# Convert to numpy array
image_val_preds = np.array(image_val_preds)

# Now ensemble tuning
best_smape = float("inf")
best_weight = 0.5
n_val = min(len(text_val_preds), len(image_val_preds))

for w in np.arange(0.0, 1.01, 0.05):
    val_preds = w * text_val_preds[:n_val] + (1 - w) * image_val_preds[:n_val]
    score = smape(y_val[:n_val], val_preds)
    if score < best_smape:
        best_smape = score
        best_weight = w

print(f"✅ Best Weight → Text: {best_weight:.2f}, Image: {1-best_weight:.2f}")
print(f"📊 SMAPE: {best_smape:.4f}")


with open(os.path.join(WORK_DIR, "best_ensemble_weight.json"), "w") as f:
    json.dump({"text": best_weight, "image": 1-best_weight}, f)


Using device: cuda
✅ X_train_text shape: (75000, 2390)
✅ y_train shape: (75000,)


🖼️ Loading validation images: 100%|██████████| 7500/7500 [00:18<00:00, 414.99it/s]


✅ Loaded image model checkpoint: /content/drive/MyDrive/Amazon_ML_Challenge/student_resource/models/model_epoch_20.pt
🔹 Loading cached image validation predictions...
✅ Best Weight → Text: 1.00, Image: 0.00
📊 SMAPE: 21.0106


# Final Test Predictions & Submission

In [19]:
# Notebook 2: final_submission_multithread.ipynb
import os
import json
import torch
import numpy as np
import pandas as pd
from tqdm import tqdm
from torch.utils.data import Dataset, DataLoader
import joblib
from concurrent.futures import ThreadPoolExecutor

# ------------------- #
# CONFIG / PATHS
WORK_DIR = '/content/drive/MyDrive/Amazon_ML_Challenge/student_resource'
TEXT_CACHE_DIR = os.path.join(WORK_DIR, "cache")
IMAGE_DIR = os.path.join(WORK_DIR, "images")
TRAIN_CSV = os.path.join(WORK_DIR, "dataset", "train.csv")
TEST_CSV = os.path.join(WORK_DIR, "dataset", "test.csv")
SUBMISSION_PATH = os.path.join(WORK_DIR, "submission_final.csv")
WEIGHT_FILE = os.path.join(WORK_DIR, "best_ensemble_weight.json")
IMAGE_TEST_PRED_FILE = os.path.join(WORK_DIR, "image_test_preds.npy")
PROGRESS_FILE = os.path.join(WORK_DIR, "ensemble_test_progress.json")
VAL_PROGRESS_FILE = os.path.join(WORK_DIR, "ensemble_val_progress.json")
IMAGE_VAL_PRED_FILE = os.path.join(WORK_DIR, "image_val_preds.npy")


# ------------------- #
# DEVICE
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# ------------------- #
# 1️⃣ Load best ensemble weight
with open(WEIGHT_FILE, "r") as f:
    weight_data = json.load(f)
TEXT_WEIGHT = weight_data["text"]
IMAGE_WEIGHT = weight_data["image"]

# ------------------- #
# 2️⃣ Load text test predictions
npz_data = np.load(os.path.join(TEXT_CACHE_DIR, "X_test_step3.npz"), allow_pickle=True)
X_test_text = csr_matrix((npz_data['data'], npz_data['indices'], npz_data['indptr']),
                         shape=tuple(npz_data['shape']))
text_model_path = os.path.join(TEXT_CACHE_DIR, "best_model_lightgbm.pkl")
text_model = joblib.load(text_model_path)
text_test_preds = text_model.predict(X_test_text)

# ------------------- #
# 3️⃣ Load image test embeddings using multithreading
def load_npz_file(path):
    try:
        data = np.load(path)
        return data["emb"], data["ids"]
    except Exception as e:
        print(f"⚠️ Failed to load {path}: {e}")
        return None, None

IMAGE_EMB_DIR = os.path.join(WORK_DIR, "test_embeddings")  # adjust path
image_emb_files = sorted([os.path.join(IMAGE_EMB_DIR, f) for f in os.listdir(IMAGE_EMB_DIR) if f.endswith(".npz")])

all_embs, all_ids = [], []

with ThreadPoolExecutor(max_workers=8) as executor:
    for emb, ids in tqdm(executor.map(load_npz_file, image_emb_files), total=len(image_emb_files), desc="Loading image embeddings"):
        if emb is not None and ids is not None:
            all_embs.append(emb)
            all_ids.extend(ids)

image_embs = np.vstack(all_embs)

# Dataset & loader
class ImageEmbDataset(Dataset):
    def __init__(self, embs, ids):
        self.embs = torch.tensor(embs, dtype=torch.float32)
        self.ids = ids

    def __len__(self):
        return len(self.embs)

    def __getitem__(self, idx):
        return self.embs[idx], self.ids[idx]

image_dataset = ImageEmbDataset(image_embs, all_ids)
image_loader = DataLoader(image_dataset, batch_size=128, shuffle=False)

# ------------------- #
# 4️⃣ Load image model
from price_regressor import PriceRegressor


image_model_ckpt = max([f for f in os.listdir(os.path.join(WORK_DIR, "models")) if f.endswith(".pt")])
ckpt = torch.load(os.path.join(WORK_DIR, "models", image_model_ckpt), map_location=device)

image_model = PriceRegressor().to(device)
if "model_state_dict" in ckpt:
    image_model.load_state_dict(ckpt["model_state_dict"])
else:
    image_model.load_state_dict(ckpt)
image_model.eval()

# ------------------- #
# 5️⃣ Resume or generate image test predictions (resumable & aligned)
if os.path.exists(PROGRESS_FILE):
    with open(PROGRESS_FILE, "r") as f:
        progress = json.load(f)
else:
    progress = {"image_test_done": False}

# Load previously saved predictions if exists
image_test_preds_dict = {}
if os.path.exists(IMAGE_TEST_PRED_FILE):
    try:
        loaded = np.load(IMAGE_TEST_PRED_FILE, allow_pickle=True)
        # Handle old format (plain array) or new format (dict)
        if isinstance(loaded, np.ndarray) and loaded.size == 1:
            # old array saved as single element
            image_test_preds_dict = loaded.item()
        elif isinstance(loaded, np.ndarray) and loaded.dtype == object:
            image_test_preds_dict = loaded.item()
        else:
            # fallback: assume array of predictions without IDs
            print("⚠️ Warning: IMAGE_TEST_PRED_FILE is not a dict; predictions will be overwritten.")
            image_test_preds_dict = {}
    except Exception as e:
        print(f"⚠️ Failed to load {IMAGE_TEST_PRED_FILE}: {e}")
        image_test_preds_dict = {}

if not progress.get("image_test_done", False):
    print("🔹 Generating image test predictions...")

    with torch.no_grad():
        for x, ids in tqdm(image_loader, desc="Image Test Predictions"):
            x = x.to(device)
            y_pred = image_model(x).cpu().numpy().flatten()
            for sid, pred in zip(ids, y_pred):
                image_test_preds_dict[int(sid)] = float(pred)

            # Save progress every batch
            np.save(IMAGE_TEST_PRED_FILE, image_test_preds_dict)

    progress["image_test_done"] = True
    with open(PROGRESS_FILE, "w") as f:
        json.dump(progress, f)
else:
    print("🔹 Loading cached image test predictions...")

# ------------------- #
# 6️⃣ Weighted ensemble & save submission (aligned by sample_id)
test_df = pd.read_csv(TEST_CSV)

# Ensure text_test_preds is a series with sample_id index
text_preds_series = pd.Series(text_test_preds, index=test_df['sample_id'])

# Image predictions aligned by sample_id; fill missing with 0
image_preds_series = test_df['sample_id'].map(image_test_preds_dict).fillna(0)

# Weighted ensemble
final_preds = TEXT_WEIGHT * text_preds_series.values + IMAGE_WEIGHT * image_preds_series.values

# Revert log1p if used during training
final_preds = np.expm1(final_preds)

# Save submission
submission = test_df[['sample_id']].copy()
submission['price'] = final_preds
submission.to_csv(SUBMISSION_PATH, index=False)
print(f"✅ Submission saved at {SUBMISSION_PATH}")


Using device: cuda


Loading image embeddings: 100%|██████████| 75/75 [00:02<00:00, 36.19it/s]


🔹 Loading cached image test predictions...
✅ Submission saved at /content/drive/MyDrive/Amazon_ML_Challenge/student_resource/submission_final.csv
