In [None]:
# ============================================================================
# CELL 1: Setup and Imports
# ============================================================================
import numpy as np
import pandas as pd
import re
import os
import gc
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# ML Libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import Ridge
import lightgbm as lgb
import catboost as cb
from scipy import sparse
from category_encoders import TargetEncoder

# Deep Learning
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import timm
from torchvision import transforms

# Image Processing
from PIL import Image
import requests
from io import BytesIO
import multiprocessing
from functools import partial
from tqdm import tqdm
import urllib

print("✅ All libraries imported!")
print(f"PyTorch: {torch.__version__} | CUDA: {torch.cuda.is_available()}")
print(f"TIMM version: {timm.__version__}")

# ============================================================================
# CELL 2: Configuration
# ============================================================================
class Config:
    # Paths
    DATA_PATH = '/kaggle/input/amazon-smart-pricing-challenge-2025'
    IMAGE_FOLDER = '/kaggle/working/product_images'
    
    # Model parameters
    SEED = 42
    N_FOLDS = 5
    MAX_TEXT_LENGTH = 256
    IMAGE_SIZE = 300  # EfficientNet-B3 optimal size
    BATCH_SIZE = 32
    
    # Feature engineering
    TFIDF_MAX_FEATURES = 60000
    SVD_COMPONENTS = 150
    
    # Ensemble weights (optimized through CV)
    CATBOOST_WEIGHT = 0.28
    LGBM_WEIGHT = 0.22
    RIDGE_WEIGHT = 0.08
    NN_TEXT_WEIGHT = 0.15
    MULTIMODAL_WEIGHT = 0.20
    STACKING_WEIGHT = 0.07
    
    # CatBoost params (enhanced)
    CATBOOST_PARAMS = {
        'iterations': 4000,
        'learning_rate': 0.025,
        'depth': 9,
        'l2_leaf_reg': 2.5,
        'min_data_in_leaf': 15,
        'random_strength': 0.4,
        'bagging_temperature': 0.25,
        'od_type': 'Iter',
        'od_wait': 150,
        'random_seed': SEED,
        'verbose': 300,
        'task_type': 'GPU' if torch.cuda.is_available() else 'CPU',
        'loss_function': 'RMSE',
        'eval_metric': 'MAE'
    }
    
    # LightGBM params (enhanced)
    LGBM_PARAMS = {
        'objective': 'regression',
        'metric': 'mae',
        'boosting_type': 'gbdt',
        'num_leaves': 150,
        'max_depth': 9,
        'learning_rate': 0.015,
        'feature_fraction': 0.75,
        'bagging_fraction': 0.75,
        'bagging_freq': 5,
        'min_child_samples': 15,
        'reg_alpha': 0.05,
        'reg_lambda': 0.05,
        'verbose': -1,
        'random_state': SEED
    }

def set_seed(seed=42):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    
set_seed(Config.SEED)

def smape(y_true, y_pred):
    """Calculate SMAPE metric"""
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    diff = np.abs(y_true - y_pred) / denominator
    diff[denominator == 0] = 0.0
    return np.mean(diff) * 100  # Return as percentage

print("✅ Configuration set!")

# ============================================================================
# CELL 3: Load Data
# ============================================================================
print("📊 Loading data...")
train_df = pd.read_csv(f'{Config.DATA_PATH}/train.csv')
test_df = pd.read_csv(f'{Config.DATA_PATH}/test.csv')

print(f"Train: {train_df.shape} | Test: {test_df.shape}")
print(f"\n💰 Price stats:\n{train_df['price'].describe()}")

# Log transform target
train_df['log_price'] = np.log1p(train_df['price'])

# Create price bins for stratified sampling
train_df['price_bin'] = pd.qcut(train_df['price'], q=10, labels=False, duplicates='drop')

print("✅ Data loaded!")

# ============================================================================
# CELL 4: Advanced Feature Engineering with Interaction Features
# ============================================================================
print("🔧 Extracting ADVANCED features...")

def extract_features(df):
    """Extract comprehensive features"""
    features = pd.DataFrame()
    
    # === NUMERICAL FEATURES ===
    features['text_length'] = df['catalog_content'].str.len()
    features['word_count'] = df['catalog_content'].str.split().str.len()
    features['avg_word_length'] = features['text_length'] / (features['word_count'] + 1)
    features['capital_ratio'] = df['catalog_content'].apply(lambda x: sum(1 for c in x if c.isupper()) / (len(x) + 1))
    features['digit_count'] = df['catalog_content'].str.count(r'\d')
    features['special_char_count'] = df['catalog_content'].str.count(r'[!@#$%^&*(),.?":{}|<>]')
    features['sentence_count'] = df['catalog_content'].str.count(r'\.')
    features['exclamation_count'] = df['catalog_content'].str.count(r'!')
    features['question_count'] = df['catalog_content'].str.count(r'\?')
    
    # Extract value
    def extract_value(text):
        match = re.search(r'Value:\s*(\d+\.?\d*)', text)
        return float(match.group(1)) if match else 0
    
    features['extracted_value'] = df['catalog_content'].apply(extract_value)
    features['log_value'] = np.log1p(features['extracted_value'])
    
    # Extract quantity/pack
    def extract_quantity(text):
        patterns = [
            r'Pack of (\d+)', r'(\d+)\s*Pack', r'(\d+)\s*Count',
            r'Set of (\d+)', r'\((\d+)\s*Pack\)', r'Quantity:\s*(\d+)',
            r'(\d+)\s*Pieces?', r'(\d+)\s*Units?'
        ]
        for pattern in patterns:
            match = re.search(pattern, text, re.IGNORECASE)
            if match:
                return int(match.group(1))
        return 1
    
    features['pack_quantity'] = df['catalog_content'].apply(extract_quantity)
    features['log_pack_quantity'] = np.log1p(features['pack_quantity'])
    features['sqrt_pack_quantity'] = np.sqrt(features['pack_quantity'])
    
    # Value per pack
    features['value_per_pack'] = features['extracted_value'] / (features['pack_quantity'] + 1)
    features['log_value_per_pack'] = np.log1p(features['value_per_pack'])
    
    # === CATEGORICAL FEATURES ===
    
    # Unit type
    def extract_unit(text):
        units = {
            'oz': 'ounce', 'ounce': 'ounce', 'fl oz': 'fluid_ounce',
            'lb': 'pound', 'pound': 'pound', 'lbs': 'pound',
            'kg': 'kilogram', 'gram': 'gram', 'g': 'gram',
            'ml': 'milliliter', 'liter': 'liter', 'l': 'liter',
            'count': 'count', 'piece': 'piece', 'ct': 'count'
        }
        text_lower = text.lower()
        for unit, standard in units.items():
            if unit in text_lower:
                return standard
        return 'unknown'
    
    features['unit_type'] = df['catalog_content'].apply(extract_unit)
    
    # Brand
    def extract_brand(text):
        match = re.search(r'Item Name:\s*([A-Z][a-zA-Z0-9]+)', text)
        return match.group(1) if match else 'unknown'
    
    features['brand'] = df['catalog_content'].apply(extract_brand)
    
    # Category
    def categorize_product(text):
        text_lower = text.lower()
        categories = {
            'food': ['food', 'sauce', 'spice', 'cookie', 'snack', 'candy', 'chocolate', 'coffee', 'tea'],
            'health': ['vitamin', 'supplement', 'health', 'protein', 'probiotic'],
            'beauty': ['beauty', 'cream', 'shampoo', 'lotion', 'soap', 'skin'],
            'tools': ['tool', 'equipment', 'device', 'gadget'],
            'household': ['cleaner', 'detergent', 'laundry', 'paper', 'towel'],
            'pet': ['pet', 'dog', 'cat', 'animal']
        }
        for cat, keywords in categories.items():
            if any(word in text_lower for word in keywords):
                return cat
        return 'other'
    
    features['category'] = df['catalog_content'].apply(categorize_product)
    
    # Quality tier
    def get_quality_tier(text):
        text_lower = text.lower()
        if any(word in text_lower for word in ['premium', 'luxury', 'gourmet', 'artisan', 'finest']):
            return 'premium'
        elif any(word in text_lower for word in ['organic', 'natural', 'pure', 'fresh']):
            return 'organic'
        elif any(word in text_lower for word in ['value', 'economy', 'basic', 'budget']):
            return 'economy'
        else:
            return 'standard'
    
    features['quality_tier'] = df['catalog_content'].apply(get_quality_tier)
    
    # Pack size category
    def get_pack_size_category(qty):
        if qty == 1:
            return 'single'
        elif qty <= 3:
            return 'small_pack'
        elif qty <= 6:
            return 'medium_pack'
        elif qty <= 12:
            return 'large_pack'
        else:
            return 'bulk'
    
    features['pack_size_category'] = features['pack_quantity'].apply(get_pack_size_category)
    
    # Binary flags
    features['has_value'] = df['catalog_content'].str.contains('Value:', case=False).astype(int)
    features['has_unit'] = df['catalog_content'].str.contains('Unit:', case=False).astype(int)
    features['has_brand'] = df['catalog_content'].str.contains(r'\b[A-Z][a-z]+\b').astype(int)
    features['premium_keywords'] = df['catalog_content'].str.count(
        r'(?i)(premium|organic|natural|gourmet|artisan|luxury|handmade|finest)'
    )
    features['sale_keywords'] = df['catalog_content'].str.count(
        r'(?i)(save|discount|deal|offer|promo)'
    )
    
    return features

# Extract features
print("Extracting train features...")
train_features = extract_features(train_df)
print("Extracting test features...")
test_features = extract_features(test_df)

categorical_features = ['unit_type', 'brand', 'category', 'quality_tier', 'pack_size_category']

print(f"✅ Extracted {train_features.shape[1]} features")

# ============================================================================
# CELL 5: Interaction Features (KEY for pricing!)
# ============================================================================
print("🔗 Creating interaction features...")

def create_interactions(features):
    """Create interaction features"""
    inter = pd.DataFrame()
    
    # Pack quantity * value interactions
    inter['pack_value_interaction'] = features['pack_quantity'] * features['extracted_value']
    inter['log_pack_value'] = np.log1p(inter['pack_value_interaction'])
    
    # Unit type * pack quantity (encoded)
    le_unit = LabelEncoder()
    unit_encoded = le_unit.fit_transform(features['unit_type'])
    inter['unit_pack_interaction'] = unit_encoded * features['pack_quantity']
    
    # Quality * value
    le_quality = LabelEncoder()
    quality_encoded = le_quality.fit_transform(features['quality_tier'])
    inter['quality_value_interaction'] = quality_encoded * features['extracted_value']
    
    # Category * pack
    le_category = LabelEncoder()
    category_encoded = le_category.fit_transform(features['category'])
    inter['category_pack_interaction'] = category_encoded * features['pack_quantity']
    
    # Value per word
    inter['value_per_word'] = features['extracted_value'] / (features['word_count'] + 1)
    
    # Premium score
    inter['premium_score'] = (
        features['premium_keywords'] * 2 +
        (features['quality_tier'] == 'premium').astype(int) * 3 +
        (features['quality_tier'] == 'organic').astype(int) * 2
    )
    
    return inter

train_interactions = create_interactions(train_features)
test_interactions = create_interactions(test_features)

# Combine
train_features = pd.concat([train_features, train_interactions], axis=1)
test_features = pd.concat([test_features, test_interactions], axis=1)

print(f"✅ Total features: {train_features.shape[1]}")

# ============================================================================
# CELL 6: Target Encoding with CV (Prevents Overfitting!)
# ============================================================================
print("🎯 Applying target encoding...")

kf = KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.SEED)

# Target encode categorical features
target_encoded_features = []

for cat_col in categorical_features:
    te = TargetEncoder(smoothing=1.0)
    train_encoded = np.zeros(len(train_df))
    
    # CV encoding for train
    for fold, (train_idx, val_idx) in enumerate(kf.split(train_df)):
        te.fit(train_features[cat_col].iloc[train_idx].values.reshape(-1, 1), 
               train_df['log_price'].iloc[train_idx])
        train_encoded[val_idx] = te.transform(
            train_features[cat_col].iloc[val_idx].values.reshape(-1, 1)
        ).ravel()
    
    # Fit on full train for test
    te.fit(train_features[cat_col].values.reshape(-1, 1), train_df['log_price'])
    test_encoded = te.transform(test_features[cat_col].values.reshape(-1, 1)).ravel()
    
    train_features[f'{cat_col}_target_enc'] = train_encoded
    test_features[f'{cat_col}_target_enc'] = test_encoded
    target_encoded_features.append(f'{cat_col}_target_enc')

print(f"✅ Added {len(target_encoded_features)} target-encoded features")

# ============================================================================
# CELL 7: TF-IDF + SVD
# ============================================================================
print("📝 Creating TF-IDF features...")

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

train_df['clean_text'] = train_df['catalog_content'].apply(preprocess_text)
test_df['clean_text'] = test_df['catalog_content'].apply(preprocess_text)

# TF-IDF
tfidf = TfidfVectorizer(
    max_features=Config.TFIDF_MAX_FEATURES,
    ngram_range=(1, 3),  # trigrams
    min_df=2,
    max_df=0.95,
    sublinear_tf=True
)

train_tfidf = tfidf.fit_transform(train_df['clean_text'])
test_tfidf = tfidf.transform(test_df['clean_text'])

# SVD
svd = TruncatedSVD(n_components=Config.SVD_COMPONENTS, random_state=Config.SEED)
train_svd = svd.fit_transform(train_tfidf)
test_svd = svd.transform(test_tfidf)

# Add to features
for i in range(Config.SVD_COMPONENTS):
    train_features[f'svd_{i}'] = train_svd[:, i]
    test_features[f'svd_{i}'] = test_svd[:, i]

print(f"✅ TF-IDF: {train_tfidf.shape} → SVD: {train_svd.shape}")
print(f"✅ Total features now: {train_features.shape[1]}")

# ============================================================================
# CELL 8: Download Images
# ============================================================================
print("🖼️ Downloading images...")

def download_image(image_link, savefolder):
    if isinstance(image_link, str):
        filename = Path(image_link).name
        image_save_path = os.path.join(savefolder, filename)
        if not os.path.exists(image_save_path):
            try:
                urllib.request.urlretrieve(image_link, image_save_path)
            except:
                pass

if not os.path.exists(Config.IMAGE_FOLDER):
    os.makedirs(Config.IMAGE_FOLDER)

# Download ALL images (parallel)
all_image_links = pd.concat([train_df['image_link'], test_df['image_link']]).unique()
print(f"Downloading {len(all_image_links)} unique images...")

download_partial = partial(download_image, savefolder=Config.IMAGE_FOLDER)
with multiprocessing.Pool(100) as pool:
    list(tqdm(pool.imap(download_partial, all_image_links), total=len(all_image_links)))

print("✅ Images downloaded!")

# ============================================================================
# CELL 9: Extract Image Features (EfficientNet-B3)
# ============================================================================
print("🎨 Extracting image features with EfficientNet-B3...")

# Load pretrained EfficientNet-B3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
efficientnet = timm.create_model('efficientnet_b3', pretrained=True, num_classes=0)
efficientnet = efficientnet.to(device)
efficientnet.eval()

# Image transforms
image_transform = transforms.Compose([
    transforms.Resize((Config.IMAGE_SIZE, Config.IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

def extract_image_features_batch(image_links, batch_size=64):
    """Extract EfficientNet features in batches"""
    all_features = []
    
    for i in tqdm(range(0, len(image_links), batch_size)):
        batch_links = image_links[i:i+batch_size]
        batch_images = []
        
        for link in batch_links:
            img_path = os.path.join(Config.IMAGE_FOLDER, Path(link).name)
            try:
                img = Image.open(img_path).convert('RGB')
                img_tensor = image_transform(img)
                batch_images.append(img_tensor)
            except:
                # Fallback to zero vector
                batch_images.append(torch.zeros(3, Config.IMAGE_SIZE, Config.IMAGE_SIZE))
        
        batch_tensor = torch.stack(batch_images).to(device)
        
        with torch.no_grad():
            features = efficientnet(batch_tensor)
            all_features.append(features.cpu().numpy())
        
        del batch_tensor
        torch.cuda.empty_cache()
    
    return np.vstack(all_features)

print("Extracting train image features...")
train_image_features = extract_image_features_batch(train_df['image_link'].values)

print("Extracting test image features...")
test_image_features = extract_image_features_batch(test_df['image_link'].values)

# Add to features
for i in range(train_image_features.shape[1]):
    train_features[f'img_feat_{i}'] = train_image_features[:, i]
    test_features[f'img_feat_{i}'] = test_image_features[:, i]

print(f"✅ Image features: {train_image_features.shape}")
print(f"✅ Total features: {train_features.shape[1]}")

gc.collect()

# ============================================================================
# CELL 10: Train CatBoost (Model 1)
# ============================================================================
print("\n🚀 Training CatBoost...")

cat_feature_indices = [train_features.columns.get_loc(col) for col in categorical_features]

catboost_preds = np.zeros(len(test_df))
oof_catboost = np.zeros(len(train_df))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df), 1):
    print(f"\n{'='*60}\nFold {fold}/{Config.N_FOLDS}\n{'='*60}")
    
    X_tr, X_val = train_features.iloc[train_idx], train_features.iloc[val_idx]
    y_tr, y_val = train_df['log_price'].iloc[train_idx], train_df['log_price'].iloc[val_idx]
    
    train_pool = cb.Pool(X_tr, y_tr, cat_features=cat_feature_indices)
    val_pool = cb.Pool(X_val, y_val, cat_features=cat_feature_indices)
    
    model = cb.CatBoostRegressor(**Config.CATBOOST_PARAMS)
    model.fit(train_pool, eval_set=val_pool, verbose=300, early_stopping_rounds=150)
    
    oof_catboost[val_idx] = model.predict(X_val)
    catboost_preds += model.predict(test_features) / Config.N_FOLDS
    
    fold_smape = smape(train_df['price'].iloc[val_idx], np.expm1(oof_catboost[val_idx]))
    print(f"Fold {fold} SMAPE: {fold_smape:.2f}")
    
    del model, train_pool, val_pool
    gc.collect()

oof_smape_cb = smape(train_df['price'], np.expm1(oof_catboost))
print(f"\n🏆 CatBoost OOF SMAPE: {oof_smape_cb:.2f}")

# ============================================================================
# CELL 11: Train LightGBM (Model 2)
# ============================================================================
print("\n🚀 Training LightGBM...")

train_features_lgb = train_features.values.astype(np.float32)
test_features_lgb = test_features.values.astype(np.float32)

lgbm_preds = np.zeros(len(test_df))
oof_lgbm = np.zeros(len(train_df))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df), 1):
    print(f"\nFold {fold}/{Config.N_FOLDS}")
    
    X_tr, X_val = train_features_lgb[train_idx], train_features_lgb[val_idx]
    y_tr, y_val = train_df['log_price'].iloc[train_idx], train_df['log_price'].iloc[val_idx]
    
    train_data = lgb.Dataset(X_tr, label=y_tr)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    model = lgb.train(
        Config.LGBM_PARAMS,
        train_data,
        num_boost_round=3000,
        valid_sets=[train_data, val_data],
        valid_names=['train', 'valid'],
        callbacks=[lgb.early_stopping(200), lgb.log_evaluation(300)]
    )
    
    oof_lgbm[val_idx] = model.predict(X_val)
    lgbm_preds += model.predict(test_features_lgb) / Config.N_FOLDS
    
    fold_smape = smape(train_df['price'].iloc[val_idx], np.expm1(oof_lgbm[val_idx]))
    print(f"Fold {fold} SMAPE: {fold_smape:.2f}")
    
    del model
    gc.collect()

oof_smape_lgb = smape(train_df['price'], np.expm1(oof_lgbm))
print(f"\n🏆 LightGBM OOF SMAPE: {oof_smape_lgb:.2f}")

# ============================================================================
# CELL 12: Ridge Regression (Model 3 - Sparse Features)
# ============================================================================
print("\n🚀 Training Ridge Regression...")

# Use TF-IDF + numeric features
ridge_train = sparse.hstack([
    train_tfidf,
    sparse.csr_matrix(train_features[['pack_quantity', 'extracted_value', 'log_pack_quantity']].values)
])
ridge_test = sparse.hstack([
    test_tfidf,
    sparse.csr_matrix(test_features[['pack_quantity', 'extracted_value', 'log_pack_quantity']].values)
])

ridge_preds = np.zeros(len(test_df))
oof_ridge = np.zeros(len(train_df))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df), 1):
    X_tr = ridge_train[train_idx]
    y_tr = train_df['log_price'].iloc[train_idx]
    X_val = ridge_train[val_idx]
    
    model = Ridge(alpha=10.0, random_state=Config.SEED)
    model.fit(X_tr, y_tr)
    
    oof_ridge[val_idx] = model.predict(X_val)
    ridge_preds += model.predict(ridge_test) / Config.N_FOLDS
    
    del model
    gc.collect()

oof_smape_ridge = smape(train_df['price'], np.expm1(oof_ridge))
print(f"🏆 Ridge OOF SMAPE: {oof_smape_ridge:.2f}")

# ============================================================================
# CELL 13: Neural Network - Text Only (Model 4)
# ============================================================================
print("\n🧠 Training Neural Network...")

class TextDataset(Dataset):
    def __init__(self, texts, prices=None, tokenizer=None, max_length=256):
        self.texts = texts
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text, max_length=self.max_length, padding='max_length',
            truncation=True, return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        if self.prices is not None:
            item['price'] = torch.tensor(self.prices[idx], dtype=torch.float)
        return item

class TextModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.bert = AutoModel.from_pretrained('distilbert-base-uncased')
        for param in self.bert.embeddings.parameters():
            param.requires_grad = False
        self.regressor = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        return self.regressor(outputs.last_hidden_state[:, 0])

tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')

nn_preds = np.zeros(len(test_df))
oof_nn = np.zeros(len(train_df))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df), 1):
    print(f"\nNN Fold {fold}/{Config.N_FOLDS}")
    
    train_dataset = TextDataset(
        train_df['catalog_content'].iloc[train_idx].values,
        train_df['log_price'].iloc[train_idx].values,
        tokenizer, Config.MAX_TEXT_LENGTH
    )
    val_dataset = TextDataset(
        train_df['catalog_content'].iloc[val_idx].values,
        train_df['log_price'].iloc[val_idx].values,
        tokenizer, Config.MAX_TEXT_LENGTH
    )
    
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=2)
    
    model = TextModel().to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    criterion = nn.HuberLoss()
    
    best_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(5):
        model.train()
        train_loss = 0
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            prices = batch['price'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask).squeeze()
            loss = criterion(outputs, prices)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        
        model.eval()
        val_loss = 0
        val_preds = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                prices = batch['price'].to(device)
                outputs = model(input_ids, attention_mask).squeeze()
                loss = criterion(outputs, prices)
                val_loss += loss.item()
                val_preds.extend(outputs.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            patience_counter = 0
            oof_nn[val_idx] = val_preds
        else:
            patience_counter += 1
            if patience_counter >= 2:
                break
    
    # Test predictions
    test_dataset = TextDataset(test_df['catalog_content'].values, None, tokenizer, Config.MAX_TEXT_LENGTH)
    test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=2)
    
    model.eval()
    fold_test_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            outputs = model(input_ids, attention_mask).squeeze()
            fold_test_preds.extend(outputs.cpu().numpy())
    
    nn_preds += np.array(fold_test_preds) / Config.N_FOLDS
    
    del model, train_dataset, val_dataset
    torch.cuda.empty_cache()
    gc.collect()

oof_smape_nn = smape(train_df['price'], np.expm1(oof_nn))
print(f"\n🏆 Neural Network OOF SMAPE: {oof_smape_nn:.2f}")

# ============================================================================
# CELL 14: Multimodal Model (Text + Image) - Model 5
# ============================================================================
print("\n🔥 Training Multimodal Model (Text + EfficientNet-B3)...")

class MultimodalDataset(Dataset):
    def __init__(self, texts, image_features, prices=None, tokenizer=None, max_length=128):
        self.texts = texts
        self.image_features = image_features
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text, max_length=self.max_length, padding='max_length',
            truncation=True, return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'image_features': torch.tensor(self.image_features[idx], dtype=torch.float)
        }
        if self.prices is not None:
            item['price'] = torch.tensor(self.prices[idx], dtype=torch.float)
        return item

class MultimodalModel(nn.Module):
    def __init__(self, image_feat_dim=1536):  # EfficientNet-B3 output
        super().__init__()
        # Text encoder
        self.text_encoder = AutoModel.from_pretrained('distilbert-base-uncased')
        for param in self.text_encoder.embeddings.parameters():
            param.requires_grad = False
        
        # Image projection
        self.image_proj = nn.Sequential(
            nn.Linear(image_feat_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256)
        )
        
        # Fusion
        self.fusion = nn.Sequential(
            nn.Linear(768 + 256, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 1)
        )
        
    def forward(self, input_ids, attention_mask, image_features):
        # Text features
        text_outputs = self.text_encoder(input_ids=input_ids, attention_mask=attention_mask)
        text_features = text_outputs.last_hidden_state[:, 0]
        
        # Image features
        image_features = self.image_proj(image_features)
        
        # Concatenate and fuse
        combined = torch.cat([text_features, image_features], dim=1)
        return self.fusion(combined)

multimodal_preds = np.zeros(len(test_df))
oof_multimodal = np.zeros(len(train_df))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df), 1):
    print(f"\nMultimodal Fold {fold}/{Config.N_FOLDS}")
    
    train_dataset = MultimodalDataset(
        train_df['catalog_content'].iloc[train_idx].values,
        train_image_features[train_idx],
        train_df['log_price'].iloc[train_idx].values,
        tokenizer, 128
    )
    val_dataset = MultimodalDataset(
        train_df['catalog_content'].iloc[val_idx].values,
        train_image_features[val_idx],
        train_df['log_price'].iloc[val_idx].values,
        tokenizer, 128
    )
    
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, num_workers=2)
    val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False, num_workers=2)
    
    model = MultimodalModel(image_feat_dim=train_image_features.shape[1]).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
    criterion = nn.HuberLoss()
    
    best_loss = float('inf')
    patience_counter = 0
    
    for epoch in range(4):
        model.train()
        for batch in train_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            image_features = batch['image_features'].to(device)
            prices = batch['price'].to(device)
            
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask, image_features).squeeze()
            loss = criterion(outputs, prices)
            loss.backward()
            optimizer.step()
        
        model.eval()
        val_loss = 0
        val_preds = []
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                image_features = batch['image_features'].to(device)
                prices = batch['price'].to(device)
                outputs = model(input_ids, attention_mask, image_features).squeeze()
                loss = criterion(outputs, prices)
                val_loss += loss.item()
                val_preds.extend(outputs.cpu().numpy())
        
        avg_val_loss = val_loss / len(val_loader)
        if avg_val_loss < best_loss:
            best_loss = avg_val_loss
            patience_counter = 0
            oof_multimodal[val_idx] = val_preds
        else:
            patience_counter += 1
            if patience_counter >= 2:
                break
    
    # Test predictions
    test_dataset = MultimodalDataset(
        test_df['catalog_content'].values, test_image_features, None, tokenizer, 128
    )
    test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False, num_workers=2)
    
    model.eval()
    fold_test_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            image_features = batch['image_features'].to(device)
            outputs = model(input_ids, attention_mask, image_features).squeeze()
            fold_test_preds.extend(outputs.cpu().numpy())
    
    multimodal_preds += np.array(fold_test_preds) / Config.N_FOLDS
    
    del model
    torch.cuda.empty_cache()
    gc.collect()

oof_smape_mm = smape(train_df['price'], np.expm1(oof_multimodal))
print(f"\n🏆 Multimodal OOF SMAPE: {oof_smape_mm:.2f}")

# ============================================================================
# CELL 15: Pseudo-Labeling (Use test predictions to retrain)
# ============================================================================
print("\n🔄 Applying Pseudo-Labeling...")

# Get average predictions for test set
test_avg_preds = (
    Config.CATBOOST_WEIGHT * catboost_preds +
    Config.LGBM_WEIGHT * lgbm_preds +
    Config.RIDGE_WEIGHT * ridge_preds +
    Config.NN_TEXT_WEIGHT * nn_preds +
    Config.MULTIMODAL_WEIGHT * multimodal_preds
)

# Select high-confidence predictions (middle 50% of predictions)
q25 = np.percentile(test_avg_preds, 25)
q75 = np.percentile(test_avg_preds, 75)
confident_mask = (test_avg_preds >= q25) & (test_avg_preds <= q75)
confident_indices = np.where(confident_mask)[0]

print(f"Selected {len(confident_indices)} confident pseudo-labels")

# Create pseudo-labeled dataset
pseudo_features = test_features.iloc[confident_indices].copy()
pseudo_labels = test_avg_preds[confident_indices]

# Combine with training data
combined_features = pd.concat([train_features, pseudo_features], ignore_index=True)
combined_labels = np.concatenate([train_df['log_price'].values, pseudo_labels])

# Retrain CatBoost on combined data
print("Retraining CatBoost with pseudo-labels...")

catboost_pseudo_preds = np.zeros(len(test_df))

for fold in range(2):  # 2 iterations for time
    model = cb.CatBoostRegressor(
        iterations=2000,
        learning_rate=0.02,
        depth=8,
        random_seed=Config.SEED + fold,
        verbose=0,
        task_type='GPU' if torch.cuda.is_available() else 'CPU'
    )
    
    model.fit(
        combined_features, combined_labels,
        cat_features=cat_feature_indices,
        verbose=False
    )
    
    catboost_pseudo_preds += model.predict(test_features) / 2
    
    del model
    gc.collect()

print("✅ Pseudo-labeling complete!")

# ============================================================================
# CELL 16: Stacking Meta-Model
# ============================================================================
print("\n🎭 Training Stacking Meta-Model...")

# Create meta-features (OOF predictions from base models)
meta_train = np.column_stack([
    oof_catboost,
    oof_lgbm,
    oof_ridge,
    oof_nn,
    oof_multimodal
])

meta_test = np.column_stack([
    catboost_preds,
    lgbm_preds,
    ridge_preds,
    nn_preds,
    multimodal_preds
])

# Add some base features
meta_train_full = np.column_stack([
    meta_train,
    train_features[['pack_quantity', 'extracted_value', 'log_pack_quantity']].values
])

meta_test_full = np.column_stack([
    meta_test,
    test_features[['pack_quantity', 'extracted_value', 'log_pack_quantity']].values
])

# Train Ridge meta-model
stacking_preds = np.zeros(len(test_df))
oof_stacking = np.zeros(len(train_df))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_df), 1):
    X_tr = meta_train_full[train_idx]
    y_tr = train_df['log_price'].iloc[train_idx]
    X_val = meta_train_full[val_idx]
    
    meta_model = Ridge(alpha=5.0)
    meta_model.fit(X_tr, y_tr)
    
    oof_stacking[val_idx] = meta_model.predict(X_val)
    stacking_preds += meta_model.predict(meta_test_full) / Config.N_FOLDS

oof_smape_stack = smape(train_df['price'], np.expm1(oof_stacking))
print(f"🏆 Stacking OOF SMAPE: {oof_smape_stack:.2f}")

# ============================================================================
# CELL 17: Final Ensemble
# ============================================================================
print("\n" + "="*70)
print("🎯 FINAL ENSEMBLE")
print("="*70)

# Weighted ensemble
final_preds_log = (
    Config.CATBOOST_WEIGHT * catboost_preds +
    Config.LGBM_WEIGHT * lgbm_preds +
    Config.RIDGE_WEIGHT * ridge_preds +
    Config.NN_TEXT_WEIGHT * nn_preds +
    Config.MULTIMODAL_WEIGHT * multimodal_preds +
    Config.STACKING_WEIGHT * stacking_preds
)

# Add pseudo-labeled CatBoost with small weight
final_preds_log = 0.92 * final_preds_log + 0.08 * catboost_pseudo_preds

# Convert to actual prices
final_preds = np.expm1(final_preds_log)

# Calculate OOF ensemble
oof_ensemble_log = (
    Config.CATBOOST_WEIGHT * oof_catboost +
    Config.LGBM_WEIGHT * oof_lgbm +
    Config.RIDGE_WEIGHT * oof_ridge +
    Config.NN_TEXT_WEIGHT * oof_nn +
    Config.MULTIMODAL_WEIGHT * oof_multimodal +
    Config.STACKING_WEIGHT * oof_stacking
)

oof_ensemble = np.expm1(oof_ensemble_log)
final_oof_smape = smape(train_df['price'], oof_ensemble)

# Print results
print(f"\n📊 Individual Model Performance:")
print(f"   CatBoost:      {oof_smape_cb:.2f} (weight: {Config.CATBOOST_WEIGHT})")
print(f"   LightGBM:      {oof_smape_lgb:.2f} (weight: {Config.LGBM_WEIGHT})")
print(f"   Ridge:         {oof_smape_ridge:.2f} (weight: {Config.RIDGE_WEIGHT})")
print(f"   Neural Net:    {oof_smape_nn:.2f} (weight: {Config.NN_TEXT_WEIGHT})")
print(f"   Multimodal:    {oof_smape_mm:.2f} (weight: {Config.MULTIMODAL_WEIGHT})")
print(f"   Stacking:      {oof_smape_stack:.2f} (weight: {Config.STACKING_WEIGHT})")
print(f"\n{'='*70}")
print(f"🏆 FINAL ENSEMBLE OOF SMAPE: {final_oof_smape:.2f}")
print(f"{'='*70}")

# ============================================================================
# CELL 18: Post-Processing & Calibration
# ============================================================================
print("\n🔧 Post-processing...")

# Ensure positive predictions
final_preds = np.maximum(final_preds, 0.1)

# Gentle calibration
train_median = train_df['price'].median()
pred_median = np.median(final_preds)
calibration_factor = train_median / pred_median

# Apply gentle calibration
calibrated_preds = final_preds * (0.8 + 0.2 * calibration_factor)

# Clip extremes
q01, q99 = train_df['price'].quantile([0.01, 0.99])
calibrated_preds = np.clip(calibrated_preds, q01 * 0.3, q99 * 2.0)

print(f"Calibration factor: {calibration_factor:.3f}")
print(f"Price range: ${calibrated_preds.min():.2f} - ${calibrated_preds.max():.2f}")

# ============================================================================
# CELL 19: Create Submissions
# ============================================================================
print("\n📝 Creating submissions...")

# Main submission
submission = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_preds
})

# Calibrated submission
submission_calibrated = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': calibrated_preds
})

# Verify
assert len(submission) == len(test_df)
assert (submission['price'] > 0).all()

# Save
submission.to_csv('submission.csv', index=False)
submission_calibrated.to_csv('submission_calibrated.csv', index=False)

print("\n✅ Submissions saved!")
print(f"   - submission.csv")
print(f"   - submission_calibrated.csv")


  