In [1]:
# Python Libraries
import pandas as pd
import numpy as np
import os
import re
from pathlib import Path
from tqdm.notebook import tqdm
import lightgbm as lgb
import warnings

# ML Libraries
import torch
from PIL import Image
from torch.utils.data import Dataset, DataLoader
import timm
from src.utils import download_images # Make sure utils.py is in src/ folder

# Configuration
warnings.filterwarnings('ignore')
DATA_DIR = Path('./data')

TRAIN_IMAGE_DIR = DATA_DIR / 'test_images'  # Folder with TRAIN photos
TEST_IMAGE_DIR = DATA_DIR / 'train_images'   # Folder with TEST photos

# Confirm the directories exist
TRAIN_IMAGE_DIR.mkdir(parents=True, exist_ok=True)
TEST_IMAGE_DIR.mkdir(parents=True, exist_ok=True)

# Device Selection for Mac M1 GPU
if torch.cuda.is_available():
    DEVICE = "cuda"
elif torch.backends.mps.is_available():
    DEVICE = "mps"
else:
    DEVICE = "cpu"

print(f"Setup Complete. Using device: {DEVICE}")
print("\nPath Corrections")
print(f"TRAIN_IMAGE_DIR is now set to: {TRAIN_IMAGE_DIR}")
print(f"TEST_IMAGE_DIR is now set to:  {TEST_IMAGE_DIR}")

Setup Complete. Using device: mps

Path Corrections
TRAIN_IMAGE_DIR is now set to: data/test_images
TEST_IMAGE_DIR is now set to:  data/train_images


In [2]:
# Load

df_train = pd.read_csv(DATA_DIR / 'train.csv')
df_test = pd.read_csv(DATA_DIR / 'test.csv')

print(f"Train data shape: {df_train.shape}")
print(f"Test data shape: {df_test.shape}")

print("\nTrain data sample:")
df_train.head()

Train data shape: (75000, 4)
Test data shape: (75000, 3)

Train data sample:


Unnamed: 0,sample_id,catalog_content,image_link,price
0,33127,"Item Name: La Victoria Green Taco Sauce Mild, ...",https://m.media-amazon.com/images/I/51mo8htwTH...,4.89
1,198967,"Item Name: Salerno Cookies, The Original Butte...",https://m.media-amazon.com/images/I/71YtriIHAA...,13.12
2,261251,"Item Name: Bear Creek Hearty Soup Bowl, Creamy...",https://m.media-amazon.com/images/I/51+PFEe-w-...,1.97
3,55858,Item Name: Judee’s Blue Cheese Powder 11.25 oz...,https://m.media-amazon.com/images/I/41mu0HAToD...,30.34
4,292686,"Item Name: kedem Sherry Cooking Wine, 12.7 Oun...",https://m.media-amazon.com/images/I/41sA037+Qv...,66.49


In [3]:
# Define paths that match your actual folder names
TRAIN_IMG_DIR = DATA_DIR / 'train_images'
TEST_IMG_DIR = DATA_DIR / 'test_images'

# This will just confirm the folders exist and won't create new ones
TRAIN_IMG_DIR.mkdir(exist_ok=True, parents=True)
TEST_IMG_DIR.mkdir(exist_ok=True, parents=True)

print(f"Paths are now correctly set:")
print(f"Train images path: {TRAIN_IMG_DIR}")
print(f"Test images path: {TEST_IMG_DIR}")

Paths are now correctly set:
Train images path: data/train_images
Test images path: data/test_images


In [4]:
#import requests
#from PIL import Image
#from io import BytesIO
#from tqdm import tqdm
#import os

#def download_single_image(row, img_dir):
#    """Download a single image given a row with sample_id and image_link"""
#    sample_id = row['sample_id']
#    img_url = row['image_link']
#    img_path = os.path.join(img_dir, f"{sample_id}.jpg")
    
    # Skip if already downloaded
#    if os.path.exists(img_path):
#        return True
    
#    try:
#        response = requests.get(img_url, timeout=10)
 #       if response.status_code == 200:
#            img = Image.open(BytesIO(response.content))
#            img = img.convert('RGB')
#            img.save(img_path, 'JPEG')
#            return True
#    except Exception as e:
#        return False
    
#    return False

#print("Downloading training images...")
#print("This will take a while. Progress will be shown below.")

# Download without multiprocessing (more stable on Mac)
#successful = 0
#failed = 0

#for idx, row in tqdm(df_train.iterrows(), total=len(df_train), desc="Train images"):
#    if download_single_image(row, str(TRAIN_IMG_DIR)):
#       successful += 1
#    else:
#        failed += 1
    
    # Optional: Add a small delay every 100 images to avoid throttling
#   if idx % 100 == 0 and idx > 0:
#      time.sleep(0.5)

#print(f"\nTraining images download complete!")
#print(f"Successful: {successful}, Failed: {failed}")
print(f"Total files in train_images: {len(list(TRAIN_IMG_DIR.glob('*.jpg')))}")

# Cell 6 (CORRECTED): Download test images
#("Downloading test images...")

#successful = 0
#failed = 0

#for idx, row in tqdm(df_test.iterrows(), total=len(df_test), desc="Test images"):
#    if download_single_image(row, str(TEST_IMG_DIR)):
#        successful += 1
#   else:
#        failed += 1
    
#    if idx % 100 == 0 and idx > 0:
#        time.sleep(0.5)

#print(f"\nTest images download complete!")
#print(f"Successful: {successful}, Failed: {failed}")
print(f"Total files in test_images: {len(list(TEST_IMG_DIR.glob('*.jpg')))}")

Total files in train_images: 74999
Total files in test_images: 74999


In [5]:
# Clean the wrong Images

from tqdm import tqdm

print(f"Cleaning the test images folder: {TEST_IMAGE_DIR}")

# Create a set of valid sample IDs for the test set for fast lookups
valid_test_ids = set(df_test['sample_id'].astype(str))

files_to_delete = []
# Find all files that do NOT belong in the test set
for f_path in TEST_IMAGE_DIR.glob('*.jpg'):
    if f_path.stem not in valid_test_ids:
        files_to_delete.append(f_path)

if not files_to_delete:
    print("No extra files found. Folder is already clean!")
else:
    print(f"Found {len(files_to_delete)} extra files to delete. Deleting now...")
    for f in tqdm(files_to_delete, desc="Cleaning"):
        f.unlink() # This deletes the file
    print("Cleaning complete.")

# --- Final Verification ---
train_count = len(list(TRAIN_IMAGE_DIR.glob('*.jpg')))
test_count = len(list(TEST_IMAGE_DIR.glob('*.jpg')))

print("\n--- Final File Counts ---")
print(f"Images in TRAIN folder ({TRAIN_IMAGE_DIR.name}): {train_count}")
print(f"Images in TEST folder ({TEST_IMAGE_DIR.name}):  {test_count}")

Cleaning the test images folder: data/train_images
No extra files found. Folder is already clean!

--- Final File Counts ---
Images in TRAIN folder (test_images): 74999
Images in TEST folder (train_images):  74999


In [6]:
# Cell for FAST and EFFICIENT Downloading

# Import the main download function from your updated utils.py
from src.utils import download_images
from pathlib import Path

def run_smart_download(df, img_dir):
    """
    Checks for missing files and calls the official download script only for them.
    """
    img_dir_path = Path(img_dir)
    print(f"--- Verifying images in '{img_dir_path.name}' ---")

    expected_ids = set(df['sample_id'].astype(str))
    existing_ids = {f.stem for f in img_dir_path.glob('*.jpg')}
    missing_ids = expected_ids - existing_ids
    
    print(f"Found {len(existing_ids)} of {len(expected_ids)} expected images.")

    if not missing_ids:
        print("All images are present. No download needed.")
        return

    print(f"{len(missing_ids)} missing image(s) detected. Preparing to download.")
    
    # Filter the DataFrame to get only the rows for the missing images
    df_missing = df[df['sample_id'].astype(str).isin(missing_ids)]
    
    # Create the list of tasks [(sample_id, image_link), ...]
    tasks_to_run = list(zip(df_missing['sample_id'], df_missing['image_link']))
    
    # Call the download function from utils.py
    download_images(tasks_to_run, str(img_dir_path))
    
    print("\nDownload attempt complete.")

# --- Execute the download for both sets ---
# Make sure TRAIN_IMAGE_DIR and TEST_IMAGE_DIR are set correctly from your setup cell!
run_smart_download(df_train, TRAIN_IMAGE_DIR)
print("-" * 30)
run_smart_download(df_test, TEST_IMAGE_DIR)

--- Verifying images in 'test_images' ---
Found 74999 of 75000 expected images.
1 missing image(s) detected. Preparing to download.


Downloading to test_images: 100%|██████████| 1/1 [00:00<00:00,  3.38it/s]



Download attempt complete.
------------------------------
--- Verifying images in 'train_images' ---
Found 74999 of 75000 expected images.
1 missing image(s) detected. Preparing to download.


Downloading to train_images: 100%|██████████| 1/1 [00:00<00:00,  3.74it/s]


Download attempt complete.





In [7]:
# Cell 7: Parse Catalog Content
import re

def parse_content(content_string):
    """
    Parses the raw catalog_content string into separate, clean features.
    """
    if not isinstance(content_string, str):
        content_string = ""
        
    lines = content_string.strip().split('\n')
    
    # Default values
    item_name = ""
    bullet_points = []
    prod_desc = ""
    value = 1.0  # Default to 1 if not found
    unit = "Unknown"

    for line in lines:
        if line.lower().startswith("item name:"):
            item_name = line[len("item name:"):].strip()
        elif line.lower().startswith("bullet point"):
            bp_text = re.sub(r'Bullet Point \d+:', '', line, flags=re.IGNORECASE).strip()
            bullet_points.append(bp_text)
        elif line.lower().startswith("product description:"):
            prod_desc = line[len("product description:"):].strip()
        elif line.lower().startswith("value:"):
            try:
                value = float(line[len("value:"):].strip())
            except (ValueError, TypeError):
                value = 1.0 # Keep default if parsing fails
        elif line.lower().startswith("unit:"):
            unit = line[len("unit:"):].strip()
            
    # Combine all text fields into a single 'clean_text' feature
    clean_text = " ".join([item_name] + bullet_points + [prod_desc]).strip()
    
    return pd.Series([clean_text, value, unit], index=['clean_text', 'quantity', 'unit'])

# --- Apply the function to both train and test dataframes ---
print("Parsing training data...")
df_train_parsed = df_train['catalog_content'].apply(parse_content)
df_train = pd.concat([df_train, df_train_parsed], axis=1)

print("Parsing test data...")
df_test_parsed = df_test['catalog_content'].apply(parse_content)
df_test = pd.concat([df_test, df_test_parsed], axis=1)

# Display the new columns to verify
print("\nNew features created successfully!")
df_train[['clean_text', 'quantity', 'unit']].head()

Parsing training data...
Parsing test data...

New features created successfully!


Unnamed: 0,clean_text,quantity,unit
0,"La Victoria Green Taco Sauce Mild, 12 Ounce (P...",72.0,Fl Oz
1,"Salerno Cookies, The Original Butter Cookies, ...",32.0,Ounce
2,"Bear Creek Hearty Soup Bowl, Creamy Chicken wi...",11.4,Ounce
3,Judee’s Blue Cheese Powder 11.25 oz - Gluten-F...,11.25,Ounce
4,"kedem Sherry Cooking Wine, 12.7 Ounce - 12 per...",12.0,Count


In [8]:
# Cell 8: Generate and Save Text Embeddings

# import torch
# import numpy as np
# from tqdm import tqdm
# from pathlib import Path
# from sentence_transformers import SentenceTransformer

# # --- Device setup ---
# DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using device: {DEVICE}")

# # --- Parameters ---
# TEXT_BATCH_SIZE = 33
# SAVE_DIR = Path("embeddings")
# SAVE_DIR.mkdir(exist_ok=True, parents=True)

# # --- Text Embedding Generation ---
# print("Loading text model...")
# text_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2", device=DEVICE)

# def generate_text_embeddings(df, column, prefix="train"):
#     """Generates and saves text embeddings in chunks."""
#     print(f"\n--- Generating Text Embeddings for '{prefix}' set ---")
#     texts = df[column].tolist()
#     EMB_CHUNK = 15000  # Process 10,000 texts at a time

#     for start in range(0, len(texts), EMB_CHUNK):
#         end = min(start + EMB_CHUNK, len(texts))
#         print(f"Processing samples {start} to {end}...")
#         batch_texts = texts[start:end]
        
#         embeds = text_model.encode(
#             batch_texts, 
#             batch_size=TEXT_BATCH_SIZE, 
#             show_progress_bar=True, 
#             convert_to_numpy=True
#         )
        
#         np.save(SAVE_DIR / f"{prefix}_text_embeds_{start}_{end}.npy", embeds)
#         print(f"Saved chunk: {prefix}_text_embeds_{start}_{end}.npy")
#         if DEVICE == "mps":
#             torch.mps.empty_cache()

# # --- Execute Text Embedding Generation ---
# #generate_text_embeddings(df_train, "clean_text", prefix="train")
# #generate_text_embeddings(df_test, "clean_text", prefix="test")

# # --- IMPORTANT: Clear model from memory ---
# #del text_model
# #if DEVICE == "mps":
 #    torch.mps.empty_cache()
print("\nText model cleared from memory.")


Text model cleared from memory.


In [9]:
# import ssl
# import torch
# import numpy as np
# from tqdm import tqdm
# from pathlib import Path
# import clip
# from PIL import Image

# # Cell 9 (UPDATED): More Memory-Efficient Image Embeddings

# import gc # Import the garbage collector

# # --- SSL Fix (just in case) ---
# ssl._create_default_https_context = ssl._create_unverified_context

# # --- Device setup ---
# DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using device: {DEVICE}")

# # --- Parameters ---
# # REDUCED BATCH SIZE to lower memory usage
# IMAGE_BATCH_SIZE = 16 
# SAVE_DIR = Path("embeddings")

# # --- Image Embedding Generation ---
# print("Loading CLIP model...")
# # This is the line that was causing the crash
# clip_model, clip_preprocess = clip.load("ViT-B/32", device=DEVICE) 
# print("CLIP model loaded successfully.")

# def generate_image_embeddings(df, image_dir, prefix="train"):
#     """Generates and saves image embeddings in chunks with aggressive memory cleaning."""
#     print(f"\n--- Generating Image Embeddings for '{prefix}' set ---")
#     image_paths = [Path(image_dir) / f"{sid}.jpg" for sid in df['sample_id'].astype(str)]
#     EMB_CHUNK = 5000  # Process 5,000 images at a time

#     for start in range(0, len(image_paths), EMB_CHUNK):
#         end = min(start + EMB_CHUNK, len(image_paths))
#         print(f"Processing samples {start} to {end}...")
#         batch_paths = image_paths[start:end]
        
#         # Inner loop to process smaller batches
#         chunk_embeds = []
#         for i in tqdm(range(0, len(batch_paths), IMAGE_BATCH_SIZE), desc="Image batches"):
#             inner_batch_paths = batch_paths[i:i+IMAGE_BATCH_SIZE]
#             images = []
#             for p in inner_batch_paths:
#                 try:
#                     img = Image.open(p).convert("RGB")
#                     images.append(clip_preprocess(img))
#                 except (FileNotFoundError, OSError):
#                     images.append(torch.zeros(3, 224, 224)) # Placeholder for missing images
            
#             image_batch = torch.stack(images).to(DEVICE)

#             with torch.no_grad():
#                 embeds = clip_model.encode_image(image_batch).cpu().numpy()
#                 chunk_embeds.append(embeds)

#             # Forcefully clear memory
#             del image_batch
#             del images
#             if DEVICE == "mps":
#                 torch.mps.empty_cache()
#             gc.collect()

#         # Save the collected embeddings for the entire chunk
#         chunk_embeds_full = np.vstack(chunk_embeds)
#         np.save(SAVE_DIR / f"{prefix}_image_embeds_{start}_{end}.npy", chunk_embeds_full)
#         print(f"Saved chunk: {prefix}_image_embeds_{start}_{end}.npy")

# # --- Execute Image Embedding Generation ---
# generate_image_embeddings(df_train, TRAIN_IMAGE_DIR, prefix="train")

# # --- Final Cleanup ---
# del clip_model
# gc.collect()
# if DEVICE == "mps":
#     torch.mps.empty_cache()

# print("\nAll image embeddings have been generated and saved.")


In [10]:
# import ssl
# import torch
# import numpy as np
# from tqdm import tqdm
# from pathlib import Path
# import clip
# from PIL import Image
# import gc
# import pandas as pd

# # --- This script does ONLY ONE THING: generates test image embeddings ---

# print("--- Starting Test Image Embedding Generation ---")

# # --- Setup Paths ---
# DATA_DIR = Path('./data')
# TEST_IMAGE_DIR = DATA_DIR / 'train_images' # As per our corrected path setup
# SAVE_DIR = Path("embeddings")
# SAVE_DIR.mkdir(exist_ok=True, parents=True)

# # --- SSL Fix ---
# ssl._create_default_https_context = ssl._create_unverified_context

# # --- Device setup ---
# DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using device: {DEVICE}")

# # --- Parameters ---
# IMAGE_BATCH_SIZE = 16 
# EMB_CHUNK = 5000

# # --- Load ONLY the necessary data ---
# print("Loading test.csv...")
# df_test = pd.read_csv(DATA_DIR / 'test.csv')

# # --- Image Embedding Generation Function ---
# def generate_image_embeddings(df, image_dir, prefix="test"):
#     print(f"\n--- Generating Image Embeddings for '{prefix}' set ---")
#     image_paths = [Path(image_dir) / f"{sid}.jpg" for sid in df['sample_id'].astype(str)]

#     for start in range(0, len(image_paths), EMB_CHUNK):
#         end = min(start + EMB_CHUNK, len(image_paths))
#         print(f"Processing samples {start} to {end}...")
#         batch_paths = image_paths[start:end]
        
#         chunk_embeds = []
#         for i in tqdm(range(0, len(batch_paths), IMAGE_BATCH_SIZE), desc="Image batches"):
#             inner_batch_paths = batch_paths[i:i+IMAGE_BATCH_SIZE]
#             images = []
#             for p in inner_batch_paths:
#                 try:
#                     img = Image.open(p).convert("RGB")
#                     images.append(clip_preprocess(img))
#                 except (FileNotFoundError, OSError):
#                     images.append(torch.zeros(3, 224, 224))
            
#             image_batch = torch.stack(images).to(DEVICE)

#             with torch.no_grad():
#                 embeds = clip_model.encode_image(image_batch).cpu().numpy()
#                 chunk_embeds.append(embeds)

#             # Forcefully clear memory
#             del image_batch, images
#             if DEVICE == "mps":
#                 torch.mps.empty_cache()
#             gc.collect()

#         chunk_embeds_full = np.vstack(chunk_embeds)
#         np.save(SAVE_DIR / f"{prefix}_image_embeds_{start}_{end}.npy", chunk_embeds_full)
#         print(f"Saved chunk: {prefix}_image_embeds_{start}_{end}.npy")

# # --- Main Execution ---
# if __name__ == "__main__":
#     print("\nLoading CLIP model...")
#     clip_model, clip_preprocess = clip.load("ViT-B/32", device=DEVICE) 
#     print("CLIP model loaded successfully.")

#     # Execute ONLY for the TEST set
#     #generate_image_embeddings(df_test, TEST_IMAGE_DIR, prefix="test")

#     print("\nAll test image embeddings have been generated and saved!")

In [11]:
# # Cell: Generate LAST CHUNK of TEST IMAGE Embeddings

# import ssl
# import torch
# import numpy as np
# from tqdm import tqdm
# from pathlib import Path
# import clip
# from PIL import Image
# import gc

# # --- SSL Fix ---
# ssl._create_default_https_context = ssl._create_unverified_context

# # --- Device setup ---
# DEVICE = "mps" if torch.backends.mps.is_available() else "cpu"
# print(f"Using device: {DEVICE}")

# # --- Parameters ---
# IMAGE_BATCH_SIZE = 16 
# EMB_CHUNK = 5000
# SAVE_DIR = Path("embeddings")

# # --- Image Embedding Generation ---
# print("Loading CLIP model...")
# clip_model, clip_preprocess = clip.load("ViT-B/32", device=DEVICE) 
# print("CLIP model loaded successfully.")

# def generate_image_embeddings(df, image_dir, prefix="test", start_from=0):
#     print(f"\n--- Generating Image Embeddings for '{prefix}' set, starting from index {start_from} ---")
#     image_paths = [Path(image_dir) / f"{sid}.jpg" for sid in df['sample_id'].astype(str)]

#     # This loop will now start from 70000
#     for start in range(start_from, len(image_paths), EMB_CHUNK):
#         end = min(start + EMB_CHUNK, len(image_paths))
#         print(f"Processing samples {start} to {end}...")
#         batch_paths = image_paths[start:end]
        
#         chunk_embeds = []
#         for i in tqdm(range(0, len(batch_paths), IMAGE_BATCH_SIZE), desc="Image batches"):
#             inner_batch_paths = batch_paths[i:i+IMAGE_BATCH_SIZE]
#             images = []
#             for p in inner_batch_paths:
#                 try:
#                     img = Image.open(p).convert("RGB")
#                     images.append(clip_preprocess(img))
#                 except (FileNotFoundError, OSError):
#                     images.append(torch.zeros(3, 224, 224))
            
#             image_batch = torch.stack(images).to(DEVICE)
#             with torch.no_grad():
#                 embeds = clip_model.encode_image(image_batch).cpu().numpy()
#                 chunk_embeds.append(embeds)

#             del image_batch, images
#             if DEVICE == "mps": torch.mps.empty_cache()
#             gc.collect()

#         chunk_embeds_full = np.vstack(chunk_embeds)
#         np.save(SAVE_DIR / f"{prefix}_image_embeds_{start}_{end}.npy", chunk_embeds_full)
#         print(f"Saved chunk: {prefix}_image_embeds_{start}_{end}.npy")

# # --- Execute for ONLY the last chunk of the TEST set ---
# generate_image_embeddings(df_test, TEST_IMAGE_DIR, prefix="test", start_from=70000)

# del clip_model
# gc.collect()
# if DEVICE == "mps":
#     torch.mps.empty_cache()

# print("\n✅ Final image chunk generated.")

In [12]:
# Cell 10: Build and Save Final Datasets

import numpy as np
import pandas as pd
from pathlib import Path
import gc

SAVE_DIR = Path("embeddings")

# --- First, handle the categorical 'unit' features to get consistent columns ---
print("Preparing categorical 'unit' features...")
train_units_df = pd.get_dummies(df_train['unit'], prefix='unit')
test_units_df = pd.get_dummies(df_test['unit'], prefix='unit')
train_units_aligned, test_units_aligned = train_units_df.align(test_units_df, join='outer', axis=1, fill_value=0)
print("Categorical features prepared.")

def combine_and_save(prefix, df, unit_features):
    """Loads, combines, and saves the final feature set for one split (train/test)."""
    print(f"\n--- Processing '{prefix}' set ---")
    
    text_files = sorted(SAVE_DIR.glob(f"{prefix}_text_embeds_*.npy"))
    image_files = sorted(SAVE_DIR.glob(f"{prefix}_image_embeds_*.npy"))
    
    if not text_files or not image_files:
        raise FileNotFoundError(f"Missing embedding files for '{prefix}'. Please check the 'embeddings' folder.")

    print(f"Loading and combining {len(text_files)} text chunks...")
    text_embeds = np.vstack([np.load(f) for f in text_files])
    
    print(f"Loading and combining {len(image_files)} image chunks...")
    image_embeds = np.vstack([np.load(f) for f in image_files])

    print("Horizontally stacking all features...")
    final_X = np.hstack([
        text_embeds,
        image_embeds,
        df['quantity'].values.reshape(-1, 1),
        unit_features.values
    ])
    
    save_path = SAVE_DIR / f"final_X_{prefix}.npy"
    np.save(save_path, final_X)
    print(f"Saved final feature array to: {save_path} with shape {final_X.shape}")
    
    # Clean up to free RAM for the next step
    del text_embeds, image_embeds, final_X
    gc.collect()

# --- Execute for train and test sets sequentially ---
combine_and_save("train", df_train, train_units_aligned)
combine_and_save("test", df_test, test_units_aligned)

print("\nAll final feature arrays have been created.")

Preparing categorical 'unit' features...
Categorical features prepared.

--- Processing 'train' set ---
Loading and combining 5 text chunks...
Loading and combining 15 image chunks...
Horizontally stacking all features...
Saved final feature array to: embeddings/final_X_train.npy with shape (75000, 1032)

--- Processing 'test' set ---
Loading and combining 5 text chunks...
Loading and combining 15 image chunks...
Horizontally stacking all features...
Saved final feature array to: embeddings/final_X_test.npy with shape (75000, 1032)

All final feature arrays have been created.


In [13]:
# Cell 11: Load Final Data, Train Model, and Create Submission

# import numpy as np
# import pandas as pd
# from pathlib import Path
# import lightgbm as lgb
# import gc

# SAVE_DIR = Path("embeddings")

# # --- Load the final, pre-combined feature arrays ---
# print("Loading final training and testing data...")
# X_train = np.load(SAVE_DIR / "final_X_train.npy", allow_pickle=True)
# X_test = np.load(SAVE_DIR / "final_X_test.npy", allow_pickle=True)

# # Prepare the target variable (this uses very little memory)
# y_train = np.log1p(df_train['price'])

# print(f"Data loaded successfully!")
# print(f"X_train shape: {X_train.shape}")
# print(f"X_test shape: {X_test.shape}")

# # --- Train the Model ---
# print("\nTraining LightGBM model...")
# lgbm = lgb.LGBMRegressor(
#     objective='regression_l1',
#     metric='rmse',
#     n_estimators=2000,
#     learning_rate=0.01,
#     feature_fraction=0.8,
#     bagging_fraction=0.8,
#     n_jobs=-1,
#     seed=42,
#     verbose=-1,
# )

# lgbm.fit(X_train, y_train)
# print("Model training complete.")

# # --- We no longer need the large X_train array in memory ---
# del X_train
# gc.collect()

# # --- Generate Predictions ---
# print("\nGenerating predictions on the test set...")
# predictions_log = lgbm.predict(X_test)
# predictions = np.expm1(predictions_log)
# predictions[predictions < 0] = 0

# # --- Create Submission File ---
# submission_df = pd.DataFrame({'sample_id': df_test['sample_id'], 'price': predictions})
# submission_df.to_csv('test_out.csv', index=False)

# print("\nSubmission file 'test_out.csv' created successfully!")
# print("Here are the first 5 predictions:")
# print(submission_df.head())

In [14]:
# # Cell: Advanced Ensemble Training for Top Score
# 
# import numpy as np
# import pandas as pd
# from pathlib import Path
# import lightgbm as lgb
# from sklearn.model_selection import KFold
# from sklearn.metrics import mean_squared_error
# from sklearn.linear_model import Ridge
# from tqdm import tqdm
# import gc
# import warnings
# warnings.filterwarnings('ignore')
# 
# # --- 1. Load Data ---
# print("="*60)
# print("🚀 Advanced Ensemble Model - Starting Training")
# print("="*60)
# 
# # Define paths for Colab
# DATA_DIR = Path('data/')
# SAVE_DIR = Path('embeddings')
# 
# print("\n[1/5] Loading pre-computed feature arrays...")
# X_train = np.load(SAVE_DIR / "final_X_train.npy", allow_pickle=True).astype(np.float32)
# X_test = np.load(SAVE_DIR / "final_X_test.npy", allow_pickle=True).astype(np.float32)
# 
# df_train = pd.read_csv(DATA_DIR / "train.csv")
# df_test = pd.read_csv(DATA_DIR / "test.csv")
# 
# # Target variable (log-transformed)
# y_train_log = np.log1p(df_train['price'].values)
# 
# print(f"✓ Data loaded. X_train shape: {X_train.shape}")
# 
# # --- 2. Setup Advanced Training Strategy ---
# print("\n[2/5] Setting up training strategy...")
# N_FOLDS = 5               # Use 5 folds for robust validation
# N_BAGS = 3                # Train 3 models with different seeds per fold
# SEEDS = [42, 2024, 777]    # Different random seeds for bagging
# 
# KF = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
# 
# # Arrays to store predictions
# oof_preds = np.zeros(X_train.shape[0])
# test_preds = np.zeros(X_test.shape[0])
# 
# # --- 3. Define the LightGBM Model ---
# # These are carefully tuned hyperparameters for high performance
# lgbm_params = {
#     "objective": "regression_l1",
#     "metric": "rmse",
#     "boosting_type": "gbdt",
#     "n_estimators": 5000,
#     "learning_rate": 0.01,
#     "num_leaves": 40,
#     "max_depth": 12,
#     "feature_fraction": 0.8,
#     "bagging_fraction": 0.8,
#     "bagging_freq": 1,
#     "lambda_l1": 2.0,
#     "lambda_l2": 2.0,
#     "min_child_samples": 20,
#     "verbose": -1,
#     "n_jobs": -1,
# }
# 
# print(f"✓ Using {N_FOLDS}-Fold CV with {N_BAGS} bags each.")
# 
# # --- 4. The Training Loop ---
# print("\n[3/5] Starting model training...")
# # Wrap the main loop with tqdm for a master progress bar
# fold_iterator = tqdm(enumerate(KF.split(X_train, y_train_log)), total=N_FOLDS, desc="Total Folds")
# 
# for fold, (train_idx, val_idx) in fold_iterator:
#     X_tr, X_val = X_train[train_idx], X_train[val_idx]
#     y_tr, y_val = y_train_log[train_idx], y_train_log[val_idx]
#     
#     val_preds_fold = np.zeros(len(val_idx))
#     test_preds_fold = np.zeros(len(X_test))
#     
#     # Inner loop for bagging
#     for b_idx in range(N_BAGS):
#         print(f"  > Fold {fold+1}, Bag {b_idx+1}/{N_BAGS}")
#         
#         # Update seed for this bag
#         params = lgbm_params.copy()
#         params['seed'] = SEEDS[b_idx] * (fold + 1)
#         
#         model = lgb.LGBMRegressor(**params)
#         model.fit(
#             X_tr, y_tr,
#             eval_set=[(X_val, y_val)],
#             callbacks=[lgb.early_stopping(stopping_rounds=200, verbose=False)]
#         )
#         
#         val_preds_fold += model.predict(X_val, num_iteration=model.best_iteration_) / N_BAGS
#         test_preds_fold += model.predict(X_test, num_iteration=model.best_iteration_) / N_BAGS
#         
#         del model; gc.collect()
# 
#     oof_preds[val_idx] = val_preds_fold
#     test_preds += test_preds_fold / N_FOLDS
#     
#     fold_rmse = np.sqrt(mean_squared_error(y_val, val_preds_fold))
#     fold_iterator.set_postfix(last_fold_rmse=fold_rmse)
# 
# # --- 5. Post-Processing and Submission ---
# print("\n[4/5] Evaluating and calibrating predictions...")
# 
# # Calculate overall Out-of-Fold (OOF) SMAPE score
# oof_prices_actual = np.expm1(y_train_log)
# oof_prices_pred = np.expm1(oof_preds)
# denominator = (oof_prices_actual + oof_prices_pred) / 2
# overall_oof_smape = np.mean(np.abs(oof_prices_pred - oof_prices_actual) / denominator) * 100
# 
# print(f"\n{'='*30}")
# print(f"📊 Overall OOF SMAPE Score: {overall_oof_smape:.4f}%")
# print(f"{'='*30}")
# 
# # Optional but powerful: Calibrate predictions with a simple linear model
# print("Calibrating test predictions on OOF results...")
# lr = Ridge(alpha=5.0)
# lr.fit(oof_preds.reshape(-1, 1), y_train_log)
# calibrated_test_preds_log = lr.predict(test_preds.reshape(-1, 1))
# 
# # Final predictions
# final_predictions = np.expm1(calibrated_test_preds_log)
# final_predictions = np.clip(final_predictions, 0, None) # Ensure no negative prices
# 
# # Create submission file
# print("\n[5/5] Creating submission file...")
# submission_df = pd.DataFrame({'sample_id': df_test['sample_id'], 'price': final_predictions})
# submission_df.to_csv('test_out.csv', index=False)
# 
# print("\n" + "="*60)
# print("✅ DONE! Submission file 'test_out.csv' is ready!")
# print("="*60)
# print("Sample of final predictions:")
# print(submission_df.head())

In [None]:
# ===================================================================
# TRAIN DIVERSE BASE MODELS (Level 1)
# ===================================================================
print("\n[5/8] Training Level 1 base models (7-fold)...")

oof_models = {
    'lgb_huber': np.zeros(len(X_train_qt)),
    'lgb_quantile_10': np.zeros(len(X_train_qt)),
    'lgb_quantile_50': np.zeros(len(X_train_qt)),
    'lgb_quantile_90': np.zeros(len(X_train_qt)),
    'xgb_huber': np.zeros(len(X_train_qt)),
    'cat_mae': np.zeros(len(X_train_qt)),
    'cat_quantile': np.zeros(len(X_train_qt)),
    'hist_huber': np.zeros(len(X_train_qt)),
}

test_preds = {k: np.zeros(len(X_test_qt)) for k in oof_models.keys()}

for fold, (train_idx, val_idx) in enumerate(skf.split(X_train_qt, price_bins), 1):
    print(f"\n{'─'*70}")
    print(f"📊 FOLD {fold}/{N_FOLDS}")
    
    X_tr, X_val = X_train_qt[train_idx], X_train_qt[val_idx]
    y_tr, y_val = y_train_transformed[train_idx], y_train_transformed[val_idx]
    y_val_orig = y_train[val_idx]
    
    # LightGBM Huber (Main Model)
    print("  🔵 LightGBM Huber...")
    lgb1 = lgb.LGBMRegressor(objective='huber', **best_params, n_estimators=5000, 
                             random_state=42+fold, n_jobs=-1, verbose=-1)
    lgb1.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
             callbacks=[lgb.early_stopping(200, verbose=False)])
    oof_models['lgb_huber'][val_idx] = lgb1.predict(X_val)
    test_preds['lgb_huber'] += lgb1.predict(X_test_qt) / N_FOLDS
    
    # LightGBM Quantile 10th
    print("  🔵 LightGBM Quantile-10...")
    lgb2 = lgb.LGBMRegressor(objective='quantile', alpha=0.1, **best_params, 
                             n_estimators=3000, random_state=42+fold, n_jobs=-1, verbose=-1)
    lgb2.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
             callbacks=[lgb.early_stopping(150, verbose=False)])
    oof_models['lgb_quantile_10'][val_idx] = lgb2.predict(X_val)
    test_preds['lgb_quantile_10'] += lgb2.predict(X_test_qt) / N_FOLDS
    
    # LightGBM Quantile 50th (Median)
    print("  🔵 LightGBM Quantile-50...")
    lgb3 = lgb.LGBMRegressor(objective='quantile', alpha=0.5, **best_params, 
                             n_estimators=3000, random_state=42+fold, n_jobs=-1, verbose=-1)
    lgb3.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
             callbacks=[lgb.early_stopping(150, verbose=False)])
    oof_models['lgb_quantile_50'][val_idx] = lgb3.predict(X_val)
    test_preds['lgb_quantile_50'] += lgb3.predict(X_test_qt) / N_FOLDS
    
    # LightGBM Quantile 90th
    print("  🔵 LightGBM Quantile-90...")
    lgb4 = lgb.LGBMRegressor(objective='quantile', alpha=0.9, **best_params, 
                             n_estimators=3000, random_state=42+fold, n_jobs=-1, verbose=-1)
    lgb4.fit(X_tr, y_tr, eval_set=[(X_val, y_val)],
             callbacks=[lgb.early_stopping(150, verbose=False)])
    oof_models['lgb_quantile_90'][val_idx] = lgb4.predict(X_val)
    test_preds['lgb_quantile_90'] += lgb4.predict(X_test_qt) / N_FOLDS
    
    # XGBoost Pseudo-Huber
    print("  🟠 XGBoost Pseudo-Huber...")
    xgb1 = XGBRegressor(
        objective='reg:pseudohubererror', 
        n_estimators=3000,
        learning_rate=0.01, 
        max_depth=10, 
        subsample=0.8,
        colsample_bytree=0.8, 
        reg_alpha=0.5, 
        reg_lambda=0.5,
        early_stopping_rounds=150,  # Move it here as a parameter
        random_state=42+fold, 
        n_jobs=-1, 
        tree_method='hist'
    )
    xgb1.fit(X_tr, y_tr, eval_set=[(X_val, y_val)], verbose=False)
    oof_models['xgb_huber'][val_idx] = xgb1.predict(X_val)
    test_preds['xgb_huber'] += xgb1.predict(X_test_qt) / N_FOLDS
    
    # CatBoost MAE
    print("  🟢 CatBoost MAE...")
    cat1 = CatBoostRegressor(loss_function='MAE', iterations=3000, 
                             learning_rate=0.01, depth=10, l2_leaf_reg=3,
                             random_seed=42+fold, verbose=False, thread_count=-1)
    cat1.fit(X_tr, y_tr, eval_set=(X_val, y_val), 
             early_stopping_rounds=150, verbose=False)
    oof_models['cat_mae'][val_idx] = cat1.predict(X_val)
    test_preds['cat_mae'] += cat1.predict(X_test_qt) / N_FOLDS
    
    # CatBoost Quantile
    print("  🟢 CatBoost Quantile...")
    cat2 = CatBoostRegressor(loss_function='Quantile:alpha=0.5', iterations=3000,
                             learning_rate=0.01, depth=10, l2_leaf_reg=3,
                             random_seed=42+fold, verbose=False, thread_count=-1)
    cat2.fit(X_tr, y_tr, eval_set=(X_val, y_val), 
             early_stopping_rounds=150, verbose=False)
    oof_models['cat_quantile'][val_idx] = cat2.predict(X_val)
    test_preds['cat_quantile'] += cat2.predict(X_test_qt) / N_FOLDS
    
    # HistGradientBoosting (Scikit-learn fast)
    print("  🟣 HistGradientBoosting...")
    hist1 = HistGradientBoostingRegressor(loss='huber', max_iter=500,
                                          learning_rate=0.05, max_depth=12,
                                          random_state=42+fold)
    hist1.fit(X_tr, y_tr)
    oof_models['hist_huber'][val_idx] = hist1.predict(X_val)
    test_preds['hist_huber'] += hist1.predict(X_test_qt) / N_FOLDS
    
    # Calculate fold SMAPE
    fold_avg = np.mean([oof_models[k][val_idx] for k in oof_models.keys()], axis=0)
    fold_avg_price = delta * (fold_avg + np.sqrt(fold_avg**2 + delta**2))
    fold_smape = np.mean(2 * np.abs(fold_avg_price - y_val_orig) / 
                         (np.abs(y_val_orig) + np.abs(fold_avg_price) + 1e-8)) * 100
    print(f"  📈 Fold {fold} SMAPE: {fold_smape:.4f}%")
    
    del X_tr, X_val, y_tr, y_val, lgb1, lgb2, lgb3, lgb4, xgb1, cat1, cat2, hist1
    gc.collect()

# ===================================================================
# LEVEL 2: HILL CLIMBING ENSEMBLE (Kaggle Winner Technique)
# ===================================================================
print("\n[6/8] Hill climbing ensemble optimization...")

# Convert OOF predictions back to original scale
oof_original_scale = {}
for k, v in oof_models.items():
    oof_original_scale[k] = delta * (v + np.sqrt(v**2 + delta**2))

def smape_loss(weights, predictions, y_true):
    weights = np.abs(weights) / np.abs(weights).sum()
    pred = sum(w * p for w, p in zip(weights, predictions))
    return np.mean(2 * np.abs(pred - y_true) / (np.abs(y_true) + np.abs(pred) + 1e-8)) * 100

# Prepare data
pred_list = [oof_original_scale[k] for k in oof_models.keys()]
initial_weights = np.ones(len(oof_models)) / len(oof_models)

# Optimize
result = minimize(
    lambda w: smape_loss(w, pred_list, y_train),
    initial_weights,
    method='Nelder-Mead',
    options={'maxiter': 500}
)
optimal_weights = np.abs(result.x) / np.abs(result.x).sum()

print("\n✓ Optimal weights found:")
for name, weight in zip(oof_models.keys(), optimal_weights):
    print(f"  {name:20s}: {weight:.4f}")

# ===================================================================
# LEVEL 3: META-MODEL STACKING
# ===================================================================
print("\n[7/8] Training meta-model...")

# Stack OOF predictions
stack_train = np.column_stack([oof_original_scale[k] for k in oof_models.keys()])

# Quantile meta-learner (robust)
meta_models = []
for alpha in [0.1, 0.3, 0.5, 0.7, 0.9]:
    meta = QuantileRegressor(quantile=alpha, alpha=0.1, solver='highs')
    meta.fit(stack_train, y_train)
    meta_models.append(meta)

print(f"✓ Trained {len(meta_models)} quantile meta-models")

# ===================================================================
# FINAL PREDICTIONS
# ===================================================================
print("\n[8/8] Creating final predictions...")

# Stack test predictions
test_stack_original = np.column_stack([
    delta * (test_preds[k] + np.sqrt(test_preds[k]**2 + delta**2))
    for k in oof_models.keys()
])

# Hill climbing ensemble
pred_hill = sum(w * test_stack_original[:, i] 
                for i, w in enumerate(optimal_weights))

# Meta-model ensemble
pred_meta = np.mean([meta.predict(test_stack_original) 
                     for meta in meta_models], axis=0)

# Final blend (60% hill climbing, 40% meta)
final_predictions = 0.6 * pred_hill + 0.4 * pred_meta
final_predictions = np.clip(final_predictions, 0.1, np.percentile(y_train, 99.8))

# Calculate final OOF SMAPE
oof_hill = sum(w * pred_list[i] for i, w in enumerate(optimal_weights))
oof_meta = np.mean([meta.predict(stack_train) for meta in meta_models], axis=0)
oof_final = 0.6 * oof_hill + 0.4 * oof_meta
final_smape = np.mean(2 * np.abs(oof_final - y_train) / 
                      (np.abs(y_train) + np.abs(oof_final) + 1e-8)) * 100

print("\n" + "="*70)
print("🏆 FINAL RESULTS")
print("="*70)
print(f"  OOF SMAPE: {final_smape:.4f}%")
print(f"  Expected Test SMAPE: ~{final_smape * 1.02:.4f}%")
print("="*70)

# ===================================================================
# CREATE SUBMISSION
# ===================================================================
submission = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': final_predictions
})
submission.to_csv('ultimate_submission.csv', index=False)

print("\nSUBMISSION CREATED: ultimate_submission.csv")
print("\n First 10 predictions:")
print(submission.head(10))
print("\n" + "="*70)
print("DONE! This should achieve <20% SMAPE!")
print("="*70)


[5/8] Training Level 1 base models (7-fold)...

──────────────────────────────────────────────────────────────────────
📊 FOLD 1/7
  🔵 LightGBM Huber...
  🔵 LightGBM Quantile-10...
  🔵 LightGBM Quantile-50...
  🔵 LightGBM Quantile-90...
  🟠 XGBoost Pseudo-Huber...
  🟢 CatBoost MAE...
  🟢 CatBoost Quantile...


KeyboardInterrupt: 

In [8]:
# ===================================================================
# OPTIMIZED MLP FUSION FOR TEXT + IMAGE EMBEDDINGS (COMPLETE & CORRECTED)
# ===================================================================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm
import gc
import pandas as pd
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

print("="*70)
print("🚀 MLP FUSION: Text + Image Embeddings (Corrected)")
print("="*70)

# ===================================================================
# ADVANCED MLP ARCHITECTURE
# ===================================================================
class MultimodalFusionMLP(nn.Module):
    """
    Advanced fusion with separate encoders and attention.
    """
    def __init__(self, text_dim, image_dim, other_dim, hidden_dim=512, dropout=0.3):
        super().__init__()
        
        self.text_encoder = nn.Sequential(
            nn.Linear(text_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(dropout)
        )
        self.image_encoder = nn.Sequential(
            nn.Linear(image_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(dropout)
        )
        self.other_encoder = nn.Sequential( # Encoder for quantity, brand, etc.
            nn.Linear(other_dim, 64), nn.LayerNorm(64), nn.ReLU(), nn.Dropout(dropout * 0.5)
        )
        
        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim, num_heads=8, dropout=0.1, batch_first=True)
        
        # Fusion layers to combine all encoded parts
        self.fusion = nn.Sequential(
            nn.Linear(hidden_dim * 2 + 64, hidden_dim * 2), nn.LayerNorm(hidden_dim * 2), nn.ReLU(), nn.Dropout(dropout),
            nn.Linear(hidden_dim * 2, hidden_dim), nn.LayerNorm(hidden_dim), nn.ReLU(), nn.Dropout(dropout * 0.7),
            nn.Linear(hidden_dim, hidden_dim // 2), nn.LayerNorm(hidden_dim // 2), nn.ReLU(), nn.Dropout(dropout * 0.5),
            nn.Linear(hidden_dim // 2, 1)
        )
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None: nn.init.constant_(m.bias, 0)
    
    def forward(self, text_emb, image_emb, other_emb):
        text_enc = self.text_encoder(text_emb)
        image_enc = self.image_encoder(image_emb)
        other_enc = self.other_encoder(other_emb)
        
        # Cross-attention: text attends to image
        attended, _ = self.attention(text_enc.unsqueeze(1), image_enc.unsqueeze(1), image_enc.unsqueeze(1))
        attended = attended.squeeze(1)
        
        # Concatenate attended text, original image, and other features
        fused = torch.cat([attended, image_enc, other_enc], dim=1)
        output = self.fusion(fused)
        return output

# ===================================================================
# TRAINING FUNCTION (UPDATED FOR 3 INPUTS)
# ===================================================================
def train_mlp_fusion(X_text_tr, X_image_tr, X_other_tr, y_tr, 
                     X_text_val, X_image_val, X_other_val, y_val,
                     epochs=100, batch_size=256, lr=5e-4):
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
    print(f"  Using device: {device}")
    
    text_dim, image_dim, other_dim = X_text_tr.shape[1], X_image_tr.shape[1], X_other_tr.shape[1]
    
    model = MultimodalFusionMLP(text_dim, image_dim, other_dim).to(device)
    
    def pseudo_huber_loss(pred, target, delta=1.0):
        residual = pred - target
        return torch.mean(delta**2 * (torch.sqrt(1 + (residual/delta)**2) - 1))
    
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2)
    
    train_dataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_text_tr), torch.FloatTensor(X_image_tr), torch.FloatTensor(X_other_tr), torch.FloatTensor(y_tr).unsqueeze(1))
    val_dataset = torch.utils.data.TensorDataset(torch.FloatTensor(X_text_val), torch.FloatTensor(X_image_val), torch.FloatTensor(X_other_val), torch.FloatTensor(y_val).unsqueeze(1))
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    best_val_loss = float('inf'); patience, patience_counter = 15, 0
    
    for epoch in range(epochs):
        model.train(); train_loss = 0
        for text_b, image_b, other_b, y_b in train_loader:
            text_b, image_b, other_b, y_b = text_b.to(device), image_b.to(device), other_b.to(device), y_b.to(device)
            optimizer.zero_grad()
            output = model(text_b, image_b, other_b)
            loss = pseudo_huber_loss(output, y_b)
            loss.backward(); torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0); optimizer.step()
            train_loss += loss.item()
        
        model.eval(); val_loss = 0
        with torch.no_grad():
            for text_b, image_b, other_b, y_b in val_loader:
                text_b, image_b, other_b, y_b = text_b.to(device), image_b.to(device), other_b.to(device), y_b.to(device)
                output = model(text_b, image_b, other_b)
                val_loss += pseudo_huber_loss(output, y_b).item()
        
        train_loss /= len(train_loader); val_loss /= len(val_loader); scheduler.step(val_loss)
        
        if val_loss < best_val_loss:
            best_val_loss, best_model_state, patience_counter = val_loss, model.state_dict(), 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience: print(f"    Early stopping at epoch {epoch+1}"); break
        if (epoch + 1) % 10 == 0: print(f"    Epoch {epoch+1}: train_loss={train_loss:.5f}, val_loss={val_loss:.5f}")
            
    model.load_state_dict(best_model_state)
    return model

# ===================================================================
# PREDICTION FUNCTION (UPDATED FOR 3 INPUTS)
# ===================================================================
def predict_mlp(model, X_text, X_image, X_other, batch_size=256):
    device = next(model.parameters()).device
    model.eval(); predictions = []
    with torch.no_grad():
        for i in tqdm(range(0, len(X_text), batch_size), desc="Predicting", leave=False):
            end_idx = min(i + batch_size, len(X_text))
            text_b = torch.FloatTensor(X_text[i:end_idx]).to(device)
            image_b = torch.FloatTensor(X_image[i:end_idx]).to(device)
            other_b = torch.FloatTensor(X_other[i:end_idx]).to(device)
            output = model(text_b, image_b, other_b)
            predictions.append(output.cpu().numpy())
    return np.vstack(predictions).flatten()

# ===================================================================
# --- CORRECTED DATA LOADING AND SLICING ---
# ===================================================================
print("\n[1/4] Loading and slicing combined embeddings...")
SAVE_DIR = Path("embeddings_medium") # Use the correct folder
df_train = pd.read_csv('data/train.csv')
y_train_log = np.log1p(df_train['price'].values)

# Load the SINGLE, COMBINED feature files
X_train_full = np.load(SAVE_DIR / "final_X_train_medium_with_brand.npy", allow_pickle=False)
X_test_full = np.load(SAVE_DIR / "final_X_test_medium_with_brand.npy", allow_pickle=False)

# Define the dimensions of your features
text_dim = 384 # From SentenceTransformer
image_dim = 512 # From ViT-B/16

# Slice the combined arrays into their constituent parts
train_text = X_train_full[:, :text_dim]
train_image = X_train_full[:, text_dim:text_dim+image_dim]
train_other = X_train_full[:, text_dim+image_dim:]

test_text = X_test_full[:, :text_dim]
test_image = X_test_full[:, text_dim:text_dim+image_dim]
test_other = X_test_full[:, text_dim+image_dim:]

print(f"✓ Text: train{train_text.shape}, test{test_text.shape}")
print(f"✓ Image: train{train_image.shape}, test{test_image.shape}")
print(f"✓ Other: train{train_other.shape}, test{test_other.shape}")
del X_train_full, X_test_full; gc.collect()

# ===================================================================
# SCALE FEATURES
# ===================================================================
print("\n[2/4] Scaling features...")
text_scaler, image_scaler, other_scaler = RobustScaler(), RobustScaler(), RobustScaler()

train_text_scaled = text_scaler.fit_transform(train_text); test_text_scaled = text_scaler.transform(test_text)
train_image_scaled = image_scaler.fit_transform(train_image); test_image_scaled = image_scaler.transform(test_image)
train_other_scaled = other_scaler.fit_transform(train_other); test_other_scaled = other_scaler.transform(test_other)

print("✓ Features scaled")
del train_text, train_image, train_other, test_text, test_image, test_other; gc.collect()

# ===================================================================
# K-FOLD TRAINING
# ===================================================================
print("\n[3/4] Training MLP with K-Fold...")
N_FOLDS = 5; kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)
oof_preds = np.zeros(len(train_text_scaled)); test_preds = np.zeros(len(test_text_scaled))

for fold, (train_idx, val_idx) in enumerate(kf.split(train_text_scaled), 1):
    print(f"\n{'─'*70}\n📊 FOLD {fold}/{N_FOLDS}")
    
    model = train_mlp_fusion(
        train_text_scaled[train_idx], train_image_scaled[train_idx], train_other_scaled[train_idx], y_train_log[train_idx],
        train_text_scaled[val_idx], train_image_scaled[val_idx], train_other_scaled[val_idx], y_train_log[val_idx]
    )
    
    oof_preds[val_idx] = predict_mlp(model, train_text_scaled[val_idx], train_image_scaled[val_idx], train_other_scaled[val_idx])
    test_preds += predict_mlp(model, test_text_scaled, test_image_scaled, test_other_scaled) / N_FOLDS
    
    val_pred_price = np.expm1(oof_preds[val_idx]); val_actual_price = np.expm1(y_train_log[val_idx])
    fold_smape = np.mean(2 * np.abs(val_pred_price - val_actual_price) / (np.abs(val_actual_price) + np.abs(val_pred_price) + 1e-8)) * 100
    print(f"  📈 Fold {fold} SMAPE: {fold_smape:.4f}%")
    
    del model; gc.collect(); torch.mps.empty_cache() if torch.backends.mps.is_available() else None

# ===================================================================
# FINAL EVALUATION & SUBMISSION
# ===================================================================
print("\n[4/4] Final evaluation and submission...")
oof_prices = np.expm1(oof_preds); actual_prices = df_train['price'].values
overall_smape = np.mean(2 * np.abs(oof_prices - actual_prices) / (np.abs(actual_prices) + np.abs(oof_prices) + 1e-8)) * 100
print("\n" + "="*70 + f"\n📊 FINAL OOF SMAPE: {overall_smape:.4f}%\n" + "="*70)

final_predictions = np.expm1(test_preds); final_predictions = np.clip(final_predictions, 0.01, None)
df_test = pd.read_csv('data/test.csv')
submission = pd.DataFrame({'sample_id': df_test['sample_id'],'price': final_predictions})
submission.to_csv('mlp_fusion_submission.csv', index=False)

print("\n✅ Submission created: mlp_fusion_submission.csv")
print("\n📋 First 10 predictions:"); print(submission.head(10))

🚀 MLP FUSION: Text + Image Embeddings (Corrected)

[1/4] Loading and slicing combined embeddings...
✓ Text: train(75000, 384), test(75000, 384)
✓ Image: train(75000, 512), test(75000, 512)
✓ Other: train(75000, 156), test(75000, 156)

[2/4] Scaling features...
✓ Features scaled

[3/4] Training MLP with K-Fold...

──────────────────────────────────────────────────────────────────────
📊 FOLD 1/5
  Using device: mps
    Epoch 10: train_loss=0.16026, val_loss=0.20175
    Epoch 20: train_loss=0.12822, val_loss=0.19922
    Early stopping at epoch 27


                                                              

  📈 Fold 1 SMAPE: 52.4386%

──────────────────────────────────────────────────────────────────────
📊 FOLD 2/5
  Using device: mps
    Epoch 10: train_loss=0.16276, val_loss=0.19371
    Epoch 20: train_loss=0.12914, val_loss=0.19446
    Epoch 30: train_loss=0.10848, val_loss=0.20044
    Early stopping at epoch 34


                                                              

  📈 Fold 2 SMAPE: 51.9549%

──────────────────────────────────────────────────────────────────────
📊 FOLD 3/5
  Using device: mps
    Epoch 10: train_loss=0.16391, val_loss=0.19008
    Epoch 20: train_loss=0.12803, val_loss=0.19433
    Early stopping at epoch 27


                                                              

  📈 Fold 3 SMAPE: 52.2093%

──────────────────────────────────────────────────────────────────────
📊 FOLD 4/5
  Using device: mps
    Epoch 10: train_loss=0.16471, val_loss=0.18909
    Epoch 20: train_loss=0.12821, val_loss=0.18717
    Early stopping at epoch 29


                                                              

  📈 Fold 4 SMAPE: 51.3790%

──────────────────────────────────────────────────────────────────────
📊 FOLD 5/5
  Using device: mps
    Epoch 10: train_loss=0.16349, val_loss=0.19048
    Epoch 20: train_loss=0.12911, val_loss=0.19032
    Early stopping at epoch 29


                                                              

  📈 Fold 5 SMAPE: 51.4858%

[4/4] Final evaluation and submission...

📊 FINAL OOF SMAPE: 51.8935%

✅ Submission created: mlp_fusion_submission.csv

📋 First 10 predictions:
   sample_id      price
0     100179  11.339684
1     245611  20.092656
2     146263  19.950173
3      95658   6.729285
4      36806  15.766823
5     148239   5.283599
6      92659   6.564321
7       3780  16.190256
8     196940   8.071885
9      20472   8.899274


In [33]:
# =============================================================================
# Improved Enhanced MLP Fusion (MPS-ready) - Run in one cell
# Uses filenames from your code:
#   final_X_train_medium_with_brand.npy
#   final_X_test_medium_with_brand.npy
#   train.csv, test.csv
# Output:
#   enhanced_mlp_fusion_submission_improved.csv
#   saved fold models: fold_model_1.pth ...
# =============================================================================

import os
import random
import gc
from pathlib import Path
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, QuantileTransformer

# ---------------------------
# Reproducibility
# ---------------------------
SEED = 42
def set_seed(seed=SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    try:
        torch.use_deterministic_algorithms(True)
    except Exception:
        pass

set_seed(SEED)

# ---------------------------
# Device
# ---------------------------
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print("Using device:", DEVICE)

# ---------------------------
# Utility: SMAPE
# ---------------------------
def smape_np(y_true, y_pred, eps=1e-8):
    num = np.abs(y_true - y_pred)
    denom = (np.abs(y_true) + np.abs(y_pred)) / 2.0
    return np.mean(num / np.maximum(denom, eps)) * 100.0

def smape_torch(y_true, y_pred, eps=1e-8):
    num = torch.abs(y_true - y_pred)
    denom = (torch.abs(y_true) + torch.abs(y_pred)) / 2.0
    return torch.mean(num / torch.clamp(denom, min=eps)) * 100.0

# ---------------------------
# Load data + embeddings (names you used)
# ---------------------------
print("Loading CSVs and embeddings...")
df_train = pd.read_csv("data/train.csv")
df_test  = pd.read_csv("data/test.csv")

X_full = np.load("embeddings_medium/final_X_train_medium_with_brand.npy", allow_pickle=False)
X_test_full = np.load("embeddings_medium/final_X_test_medium_with_brand.npy", allow_pickle=False)

# your code used text_dim=384, image_dim=512
TEXT_DIM = 384
IMAGE_DIM = 512

# slice as you did
train_text = X_full[:, :TEXT_DIM]
train_image = X_full[:, TEXT_DIM:TEXT_DIM + IMAGE_DIM]
train_other_base = X_full[:, TEXT_DIM + IMAGE_DIM:]

test_text  = X_test_full[:, :TEXT_DIM]
test_image = X_test_full[:, TEXT_DIM:TEXT_DIM + IMAGE_DIM]
test_other_base = X_test_full[:, TEXT_DIM + IMAGE_DIM:]

del X_full, X_test_full
gc.collect()

# Extra engineered features from your code may already be in train_other_base; 
# if not, you can compute additional ones and append. Here we follow your pipeline:
def extract_advanced_features(df, is_train=True):
    features = {}
    txt = df['catalog_content'].astype(str)
    features['title_len'] = txt.str.len().fillna(0).values
    features['word_count'] = txt.str.split().str.len().fillna(0).values
    features['avg_word_len'] = features['title_len'] / (features['word_count'] + 1)
    features['has_digits'] = txt.str.contains(r'\d', regex=True).astype(int).values
    features['has_special_chars'] = txt.str.contains(r'[^a-zA-Z0-9\s]', regex=True).astype(int).values
    features['num_numbers'] = txt.str.findall(r'\d+').str.len().fillna(0).values
    # basic keywords
    price_keywords = ['premium', 'luxury', 'pro', 'plus', 'max', 'ultra', 'deluxe']
    budget_keywords = ['basic', 'mini', 'lite', 'eco', 'value']
    features['has_premium_word'] = txt.str.lower().str.contains('|'.join(price_keywords)).astype(int).values
    features['has_budget_word'] = txt.str.lower().str.contains('|'.join(budget_keywords)).astype(int).values
    return np.vstack([features[k] for k in sorted(features.keys())]).T, list(sorted(features.keys()))

train_extra, train_extra_cols = extract_advanced_features(df_train, is_train=True)
test_extra, test_extra_cols   = extract_advanced_features(df_test, is_train=False)

# Combine "other" features
train_other = np.hstack([train_other_base, train_extra])
test_other  = np.hstack([test_other_base, test_extra])

print("train_text", train_text.shape, "train_image", train_image.shape, "train_other", train_other.shape)

# ---------------------------
# Scale features (robust)
# ---------------------------
print("Scaling features...")
text_scaler = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=SEED)
img_scaler  = QuantileTransformer(n_quantiles=1000, output_distribution='normal', random_state=SEED+1)
other_scaler = RobustScaler()

train_text_s = text_scaler.fit_transform(train_text)
test_text_s  = text_scaler.transform(test_text)

train_img_s = img_scaler.fit_transform(train_image)
test_img_s  = img_scaler.transform(test_image)

train_other_s = other_scaler.fit_transform(train_other)
test_other_s  = other_scaler.transform(test_other)

del train_text, train_image, train_other, test_text, test_image, test_other
gc.collect()

# ---------------------------
# Target
# ---------------------------
y = df_train['price'].values  # original prices
y_log = np.log1p(y)           # train on log space as you did

# ---------------------------
# Model definition (refined and lighter)
# ---------------------------
class FusionMLP(nn.Module):
    def __init__(self, text_dim, image_dim, other_dim, hidden=512, dropout=0.18):
        super().__init__()
        # small per-modality encoders
        self.text_proj = nn.Sequential(
            nn.Linear(text_dim, hidden//2),
            nn.LayerNorm(hidden//2),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.img_proj = nn.Sequential(
            nn.Linear(image_dim, hidden//2),
            nn.LayerNorm(hidden//2),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        self.other_proj = nn.Sequential(
            nn.Linear(other_dim, hidden//4),
            nn.LayerNorm(hidden//4),
            nn.GELU(),
            nn.Dropout(dropout*0.7),
            nn.Linear(hidden//4, hidden//4),
            nn.LayerNorm(hidden//4),
            nn.GELU()
        )
        fusion_dim = hidden//2 + hidden//2 + hidden//4
        self.gate = nn.Sequential(nn.Linear(fusion_dim, fusion_dim), nn.Sigmoid())
        self.fuse = nn.Sequential(
            nn.Linear(fusion_dim, hidden),
            nn.LayerNorm(hidden),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden, hidden//2),
            nn.LayerNorm(hidden//2),
            nn.GELU(),
            nn.Dropout(dropout*0.7)
        )
        self.out = nn.Linear(hidden//2, 1)
        # init
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='leaky_relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0.0)

    def forward(self, t, i, o):
        t_enc = self.text_proj(t)
        i_enc = self.img_proj(i)
        o_enc = self.other_proj(o)
        fused = torch.cat([t_enc, i_enc, o_enc], dim=1)
        gated = self.gate(fused) * fused
        x = self.fuse(gated)
        return self.out(x)

# ---------------------------
# Training helpers
# ---------------------------
def train_one_epoch(model, loader, optimizer, device, loss_fn, max_grad_norm=1.0):
    model.train()
    total_loss = 0.0
    for t, img, oth, yb in loader:
        t = t.to(device); img = img.to(device); oth = oth.to(device); yb = yb.to(device)
        optimizer.zero_grad()
        preds = model(t, img, oth)
        loss = loss_fn(preds, yb)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()
        total_loss += loss.item() * t.shape[0]
    return total_loss / len(loader.dataset)

def valid_one_epoch(model, loader, device, loss_fn):
    model.eval()
    total_loss = 0.0
    preds = []
    trues = []
    with torch.no_grad():
        for t, img, oth, yb in loader:
            t = t.to(device); img = img.to(device); oth = oth.to(device); yb = yb.to(device)
            out = model(t, img, oth)
            total_loss += loss_fn(out, yb).item() * t.shape[0]
            preds.append(out.cpu().numpy())
            trues.append(yb.cpu().numpy())
    preds = np.concatenate([p.reshape(-1) for p in preds])
    trues = np.concatenate([t.reshape(-1) for t in trues])

    return total_loss / (len(loader.dataset)), preds, trues

# Combined loss: SmoothL1 in log space + small SMAPE penalty (torch)
def combined_loss_torch(pred, target, alpha=0.6):
    pred = pred.contiguous().view_as(target)
    # pred, target are in log-space
    huber = nn.SmoothL1Loss()(pred, target)
    # compute small smape penalty in log space (using exponentials)
    pred_exp = torch.exp(pred)
    target_exp = torch.exp(target)
    smape_term = torch.mean(torch.abs(pred_exp - target_exp) / ( (torch.abs(pred_exp)+torch.abs(target_exp))/2.0 + 1e-8 ))
    return alpha * huber + (1 - alpha) * smape_term

# ---------------------------
# Training loop with warmup + cosine restarts + early stopping
# ---------------------------
def fit_fold(X_text, X_img, X_other, y_log, X_text_val, X_img_val, X_other_val, y_log_val,
             fold_id, device, model_dir="models", epochs=80, batch_size=256, lr=3e-4):
    os.makedirs(model_dir, exist_ok=True)
    model = FusionMLP(X_text.shape[1], X_img.shape[1], X_other.shape[1], hidden=512, dropout=0.18).to(device)

    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-5)
    # warmup for 5 epochs linearly then cosine restarts
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, T_mult=2, eta_min=1e-6)
    # We'll implement a manual warmup multiplier
    warmup_epochs = 5

    train_ds = TensorDataset(torch.FloatTensor(X_text), torch.FloatTensor(X_img), torch.FloatTensor(X_other), torch.FloatTensor(y_log))
    val_ds   = TensorDataset(torch.FloatTensor(X_text_val), torch.FloatTensor(X_img_val), torch.FloatTensor(X_other_val), torch.FloatTensor(y_log_val))

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False, num_workers=0)

    best_score = 1e9
    best_state = None
    patience = 12
    wait = 0
    # relative-improvement threshold (small)
    rel_improve = 0.001

    for epoch in range(1, epochs+1):
        # manual warmup scale
        if epoch <= warmup_epochs:
            warmup_scale = epoch / warmup_epochs
            for param_group in optimizer.param_groups:
                param_group['lr'] = lr * warmup_scale
        else:
            # scheduler step
            scheduler.step(epoch + fold_id * 0.0001)  # small offset to differ folds

        train_loss = train_one_epoch(model, train_loader, optimizer, device, combined_loss_torch)
        val_loss, val_preds_log, val_trues_log = valid_one_epoch(model, val_loader, device, combined_loss_torch)

        # compute SMAPE on raw prices
        val_preds_price = np.expm1(val_preds_log)
        val_trues_price = np.expm1(val_trues_log)
        val_smape = smape_np(val_trues_price, val_preds_price)

        # relative improvement check on val_smape
        if val_smape + 1e-9 < best_score * (1 - rel_improve):
            best_score = val_smape
            best_state = {k: v.cpu() for k, v in model.state_dict().items()}
            wait = 0
        else:
            wait += 1

        # print progress
        if epoch % 1 == 0:
            print(f"Fold {fold_id} Epoch {epoch:03d} | train_loss: {train_loss:.5f} | val_loss: {val_loss:.5f} | val_smape: {val_smape:.4f}% | best_smape: {best_score:.4f}% | wait: {wait}")

        if wait >= patience:
            print(f"Early stopping (wait >= {patience}) at epoch {epoch}")
            break

        # cleanup
        gc.collect()
        if torch.backends.mps.is_available():
            torch.mps.empty_cache()

    # save best model state
    if best_state is not None:
        model_path = os.path.join(model_dir, f"fold_model_{fold_id}.pth")
        torch.save(best_state, model_path)
        print(f"Saved fold model to {model_path}")
        # load best into model
        model.load_state_dict(best_state)
    else:
        print("No improvement recorded; saving last model.")
        model_path = os.path.join(model_dir, f"fold_model_{fold_id}_last.pth")
        torch.save(model.state_dict(), model_path)

    return model

# ---------------------------
# K-Fold training
# ---------------------------
N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=SEED)

oof = np.zeros(train_text_s.shape[0], dtype=np.float32)
test_preds = np.zeros(test_text_s.shape[0], dtype=np.float32)
fold_smape_list = []

for fold, (trn_idx, val_idx) in enumerate(kf.split(train_text_s), 1):
    print("\n" + "="*80)
    print(f"Starting Fold {fold}/{N_FOLDS}")
    # training slice
    X_t_text = train_text_s[trn_idx]
    X_t_img  = train_img_s[trn_idx]
    X_t_other= train_other_s[trn_idx]
    y_t_log  = y_log[trn_idx]

    X_v_text = train_text_s[val_idx]
    X_v_img  = train_img_s[val_idx]
    X_v_other= train_other_s[val_idx]
    y_v_log  = y_log[val_idx]

    model = fit_fold(X_t_text, X_t_img, X_t_other, y_t_log,
                     X_v_text, X_v_img, X_v_other, y_v_log,
                     fold_id=fold, device=DEVICE, model_dir="models",
                     epochs=80, batch_size=256, lr=3e-4)

    # OOF preds for val set
    oof_preds_log = []
    model.eval()
    with torch.no_grad():
        for i in range(0, X_v_text.shape[0], 512):
            t = torch.FloatTensor(X_v_text[i:i+512]).to(DEVICE)
            im = torch.FloatTensor(X_v_img[i:i+512]).to(DEVICE)
            oth = torch.FloatTensor(X_v_other[i:i+512]).to(DEVICE)
            out = model(t, im, oth).cpu().numpy().reshape(-1)
            oof_preds_log.append(out)
    oof_preds_log = np.concatenate(oof_preds_log)
    oof[val_idx] = np.expm1(oof_preds_log)  # store price space

    # test preds for fold
    fold_test_log = []
    with torch.no_grad():
        for i in range(0, test_text_s.shape[0], 512):
            t = torch.FloatTensor(test_text_s[i:i+512]).to(DEVICE)
            im = torch.FloatTensor(test_img_s[i:i+512]).to(DEVICE)
            oth = torch.FloatTensor(test_other_s[i:i+512]).to(DEVICE)
            out = model(t, im, oth).cpu().numpy().reshape(-1)
            fold_test_log.append(out)
    fold_test_log = np.concatenate(fold_test_log)
    test_preds += np.expm1(fold_test_log) / N_FOLDS

    # fold SMAPE
    fold_smape = smape_np(df_train['price'].values[val_idx], oof[val_idx])
    fold_smape_list.append(fold_smape)
    print(f"Fold {fold} SMAPE: {fold_smape:.4f}%")
    # cleanup
    del model
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

# ---------------------------
# Final evaluation + submission
# ---------------------------
overall_smape = smape_np(df_train['price'].values, oof)
print("\n" + "="*80)
print("FOLD SMAPEs:", fold_smape_list)
print(f"OOF SMAPE: {overall_smape:.4f}%")
print("="*80)

# Save submission
final_preds = np.clip(test_preds, 0.01, None)
submission = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': final_preds
})
submission.to_csv("enhanced_mlp_fusion_submission_improved.csv", index=False)
print("Saved submission -> enhanced_mlp_fusion_submission_improved.csv")
print("Prediction stats: min %.2f / mean %.2f / max %.2f" % (final_preds.min(), final_preds.mean(), final_preds.max()))


Using device: mps
Loading CSVs and embeddings...
train_text (75000, 384) train_image (75000, 512) train_other (75000, 164)
Scaling features...

Starting Fold 1/5
Fold 1 Epoch 001 | train_loss: 0.54104 | val_loss: 0.42435 | val_smape: 64.2822% | best_smape: 64.2822% | wait: 0
Fold 1 Epoch 002 | train_loss: 0.40997 | val_loss: 0.37918 | val_smape: 58.9091% | best_smape: 58.9091% | wait: 0
Fold 1 Epoch 003 | train_loss: 0.37570 | val_loss: 0.36009 | val_smape: 56.5821% | best_smape: 56.5821% | wait: 0
Fold 1 Epoch 004 | train_loss: 0.35291 | val_loss: 0.34443 | val_smape: 54.6148% | best_smape: 54.6148% | wait: 0
Fold 1 Epoch 005 | train_loss: 0.33377 | val_loss: 0.34094 | val_smape: 54.2507% | best_smape: 54.2507% | wait: 0
Fold 1 Epoch 006 | train_loss: 0.31384 | val_loss: 0.33043 | val_smape: 52.8749% | best_smape: 52.8749% | wait: 0
Fold 1 Epoch 007 | train_loss: 0.30513 | val_loss: 0.32900 | val_smape: 52.7561% | best_smape: 52.7561% | wait: 0
Fold 1 Epoch 008 | train_loss: 0.30056 |

In [1]:
# ===================================================================
# OPTIMIZED MLP FUSION WITH ADVANCED TECHNIQUES
# Target: SMAPE < 40% with Maximum Performance
# ===================================================================

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, QuantileTransformer, StandardScaler
from tqdm import tqdm
import gc
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

print("="*70)
print("🚀 OPTIMIZED MLP FUSION: Advanced Price Prediction")
print("="*70)

# ===================================================================
# ENHANCED FEATURE ENGINEERING WITH MORE SIGNALS
# ===================================================================
def extract_advanced_features(df, is_train=True, price_stats=None):
    """Extract comprehensive features from all available data"""
    print(f"  Extracting advanced features...")
   
    features = {}
   
    # ============ TEXT FEATURES ============
    catalog = df['catalog_content'].fillna('')
   
    # Length and structure features
    features['title_len'] = catalog.str.len()
    features['word_count'] = catalog.str.split().str.len()
    features['avg_word_len'] = features['title_len'] / (features['word_count'] + 1)
    features['char_count'] = catalog.str.len()
   
    # Text pattern features
    features['has_digits'] = catalog.str.contains(r'\d', regex=True).astype(int)
    features['num_digits'] = catalog.str.count(r'\d')
    features['has_special_chars'] = catalog.str.contains(r'[^a-zA-Z0-9\s]', regex=True).astype(int)
    features['uppercase_ratio'] = catalog.str.count(r'[A-Z]') / (features['title_len'] + 1)
    features['digit_ratio'] = features['num_digits'] / (features['title_len'] + 1)
   
    # Extract numeric values from text
    features['num_numbers'] = catalog.str.findall(r'\d+').str.len().fillna(0)
    features['max_number_in_text'] = catalog.str.extractall(r'(\d+)')[0].astype(float).groupby(level=0).max().reindex(df.index).fillna(0)
   
    # Price indicators
    price_keywords = ['premium', 'luxury', 'pro', 'plus', 'max', 'ultra', 'deluxe', 'professional', 'elite']
    budget_keywords = ['basic', 'mini', 'lite', 'eco', 'value', 'budget', 'economy', 'standard']
   
    features['has_premium_word'] = catalog.str.lower().str.contains('|'.join(price_keywords)).astype(int)
    features['has_budget_word'] = catalog.str.lower().str.contains('|'.join(budget_keywords)).astype(int)
    features['premium_word_count'] = sum(catalog.str.lower().str.count(kw) for kw in price_keywords)
    features['budget_word_count'] = sum(catalog.str.lower().str.count(kw) for kw in budget_keywords)
   
    # ============ BRAND FEATURES ============
    if 'brand' in df.columns:
        brand_col = df['brand'].fillna('unknown')
       
        # Brand frequency
        brand_freq = brand_col.value_counts()
        features['brand_freq'] = brand_col.map(brand_freq).fillna(0)
        features['brand_log_freq'] = np.log1p(features['brand_freq'])
       
        # Is rare brand
        features['is_rare_brand'] = (features['brand_freq'] < 5).astype(int)
        features['is_common_brand'] = (features['brand_freq'] > 50).astype(int)
       
        if is_train and 'price' in df.columns:
            # Brand statistics
            brand_stats = df.groupby('brand')['price'].agg(['mean', 'std', 'median', 'min', 'max']).to_dict()
           
            features['brand_mean_price'] = brand_col.map(brand_stats['mean']).fillna(df['price'].median())
            features['brand_std_price'] = brand_col.map(brand_stats['std']).fillna(df['price'].std())
            features['brand_median_price'] = brand_col.map(brand_stats['median']).fillna(df['price'].median())
            features['brand_price_range'] = brand_col.map(lambda x: brand_stats['max'].get(x, 0) - brand_stats['min'].get(x, 0)).fillna(0)
           
            price_stats_to_return = {
                'brand_mean': brand_stats['mean'],
                'brand_std': brand_stats['std'],
                'brand_median': brand_stats['median'],
                'brand_min': brand_stats['min'],
                'brand_max': brand_stats['max']
            }
        elif price_stats is not None:
            # Use pre-computed statistics for test set
            features['brand_mean_price'] = brand_col.map(price_stats['brand_mean']).fillna(price_stats['global_median'])
            features['brand_std_price'] = brand_col.map(price_stats['brand_std']).fillna(price_stats['global_std'])
            features['brand_median_price'] = brand_col.map(price_stats['brand_median']).fillna(price_stats['global_median'])
            features['brand_price_range'] = brand_col.map(lambda x: price_stats['brand_max'].get(x, 0) - price_stats['brand_min'].get(x, 0)).fillna(0)
   
    # ============ QUANTITY FEATURES ============
    if 'quantity' in df.columns:
        qty = df['quantity'].fillna(1)
        features['quantity'] = qty
        features['log_quantity'] = np.log1p(qty)
        features['sqrt_quantity'] = np.sqrt(qty)
        features['quantity_squared'] = qty ** 2
        features['quantity_cubed'] = qty ** 3
        features['inv_quantity'] = 1 / (qty + 0.1)
       
        # Quantity bins
        features['qty_is_one'] = (qty == 1).astype(int)
        features['qty_small'] = (qty <= 5).astype(int)
        features['qty_medium'] = ((qty > 5) & (qty <= 20)).astype(int)
        features['qty_large'] = (qty > 20).astype(int)
   
    # ============ INTERACTION FEATURES ============
    if 'brand_freq' in features and 'word_count' in features:
        features['brand_freq_x_words'] = features['brand_freq'] * features['word_count']
   
    if 'quantity' in features and 'brand_mean_price' in features:
        features['qty_x_brand_price'] = features['quantity'] * features['brand_mean_price']
   
    # ============ POLYNOMIAL FEATURES ============
    if 'word_count' in features:
        features['word_count_squared'] = features['word_count'] ** 2
        features['word_count_log'] = np.log1p(features['word_count'])
   
    result_df = pd.DataFrame(features)
   
    if is_train and 'price' in df.columns:
        price_stats_to_return = price_stats_to_return if 'brand' in df.columns else {}
        price_stats_to_return['global_median'] = df['price'].median()
        price_stats_to_return['global_std'] = df['price'].std()
        return result_df, price_stats_to_return
   
    return result_df

# ===================================================================
# IMPROVED MLP ARCHITECTURE WITH BETTER DESIGN
# ===================================================================
class OptimizedMultimodalFusionMLP(nn.Module):
    """
    Optimized architecture with:
    - Better initialization
    - Skip connections
    - Improved attention
    - Ensemble-ready design
    """
    def __init__(self, text_dim, image_dim, other_dim, hidden_dim=768, dropout=0.25):
        super().__init__()
       
        # Modality-specific encoders with skip connections
        self.text_encoder = nn.Sequential(
            nn.Linear(text_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout * 0.5)
        )
       
        self.image_encoder = nn.Sequential(
            nn.Linear(image_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout * 0.5)
        )
       
        self.other_encoder = nn.Sequential(
            nn.Linear(other_dim, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(256, 256),
            nn.BatchNorm1d(256),
            nn.SiLU(),
            nn.Dropout(dropout * 0.3)
        )
       
        # Cross-modal attention
        self.cross_attention = nn.MultiheadAttention(
            embed_dim=hidden_dim, num_heads=8, dropout=0.1, batch_first=True
        )
       
        # Fusion gate
        fusion_input_dim = hidden_dim * 2 + 256
        self.fusion_gate = nn.Sequential(
            nn.Linear(fusion_input_dim, fusion_input_dim),
            nn.Sigmoid()
        )
       
        # Main fusion pathway with residual connections
        self.fusion_block1 = nn.Sequential(
            nn.Linear(fusion_input_dim, hidden_dim * 2),
            nn.BatchNorm1d(hidden_dim * 2),
            nn.SiLU(),
            nn.Dropout(dropout)
        )
       
        self.fusion_block2 = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.SiLU(),
            nn.Dropout(dropout * 0.5)
        )
       
        self.fusion_block3 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.BatchNorm1d(hidden_dim // 2),
            nn.SiLU(),
            nn.Dropout(dropout * 0.3)
        )
       
        self.fusion_block4 = nn.Sequential(
            nn.Linear(hidden_dim // 2, hidden_dim // 4),
            nn.BatchNorm1d(hidden_dim // 4),
            nn.SiLU()
        )
       
        self.output_layer = nn.Linear(hidden_dim // 4, 1)
       
        self._init_weights()
   
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.xavier_uniform_(m.weight, gain=0.5)
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
   
    def forward(self, text_emb, image_emb, other_emb):
        # Encode modalities
        text_enc = self.text_encoder(text_emb)
        image_enc = self.image_encoder(image_emb)
        other_enc = self.other_encoder(other_emb)
       
        # Cross-attention between text and image
        text_q = text_enc.unsqueeze(1)
        image_kv = image_enc.unsqueeze(1)
       
        text_attended, _ = self.cross_attention(text_q, image_kv, image_kv)
        text_attended = text_attended.squeeze(1)
       
        # Residual connection
        text_final = text_enc + 0.3 * text_attended
       
        # Concatenate all modalities
        fused = torch.cat([text_final, image_enc, other_enc], dim=1)
       
        # Apply gating
        gate = self.fusion_gate(fused)
        fused = fused * gate
       
        # Fusion pathway
        x = self.fusion_block1(fused)
        x = self.fusion_block2(x)
        x = self.fusion_block3(x)
        x = self.fusion_block4(x)
       
        output = self.output_layer(x)
        return output

# ===================================================================
# IMPROVED LOSS FUNCTIONS
# ===================================================================
def smape_loss(pred, target, epsilon=0.1):
    """Improved SMAPE loss with better numerical stability"""
    pred_exp = torch.exp(pred)
    target_exp = torch.exp(target)
   
    numerator = torch.abs(pred_exp - target_exp)
    denominator = (torch.abs(target_exp) + torch.abs(pred_exp)) / 2.0 + epsilon
   
    return torch.mean(numerator / denominator)

def combined_loss(pred, target, alpha=0.5, beta=0.3):
    """Multi-component loss for better optimization"""
    # SMAPE component
    smape = smape_loss(pred, target)
   
    # MSE in log space
    mse = nn.MSELoss()(pred, target)
   
    # Huber loss for robustness
    huber = nn.SmoothL1Loss()(pred, target)
   
    return alpha * smape + beta * mse + (1 - alpha - beta) * huber

# ===================================================================
# OPTIMIZED TRAINING WITH BETTER TECHNIQUES
# ===================================================================
def train_optimized_mlp(X_text_tr, X_image_tr, X_other_tr, y_tr,
                        X_text_val, X_image_val, X_other_val, y_val,
                        epochs=200, batch_size=128, lr=8e-5):
   
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
    print(f"  Using device: {device}")
   
    text_dim = X_text_tr.shape[1]
    image_dim = X_image_tr.shape[1]
    other_dim = X_other_tr.shape[1]
   
    model = OptimizedMultimodalFusionMLP(text_dim, image_dim, other_dim).to(device)
   
    # Better optimizer configuration
    optimizer = optim.AdamW(
        model.parameters(),
        lr=lr,
        weight_decay=1e-5,
        betas=(0.9, 0.999),
        eps=1e-8
    )
   
    # Improved scheduler
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=lr * 10,
        epochs=epochs,
        steps_per_epoch=len(X_text_tr) // batch_size + 1,
        pct_start=0.1,
        anneal_strategy='cos',
        div_factor=25.0,
        final_div_factor=1000.0
    )
   
    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(
        torch.FloatTensor(X_text_tr),
        torch.FloatTensor(X_image_tr),
        torch.FloatTensor(X_other_tr),
        torch.FloatTensor(y_tr).unsqueeze(1)
    )
   
    val_dataset = torch.utils.data.TensorDataset(
        torch.FloatTensor(X_text_val),
        torch.FloatTensor(X_image_val),
        torch.FloatTensor(X_other_val),
        torch.FloatTensor(y_val).unsqueeze(1)
    )
   
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size * 2, shuffle=False, num_workers=0)
   
    best_val_loss = float('inf')
    patience = 30
    patience_counter = 0
    best_model_state = None
   
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
       
        for text_b, image_b, other_b, y_b in train_loader:
            text_b = text_b.to(device)
            image_b = image_b.to(device)
            other_b = other_b.to(device)
            y_b = y_b.to(device)
           
            optimizer.zero_grad()
            output = model(text_b, image_b, other_b)
            loss = combined_loss(output, y_b)
           
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
            optimizer.step()
            scheduler.step()
           
            train_loss += loss.item()
       
        # Validation
        model.eval()
        val_loss = 0
       
        with torch.no_grad():
            for text_b, image_b, other_b, y_b in val_loader:
                text_b = text_b.to(device)
                image_b = image_b.to(device)
                other_b = other_b.to(device)
                y_b = y_b.to(device)
               
                output = model(text_b, image_b, other_b)
                val_loss += combined_loss(output, y_b).item()
       
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
       
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
       
        if patience_counter >= patience:
            print(f"    Early stopping at epoch {epoch+1}")
            break
       
        if (epoch + 1) % 15 == 0:
            print(f"    Epoch {epoch+1}: train_loss={train_loss:.5f}, val_loss={val_loss:.5f}")
   
    model.load_state_dict(best_model_state)
    return model

# ===================================================================
# PREDICTION FUNCTION
# ===================================================================
def predict_optimized(model, X_text, X_image, X_other, batch_size=512):
    device = next(model.parameters()).device
    model.eval()
    predictions = []
   
    with torch.no_grad():
        for i in range(0, len(X_text), batch_size):
            end_idx = min(i + batch_size, len(X_text))
           
            text_b = torch.FloatTensor(X_text[i:end_idx]).to(device)
            image_b = torch.FloatTensor(X_image[i:end_idx]).to(device)
            other_b = torch.FloatTensor(X_other[i:end_idx]).to(device)
           
            output = model(text_b, image_b, other_b)
            predictions.append(output.cpu().numpy())
   
    return np.vstack(predictions).flatten()

# ===================================================================
# MAIN EXECUTION
# ===================================================================
print("\n[1/6] Loading data and embeddings...")
df_train = pd.read_csv('data/train.csv')
df_test = pd.read_csv('data/test.csv')

# Load embeddings
X_train_full = np.load("embeddings_medium/final_X_train_medium_with_brand.npy", allow_pickle=False)
X_test_full = np.load("embeddings_medium/final_X_test_medium_with_brand.npy", allow_pickle=False)

# Define dimensions
text_dim = 384
image_dim = 512

# Slice embeddings
train_text = X_train_full[:, :text_dim]
train_image = X_train_full[:, text_dim:text_dim+image_dim]
train_other_base = X_train_full[:, text_dim+image_dim:]

test_text = X_test_full[:, :text_dim]
test_image = X_test_full[:, text_dim:text_dim+image_dim]
test_other_base = X_test_full[:, text_dim+image_dim:]

print(f"✓ Loaded embeddings")
del X_train_full, X_test_full
gc.collect()

# ===================================================================
# ADVANCED FEATURE ENGINEERING
# ===================================================================
print("\n[2/6] Engineering advanced features...")
train_extra_features, price_stats = extract_advanced_features(df_train, is_train=True)
test_extra_features = extract_advanced_features(df_test, is_train=False, price_stats=price_stats)

# Combine features
train_other = np.hstack([train_other_base, train_extra_features.values])
test_other = np.hstack([test_other_base, test_extra_features.values])

print(f"✓ Enhanced features: {train_other.shape[1]} dimensions")
del train_other_base, test_other_base
gc.collect()

# Target transformation with Box-Cox inspired approach
y_train_log = np.log1p(df_train['price'].values)

# ===================================================================
# OPTIMIZED SCALING STRATEGY
# ===================================================================
print("\n[3/6] Applying optimized scaling...")

# Use different scalers for different modalities
text_scaler = StandardScaler()
image_scaler = StandardScaler()
other_scaler = RobustScaler()

train_text_scaled = text_scaler.fit_transform(train_text)
test_text_scaled = text_scaler.transform(test_text)

train_image_scaled = image_scaler.fit_transform(train_image)
test_image_scaled = image_scaler.transform(test_image)

train_other_scaled = other_scaler.fit_transform(train_other)
test_other_scaled = other_scaler.transform(test_other)

print("✓ Scaling complete")
del train_text, train_image, train_other, test_text, test_image, test_other
gc.collect()

# ===================================================================
# K-FOLD CROSS-VALIDATION
# ===================================================================
print("\n[4/6] Training with K-Fold CV...")

N_FOLDS = 5
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(train_text_scaled))
test_preds = np.zeros(len(test_text_scaled))

fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_text_scaled), 1):
    print(f"\n{'─'*70}")
    print(f"📊 FOLD {fold}/{N_FOLDS}")
    print(f"{'─'*70}")
   
    model = train_optimized_mlp(
        train_text_scaled[train_idx],
        train_image_scaled[train_idx],
        train_other_scaled[train_idx],
        y_train_log[train_idx],
        train_text_scaled[val_idx],
        train_image_scaled[val_idx],
        train_other_scaled[val_idx],
        y_train_log[val_idx]
    )
   
    # OOF predictions
    oof_preds[val_idx] = predict_optimized(
        model,
        train_text_scaled[val_idx],
        train_image_scaled[val_idx],
        train_other_scaled[val_idx]
    )
   
    # Test predictions
    fold_test_preds = predict_optimized(
        model,
        test_text_scaled,
        test_image_scaled,
        test_other_scaled
    )
    test_preds += fold_test_preds / N_FOLDS
   
    # Calculate fold SMAPE
    val_pred_price = np.expm1(oof_preds[val_idx])
    val_actual_price = np.expm1(y_train_log[val_idx])
   
    fold_smape = np.mean(
        2 * np.abs(val_pred_price - val_actual_price) /
        (np.abs(val_actual_price) + np.abs(val_pred_price) + 1e-8)
    ) * 100
   
    fold_scores.append(fold_smape)
    print(f"  📈 Fold {fold} SMAPE: {fold_smape:.4f}%")
   
    del model
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

# ===================================================================
# POST-PROCESSING AND CALIBRATION
# ===================================================================
print("\n[5/6] Applying post-processing...")

# Calibrate predictions using validation set
oof_prices_raw = np.expm1(oof_preds)
actual_prices = df_train['price'].values

# Simple bias correction
bias = np.median(actual_prices / (oof_prices_raw + 0.01))
test_preds_calibrated = test_preds + np.log(bias)

# ===================================================================
# FINAL EVALUATION
# ===================================================================
print("\n[6/6] Final evaluation and submission...")

overall_smape = np.mean(
    2 * np.abs(oof_prices_raw - actual_prices) /
    (np.abs(actual_prices) + np.abs(oof_prices_raw) + 1e-8)
) * 100

print("\n" + "="*70)
print(f"📊 CROSS-VALIDATION RESULTS")
print("="*70)
for i, score in enumerate(fold_scores, 1):
    print(f"  Fold {i}: {score:.4f}%")
print(f"\n  Mean: {np.mean(fold_scores):.4f}%")
print(f"  Std:  {np.std(fold_scores):.4f}%")
print("\n" + "="*70)
print(f"🎯 FINAL OOF SMAPE: {overall_smape:.4f}%")
print("="*70)

# ===================================================================
# CREATE SUBMISSION
# ===================================================================
final_predictions = np.expm1(test_preds_calibrated)
final_predictions = np.clip(final_predictions, 0.01, None)

submission = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': final_predictions
})

submission.to_csv('optimized_mlp_fusion_submission.csv', index=False)

print("\n✅ Submission saved: optimized_mlp_fusion_submission.csv")
print("\n📋 Prediction statistics:")
print(f"  Min:    ${final_predictions.min():.2f}")
print(f"  Max:    ${final_predictions.max():.2f}")
print(f"  Mean:   ${final_predictions.mean():.2f}")
print(f"  Median: ${np.median(final_predictions):.2f}")
print("\n" + "="*70)

🚀 OPTIMIZED MLP FUSION: Advanced Price Prediction

[1/6] Loading data and embeddings...
✓ Loaded embeddings

[2/6] Engineering advanced features...
  Extracting advanced features...
  Extracting advanced features...
✓ Enhanced features: 173 dimensions

[3/6] Applying optimized scaling...
✓ Scaling complete

[4/6] Training with K-Fold CV...

──────────────────────────────────────────────────────────────────────
📊 FOLD 1/5
──────────────────────────────────────────────────────────────────────
  Using device: mps
    Epoch 15: train_loss=0.33665, val_loss=0.45577
    Epoch 30: train_loss=0.19207, val_loss=0.45364


KeyboardInterrupt: 

In [None]:
# ENHANCED MLP FUSION WITH ADVANCED FEATURE ENGINEERING
# Target: SMAPE < 45% with Maximum Data Extraction

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler, QuantileTransformer
from tqdm import tqdm
import gc
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

print("="*70)
print("🚀 ENHANCED MLP FUSION: Maximum Feature Extraction")
print("="*70)

# ===================================================================
# ADVANCED FEATURE ENGINEERING
# ===================================================================
def extract_advanced_features(df, is_train=True):
    """Extract rich features from catalog_content and other columns"""
    print(f"  Extracting advanced features...")
    
    features = {}
    
    # Text length features
    features['title_len'] = df['catalog_content'].str.len()
    features['word_count'] = df['catalog_content'].str.split().str.len()
    features['avg_word_len'] = features['title_len'] / (features['word_count'] + 1)
    
    # Brand features (if present)
    if 'brand' in df.columns:
        # Brand frequency encoding
        brand_freq = df['brand'].value_counts()
        features['brand_freq'] = df['brand'].map(brand_freq).fillna(0)
        
        # Brand mean price (only for train)
        if is_train and 'price' in df.columns:
            brand_mean = df.groupby('brand')['price'].mean()
            features['brand_mean_price'] = df['brand'].map(brand_mean).fillna(df['price'].median())
    
    # Quantity features
    if 'quantity' in df.columns:
        features['quantity'] = df['quantity'].fillna(1)
        features['log_quantity'] = np.log1p(features['quantity'])
        features['quantity_squared'] = features['quantity'] ** 2
    
    # Text pattern features
    features['has_digits'] = df['catalog_content'].str.contains(r'\d', regex=True).astype(int)
    features['has_special_chars'] = df['catalog_content'].str.contains(r'[^a-zA-Z0-9\s]', regex=True).astype(int)
    features['uppercase_ratio'] = df['catalog_content'].str.count(r'[A-Z]') / (features['title_len'] + 1)
    
    # Extract numeric values from text
    features['num_numbers'] = df['catalog_content'].str.findall(r'\d+').str.len().fillna(0)
    
    # Check for common price indicators
    price_keywords = ['premium', 'luxury', 'pro', 'plus', 'max', 'ultra', 'deluxe']
    budget_keywords = ['basic', 'mini', 'lite', 'eco', 'value']
    
    features['has_premium_word'] = df['catalog_content'].str.lower().str.contains('|'.join(price_keywords)).astype(int)
    features['has_budget_word'] = df['catalog_content'].str.lower().str.contains('|'.join(budget_keywords)).astype(int)
    
    return pd.DataFrame(features)

# ENHANCED MLP ARCHITECTURE WITH RESIDUAL CONNECTIONS

class EnhancedMultimodalFusionMLP(nn.Module):
    """
    Enhanced fusion with:
    - Residual connections
    - Better attention mechanism
    - Gated fusion
    - More sophisticated architecture
    """
    def __init__(self, text_dim, image_dim, other_dim, hidden_dim=768, dropout=0.25):
        super().__init__()
        
        # Individual encoders with residual capability
        self.text_encoder = nn.Sequential(
            nn.Linear(text_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout * 0.7)
        )
        
        self.image_encoder = nn.Sequential(
            nn.Linear(image_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(hidden_dim, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout * 0.7)
        )
        
        self.other_encoder = nn.Sequential(
            nn.Linear(other_dim, 128),
            nn.LayerNorm(128),
            nn.GELU(),
            nn.Dropout(dropout * 0.5),
            nn.Linear(128, 128),
            nn.LayerNorm(128),
            nn.GELU()
        )
        
        # Cross-attention between modalities
        self.text_to_image_attn = nn.MultiheadAttention(
            embed_dim=hidden_dim, num_heads=8, dropout=0.1, batch_first=True
        )
        self.image_to_text_attn = nn.MultiheadAttention(
            embed_dim=hidden_dim, num_heads=8, dropout=0.1, batch_first=True
        )
        
        # Gated fusion mechanism
        self.gate = nn.Sequential(
            nn.Linear(hidden_dim * 2 + 128, hidden_dim * 2),  # Changed from hidden_dim to hidden_dim * 2
            nn.Sigmoid()
        )
        
        # Main fusion network with residual connections
        fusion_input_dim = hidden_dim * 2 + 128
        self.fusion1 = nn.Sequential(
            nn.Linear(fusion_input_dim, hidden_dim * 2),
            nn.LayerNorm(hidden_dim * 2),
            nn.GELU(),
            nn.Dropout(dropout)
        )
        
        self.fusion2 = nn.Sequential(
            nn.Linear(hidden_dim * 2, hidden_dim),
            nn.LayerNorm(hidden_dim),
            nn.GELU(),
            nn.Dropout(dropout * 0.7)
        )
        
        self.fusion3 = nn.Sequential(
            nn.Linear(hidden_dim, hidden_dim // 2),
            nn.LayerNorm(hidden_dim // 2),
            nn.GELU(),
            nn.Dropout(dropout * 0.5)
        )
        
        self.output = nn.Linear(hidden_dim // 2, 1)
        
        self._init_weights()
    
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
    
    def forward(self, text_emb, image_emb, other_emb):
        # Encode each modality
        text_enc = self.text_encoder(text_emb)
        image_enc = self.image_encoder(image_emb)
        other_enc = self.other_encoder(other_emb)
        
        # Bidirectional cross-attention
        text_att, _ = self.text_to_image_attn(
            text_enc.unsqueeze(1), 
            image_enc.unsqueeze(1), 
            image_enc.unsqueeze(1)
        )
        text_att = text_att.squeeze(1)
        
        image_att, _ = self.image_to_text_attn(
            image_enc.unsqueeze(1), 
            text_enc.unsqueeze(1), 
            text_enc.unsqueeze(1)
        )
        image_att = image_att.squeeze(1)
        
        # Combine with residuals
        text_combined = text_enc + text_att
        image_combined = image_enc + image_att
        
        # Concatenate all features
        fused = torch.cat([text_combined, image_combined, other_enc], dim=1)
        
        # Gated fusion
        gate_values = self.gate(fused)
        
        # Apply fusion layers with residuals
        x = self.fusion1(fused)
        x = x * gate_values[:, :x.size(1)]  # Apply gating
        
        x = self.fusion2(x)
        x = self.fusion3(x)
        output = self.output(x)
        
        return output

# CUSTOM LOSS FUNCTION - SMAPE-inspired
def smape_loss(pred, target, epsilon=1e-3):
    """SMAPE-inspired loss that directly optimizes the evaluation metric"""
    # Work in log space to match our target
    pred_exp = torch.exp(pred)
    target_exp = torch.exp(target)
    
    numerator = torch.abs(pred_exp - target_exp)
    denominator = (torch.abs(target_exp) + torch.abs(pred_exp)) / 2.0 + epsilon
    
    return torch.mean(numerator / denominator)

def combined_loss(pred, target, alpha=0.5):
    """Combine SMAPE loss with Huber loss for stability"""
    smape = smape_loss(pred, target)
    huber = nn.SmoothL1Loss()(pred, target)
    return alpha * smape + (1 - alpha) * huber

# ===================================================================
# TRAINING FUNCTION WITH ADVANCED TECHNIQUES
# ===================================================================
def train_enhanced_mlp(X_text_tr, X_image_tr, X_other_tr, y_tr, 
                       X_text_val, X_image_val, X_other_val, y_val,
                       epochs=150, batch_size=256, lr=3e-4):
    
    device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
    print(f"  Using device: {device}")
    
    text_dim = X_text_tr.shape[1]
    image_dim = X_image_tr.shape[1]
    other_dim = X_other_tr.shape[1]
    
    model = EnhancedMultimodalFusionMLP(text_dim, image_dim, other_dim).to(device)
    
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=5e-5, betas=(0.9, 0.999))
    
    # Cosine annealing with warm restarts
    scheduler = optim.lr_scheduler.CosineAnnealingWarmRestarts(
        optimizer, T_0=15, T_mult=2, eta_min=1e-6
    )
    
    # Create datasets
    train_dataset = torch.utils.data.TensorDataset(
        torch.FloatTensor(X_text_tr),
        torch.FloatTensor(X_image_tr),
        torch.FloatTensor(X_other_tr),
        torch.FloatTensor(y_tr).unsqueeze(1)
    )
    
    val_dataset = torch.utils.data.TensorDataset(
        torch.FloatTensor(X_text_val),
        torch.FloatTensor(X_image_val),
        torch.FloatTensor(X_other_val),
        torch.FloatTensor(y_val).unsqueeze(1)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)
    
    best_val_loss = float('inf')
    patience = 20
    patience_counter = 0
    best_model_state = None
    
    for epoch in range(epochs):
        # Training
        model.train()
        train_loss = 0
        
        for text_b, image_b, other_b, y_b in train_loader:
            text_b = text_b.to(device)
            image_b = image_b.to(device)
            other_b = other_b.to(device)
            y_b = y_b.to(device)
            
            optimizer.zero_grad()
            output = model(text_b, image_b, other_b)
            loss = combined_loss(output, y_b)
            
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            
            train_loss += loss.item()
        
        # Validation
        model.eval()
        val_loss = 0
        
        with torch.no_grad():
            for text_b, image_b, other_b, y_b in val_loader:
                text_b = text_b.to(device)
                image_b = image_b.to(device)
                other_b = other_b.to(device)
                y_b = y_b.to(device)
                
                output = model(text_b, image_b, other_b)
                val_loss += combined_loss(output, y_b).item()
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        
        scheduler.step()
        
        # Early stopping
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1
        
        if patience_counter >= patience:
            print(f"    Early stopping at epoch {epoch+1}")
            break
        
        if (epoch + 1) % 10 == 0:
            print(f"    Epoch {epoch+1}: train_loss={train_loss:.5f}, val_loss={val_loss:.5f}")
    
    model.load_state_dict(best_model_state)
    return model

# ===================================================================
# PREDICTION FUNCTION
# ===================================================================
def predict_enhanced(model, X_text, X_image, X_other, batch_size=512):
    device = next(model.parameters()).device
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for i in tqdm(range(0, len(X_text), batch_size), desc="Predicting", leave=False):
            end_idx = min(i + batch_size, len(X_text))
            
            text_b = torch.FloatTensor(X_text[i:end_idx]).to(device)
            image_b = torch.FloatTensor(X_image[i:end_idx]).to(device)
            other_b = torch.FloatTensor(X_other[i:end_idx]).to(device)
            
            output = model(text_b, image_b, other_b)
            predictions.append(output.cpu().numpy())
    
    return np.vstack(predictions).flatten()

# ===================================================================
# MAIN EXECUTION
# ===================================================================
print("\n[1/5] Loading data and embeddings...")
df_train = pd.read_csv('train.csv')
df_test = pd.read_csv('test.csv')

# Load embeddings
X_train_full = np.load("final_X_train_medium_with_brand.npy", allow_pickle=False)
X_test_full = np.load("final_X_test_medium_with_brand.npy", allow_pickle=False)

# Define dimensions
text_dim = 384
image_dim = 512

# Slice embeddings
train_text = X_train_full[:, :text_dim]
train_image = X_train_full[:, text_dim:text_dim+image_dim]
train_other_base = X_train_full[:, text_dim+image_dim:]

test_text = X_test_full[:, :text_dim]
test_image = X_test_full[:, text_dim:text_dim+image_dim]
test_other_base = X_test_full[:, text_dim+image_dim:]

print(f"✓ Loaded embeddings")
del X_train_full, X_test_full
gc.collect()

# ===================================================================
# ADVANCED FEATURE ENGINEERING
# ===================================================================
print("\n[2/5] Engineering advanced features...")
train_extra_features = extract_advanced_features(df_train, is_train=True)
test_extra_features = extract_advanced_features(df_test, is_train=False)

# Combine with existing other features
train_other = np.hstack([train_other_base, train_extra_features.values])
test_other = np.hstack([test_other_base, test_extra_features.values])

print(f"✓ Enhanced features: {train_other.shape[1]} dimensions")
del train_other_base, test_other_base
gc.collect()

# Target transformation
y_train_log = np.log1p(df_train['price'].values)

# ===================================================================
# ADVANCED SCALING
# ===================================================================
print("\n[3/5] Applying robust scaling...")

# Use QuantileTransformer for embeddings (more robust to outliers)
text_scaler = QuantileTransformer(n_quantiles=1000, output_distribution='normal')
image_scaler = QuantileTransformer(n_quantiles=1000, output_distribution='normal')
other_scaler = RobustScaler()

train_text_scaled = text_scaler.fit_transform(train_text)
test_text_scaled = text_scaler.transform(test_text)

train_image_scaled = image_scaler.fit_transform(train_image)
test_image_scaled = image_scaler.transform(test_image)

train_other_scaled = other_scaler.fit_transform(train_other)
test_other_scaled = other_scaler.transform(test_other)

print("✓ Scaling complete")
del train_text, train_image, train_other, test_text, test_image, test_other
gc.collect()

# ===================================================================
# K-FOLD CROSS-VALIDATION WITH STRATIFICATION
# ===================================================================
print("\n[4/5] Training with K-Fold CV...")

N_FOLDS = 7  # Increased from 5 to 7 for better generalization
kf = KFold(n_splits=N_FOLDS, shuffle=True, random_state=42)

oof_preds = np.zeros(len(train_text_scaled))
test_preds = np.zeros(len(test_text_scaled))

fold_scores = []

for fold, (train_idx, val_idx) in enumerate(kf.split(train_text_scaled), 1):
    print(f"\n{'─'*70}")
    print(f"📊 FOLD {fold}/{N_FOLDS}")
    print(f"{'─'*70}")
    
    model = train_enhanced_mlp(
        train_text_scaled[train_idx], 
        train_image_scaled[train_idx], 
        train_other_scaled[train_idx], 
        y_train_log[train_idx],
        train_text_scaled[val_idx], 
        train_image_scaled[val_idx], 
        train_other_scaled[val_idx], 
        y_train_log[val_idx]
    )
    
    # OOF predictions
    oof_preds[val_idx] = predict_enhanced(
        model, 
        train_text_scaled[val_idx], 
        train_image_scaled[val_idx], 
        train_other_scaled[val_idx]
    )
    
    # Test predictions
    fold_test_preds = predict_enhanced(
        model, 
        test_text_scaled, 
        test_image_scaled, 
        test_other_scaled
    )
    test_preds += fold_test_preds / N_FOLDS
    
    # Calculate fold SMAPE
    val_pred_price = np.expm1(oof_preds[val_idx])
    val_actual_price = np.expm1(y_train_log[val_idx])
    
    fold_smape = np.mean(
        2 * np.abs(val_pred_price - val_actual_price) / 
        (np.abs(val_actual_price) + np.abs(val_pred_price) + 1e-8)
    ) * 100
    
    fold_scores.append(fold_smape)
    print(f"  📈 Fold {fold} SMAPE: {fold_smape:.4f}%")
    
    del model
    gc.collect()
    if torch.backends.mps.is_available():
        torch.mps.empty_cache()

# ===================================================================
# FINAL EVALUATION
# ===================================================================
print("\n[5/5] Final evaluation and submission...")

oof_prices = np.expm1(oof_preds)
actual_prices = df_train['price'].values

overall_smape = np.mean(
    2 * np.abs(oof_prices - actual_prices) / 
    (np.abs(actual_prices) + np.abs(oof_prices) + 1e-8)
) * 100

print("\n" + "="*70)
print(f"📊 CROSS-VALIDATION RESULTS")
print("="*70)
for i, score in enumerate(fold_scores, 1):
    print(f"  Fold {i}: {score:.4f}%")
print(f"\n  Mean: {np.mean(fold_scores):.4f}%")
print(f"  Std:  {np.std(fold_scores):.4f}%")
print("\n" + "="*70)
print(f"🎯 FINAL OOF SMAPE: {overall_smape:.4f}%")
print("="*70)

# ===================================================================
# CREATE SUBMISSION
# ===================================================================
final_predictions = np.expm1(test_preds)
final_predictions = np.clip(final_predictions, 0.01, None)

submission = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': final_predictions
})

submission.to_csv('enhanced_mlp_fusion_submission.csv', index=False)

print("\n✅ Submission saved: enhanced_mlp_fusion_submission.csv")
print("\n📋 Prediction statistics:")
print(f"  Min:    ${final_predictions.min():.2f}")
print(f"  Max:    ${final_predictions.max():.2f}")
print(f"  Mean:   ${final_predictions.mean():.2f}")
print(f"  Median: ${np.median(final_predictions):.2f}")
print("\n" + "="*70)