<a href="https://colab.research.google.com/github/munnurumahesh03-coder/Amazon-ML-Hackathon-2025/blob/main/Final_Experiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Importings**

---



In [1]:
# =============================================================================
# SECTION 1: SETUP, DATA LOADING, AND CONSTANTS
# =============================================================================
import pandas as pd
import numpy as np
import re
import warnings
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import time

warnings.filterwarnings('ignore')
print("All libraries imported successfully.")

# --- Constants ---
TRAIN_FILE_PATH = "train.csv"
TEST_FILE_PATH = "test.csv"
SAMPLE_ID_COL = 'sample_id'
TEXT_COL = 'catalog_content'
TARGET_COL = 'price'
BRAND_COL = 'brand'
RANDOM_STATE = 42
TFIDF_MAX_FEATURES = 8000 # Increased for more text detail
N_SPLITS = 5 # Using 5-fold CV for robust validation

# --- Load Data ---
try:
    train_df = pd.read_csv(TRAIN_FILE_PATH)
    test_df = pd.read_csv(TEST_FILE_PATH)
    print(f"Data loaded successfully. Train shape: {train_df.shape}, Test shape: {test_df.shape}")
except FileNotFoundError:
    print("ERROR: Data files not found. Please check file paths.")

# --- Define SMAPE Metric ---
def smape(y_true, y_pred):
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / (denominator + 1e-8)) * 100

print("Setup complete.")


All libraries imported successfully.
Data loaded successfully. Train shape: (75000, 4), Test shape: (75000, 3)
Setup complete.


# **THE ULTIMATE FEATURE FACTORY**

---



In [2]:
# =============================================================================
# SECTION 2: THE ULTIMATE FEATURE FACTORY
# =============================================================================
print("--- Starting the Ultimate Feature Factory ---")

def create_all_features(df, train_df_for_stats=None):
    """
    This single, powerful function creates all engineered features.
    """
    df_copy = df.copy()

    # --- 1. Core Feature Extraction (from original notebook) ---
    item_name = df_copy[TEXT_COL].str.extract(r'Item Name:\s*(.*)', flags=re.IGNORECASE).iloc[:, 0].fillna('')

    # Brand
    NON_BRAND_WORDS = ['The', 'A', 'An', 'Organic', 'Gluten-Free', 'Natural', 'Pure', 'Food', 'Gourmet', 'Simply', 'And', 'For', 'With', 'Pack', 'To', 'Of', 'In', 'From', 'By', 'On', 'At', 'Is', 'It']
    def get_brand(name):
        if not isinstance(name, str) or not name: return 'unknown'
        words = name.split(); first_word = words[0].title().replace(',', '')
        if first_word in NON_BRAND_WORDS and len(words) > 1: return words[1].title().replace(',', '')
        return first_word
    df_copy[BRAND_COL] = item_name.apply(get_brand)

    # Weight, Volume, Pack Count (simplified and robust)
    df_copy['weight_grams'] = item_name.str.extract(r'(\d+\.?\d*)\s*(?:g|gram|grams)\b', flags=re.IGNORECASE)[0].astype(float)
    df_copy['pack_count'] = item_name.str.extract(r'(?:pack of|count of|pk of|\s)(\d+)\s*(?:ct|count|pack)', flags=re.IGNORECASE)[0].astype(float)

    # --- 2. Advanced Text Statistic Features ---
    df_copy['name_char_count'] = item_name.str.len()
    df_copy['name_word_count'] = item_name.str.split().str.len()
    df_copy['name_all_caps_word_count'] = item_name.str.findall(r'\b[A-Z]{2,}\b').str.len()

    # --- 3. Advanced Keyword Flag Features ---
    df_copy['is_organic'] = item_name.str.contains('organic', case=False).astype(int)
    df_copy['is_gluten_free'] = item_name.str.contains('gluten free|gluten-free', case=False).astype(int)
    df_copy['is_case'] = item_name.str.contains(r'\bcase\b', case=False).astype(int)
    df_copy['is_kosher'] = item_name.str.contains('kosher', case=False).astype(int)

    # --- 4. Brand-Based Statistical Features ---
    if train_df_for_stats is not None:
        brand_stats = train_df_for_stats.groupby(BRAND_COL).agg(
            brand_avg_price=('price', 'mean'),
            brand_product_count=(SAMPLE_ID_COL, 'count')
        ).reset_index()
        df_copy = pd.merge(df_copy, brand_stats, on=BRAND_COL, how='left')

    return df_copy

# --- Apply the Feature Factory ---
# Create stats from the original training data
train_for_stats = create_all_features(train_df.copy())

# Apply to create the final, feature-rich dataframes
train_featured = create_all_features(train_df.copy(), train_for_stats)
test_featured = create_all_features(test_df.copy(), train_for_stats)

print("Ultimate Feature Factory complete.")
print("New training data shape:", train_featured.shape)


--- Starting the Ultimate Feature Factory ---
Ultimate Feature Factory complete.
New training data shape: (75000, 16)


# **DATA CLEANING AND PREPROCESSING PIPELINE**

---



In [3]:
# =============================================================================
# SECTION 3: DATA CLEANING AND PREPROCESSING
# =============================================================================
print("--- Starting Data Cleaning and Preprocessing ---")

# --- 1. Data Cleaning ---
# Remove placeholder images identified in previous notebooks
image_counts = train_featured['image_link'].value_counts()
placeholders = image_counts[image_counts > 10].index
train_cleaned = train_featured[~train_featured['image_link'].isin(placeholders)]

# Clean target variable (price)
train_cleaned[TARGET_COL] = pd.to_numeric(train_cleaned[TARGET_COL], errors='coerce')
train_cleaned.dropna(subset=[TARGET_COL], inplace=True)
print(f"Data cleaned. Final training rows: {len(train_cleaned)}")

# --- 2. Define Feature Lists for the Pipeline ---
numeric_features = [
    'weight_grams', 'pack_count', 'name_char_count', 'name_word_count',
    'name_all_caps_word_count', 'is_organic', 'is_gluten_free', 'is_case', 'is_kosher',
    'brand_avg_price', 'brand_product_count'
]
categorical_features = [BRAND_COL]
text_feature = TEXT_COL

# --- 3. Build the Preprocessing Pipeline ---
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline(steps=[('imputer', SimpleImputer(strategy='median'))]), numeric_features),
        ('cat', Pipeline(steps=[('imputer', SimpleImputer(strategy='constant', fill_value='unknown')), ('onehot', OneHotEncoder(handle_unknown='ignore'))]), categorical_features),
        ('text', TfidfVectorizer(max_features=TFIDF_MAX_FEATURES, stop_words='english', ngram_range=(1, 2)), text_feature)
    ],
    remainder='drop'
)

transform_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('scaler', StandardScaler(with_mean=False)) # Scale all features
])

# --- 4. Prepare Data for Modeling ---
X = train_cleaned[numeric_features + categorical_features + [text_feature]]
y = np.log1p(train_cleaned[TARGET_COL])

X_test = test_featured[numeric_features + categorical_features + [text_feature]]

# Fit the pipeline on training data and transform both train and test
X_sparse = transform_pipeline.fit_transform(X)
y_numpy = y.values
X_test_sparse = transform_pipeline.transform(X_test)

print("Preprocessing complete. Data is ready for modeling.")
print(f"Final feature matrix shape: {X_sparse.shape}")


--- Starting Data Cleaning and Preprocessing ---
Data cleaned. Final training rows: 74891
Preprocessing complete. Data is ready for modeling.
Final feature matrix shape: (74891, 17349)


# **THE ULTIMATE MLP MODEL (TRAINING & FINE-TUNING)**

---



In [4]:
# =============================================================================
# SECTION 4: THE ULTIMATE MLP MODEL (TRAINING & FINE-TUNING)
# =============================================================================
print("--- Starting Ultimate MLP Model Training ---")

# --- 1. Setup (Device, Dataset, Model, Loss) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class SparseDataset(Dataset):
    def __init__(self, X, y): self.X, self.y = X, y
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, idx): return torch.tensor(self.X[idx].toarray().flatten(), dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        self.layers = nn.Sequential(nn.Linear(input_size, 512), nn.ReLU(), nn.Dropout(0.4), nn.Linear(512, 256), nn.ReLU(), nn.Dropout(0.4), nn.Linear(256, 1))
    def forward(self, x): return self.layers(x)

class SmapeLoss(nn.Module):
    def __init__(self, epsilon=1e-8):
        super().__init__(); self.epsilon = epsilon
    def forward(self, y_pred_log, y_true_log):
        y_pred, y_true = torch.expm1(y_pred_log), torch.expm1(y_true_log)
        num = torch.abs(y_pred - y_true); den = (torch.abs(y_true) + torch.abs(y_pred)) / 2
        return torch.mean(num / (den + self.epsilon)) * 100

train_loader = DataLoader(SparseDataset(X_sparse, y_numpy), batch_size=256, shuffle=True)
model = MLP(X_sparse.shape[1]).to(device)
criterion = SmapeLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# --- 2. Initial Training Phase ---
print("\n--- Phase 1: Initial Training ---")
initial_epochs = 60
for epoch in range(initial_epochs):
    model.train(); epoch_loss = 0.0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device).view(-1, 1)
        outputs = model(features); loss = criterion(outputs, labels)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch [{epoch+1}/{initial_epochs}], SMAPE Loss: {epoch_loss/len(train_loader):.4f}')

# --- 3. Fine-Tuning Phase ---
print("\n--- Phase 2: Fine-Tuning ---")
for g in optimizer.param_groups: g['lr'] = 0.0001
fine_tune_epochs = 30
for epoch in range(fine_tune_epochs):
    model.train(); epoch_loss = 0.0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device).view(-1, 1)
        outputs = model(features); loss = criterion(outputs, labels)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        epoch_loss += loss.item()
    print(f'Fine-Tuning Epoch [{epoch+1}/{fine_tune_epochs}], SMAPE Loss: {epoch_loss/len(train_loader):.4f}')

print("\n--- Model Training and Fine-Tuning Complete ---")


--- Starting Ultimate MLP Model Training ---

--- Phase 1: Initial Training ---
Epoch [1/60], SMAPE Loss: 71.8600
Epoch [2/60], SMAPE Loss: 61.0244
Epoch [3/60], SMAPE Loss: 57.5179
Epoch [4/60], SMAPE Loss: 54.5232
Epoch [5/60], SMAPE Loss: 52.9310
Epoch [6/60], SMAPE Loss: 50.9526
Epoch [7/60], SMAPE Loss: 49.3879
Epoch [8/60], SMAPE Loss: 47.8896
Epoch [9/60], SMAPE Loss: 46.8077
Epoch [10/60], SMAPE Loss: 45.9035
Epoch [11/60], SMAPE Loss: 44.3808
Epoch [12/60], SMAPE Loss: 43.1526
Epoch [13/60], SMAPE Loss: 42.0085
Epoch [14/60], SMAPE Loss: 41.3769
Epoch [15/60], SMAPE Loss: 40.1740
Epoch [16/60], SMAPE Loss: 39.7088
Epoch [17/60], SMAPE Loss: 38.7813
Epoch [18/60], SMAPE Loss: 37.9051
Epoch [19/60], SMAPE Loss: 37.3687
Epoch [20/60], SMAPE Loss: 37.0439
Epoch [21/60], SMAPE Loss: 36.4929
Epoch [22/60], SMAPE Loss: 35.9272
Epoch [23/60], SMAPE Loss: 35.2846
Epoch [24/60], SMAPE Loss: 34.7915
Epoch [25/60], SMAPE Loss: 34.1412
Epoch [26/60], SMAPE Loss: 34.2182
Epoch [27/60], SMAP

# **SUBMISSION**

---



In [5]:
# =============================================================================
# SECTION 5: GENERATE FINAL SUBMISSION
# =============================================================================
print("\n--- Generating Final Submission File ---")
model.eval(); final_predictions = []
batch_size = 512
with torch.no_grad():
    for i in range(0, X_test_sparse.shape[0], batch_size):
        X_batch_sparse = X_test_sparse[i:i + batch_size]
        X_batch_dense = torch.tensor(X_batch_sparse.toarray(), dtype=torch.float32).to(device)
        outputs = model(X_batch_dense)
        final_predictions.append(outputs.cpu())

test_predictions_log = torch.cat(final_predictions).numpy()
final_predictions = np.expm1(test_predictions_log)

if np.isnan(final_predictions).any() or np.isinf(final_predictions).any():
    median_pred = np.nanmedian(final_predictions)
    final_predictions = np.nan_to_num(final_predictions, nan=median_pred, posinf=median_pred, neginf=median_pred)
final_predictions = final_predictions.clip(min=0)

submission_df = pd.DataFrame({'sample_id': test_df[SAMPLE_ID_COL], 'price': final_predictions.flatten()})
submission_df.to_csv("submission_final_experiment.csv", index=False)

print("\nSubmission file 'submission_final_experiment.csv' created successfully.")
print("This represents the culmination of all learned techniques.")



--- Generating Final Submission File ---

Submission file 'submission_final_experiment.csv' created successfully.
This represents the culmination of all learned techniques.


In [8]:
# =============================================================================
# THE FINAL MOVE: WEIGHTED ENSEMBLE OF CHAMPIONS (CORRECTED)
# =============================================================================
# This script creates a weighted average of the predictions from your two best
# models: the champion Fine-Tuned MLP and the champion CatBoost model.

import pandas as pd
import numpy as np

print("--- Creating the Final Weighted Ensemble Submission ---")

# --- 1. Define the paths to your champion submission files ---
mlp_champion_file = "submission_mlp_finetuned.csv"      # Score: 51.584
catboost_champion_file = "submission_catboost_pipeline_gpu.csv" # Score: 53.670

# --- 2. Define the weights for the blend ---
mlp_weight = 0.80
catboost_weight = 0.20

print(f"Blending models with weights: MLP={mlp_weight}, CatBoost={catboost_weight}")

try:
    # --- 3. Load the submission files ---
    df_mlp = pd.read_csv(mlp_champion_file)
    df_catboost = pd.read_csv(catboost_champion_file)

    # --- 4. CRITICAL: Align the data by sorting ---
    df_mlp = df_mlp.sort_values(by='sample_id').reset_index(drop=True)
    df_catboost = df_catboost.sort_values(by='sample_id').reset_index(drop=True)

    if not df_mlp['sample_id'].equals(df_catboost['sample_id']):
        raise ValueError("Sample IDs do not match! Cannot create ensemble.")

    # --- 5. Create the Ensemble Prediction ---
    ensemble_price = (df_mlp['price'] * mlp_weight) + (df_catboost['price'] * catboost_weight)

    # --- 6. Create and save the final submission file ---
    df_ensemble = pd.DataFrame({
        'sample_id': df_mlp['sample_id'],
        'price': ensemble_price
    })

    # THE FIX: Use 'lower' instead of 'min' for the clip function.
    df_ensemble['price'] = df_ensemble['price'].clip(lower=0)

    ensemble_filename = "submission_ensemble_final_80_20.csv"
    df_ensemble.to_csv(ensemble_filename, index=False)

    print(f"\nEnsemble submission file '{ensemble_filename}' created successfully!")
    print("This is your highest-probability final submission. Good luck!")

except FileNotFoundError as e:
    print(f"\nERROR: Could not find a submission file. Please check the file paths.")
    print(f"Missing file: {e.filename}")



--- Creating the Final Weighted Ensemble Submission ---
Blending models with weights: MLP=0.8, CatBoost=0.2

Ensemble submission file 'submission_ensemble_final_80_20.csv' created successfully!
This is your highest-probability final submission. Good luck!


# **RESNET**

---



In [9]:
# =============================================================================
# SECTION 1: SETUP FOR TABULAR RESNET EXPERIMENT
# =============================================================================
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
import time

print("--- ResNet Experiment: Libraries Imported ---")

# --- Constants & Data Loading (same as before) ---
# ... (You can copy the data loading and constants from your previous notebook) ...
# Make sure train_df, test_df are loaded.

# --- Feature Factory (same as before) ---
# ... (Copy your 'create_all_features' function here) ...

# --- Apply Feature Factory (same as before) ---
# ... (Copy the code to create train_featured and test_featured) ...

# --- Data Cleaning and Preprocessing Pipeline (same as before) ---
# ... (Copy the code for cleaning and creating the 'transform_pipeline') ...

# --- Prepare Data for Modeling (same as before) ---
# ... (Copy the code to create X_sparse, y_numpy, and X_test_sparse) ...

print("Setup and Data Preparation Complete. Data is ready for the ResNet model.")
print(f"Final feature matrix shape: {X_sparse.shape}")


--- ResNet Experiment: Libraries Imported ---
Setup and Data Preparation Complete. Data is ready for the ResNet model.
Final feature matrix shape: (74891, 17349)


In [10]:
# =============================================================================
# SECTION 2: THE TABULAR RESNET ARCHITECTURE
# =============================================================================
print("--- Defining the Tabular ResNet Model ---")

# --- 1. Define the Core Residual Block ---
class ResBlock(nn.Module):
    def __init__(self, input_size, output_size, dropout_rate=0.4):
        super(ResBlock, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(input_size)
        self.linear1 = nn.Linear(input_size, output_size)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(dropout_rate)

        self.batch_norm2 = nn.BatchNorm1d(output_size)
        self.linear2 = nn.Linear(output_size, output_size)

        # This is the key: a linear layer to match dimensions for the skip connection
        self.shortcut = nn.Linear(input_size, output_size) if input_size != output_size else None

    def forward(self, x):
        residual = x

        # First part of the block
        out = self.batch_norm1(x)
        out = self.relu(out)
        out = self.linear1(out)
        out = self.dropout(out)

        # Second part of the block
        out = self.batch_norm2(out)
        out = self.relu(out)
        out = self.linear2(out)

        # The skip connection
        if self.shortcut:
            residual = self.shortcut(residual)
        out += residual # Add the input (or its projection) to the output

        return out

# --- 2. Define the Full Tabular ResNet Model ---
class TabularResNet(nn.Module):
    def __init__(self, input_size):
        super(TabularResNet, self).__init__()
        # An initial "stem" to process the raw input
        self.stem = nn.Sequential(
            nn.Linear(input_size, 512),
            nn.ReLU(),
            nn.Dropout(0.2)
        )

        # Stack multiple Residual Blocks to create a deep network
        self.res_blocks = nn.Sequential(
            ResBlock(512, 512, dropout_rate=0.4),
            ResBlock(512, 256, dropout_rate=0.4),
            ResBlock(256, 128, dropout_rate=0.3)
        )

        # The final "head" to produce the output
        self.head = nn.Linear(128, 1)

    def forward(self, x):
        x = self.stem(x)
        x = self.res_blocks(x)
        x = self.head(x)
        return x

print("TabularResNet model architecture defined successfully.")


--- Defining the Tabular ResNet Model ---
TabularResNet model architecture defined successfully.


In [11]:
# =============================================================================
# SECTION 3: TRAINING THE TABULAR RESNET
# =============================================================================
print("--- Starting Tabular ResNet Training ---")

# --- 1. Setup (Device, Dataset, Loss) ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Re-use the memory-efficient SparseDataset from our previous notebook
class SparseDataset(Dataset):
    def __init__(self, X, y): self.X, self.y = X, y
    def __len__(self): return self.X.shape[0]
    def __getitem__(self, idx): return torch.tensor(self.X[idx].toarray().flatten(), dtype=torch.float32), torch.tensor(self.y[idx], dtype=torch.float32)

# Re-use the custom SMAPE Loss
class SmapeLoss(nn.Module):
    def __init__(self, epsilon=1e-8):
        super().__init__(); self.epsilon = epsilon
    def forward(self, y_pred_log, y_true_log):
        y_pred, y_true = torch.expm1(y_pred_log), torch.expm1(y_true_log)
        num = torch.abs(y_pred - y_true); den = (torch.abs(y_true) + torch.abs(y_pred)) / 2
        return torch.mean(num / (den + self.epsilon)) * 100

# --- 2. Initialize and Train ---
train_loader = DataLoader(SparseDataset(X_sparse, y_numpy), batch_size=256, shuffle=True)
model = TabularResNet(X_sparse.shape[1]).to(device) # <-- The only change is here!
criterion = SmapeLoss().to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# --- 3. Training Loop (Initial + Fine-Tuning) ---
print("\n--- Phase 1: Initial Training ---")
initial_epochs = 50 # We can start with slightly fewer epochs as ResNet can learn faster
for epoch in range(initial_epochs):
    model.train(); epoch_loss = 0.0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device).view(-1, 1)
        outputs = model(features); loss = criterion(outputs, labels)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        epoch_loss += loss.item()
    print(f'Epoch [{epoch+1}/{initial_epochs}], SMAPE Loss: {epoch_loss/len(train_loader):.4f}')

print("\n--- Phase 2: Fine-Tuning ---")
for g in optimizer.param_groups: g['lr'] = 0.0001
fine_tune_epochs = 25
for epoch in range(fine_tune_epochs):
    model.train(); epoch_loss = 0.0
    for features, labels in train_loader:
        features, labels = features.to(device), labels.to(device).view(-1, 1)
        outputs = model(features); loss = criterion(outputs, labels)
        optimizer.zero_grad(); loss.backward(); optimizer.step()
        epoch_loss += loss.item()
    print(f'Fine-Tuning Epoch [{epoch+1}/{fine_tune_epochs}], SMAPE Loss: {epoch_loss/len(train_loader):.4f}')

print("\n--- ResNet Model Training and Fine-Tuning Complete ---")


--- Starting Tabular ResNet Training ---

--- Phase 1: Initial Training ---
Epoch [1/50], SMAPE Loss: 63.2917
Epoch [2/50], SMAPE Loss: 50.7816
Epoch [3/50], SMAPE Loss: 45.5575
Epoch [4/50], SMAPE Loss: 42.0091
Epoch [5/50], SMAPE Loss: 39.6669
Epoch [6/50], SMAPE Loss: 37.5176
Epoch [7/50], SMAPE Loss: 35.6128
Epoch [8/50], SMAPE Loss: 33.9023
Epoch [9/50], SMAPE Loss: 32.5285
Epoch [10/50], SMAPE Loss: 31.7153
Epoch [11/50], SMAPE Loss: 29.9250
Epoch [12/50], SMAPE Loss: 28.9597
Epoch [13/50], SMAPE Loss: 28.7841
Epoch [14/50], SMAPE Loss: 27.4491
Epoch [15/50], SMAPE Loss: 26.3818
Epoch [16/50], SMAPE Loss: 25.9854
Epoch [17/50], SMAPE Loss: 25.0657
Epoch [18/50], SMAPE Loss: 24.5474
Epoch [19/50], SMAPE Loss: 24.4491
Epoch [20/50], SMAPE Loss: 23.4317
Epoch [21/50], SMAPE Loss: 22.9125
Epoch [22/50], SMAPE Loss: 22.9072
Epoch [23/50], SMAPE Loss: 22.0674
Epoch [24/50], SMAPE Loss: 21.9748
Epoch [25/50], SMAPE Loss: 21.2685
Epoch [26/50], SMAPE Loss: 21.3625
Epoch [27/50], SMAPE Lo

# **Submission**

---



In [13]:
# =============================================================================
# FINAL STEP: GENERATE SUBMISSION FROM TRAINED RESNET MODEL (CORRECTED)
# =============================================================================

import numpy as np
import pandas as pd
import torch
from torch.utils.data import DataLoader

print("\n--- Generating Final Submission File for ResNet Model ---")

# --- 1. Set the model to evaluation mode ---
model.eval()

# --- 2. Create a DataLoader for the test set ---
test_dataset = SparseDataset(X_test_sparse, np.zeros(X_test_sparse.shape[0]))
test_loader = DataLoader(dataset=test_dataset, batch_size=512, shuffle=False)

# --- 3. Generate Predictions in Batches ---
final_predictions_log = []
with torch.no_grad():
    for features, _ in test_loader:
        features = features.to(device)
        outputs = model(features)
        final_predictions_log.append(outputs.cpu())

test_predictions_log = torch.cat(final_predictions_log).numpy()
print("Predictions generated successfully.")

# --- 4. Post-process the Predictions ---
final_predictions = np.expm1(test_predictions_log)

# --- 5. Bulletproof Safety Checks ---
if np.isnan(final_predictions).any() or np.isinf(final_predictions).any():
    print("WARNING: Invalid values (NaN or infinity) found in predictions.")
    median_pred = np.nanmedian(final_predictions)
    final_predictions = np.nan_to_num(final_predictions, nan=median_pred, posinf=median_pred, neginf=median_pred)
    print(f"Replaced invalid values with the median prediction: {median_pred:.4f}")

# THE FIX: Use 'min=0' for the NumPy array clip function.
final_predictions = final_predictions.clip(min=0)
print("Clipped all predictions to be non-negative.")

# --- 6. Create and Save the Submission DataFrame ---
submission_df_resnet = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_predictions.flatten()
})

submission_filename = "submission_tabular_resnet.csv"
submission_df_resnet.to_csv(submission_filename, index=False)

print(f"\nSubmission file '{submission_filename}' created successfully!")



--- Generating Final Submission File for ResNet Model ---
Predictions generated successfully.
Replaced invalid values with the median prediction: 12.9172
Clipped all predictions to be non-negative.

Submission file 'submission_tabular_resnet.csv' created successfully!


# **TabNet**

---



In [14]:
!pip install pytorch-tabnet

Collecting pytorch-tabnet
  Downloading pytorch_tabnet-4.1.0-py3-none-any.whl.metadata (15 kB)
Downloading pytorch_tabnet-4.1.0-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.5/44.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [15]:
# =============================================================================
# EXPERIMENT 2: THE TABNET CHALLENGER
# =============================================================================
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pytorch_tabnet.tab_model import TabNetRegressor
import torch

print("--- Starting TabNet Experiment ---")

# --- This code assumes 'train_cleaned' and 'test_featured' dataframes are already loaded and prepared ---

# --- 1. Data Preparation for TabNet ---
print("Preparing data specifically for TabNet...")
df_train = train_cleaned.copy()
df_test = test_featured.copy()

categorical_features = ['brand']
numerical_features = [
    'weight_grams', 'pack_count', 'name_char_count', 'name_word_count',
    'name_all_caps_word_count', 'is_organic', 'is_gluten_free', 'is_case', 'is_kosher',
    'brand_avg_price', 'brand_product_count'
]

# Handle missing values robustly
for col in numerical_features:
    median_val = df_train[col].median()
    df_train[col].fillna(median_val, inplace=True)
    df_test[col].fillna(median_val, inplace=True)
df_train[categorical_features[0]].fillna("unknown", inplace=True)
df_test[categorical_features[0]].fillna("unknown", inplace=True)

# Label Encode Categorical Features
cat_encoders = {}
for col in categorical_features:
    encoder = LabelEncoder()
    combined_series = pd.concat([df_train[col], df_test[col]], axis=0)
    encoder.fit(combined_series)
    df_train[col] = encoder.transform(df_train[col])
    df_test[col] = encoder.transform(df_test[col])
    cat_encoders[col] = encoder

# Create Final Data Matrices
features = numerical_features + categorical_features
X_train = df_train[features].values
y_train = np.log1p(df_train['price']).values.reshape(-1, 1)
X_test = df_test[features].values

# Get Categorical Indices and Dimensions for TabNet
cat_idxs = [features.index(col) for col in categorical_features]
cat_dims = [len(cat_encoders[col].classes_) for col in categorical_features]

print("Data for TabNet is ready.")

# --- 2. Training the TabNet Model ---
print("\n--- Training the TabNetRegressor ---")
tabnet_params = dict(
    cat_dims=cat_dims, cat_idxs=cat_idxs, cat_emb_dim=4,
    n_d=16, n_a=16, n_steps=5, gamma=1.5,
    n_independent=2, n_shared=2,
    optimizer_fn=torch.optim.Adam, optimizer_params=dict(lr=2e-2),
    scheduler_params={"step_size":10, "gamma":0.9},
    scheduler_fn=torch.optim.lr_scheduler.StepLR,
    mask_type='sparsemax',
    device_name='cuda' if torch.cuda.is_available() else 'cpu'
)

model_tabnet = TabNetRegressor(**tabnet_params)

# Create a validation set for early stopping
X_train_fold, X_val_fold, y_train_fold, y_val_fold = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42
)

model_tabnet.fit(
    X_train=X_train_fold, y_train=y_train_fold,
    eval_set=[(X_val_fold, y_val_fold)],
    eval_name=['validation'], eval_metric=['rmse'],
    max_epochs=100, patience=20, # Increased patience
    batch_size=1024, drop_last=False
)

print("TabNet training complete.")

# --- 3. Generate and Save Submission ---
print("\n--- Generating TabNet Submission File ---")
test_predictions_log = model_tabnet.predict(X_test)
final_predictions = np.expm1(test_predictions_log)

submission_df_tabnet = pd.DataFrame({
    'sample_id': df_test['sample_id'],
    'price': final_predictions.flatten()
})
submission_df_tabnet['price'] = submission_df_tabnet['price'].clip(lower=0)
submission_df_tabnet.to_csv("submission_tabnet.csv", index=False)

print("Submission file 'submission_tabnet.csv' created successfully!")


--- Starting TabNet Experiment ---
Preparing data specifically for TabNet...
Data for TabNet is ready.

--- Training the TabNetRegressor ---
epoch 0  | loss: 0.95825 | validation_rmse: 0.90799 |  0:00:09s
epoch 1  | loss: 0.57886 | validation_rmse: 1.15395 |  0:00:13s
epoch 2  | loss: 0.56321 | validation_rmse: 0.79169 |  0:00:17s
epoch 3  | loss: 0.55209 | validation_rmse: 0.76093 |  0:00:21s
epoch 4  | loss: 0.54167 | validation_rmse: 0.8718  |  0:00:25s
epoch 5  | loss: 0.54042 | validation_rmse: 0.78851 |  0:00:29s
epoch 6  | loss: 0.53129 | validation_rmse: 0.76944 |  0:00:33s
epoch 7  | loss: 0.53157 | validation_rmse: 0.83935 |  0:00:37s
epoch 8  | loss: 0.53146 | validation_rmse: 0.81536 |  0:00:40s
epoch 9  | loss: 0.52785 | validation_rmse: 0.90442 |  0:00:45s
epoch 10 | loss: 0.52382 | validation_rmse: 0.89486 |  0:00:49s
epoch 11 | loss: 0.52339 | validation_rmse: 0.79696 |  0:00:52s
epoch 12 | loss: 0.52149 | validation_rmse: 0.82034 |  0:00:57s
epoch 13 | loss: 0.51627 | 

In [2]:
# =============================================================================
# THE ULTIMATE ENSEMBLE: ResNet + MLP + CatBoost
# =============================================================================
# This script creates a weighted blend of our three champion models.

import pandas as pd
import numpy as np

print("--- Creating the Ultimate Three-Model Ensemble Submission ---")

# --- 1. Define the paths to your champion submission files ---
resnet_file = "submission_tabular_resnet.csv"  # Score: 50.861
mlp_file = "submission_mlp_finetuned.csv"      # Score: 51.584
catboost_file = "submission_catboost_pipeline_gpu.csv" # Score: 53.670

# --- 2. Define the weights for the blend ---
# We give the most weight to our best single model, the ResNet.
# The MLP is second best, and CatBoost is third, but still valuable for its diversity.
resnet_weight = 0.50
mlp_weight = 0.30
catboost_weight = 0.20

print(f"Blending models with weights: ResNet={resnet_weight}, MLP={mlp_weight}, CatBoost={catboost_weight}")

try:
    # --- 3. Load and align the submission files ---
    df_resnet = pd.read_csv(resnet_file).sort_values(by='sample_id').reset_index(drop=True)
    df_mlp = pd.read_csv(mlp_file).sort_values(by='sample_id').reset_index(drop=True)
    df_catboost = pd.read_csv(catboost_file).sort_values(by='sample_id').reset_index(drop=True)

    # --- 4. Create the Ensemble Prediction ---
    ensemble_price = (df_resnet['price'] * resnet_weight) + \
                     (df_mlp['price'] * mlp_weight) + \
                     (df_catboost['price'] * catboost_weight)

    # --- 5. Create and save the final submission file ---
    df_ensemble = pd.DataFrame({
        'sample_id': df_resnet['sample_id'],
        'price': ensemble_price
    })
    df_ensemble['price'] = df_ensemble['price'].clip(lower=0)

    ensemble_filename = "submission_ultimate_ensemble.csv"
    df_ensemble.to_csv(ensemble_filename, index=False)

    print(f"\nUltimate Ensemble submission file '{ensemble_filename}' created successfully!")
    print("This is your best and final shot. Good luck!")

except FileNotFoundError as e:
    print(f"\nERROR: Could not find a submission file. Please check the file paths.")
    print(f"Missing file: {e.filename}")

--- Creating the Ultimate Three-Model Ensemble Submission ---
Blending models with weights: ResNet=0.5, MLP=0.3, CatBoost=0.2

Ultimate Ensemble submission file 'submission_ultimate_ensemble.csv' created successfully!
This is your best and final shot. Good luck!
