In [16]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
import os

# --- Machine Learning ---
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# --- Deep Learning (for Embeddings) ---
import torch
import timm
from sentence_transformers import SentenceTransformer
from PIL import Image
from torchvision import transforms

# --- Setup ---
# Set up the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tqdm.pandas()

print(f"Setup complete. Using device: {device}")

Setup complete. Using device: cuda


In [17]:
# Load the datasets
try:
    train_df = pd.read_csv('dataset/train.csv')
    test_df = pd.read_csv('dataset/test.csv')
except FileNotFoundError:
    print("Please make sure train.csv and test.csv are in a 'dataset' folder.")
    # Create dummy dataframes to allow the rest of the notebook to run for demonstration
    train_df = pd.DataFrame() 
    test_df = pd.DataFrame()

print("--- Training Data ---")
print(f"Shape: {train_df.shape}")
print(train_df.head())

print("\n--- Test Data ---")
print(f"Shape: {test_df.shape}")
print(test_df.head())

--- Training Data ---
Shape: (75000, 4)
   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judee’s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  
3  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34  
4  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49  

--- Test Data ---
Shape: (75000, 3)
   sample_id                                    catalog_content  \
0     100179  Item Name: Rani 14-Spice Eshamaya's Mango Chut...   
1     245611  

In [18]:
def parse_catalog_content(text):
    # Pack Size: Look for (Pack of X)
    pack_size_match = re.search(r'\(Pack of (\d+)\)', text, re.IGNORECASE)
    pack_size = int(pack_size_match.group(1)) if pack_size_match else 1

    # Value: Look for Value: X
    value_match = re.search(r'Value: ([\d.]+)', text)
    value = float(value_match.group(1)) if value_match else np.nan

    # Unit: Look for Unit: X (specifically letters to avoid capturing numbers)
    unit_match = re.search(r'Unit: ([a-zA-Z]+)', text) # Corrected Regex
    unit = unit_match.group(1) if unit_match else 'Unknown'

    return pack_size, value, unit

# Apply the parsing function to both dataframes
print("Parsing catalog_content for train and test sets...")
train_df[['Pack_Size', 'Value', 'Unit']] = train_df['catalog_content'].apply(
    lambda x: pd.Series(parse_catalog_content(x))
)
test_df[['Pack_Size', 'Value', 'Unit']] = test_df['catalog_content'].apply(
    lambda x: pd.Series(parse_catalog_content(x))
)
print("Parsing complete.")

# --- FIX: Fit the encoder on ALL possible units ---
print("Encoding the 'Unit' feature...")
# Combine units from both train and test sets to learn all possible labels
all_units = pd.concat([train_df['Unit'], test_df['Unit']]).astype(str).unique()

unit_encoder = LabelEncoder()
unit_encoder.fit(all_units) # Fit on all unique units

# Now transform train and test sets
train_df['Unit_Encoded'] = unit_encoder.transform(train_df['Unit'].astype(str))
test_df['Unit_Encoded'] = unit_encoder.transform(test_df['Unit'].astype(str))
print("Encoding complete.")


print("\n--- Training Data After Parsing ---")
print(train_df[['sample_id', 'Pack_Size', 'Value', 'Unit', 'Unit_Encoded']].head())

print("\n--- Test Data After Parsing ---")
print(test_df[['sample_id', 'Pack_Size', 'Value', 'Unit', 'Unit_Encoded']].head())

Parsing catalog_content for train and test sets...
Parsing complete.
Encoding the 'Unit' feature...
Encoding complete.

--- Training Data After Parsing ---
   sample_id  Pack_Size  Value   Unit  Unit_Encoded
0      33127          6  72.00     Fl            20
1     198967          4  32.00  Ounce            40
2     261251          6  11.40  Ounce            40
3      55858          1  11.25  Ounce            40
4     292686          1  12.00  Count            16

--- Test Data After Parsing ---
   sample_id  Pack_Size  Value   Unit  Unit_Encoded
0     100179          1   10.5  Ounce            40
1     245611          1    2.0     Fl            20
2     146263          1   32.0  Ounce            40
3      95658          2    2.0  Count            16
4      36806          1   32.0     Fl            20


In [19]:
# Cell 3b: Create Log-Transformed Numerical Features

# Apply log transform to skewed numerical features to help the model
# We use log1p which handles zeros safely (log(1+x))
for col in ['Pack_Size', 'Value']:
    train_df[f'{col}_log'] = np.log1p(train_df[col])
    test_df[f'{col}_log'] = np.log1p(test_df[col])

print("--- Created Log-Transformed Features ---")
print(train_df[['Pack_Size', 'Pack_Size_log', 'Value', 'Value_log']].head())

--- Created Log-Transformed Features ---
   Pack_Size  Pack_Size_log  Value  Value_log
0          6       1.945910  72.00   4.290459
1          4       1.609438  32.00   3.496508
2          6       1.945910  11.40   2.517696
3          1       0.693147  11.25   2.505526
4          1       0.693147  12.00   2.564949


In [20]:
# Cell 3c: Create a Clean Text Column for Embeddings

def clean_text(text):
    # Remove boilerplate patterns
    text = re.sub(r'Item Name:', '', text)
    text = re.sub(r'Bullet Point \d+:', '', text)
    text = re.sub(r'Value: [\d.]+', '', text)
    text = re.sub(r'Unit: \w+', '', text)
    text = re.sub(r'Product Description:', '', text)
    # Remove extra whitespace and newlines
    text = ' '.join(text.split())
    return text

print("Creating clean text column...")
train_df['clean_catalog_content'] = train_df['catalog_content'].apply(clean_text)
test_df['clean_catalog_content'] = test_df['catalog_content'].apply(clean_text)

print("--- Sample Cleaned Text ---")
print(train_df['clean_catalog_content'].iloc[0])

Creating clean text column...
--- Sample Cleaned Text ---
La Victoria Green Taco Sauce Mild, 12 Ounce (Pack of 6) Oz


In [23]:
# Cell 3d: Advanced Feature Engineering (Brand, Item Size, Text Length) - CORRECTED

# --- 1. Extract Brand Name (with robust error handling) ---

def extract_brand(item_name):
    """Extracts the first word of an item name as the brand."""
    try:
        brand = item_name.split()[0].upper()
        return brand
    except (IndexError, AttributeError):
        # Handles empty or invalid item_name strings
        return 'UNKNOWN'

def get_brand_from_catalog(catalog_text):
    """Safely finds the 'Item Name' section and extracts the brand."""
    # THIS IS THE FIX: Check if 'Item Name:' exists first
    if 'Item Name:' in catalog_text:
        # If it exists, proceed with the original logic
        item_name_section = catalog_text.split('Item Name:')[1]
        item_name = item_name_section.split('\n')[0].strip()
        return extract_brand(item_name)
    else:
        # If it doesn't exist, return a default value
        return 'UNKNOWN'

print("Extracting brand names...")
# Apply our new, safer function to the catalog_content
train_df['brand'] = train_df['catalog_content'].apply(get_brand_from_catalog)
test_df['brand'] = test_df['catalog_content'].apply(get_brand_from_catalog)

# Encode the new 'brand' feature
all_brands = pd.concat([train_df['brand'], test_df['brand']]).astype(str).unique()
brand_encoder = LabelEncoder().fit(all_brands)
train_df['brand_encoded'] = brand_encoder.transform(train_df['brand'].astype(str))
test_df['brand_encoded'] = brand_encoder.transform(test_df['brand'].astype(str))
print("Brand feature created.")


# --- 2. Create Interaction and Text Statistic Features (no changes here) ---
print("Creating interaction and text statistic features...")
train_df['item_size'] = train_df['Value'] / (train_df['Pack_Size'] + 1e-6)
test_df['item_size'] = test_df['Value'] / (test_df['Pack_Size'] + 1e-6)
train_df['desc_length'] = train_df['clean_catalog_content'].str.len()
test_df['desc_length'] = test_df['clean_catalog_content'].str.len()
print("Additional features created.")


print("\n--- Sample of New Features ---")
print(train_df[['brand', 'brand_encoded', 'item_size', 'desc_length']].head())

Extracting brand names...
Brand feature created.
Creating interaction and text statistic features...
Additional features created.

--- Sample of New Features ---
     brand  brand_encoded  item_size  desc_length
0       LA           6746  11.999998           58
1  SALERNO          10361   7.999998          395
2     BEAR           1351   1.900000          212
3  JUDEE’S           6241  11.249989         1180
4    KEDEM           6389  11.999988          119


In [6]:
# Load the sentence transformer model
text_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Generate embeddings for the training data
print("Generating text embeddings for training data...")
train_text_embeddings = text_model.encode(train_df['catalog_content'].tolist(), show_progress_bar=True)

# Generate embeddings for the test data
print("Generating text embeddings for test data...")
test_text_embeddings = text_model.encode(test_df['catalog_content'].tolist(), show_progress_bar=True)

# Convert to DataFrames
train_text_embed_df = pd.DataFrame(train_text_embeddings, columns=[f'txt_{i}' for i in range(train_text_embeddings.shape[1])])
test_text_embed_df = pd.DataFrame(test_text_embeddings, columns=[f'txt_{i}' for i in range(test_text_embeddings.shape[1])])


print(f"\nText embedding shape for training data: {train_text_embed_df.shape}")
print("Sample text embedding DataFrame:")
print(train_text_embed_df.head())

Generating text embeddings for training data...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Generating text embeddings for test data...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]


Text embedding shape for training data: (75000, 384)
Sample text embedding DataFrame:
      txt_0     txt_1     txt_2     txt_3     txt_4     txt_5     txt_6  \
0  0.033260  0.010840 -0.056023  0.088746 -0.000599  0.023403  0.090809   
1 -0.042282  0.031434 -0.047481  0.068237  0.012065 -0.059747  0.098276   
2 -0.033634  0.003008  0.016667  0.083491  0.001591  0.052957  0.075054   
3 -0.107012 -0.079453  0.012586 -0.057814  0.087116  0.024524  0.051916   
4  0.054223  0.072425 -0.043134  0.051806 -0.111483  0.008939  0.054103   

      txt_7     txt_8     txt_9  ...   txt_374   txt_375   txt_376   txt_377  \
0  0.043927  0.066413 -0.065356  ... -0.009825 -0.031103  0.042088 -0.018127   
1  0.060108 -0.034569 -0.070540  ... -0.002846 -0.061159  0.005115  0.020050   
2 -0.051422  0.032575 -0.090444  ...  0.011707  0.032602 -0.012787 -0.041831   
3  0.117095 -0.001588 -0.057859  ... -0.005596 -0.037883 -0.032917 -0.006095   
4  0.001732  0.045346 -0.095735  ... -0.014247  0.006413  0.02

In [7]:
# Cell 5: Generate Image Embeddings from Local Files

# --- Check if the image folder exists ---
IMAGE_FOLDER = 'images' 
if not os.path.exists(IMAGE_FOLDER):
    print(f" ERROR: The '{IMAGE_FOLDER}' directory was not found.")
    print("Please run the image download cell first.")
else:
    print(f" Image folder '{IMAGE_FOLDER}' found. Proceeding with embedding.")

# --- Image Model Setup (remains the same) ---
img_model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=0, global_pool='avg').to(device)
img_model.eval()
config = img_model.default_cfg
transform = transforms.Compose([
    transforms.Resize(config['input_size'][1:]),
    transforms.CenterCrop(config['input_size'][1:]),
    transforms.ToTensor(),
    transforms.Normalize(mean=config['mean'], std=config['std']),
])

# --- Image Feature Extraction Function ---
def get_image_embedding(sample_id, model, device, transform, image_folder=IMAGE_FOLDER):
    image_path = os.path.join(image_folder, f"{sample_id}.jpg")
    
    if not os.path.exists(image_path):
        return np.zeros(1280)
        
    try:
        img = Image.open(image_path).convert('RGB')
        batch_img = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = model(batch_img)
        return embedding.cpu().numpy().flatten()
    except Exception as e:
        return np.zeros(1280)

# --- Generate Image Embeddings ---
print("\nGenerating image embeddings for training data...")
train_image_embeddings = train_df['sample_id'].progress_apply(
    lambda x: get_image_embedding(x, img_model, device, transform)
)

print("Generating image embeddings for test data...")
test_image_embeddings = test_df['sample_id'].progress_apply(
    lambda x: get_image_embedding(x, img_model, device, transform)
)

# --- Convert to DataFrames ---
train_img_embed_df = pd.DataFrame(train_image_embeddings.to_list(), columns=[f'img_{i}' for i in range(1280)])
test_img_embed_df = pd.DataFrame(test_image_embeddings.to_list(), columns=[f'img_{i}' for i in range(1280)])

print(f"\nImage embedding shape: {train_img_embed_df.shape}")
print("Sample of new image embedding DataFrame (should NOT be all zeros):")
print(train_img_embed_df.head())

 Image folder 'images' found. Proceeding with embedding.

Generating image embeddings for training data...


  0%|          | 0/75000 [00:00<?, ?it/s]

Generating image embeddings for test data...


  0%|          | 0/75000 [00:00<?, ?it/s]


Image embedding shape: (75000, 1280)
Sample of new image embedding DataFrame (should NOT be all zeros):
      img_0     img_1     img_2     img_3     img_4     img_5     img_6  \
0  0.769687 -0.132709 -0.178352 -0.177337  0.413674 -0.205476 -0.154951   
1 -0.155376  0.089499 -0.086478  0.102381 -0.097783 -0.173205 -0.103229   
2  0.270712  0.376103 -0.093395 -0.087051  0.131627 -0.128294 -0.039503   
3  0.339731 -0.096009 -0.031545  0.292460 -0.157695 -0.199246  0.413247   
4  0.170393 -0.070643 -0.174141 -0.099730 -0.123365 -0.222346  0.000466   

      img_7     img_8     img_9  ...  img_1270  img_1271  img_1272  img_1273  \
0 -0.125823  0.476310 -0.152682  ... -0.144723 -0.140334 -0.075375 -0.212962   
1 -0.035747 -0.054104 -0.101846  ... -0.159194 -0.084343 -0.022761  0.758022   
2 -0.057429  0.657255  0.008079  ...  0.136017 -0.120255 -0.096028  0.234427   
3 -0.047835  0.613992 -0.070078  ... -0.079402 -0.084615  0.010236 -0.222005   
4  0.010950  0.440438 -0.104971  ... -0.1695

In [None]:
# # Select the engineered features from the original dataframes
# #dont run nowww (old one)

# numerical_features = ['Pack_Size', 'Value', 'Unit_Encoded']
# X_train_base = train_df[numerical_features].fillna(0)
# X_test_base = test_df[numerical_features].fillna(0)

# # Combine all features
# X_train = pd.concat([X_train_base, train_text_embed_df, train_img_embed_df], axis=1)
# X_test = pd.concat([X_test_base, test_text_embed_df, test_img_embed_df], axis=1)

# # Prepare target variable - using log transform for better performance
# y_train = train_df['price']
# y_train_log = np.log1p(y_train)

# print("--- Final Combined Datasets ---")
# print(f"X_train shape: {X_train.shape}")
# print(f"X_test shape: {X_test.shape}")
# print(f"y_train_log shape: {y_train_log.shape}")

In [None]:
# Cell 6: Combine All Features (Using Log Features)

# Use our new log-transformed numerical features
numerical_features = ['Pack_Size_log', 'Value_log', 'Unit_Encoded', 'brand_encoded', 'item_size','desc_length']
X_train_base = train_df[numerical_features]
X_test_base = test_df[numerical_features]

# Combine all features (assuming embed dfs are created)
X_train = pd.concat([X_train_base, train_text_embed_df, train_img_embed_df], axis=1)
X_test = pd.concat([X_test_base, test_text_embed_df, test_img_embed_df], axis=1)

# Go back to predicting the log of the original price
y_train = train_df['price']
y_train_log = np.log1p(train_df['price'])

print(f"X_train shape: {X_train.shape}")
print(f"Using features: {numerical_features}")




Combining All Features
X_train shape: (75000, 1670)
X_test shape: (75000, 1670)
y_train_log shape: (75000,)

Using 15 numerical/categorical features.


In [80]:
print("\n" + "="*60)
print("Combining All Features")
print("="*60)

# Define the complete list of all engineered numerical and categorical features
numerical_features = [
    'Pack_Size_log', 
    'Value_log', 
    'Unit_Encoded', 
    'brand_encoded', 
    'item_size',
    'desc_length'
]

# Select the base numerical features, filling any potential NaNs with 0 for safety
X_train_base = train_df[numerical_features].fillna(0)
X_test_base = test_df[numerical_features].fillna(0)


# Combine all feature sets into the final training matrix
# reset_index(drop=True) is used to ensure a clean, continuous index for concatenation
X_train = pd.concat([
    X_train_base.reset_index(drop=True), 
    train_text_embed_df.reset_index(drop=True), 
    train_img_embed_df.reset_index(drop=True)
], axis=1)

# Combine all feature sets into the final test matrix
X_test = pd.concat([
    X_test_base.reset_index(drop=True), 
    test_text_embed_df.reset_index(drop=True), 
    test_img_embed_df.reset_index(drop=True)
], axis=1)

# Prepare the target variable, using a log transform for model stability
y_train = train_df['price']
y_train_log = np.log1p(y_train)

# Print the final shapes and features used to verify everything is correct
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train_log shape: {y_train_log.shape}")
print(f"\nUsing the following {len(numerical_features)} numerical/categorical features:")
print(numerical_features)


Combining All Features
X_train shape: (75000, 1670)
X_test shape: (75000, 1670)
y_train_log shape: (75000,)

Using the following 6 numerical/categorical features:
['Pack_Size_log', 'Value_log', 'Unit_Encoded', 'brand_encoded', 'item_size', 'desc_length']


In [81]:
print("\n" + "="*60)
print("Creating Train/Validation Split")
print("="*60)

X_train_split, X_val, y_train_log_split, y_val_log = train_test_split(
    X_train, y_train_log, test_size=0.2, random_state=42
)

print(f"Training split shape: {X_train_split.shape}")
print(f"Validation split shape: {X_val.shape}")


Creating Train/Validation Split
Training split shape: (60000, 1670)
Validation split shape: (15000, 1670)


In [82]:
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

In [83]:
from torch.utils.data import Dataset, DataLoader
class PriceDataset(Dataset):
    """Custom Dataset for price prediction"""
    def __init__(self, X, y=None):
        self.X = torch.FloatTensor(X.values if isinstance(X, pd.DataFrame) else X)
        self.y = torch.FloatTensor(y.values if y is not None else np.zeros(len(X)))
        self.has_labels = y is not None
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.has_labels:
            return self.X[idx], self.y[idx]
        return self.X[idx]

In [84]:
import torch.nn as nn
class SimpleMLP(nn.Module):
    """Simple MLP for price prediction"""
    def __init__(self, input_dim, hidden_dims=[512, 256, 128, 64], dropout=0.3):
        super(SimpleMLP, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x).squeeze()


class AdvancedMultimodalMLP(nn.Module):
    """Advanced MLP with separate branches for different feature types"""
    def __init__(self, num_features, text_features, img_features):
        super(AdvancedMultimodalMLP, self).__init__()
        
        self.num_features = num_features
        self.text_features = text_features
        self.img_features = img_features
        
        # Numerical features branch
        self.num_net = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32)
        )
        
        # Text features branch
        self.text_net = nn.Sequential(
            nn.Linear(text_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128)
        )
        
        # Image features branch
        self.img_net = nn.Sequential(
            nn.Linear(img_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256)
        )
        
        # Fusion layer
        fusion_input_dim = 32 + 128 + 256
        self.fusion = nn.Sequential(
            nn.Linear(fusion_input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        # Split input into different modalities
        num_feat = x[:, :self.num_features]
        text_feat = x[:, self.num_features:self.num_features+self.text_features]
        img_feat = x[:, self.num_features+self.text_features:]
        
        # Process each modality
        num_out = self.num_net(num_feat)
        text_out = self.text_net(text_feat)
        img_out = self.img_net(img_feat)
        
        # Fuse and predict
        combined = torch.cat([num_out, text_out, img_out], dim=1)
        output = self.fusion(combined)
        
        return output.squeeze()

In [85]:
from sklearn.preprocessing import StandardScaler
import torch.optim as optim

def train_mlp(X_train, y_train, X_val, y_val, 
              model_type='simple',
              num_epochs=100, 
              batch_size=256, 
              lr=0.001,
              num_features=3,
              text_features=384,
              img_features=1280):
    """Train MLP model"""
    print(f"\n{'='*60}")
    print(f"Training {model_type.upper()} MLP Model")
    print(f"{'='*60}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Create datasets
    train_dataset = PriceDataset(X_train_scaled, y_train)
    val_dataset = PriceDataset(X_val_scaled, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model
    input_dim = X_train.shape[1]
    
    if model_type == 'simple':
        model = SimpleMLP(input_dim).to(device)
    else:  # advanced
        model = AdvancedMultimodalMLP(num_features, text_features, img_features).to(device)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
        optimizer, mode='min', factor=0.5, patience=5
    )
    
    # Training loop
    best_smape = float('inf')
    patience_counter = 0
    patience = 15
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_losses = []
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_losses.append(loss.item())
        
        # Validation phase
        model.eval()
        val_losses = []
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                predictions = model(X_batch)
                loss = criterion(predictions, y_batch)
                
                val_losses.append(loss.item())
                all_preds.extend(predictions.cpu().numpy())
                all_targets.extend(y_batch.cpu().numpy())
        
        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        val_smape = smape(np.expm1(all_targets), np.expm1(all_preds))
        
        scheduler.step(avg_val_loss)
        
        # Print progress
        if (epoch + 1) % 5 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}] "
                  f"Train Loss: {avg_train_loss:.4f} | "
                  f"Val Loss: {avg_val_loss:.4f} | "
                  f"Val SMAPE: {val_smape:.4f}%")
        
        # Early stopping
        if val_smape < best_smape:
            best_smape = val_smape
            patience_counter = 0
            torch.save({
                'model_state_dict': model.state_dict(),
                'scaler': scaler,
                'best_smape': best_smape,
            }, 'best_mlp_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping at epoch {epoch+1}")
                break
    
    # Load best model
    checkpoint = torch.load('best_mlp_model.pth', map_location=device)
    model.load_state_dict(checkpoint['model_state_dict'])
    scaler = checkpoint['scaler']
    
    print(f"\n{'='*60}")
    print(f"Best Validation SMAPE: {best_smape:.4f}%")
    print(f"{'='*60}")
    
    return model, scaler, best_smape

In [86]:
mlp_simple, scaler_simple, smape_simple = train_mlp(
    X_train_split, 
    y_train_log_split, 
    X_val, 
    y_val_log,
    model_type='simple',
    num_epochs=100,
    batch_size=256,
    lr=0.001
)


Training SIMPLE MLP Model
Model parameters: 1,030,017
Epoch [5/100] Train Loss: 0.5277 | Val Loss: 0.5444 | Val SMAPE: 56.1170%
Epoch [10/100] Train Loss: 0.3791 | Val Loss: 0.5226 | Val SMAPE: 53.9202%
Epoch [15/100] Train Loss: 0.2777 | Val Loss: 0.5338 | Val SMAPE: 53.5747%
Epoch [20/100] Train Loss: 0.2011 | Val Loss: 0.5256 | Val SMAPE: 53.0377%
Epoch [25/100] Train Loss: 0.1668 | Val Loss: 0.5223 | Val SMAPE: 52.5137%
Epoch [30/100] Train Loss: 0.1507 | Val Loss: 0.5236 | Val SMAPE: 52.2157%
Epoch [35/100] Train Loss: 0.1391 | Val Loss: 0.5171 | Val SMAPE: 52.0618%
Epoch [40/100] Train Loss: 0.1365 | Val Loss: 0.5192 | Val SMAPE: 52.0492%
Epoch [45/100] Train Loss: 0.1336 | Val Loss: 0.5146 | Val SMAPE: 52.0428%
Epoch [50/100] Train Loss: 0.1299 | Val Loss: 0.5159 | Val SMAPE: 52.0067%
Epoch [55/100] Train Loss: 0.1266 | Val Loss: 0.5165 | Val SMAPE: 51.9169%
Epoch [60/100] Train Loss: 0.1248 | Val Loss: 0.5175 | Val SMAPE: 51.9581%
Epoch [65/100] Train Loss: 0.1242 | Val Loss: 

  checkpoint = torch.load('best_mlp_model.pth', map_location=device)


In [87]:
# Define the numerical features list EXACTLY as you did in the combination cell
numerical_features_list = [
    'Pack_Size_log', 
    'Value_log', 
    'Unit_Encoded', 
    'brand_encoded', 
    'item_size',
    'desc_length'
]

num_features_count = len(numerical_features_list)
text_features_count = train_text_embed_df.shape[1]
img_features_count = train_img_embed_df.shape[1]

print(f"Using feature counts -> Numerical: {num_features_count}, Text: {text_features_count}, Image: {img_features_count}")
# --------------------------------

mlp_advanced, scaler_advanced, smape_advanced = train_mlp(
    X_train_split, 
    y_train_log_split, 
    X_val, 
    y_val_log,
    model_type='advanced',
    num_epochs=100,
    batch_size=256,
    lr=0.001,
    num_features=num_features_count,    # <-- Use the correct count
    text_features=text_features_count,  # <-- Use the correct count
    img_features=img_features_count     # <-- Use the correct count
)


Using feature counts -> Numerical: 6, Text: 384, Image: 1280

Training ADVANCED MLP Model
Model parameters: 1,071,585
Epoch [5/100] Train Loss: 0.4439 | Val Loss: 0.5381 | Val SMAPE: 55.4050%
Epoch [10/100] Train Loss: 0.3196 | Val Loss: 0.5089 | Val SMAPE: 53.1061%
Epoch [15/100] Train Loss: 0.2401 | Val Loss: 0.5139 | Val SMAPE: 52.4647%
Epoch [20/100] Train Loss: 0.1641 | Val Loss: 0.5116 | Val SMAPE: 51.9544%
Epoch [25/100] Train Loss: 0.1341 | Val Loss: 0.5114 | Val SMAPE: 51.4840%
Epoch [30/100] Train Loss: 0.1147 | Val Loss: 0.5113 | Val SMAPE: 51.2936%
Epoch [35/100] Train Loss: 0.1080 | Val Loss: 0.5119 | Val SMAPE: 51.3173%
Epoch [40/100] Train Loss: 0.1006 | Val Loss: 0.5118 | Val SMAPE: 51.3815%
Epoch [45/100] Train Loss: 0.0985 | Val Loss: 0.5113 | Val SMAPE: 51.4854%
Epoch [50/100] Train Loss: 0.0980 | Val Loss: 0.5098 | Val SMAPE: 51.4403%
Epoch [55/100] Train Loss: 0.0960 | Val Loss: 0.5112 | Val SMAPE: 51.3596%
Epoch [60/100] Train Loss: 0.0956 | Val Loss: 0.5102 | Val

  checkpoint = torch.load('best_mlp_model.pth', map_location=device)


In [88]:
print("\n" + "="*60)
print("Generating Final Predictions for Submission")
print("="*60)

# Load the best model and scaler from the last training run (Advanced MLP)
print("Loading best model checkpoint from 'best_mlp_model.pth'...")
checkpoint = torch.load('best_mlp_model.pth', map_location=device)

# Initialize the model architecture with the correct feature dimensions
num_features_count = len(numerical_features)
text_features_count = test_text_embed_df.shape[1]
img_features_count = test_img_embed_df.shape[1]
final_model = AdvancedMultimodalMLP(num_features_count, text_features_count, img_features_count).to(device)

# Load the trained weights and the scaler
final_model.load_state_dict(checkpoint['model_state_dict'])
final_scaler = checkpoint['scaler']
print(f"Model loaded. Best validation SMAPE was: {checkpoint['best_smape']:.4f}%")

# Prepare the test data
print("Scaling test data...")
X_test_scaled = final_scaler.transform(X_test)
X_test_tensor = torch.FloatTensor(X_test_scaled).to(device)

# Generate predictions
print("Generating predictions on the test set...")
final_model.eval()
with torch.no_grad():
    log_predictions = final_model(X_test_tensor)

# Convert log predictions back to original price scale
# .cpu().numpy() moves the tensor from GPU to CPU and converts to a NumPy array
final_predictions = np.expm1(log_predictions.cpu().numpy())

# Ensure prices are non-negative
final_predictions[final_predictions < 0] = 0

# Create the submission DataFrame
submission_df = pd.DataFrame({
    'sample_id': test_df['sample_id'],
    'price': final_predictions
})

# Save the submission file
submission_df.to_csv('submission.csv', index=False)
print("\nSubmission file 'submission.csv' created successfully!")
print("--- Submission File Head ---")
print(submission_df.head())


Generating Final Predictions for Submission
Loading best model checkpoint from 'best_mlp_model.pth'...
Model loaded. Best validation SMAPE was: 51.2146%
Scaling test data...


  checkpoint = torch.load('best_mlp_model.pth', map_location=device)


Generating predictions on the test set...

Submission file 'submission.csv' created successfully!
--- Submission File Head ---
   sample_id      price
0     100179  18.924973
1     245611  25.577932
2     146263  19.809526
3      95658   4.585983
4      36806  24.854664


In [None]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
import numpy as np

# ✅ Split data into training and validation sets
X_train_split, X_val, y_train_log_split, y_val_log = train_test_split(
    X_train, y_train_log, test_size=0.2, random_state=42
)

# ✅ Convert to DMatrix format (recommended for XGBoost)
dtrain = xgb.DMatrix(X_train_split, label=y_train_log_split)
dval = xgb.DMatrix(X_val, label=y_val_log)

# ✅ XGBoost parameters (analogous to LightGBM setup)
params = {
    'objective': 'reg:squarederror',
    'eval_metric': 'mae',
    'learning_rate': 0.03,
    'seed': 42,
    
    # --- Key Changes to Reduce Overfitting ---
    'max_depth': 5,               # DECREASED: The single most important change. Forces simpler trees.
    'subsample': 0.7,             # DECREASED: Use a smaller random sample of data for each tree.
    'colsample_bytree': 0.7,      # DECREASED: Use a smaller random sample of features for each tree.
    'alpha': 2,                   # INCREASED: Stronger L1 regularization penalty.
    'lambda': 2,                  # INCREASED: Stronger L2 regularization penalty.
    'min_child_weight': 5  
}

# ✅ Train model with early stopping
evals = [(dtrain, 'train'), (dval, 'val')]
xgb_model = xgb.train(
    params=params,
    dtrain=dtrain,
    num_boost_round=5000,
    evals=evals,
    early_stopping_rounds=100,
    verbose_eval=100  # show progress every 100 rounds
)

# ✅ Make predictions on validation set
val_preds_log = xgb_model.predict(dval)

# ✅ Convert predictions back from log scale
val_preds = np.expm1(val_preds_log)
y_val_true = np.expm1(y_val_log)

# ✅ Define SMAPE function if not already defined
def smape(y_true, y_pred):
    return 100 * np.mean(2 * np.abs(y_pred - y_true) / (np.abs(y_true) + np.abs(y_pred)))

#  Calculate SMAPE
validation_smape = smape(y_val_true, val_preds)

print("\n-------------------------------------------")
print(f"Validation SMAPE Score (XGBoost): {validation_smape:.4f}%")
print("-------------------------------------------")


[0]	train-mae:0.75604	val-mae:0.76571
[100]	train-mae:0.59833	val-mae:0.62730
[200]	train-mae:0.55978	val-mae:0.60247
[300]	train-mae:0.53402	val-mae:0.58976
[400]	train-mae:0.51438	val-mae:0.58212
[500]	train-mae:0.49768	val-mae:0.57577
[600]	train-mae:0.48293	val-mae:0.57132
[700]	train-mae:0.46946	val-mae:0.56780
[800]	train-mae:0.45662	val-mae:0.56445
[900]	train-mae:0.44502	val-mae:0.56220
[1000]	train-mae:0.43379	val-mae:0.55969
[1100]	train-mae:0.42297	val-mae:0.55762
[1200]	train-mae:0.41295	val-mae:0.55568
[1300]	train-mae:0.40313	val-mae:0.55400
[1400]	train-mae:0.39361	val-mae:0.55266
[1500]	train-mae:0.38414	val-mae:0.55125
[1600]	train-mae:0.37546	val-mae:0.54997
[1700]	train-mae:0.36693	val-mae:0.54871
[1800]	train-mae:0.35870	val-mae:0.54744
[1900]	train-mae:0.35063	val-mae:0.54636
[2000]	train-mae:0.34269	val-mae:0.54512
[2100]	train-mae:0.33532	val-mae:0.54416
[2200]	train-mae:0.32814	val-mae:0.54327
[2300]	train-mae:0.32109	val-mae:0.54237
[2400]	train-mae:0.31430	val

In [26]:
# Cell 6c: Overfitting Diagnosis for XGBoost (Corrected)

print("Diagnosing overfitting...")

# --- THIS IS THE FIX ---
# Convert the training split DataFrame to a DMatrix before predicting
dtrain_split_for_pred = xgb.DMatrix(X_train_split)

# Predict on the TRAINING data using the new DMatrix
train_preds_log = xgb_model.predict(dtrain_split_for_pred, iteration_range=(0, xgb_model.best_iteration))
train_preds = np.expm1(train_preds_log)
y_train_true = np.expm1(y_train_log_split)

# Calculate training SMAPE
training_smape = smape(y_train_true, train_preds)

# --- Overfitting Diagnosis ---
print(f"\nSMAPE on Training Data (80%): {training_smape:.4f}%")
print(f"SMAPE on Validation Data (20%): {validation_smape:.4f}%")
print("---------------------------------")
gap = validation_smape - training_smape
if gap > 15:
    print(f" Result: High Overfitting Detected! Gap: {gap:.2f}%.")
elif gap > 5:
    print(f" Result: Moderate Overfitting. Gap: {gap:.2f}%.")
else:
    print(" Result: Good Fit. The model is generalizing well.")

Diagnosing overfitting...

SMAPE on Training Data (80%): 20.6967%
SMAPE on Validation Data (20%): 53.1493%
---------------------------------
 Result: High Overfitting Detected! Gap: 32.45%.


In [None]:
print("Starting final model training on all data...")

# Create a DMatrix for the full training data
dtrain_full = xgb.DMatrix(X_train, label=y_train_log)

# Use the same parameters from validation, but train for the optimal number of rounds
# 'xgb_model.best_iteration' was found in the validation step (Cell 6b)
best_rounds = xgb_model.best_iteration 
print(f"Training for {best_rounds} rounds...")

# Train the final model
final_xgb_model = xgb.train(
    params=params,
    dtrain=dtrain_full,
    num_boost_round=best_rounds,
    verbose_eval=100 # Optional: shows progress
)

print("Final model training complete.")

In [None]:
# # Cell 7a: Train Final LightGBM Model and Get Predictions

# print("--- Training Final LightGBM Model ---")

# # Use the best parameters you found for LightGBM (e.g., the 'balanced' set)
# lgbm_params = {
#     'objective': 'regression_l1', 'metric': 'mae', 'n_estimators': 5000,
#     'learning_rate': 0.03, 'seed': 42, 'n_jobs': -1, 'verbose': -1,
#     'num_leaves': 51, 'feature_fraction': 0.8, 'bagging_fraction': 0.8,
#     'lambda_l1': 0.5, 'lambda_l2': 0.5, 'min_child_samples': 20
# }

# # We need a temporary validation set just to find the best number of rounds automatically
# X_train_temp, X_val_temp, y_train_log_temp, y_val_log_temp = train_test_split(
#     X_train, y_train_log, test_size=0.1, random_state=42
# )

# temp_lgbm_model = lgb.LGBMRegressor(**lgbm_params)
# temp_lgbm_model.fit(X_train_temp, y_train_log_temp,
#                     eval_set=[(X_val_temp, y_val_log_temp)],
#                     callbacks=[lgb.early_stopping(100, verbose=False)])

# # Get the best number of rounds and retrain a new model on ALL the data
# best_rounds_lgbm = temp_lgbm_model.best_iteration_
# print(f"Retraining LightGBM on all data for {best_rounds_lgbm} rounds...")
# lgbm_params['n_estimators'] = best_rounds_lgbm # Set the optimal number of trees
# final_lgbm_model = lgb.LGBMRegressor(**lgbm_params)
# final_lgbm_model.fit(X_train, y_train_log)

# # Predict on the test set and save the predictions to a file
# lgbm_preds_log = final_lgbm_model.predict(X_test)
# lgbm_preds = np.expm1(lgbm_preds_log)
# np.save('lgbm_test_predictions.npy', lgbm_preds)

# print("LightGBM predictions saved to 'lgbm_test_predictions.npy'")

In [43]:
# Cell 7b: Train Final XGBoost Model and Get Predictions (with Modern GPU Syntax)

print("\n--- Training Final XGBoost Model ---")

# Use the best regularized parameters with the updated device parameter
xgb_params = {
    'objective': 'reg:squarederror', 'eval_metric': 'mae', 'learning_rate': 0.03,
    'max_depth': 5, 'subsample': 0.7, 'colsample_bytree': 0.7,
    'alpha': 2, 'lambda': 2, 'min_child_weight': 5, 'seed': 42,
    
    # --- THIS IS THE UPDATED PART ---
    'tree_method': 'hist',
    'device': 'cuda' if torch.cuda.is_available() else 'cpu'
}

print(f"Using device: {xgb_params['device']}") # Added a print to confirm

# Create DMatrix for the full training data
dtrain_full = xgb.DMatrix(X_train, label=y_train_log)

# Get the optimal number of rounds from your validation step (Cell 6b)
best_rounds_xgb = xgb_model.best_iteration 
print(f"Training XGBoost on all data for {best_rounds_xgb} rounds...")

final_xgb_model = xgb.train(
    params=xgb_params,
    dtrain=dtrain_full,
    num_boost_round=best_rounds_xgb,
    verbose_eval=100 # Keep it clean
)

# Predict on the test set and save the predictions
dtest = xgb.DMatrix(X_test)
xgb_preds_log = final_xgb_model.predict(dtest)
xgb_preds = np.expm1(xgb_preds_log)
np.save('xgb_test_predictions.npy', xgb_preds)

print("XGBoost predictions saved to 'xgb_test_predictions.npy'")


--- Training Final XGBoost Model ---
Using device: cuda
Training XGBoost on all data for 4999 rounds...
XGBoost predictions saved to 'xgb_test_predictions.npy'


In [None]:
# # Cell 8: Create Ensemble Submission

# print("Creating final ensemble submission...")

# # Load the saved predictions from each model
# lgbm_preds = np.load('lgbm_test_predictions.npy')
# xgb_preds = np.load('xgb_test_predictions.npy')

# # --- The Ensemble: Simple Averaging ---
# ensemble_preds = (lgbm_preds + xgb_preds) / 2

# # Ensure all predicted prices are positive floats
# ensemble_preds[ensemble_preds < 0] = 0

# # Create the submission DataFrame
# submission_df = pd.DataFrame({
#     'sample_id': test_df['sample_id'], 
#     'price': ensemble_preds
# })

# # Save the final submission file
# submission_df.to_csv('submission_ensemble.csv', index=False)

# print("\nEnsemble submission file 'submission_ensemble.csv' created successfully.")
# print("--- File Head ---")
# print(submission_df.head())

In [44]:
# # Cell 8: Create Ensemble Submission (with Strict Float Formatting)

# print("Creating final ensemble submission with strict formatting...")

# # Load the saved predictions from each model
# xgb_preds = np.load('xgb_test_predictions.npy')

# # --- The Ensemble: Simple Averaging ---
# ensemble_preds = xgb_preds

# # Ensure all predicted prices are positive floats
# ensemble_preds[ensemble_preds < 0] = 0

# # --- Failsafe Step ---
# # Reload a clean version of the test data to guarantee the sample_id column is correct.
# submission_ids = pd.read_csv('dataset/test.csv')['sample_id']

# # Create the submission DataFrame
# submission_df = pd.DataFrame({
#     'sample_id': submission_ids, 
#     'price': ensemble_preds
# })

# # Explicitly cast datatypes for maximum compatibility
# submission_df['sample_id'] = submission_df['sample_id'].astype(int)
# submission_df['price'] = submission_df['price'].astype(float)

# # --- Crucial Formatting Change ---
# # Save with a specific float format to ensure clean, 6-decimal-place numbers.
# # '%.6f' formats the number as a float with exactly 6 digits after the decimal point.
# submission_df.to_csv(
#     'submission_ensemble.csv', 
#     index=False, 
#     float_format='%.6f' # <-- This is the key change
# )
# # -----------------------------

# print("\nEnsemble submission file created successfully with 6-decimal-place precision.")
# print("--- File Head ---")
# print(submission_df.head())

Creating final ensemble submission with strict formatting...

Ensemble submission file created successfully with 6-decimal-place precision.
--- File Head ---
   sample_id      price
0     100179  16.333675
1     245611  15.051077
2     146263  17.806292
3      95658   6.296509
4      36806  27.668425
