In [None]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
import os

# --- Machine Learning ---
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# --- Deep Learning (for Embeddings) ---
import torch
import timm
from sentence_transformers import SentenceTransformer
from PIL import Image
from torchvision import transforms

# --- Setup ---
# Set up the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tqdm.pandas()

print(f"Setup complete. Using device: {device}")

In [None]:
# Load the datasets
try:
    train_df = pd.read_csv('dataset/train.csv')
    test_df = pd.read_csv('dataset/test.csv')
except FileNotFoundError:
    print("Please make sure train.csv and test.csv are in a 'dataset' folder.")
    # Create dummy dataframes to allow the rest of the notebook to run for demonstration
    train_df = pd.DataFrame() 
    test_df = pd.DataFrame()

print("--- Training Data ---")
print(f"Shape: {train_df.shape}")
print(train_df.head())

print("\n--- Test Data ---")
print(f"Shape: {test_df.shape}")
print(test_df.head())

In [None]:
def parse_catalog_content(text):
    # Pack Size: Look for (Pack of X)
    pack_size_match = re.search(r'\(Pack of (\d+)\)', text, re.IGNORECASE)
    pack_size = int(pack_size_match.group(1)) if pack_size_match else 1

    # Value: Look for Value: X
    value_match = re.search(r'Value: ([\d.]+)', text)
    value = float(value_match.group(1)) if value_match else np.nan

    # Unit: Look for Unit: X (specifically letters to avoid capturing numbers)
    unit_match = re.search(r'Unit: ([a-zA-Z]+)', text) # Corrected Regex
    unit = unit_match.group(1) if unit_match else 'Unknown'

    return pack_size, value, unit

# Apply the parsing function to both dataframes
print("Parsing catalog_content for train and test sets...")
train_df[['Pack_Size', 'Value', 'Unit']] = train_df['catalog_content'].apply(
    lambda x: pd.Series(parse_catalog_content(x))
)
test_df[['Pack_Size', 'Value', 'Unit']] = test_df['catalog_content'].apply(
    lambda x: pd.Series(parse_catalog_content(x))
)
print("Parsing complete.")

# --- FIX: Fit the encoder on ALL possible units ---
print("Encoding the 'Unit' feature...")
# Combine units from both train and test sets to learn all possible labels
all_units = pd.concat([train_df['Unit'], test_df['Unit']]).astype(str).unique()

unit_encoder = LabelEncoder()
unit_encoder.fit(all_units) # Fit on all unique units

# Now transform train and test sets
train_df['Unit_Encoded'] = unit_encoder.transform(train_df['Unit'].astype(str))
test_df['Unit_Encoded'] = unit_encoder.transform(test_df['Unit'].astype(str))
print("Encoding complete.")


print("\n--- Training Data After Parsing ---")
print(train_df[['sample_id', 'Pack_Size', 'Value', 'Unit', 'Unit_Encoded']].head())

print("\n--- Test Data After Parsing ---")
print(test_df[['sample_id', 'Pack_Size', 'Value', 'Unit', 'Unit_Encoded']].head())

In [None]:
# Cell 3b: Create Log-Transformed Numerical Features

# Apply log transform to skewed numerical features to help the model
# We use log1p which handles zeros safely (log(1+x))
for col in ['Pack_Size', 'Value']:
    train_df[f'{col}_log'] = np.log1p(train_df[col])
    test_df[f'{col}_log'] = np.log1p(test_df[col])

print("--- Created Log-Transformed Features ---")
print(train_df[['Pack_Size', 'Pack_Size_log', 'Value', 'Value_log']].head())

In [None]:
# Cell 3c: Create a Clean Text Column for Embeddings

def clean_text(text):
    # Remove boilerplate patterns
    text = re.sub(r'Item Name:', '', text)
    text = re.sub(r'Bullet Point \d+:', '', text)
    text = re.sub(r'Value: [\d.]+', '', text)
    text = re.sub(r'Unit: \w+', '', text)
    text = re.sub(r'Product Description:', '', text)
    # Remove extra whitespace and newlines
    text = ' '.join(text.split())
    return text

print("Creating clean text column...")
train_df['clean_catalog_content'] = train_df['catalog_content'].apply(clean_text)
test_df['clean_catalog_content'] = test_df['catalog_content'].apply(clean_text)

print("--- Sample Cleaned Text ---")
print(train_df['clean_catalog_content'].iloc[0])

In [None]:
# Cell 3d: Advanced Feature Engineering (Brand, Item Size, Text Length) - CORRECTED

# --- 1. Extract Brand Name (with robust error handling) ---

def extract_brand(item_name):
    """Extracts the first word of an item name as the brand."""
    try:
        brand = item_name.split()[0].upper()
        return brand
    except (IndexError, AttributeError):
        # Handles empty or invalid item_name strings
        return 'UNKNOWN'

def get_brand_from_catalog(catalog_text):
    """Safely finds the 'Item Name' section and extracts the brand."""
    # THIS IS THE FIX: Check if 'Item Name:' exists first
    if 'Item Name:' in catalog_text:
        # If it exists, proceed with the original logic
        item_name_section = catalog_text.split('Item Name:')[1]
        item_name = item_name_section.split('\n')[0].strip()
        return extract_brand(item_name)
    else:
        # If it doesn't exist, return a default value
        return 'UNKNOWN'

print("Extracting brand names...")
# Apply our new, safer function to the catalog_content
train_df['brand'] = train_df['catalog_content'].apply(get_brand_from_catalog)
test_df['brand'] = test_df['catalog_content'].apply(get_brand_from_catalog)

# Encode the new 'brand' feature
all_brands = pd.concat([train_df['brand'], test_df['brand']]).astype(str).unique()
brand_encoder = LabelEncoder().fit(all_brands)
train_df['brand_encoded'] = brand_encoder.transform(train_df['brand'].astype(str))
test_df['brand_encoded'] = brand_encoder.transform(test_df['brand'].astype(str))
print("Brand feature created.")


# --- 2. Create Interaction and Text Statistic Features (no changes here) ---
print("Creating interaction and text statistic features...")
train_df['item_size'] = train_df['Value'] / (train_df['Pack_Size'] + 1e-6)
test_df['item_size'] = test_df['Value'] / (test_df['Pack_Size'] + 1e-6)
train_df['desc_length'] = train_df['clean_catalog_content'].str.len()
test_df['desc_length'] = test_df['clean_catalog_content'].str.len()
print("Additional features created.")


print("\n--- Sample of New Features ---")
print(train_df[['brand', 'brand_encoded', 'item_size', 'desc_length']].head())

In [None]:
# Load the sentence transformer model
text_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Generate embeddings for the training data
print("Generating text embeddings for training data...")
train_text_embeddings = text_model.encode(train_df['catalog_content'].tolist(), show_progress_bar=True)

# Generate embeddings for the test data
print("Generating text embeddings for test data...")
test_text_embeddings = text_model.encode(test_df['catalog_content'].tolist(), show_progress_bar=True)

# Convert to DataFrames
train_text_embed_df = pd.DataFrame(train_text_embeddings, columns=[f'txt_{i}' for i in range(train_text_embeddings.shape[1])])
test_text_embed_df = pd.DataFrame(test_text_embeddings, columns=[f'txt_{i}' for i in range(test_text_embeddings.shape[1])])


print(f"\nText embedding shape for training data: {train_text_embed_df.shape}")
print("Sample text embedding DataFrame:")
print(train_text_embed_df.head())

In [None]:
# Cell 5: Generate Image Embeddings from Local Files

# --- Check if the image folder exists ---
IMAGE_FOLDER = 'images' 
if not os.path.exists(IMAGE_FOLDER):
    print(f" ERROR: The '{IMAGE_FOLDER}' directory was not found.")
    print("Please run the image download cell first.")
else:
    print(f" Image folder '{IMAGE_FOLDER}' found. Proceeding with embedding.")

# --- Image Model Setup (remains the same) ---
img_model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=0, global_pool='avg').to(device)
img_model.eval()
config = img_model.default_cfg
transform = transforms.Compose([
    transforms.Resize(config['input_size'][1:]),
    transforms.CenterCrop(config['input_size'][1:]),
    transforms.ToTensor(),
    transforms.Normalize(mean=config['mean'], std=config['std']),
])

# --- Image Feature Extraction Function ---
def get_image_embedding(sample_id, model, device, transform, image_folder=IMAGE_FOLDER):
    image_path = os.path.join(image_folder, f"{sample_id}.jpg")
    
    if not os.path.exists(image_path):
        return np.zeros(1280)
        
    try:
        img = Image.open(image_path).convert('RGB')
        batch_img = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = model(batch_img)
        return embedding.cpu().numpy().flatten()
    except Exception as e:
        return np.zeros(1280)

# --- Generate Image Embeddings ---
print("\nGenerating image embeddings for training data...")
train_image_embeddings = train_df['sample_id'].progress_apply(
    lambda x: get_image_embedding(x, img_model, device, transform)
)

print("Generating image embeddings for test data...")
test_image_embeddings = test_df['sample_id'].progress_apply(
    lambda x: get_image_embedding(x, img_model, device, transform)
)

# --- Convert to DataFrames ---
train_img_embed_df = pd.DataFrame(train_image_embeddings.to_list(), columns=[f'img_{i}' for i in range(1280)])
test_img_embed_df = pd.DataFrame(test_image_embeddings.to_list(), columns=[f'img_{i}' for i in range(1280)])

print(f"\nImage embedding shape: {train_img_embed_df.shape}")
print("Sample of new image embedding DataFrame (should NOT be all zeros):")
print(train_img_embed_df.head())

In [None]:
print("\n" + "="*60)
print("Combining All Features")
print("="*60)

# Define the complete list of all engineered numerical and categorical features
numerical_features = [
    'Pack_Size_log', 
    'Value_log', 
    'Unit_Encoded', 
    'brand_encoded', 
    'item_size',
    'desc_length'
]

# --- NOTE: Add keyword features if you created them ---
# If you have a cell that creates keyword features, this part includes them.
# If not, you can comment out the following 3 lines.
keywords = ['organic', 'premium', 'natural', 'gourmet', 'gluten-free', 'sugar-free', 'family size', 'bulk', 'value pack']
keyword_cols = [f'kw_{k}' for k in keywords]
# numerical_features.extend(keyword_cols) # Uncomment this if you have keyword features


# Select the base numerical features, filling any potential NaNs with 0 for safety
X_train_base = train_df[numerical_features].fillna(0)
X_test_base = test_df[numerical_features].fillna(0)


# Combine all feature sets into the final training matrix
# reset_index(drop=True) is used to ensure a clean, continuous index for concatenation
X_train = pd.concat([
    X_train_base.reset_index(drop=True), 
    train_text_embed_df.reset_index(drop=True), 
    train_img_embed_df.reset_index(drop=True)
], axis=1)

# Combine all feature sets into the final test matrix
X_test = pd.concat([
    X_test_base.reset_index(drop=True), 
    test_text_embed_df.reset_index(drop=True), 
    test_img_embed_df.reset_index(drop=True)
], axis=1)

# Prepare the target variable, using a log transform for model stability
y_train = train_df['price']
y_train_log = np.log1p(y_train)

# Print the final shapes and features used to verify everything is correct
print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train_log shape: {y_train_log.shape}")
print(f"\nUsing the following {len(numerical_features)} numerical/categorical features:")
print(numerical_features)

In [None]:
print("\n" + "="*60)
print("Creating Train/Validation Split")
print("="*60)

X_train_split, X_val, y_train_log_split, y_val_log = train_test_split(
    X_train, y_train_log, test_size=0.2, random_state=42
)

print(f"Training split shape: {X_train_split.shape}")
print(f"Validation split shape: {X_val.shape}")

In [None]:
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

In [None]:
from torch.utils.data import Dataset, DataLoader
class PriceDataset(Dataset):
    """Custom Dataset for price prediction"""
    def __init__(self, X, y=None):
        self.X = torch.FloatTensor(X.values if isinstance(X, pd.DataFrame) else X)
        self.y = torch.FloatTensor(y.values if y is not None else np.zeros(len(X)))
        self.has_labels = y is not None
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.has_labels:
            return self.X[idx], self.y[idx]
        return self.X[idx]

In [None]:
import torch.nn as nn
class SimpleMLP(nn.Module):
    """Simple MLP for price prediction"""
    def __init__(self, input_dim, hidden_dims=[512, 256, 128, 64], dropout=0.3):
        super(SimpleMLP, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x).squeeze()


class AdvancedMultimodalMLP(nn.Module):
    """Advanced MLP with separate branches for different feature types"""
    def __init__(self, num_features, text_features, img_features):
        super(AdvancedMultimodalMLP, self).__init__()
        
        self.num_features = num_features
        self.text_features = text_features
        self.img_features = img_features
        
        # Numerical features branch
        self.num_net = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32)
        )
        
        # Text features branch
        self.text_net = nn.Sequential(
            nn.Linear(text_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128)
        )
        
        # Image features branch
        self.img_net = nn.Sequential(
            nn.Linear(img_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256)
        )
        
        # Fusion layer
        fusion_input_dim = 32 + 128 + 256
        self.fusion = nn.Sequential(
            nn.Linear(fusion_input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        # Split input into different modalities
        num_feat = x[:, :self.num_features]
        text_feat = x[:, self.num_features:self.num_features+self.text_features]
        img_feat = x[:, self.num_features+self.text_features:]
        
        # Process each modality
        num_out = self.num_net(num_feat)
        text_out = self.text_net(text_feat)
        img_out = self.img_net(img_feat)
        
        # Fuse and predict
        combined = torch.cat([num_out, text_out, img_out], dim=1)
        output = self.fusion(combined)
        
        return output.squeeze()

In [None]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch.optim as optim
def train_mlp(X_train, y_train, X_val, y_val, 
              model_type='simple',
              num_epochs=100, 
              batch_size=256, 
              lr=0.001,
              num_features=3,
              text_features=384,
              img_features=1280):
    """Train MLP model"""
    print(f"\n{'='*60}")
    print(f"Training {model_type.upper()} MLP Model")
    print(f"{'='*60}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Create datasets
    train_dataset = PriceDataset(X_train_scaled, y_train)
    val_dataset = PriceDataset(X_val_scaled, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model
    input_dim = X_train.shape[1]
    
    if model_type == 'simple':
        model = SimpleMLP(input_dim).to(device)
    else:  # advanced
        model = AdvancedMultimodalMLP(num_features, text_features, img_features).to(device)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5)   
    
    # Training loop
    best_smape = float('inf')
    patience_counter = 0
    patience = 15
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_losses = []
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_losses.append(loss.item())
        
        # Validation phase
        model.eval()
        val_losses = []
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                predictions = model(X_batch)
                loss = criterion(predictions, y_batch)
                
                val_losses.append(loss.item())
                all_preds.extend(predictions.cpu().numpy())
                all_targets.extend(y_batch.cpu().numpy())
        
        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        val_smape = smape(np.expm1(all_targets), np.expm1(all_preds))
        
        scheduler.step(avg_val_loss)
        
        # Print progress
        if (epoch + 1) % 5 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}] "
                  f"Train Loss: {avg_train_loss:.4f} | "
                  f"Val Loss: {avg_val_loss:.4f} | "
                  f"Val SMAPE: {val_smape:.4f}%")
        
        # Early stopping
        if val_smape < best_smape:
            best_smape = val_smape
            patience_counter = 0
            torch.save({
                'model_state_dict': model.state_dict(),
                'scaler': scaler,
                'best_smape': best_smape,
            }, 'best_mlp_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping at epoch {epoch+1}")
                break
    
    # Load best model
    checkpoint = torch.load('best_mlp_model.pth', weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    scaler = checkpoint['scaler']
    
    print(f"\n{'='*60}")
    print(f"Best Validation SMAPE: {best_smape:.4f}%")
    print(f"{'='*60}")
    
    return model, scaler, best_smape

In [None]:

mlp_simple, scaler_simple, smape_simple = train_mlp(
    X_train_split, 
    y_train_log_split, 
    X_val, 
    y_val_log,
    model_type='simple',
    num_epochs=100,
    batch_size=256,
    lr=0.001
)

In [None]:
# Cell to call the training function (Corrected)

# --- THIS IS THE FIX ---
# We dynamically count the number of features instead of hardcoding it.

# Define the numerical features list EXACTLY as you did in Cell 6
numerical_features_list = [
    'Pack_Size_log', 
    'Value_log', 
    'Unit_Encoded', 
    'brand_encoded', 
    'item_size',
    'desc_length'
]
# Uncomment the line below if you also added keyword features in Cell 6
# keywords = ['organic', 'premium', 'natural', 'gourmet', 'gluten-free', 'sugar-free', 'family size', 'bulk', 'value pack']
# numerical_features_list.extend([f'kw_{k}' for k in keywords])

num_features_count = len(numerical_features_list)
text_features_count = train_text_embed_df.shape[1]
img_features_count = train_img_embed_df.shape[1]

print(f"Using feature counts -> Numerical: {num_features_count}, Text: {text_features_count}, Image: {img_features_count}")
# --------------------------------

mlp_advanced, scaler_advanced, smape_advanced = train_mlp(
    X_train_split, 
    y_train_log_split, 
    X_val, 
    y_val_log,
    model_type='advanced',
    num_epochs=100,
    batch_size=256,
    lr=0.001,
    num_features=num_features_count,    # <-- Use the correct count
    text_features=text_features_count,  # <-- Use the correct count
    img_features=img_features_count     # <-- Use the correct count
)
