In [1]:
!pip install sentence_transformers




In [None]:
# --- Core Libraries ---
import pandas as pd
import numpy as np
import re
from tqdm.notebook import tqdm
import os

# --- Machine Learning ---
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# --- Deep Learning (for Embeddings) ---
import torch
import timm
from sentence_transformers import SentenceTransformer
from PIL import Image
from torchvision import transforms
from sklearn.preprocessing import LabelEncoder, StandardScaler

# --- Setup ---
# Set up the device (GPU or CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tqdm.pandas()

print(f"Setup complete. Using device: {device}")

Setup complete. Using device: cuda


In [27]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader

from sklearn.preprocessing._data import StandardScaler as InternalStandardScaler
torch.serialization.add_safe_globals([InternalStandardScaler])

In [3]:
# Load the datasets
try:
    train_df = pd.read_csv('dataset/train.csv')
    test_df = pd.read_csv('dataset/test.csv')
except FileNotFoundError:
    print("Please make sure train.csv and test.csv are in a 'dataset' folder.")
    # Create dummy dataframes to allow the rest of the notebook to run for demonstration
    train_df = pd.DataFrame()
    test_df = pd.DataFrame()

print("--- Training Data ---")
print(f"Shape: {train_df.shape}")
print(train_df.head())

print("\n--- Test Data ---")
print(f"Shape: {test_df.shape}")
print(test_df.head())

--- Training Data ---
Shape: (75000, 4)
   sample_id                                    catalog_content  \
0      33127  Item Name: La Victoria Green Taco Sauce Mild, ...   
1     198967  Item Name: Salerno Cookies, The Original Butte...   
2     261251  Item Name: Bear Creek Hearty Soup Bowl, Creamy...   
3      55858  Item Name: Judeeâ€™s Blue Cheese Powder 11.25 oz...   
4     292686  Item Name: kedem Sherry Cooking Wine, 12.7 Oun...   

                                          image_link  price  
0  https://m.media-amazon.com/images/I/51mo8htwTH...   4.89  
1  https://m.media-amazon.com/images/I/71YtriIHAA...  13.12  
2  https://m.media-amazon.com/images/I/51+PFEe-w-...   1.97  
3  https://m.media-amazon.com/images/I/41mu0HAToD...  30.34  
4  https://m.media-amazon.com/images/I/41sA037+Qv...  66.49  

--- Test Data ---
Shape: (75000, 3)
   sample_id                                    catalog_content  \
0     100179  Item Name: Rani 14-Spice Eshamaya's Mango Chut...   
1     245611

In [4]:
def parse_catalog_content(text):
    # Pack Size: Look for (Pack of X)
    pack_size_match = re.search(r'\(Pack of (\d+)\)', text, re.IGNORECASE)
    pack_size = int(pack_size_match.group(1)) if pack_size_match else 1

    # Value: Look for Value: X
    value_match = re.search(r'Value: ([\d.]+)', text)
    value = float(value_match.group(1)) if value_match else np.nan

    # Unit: Look for Unit: X
    unit_match = re.search(r'Unit: ([a-zA-Z]+)', text)
    unit = unit_match.group(1) if unit_match else 'Unknown'

    return pack_size, value, unit

# Apply the parsing function
print("Parsing catalog_content for train and test sets...")
train_df[['Pack_Size', 'Value', 'Unit']] = train_df['catalog_content'].apply(
    lambda x: pd.Series(parse_catalog_content(x))
)
test_df[['Pack_Size', 'Value', 'Unit']] = test_df['catalog_content'].apply(
    lambda x: pd.Series(parse_catalog_content(x))
)

# Encode Unit (fit on all units)
all_units = pd.concat([train_df['Unit'], test_df['Unit']]).astype(str).unique()
unit_encoder = LabelEncoder()
unit_encoder.fit(all_units)

train_df['Unit_Encoded'] = unit_encoder.transform(train_df['Unit'].astype(str))
test_df['Unit_Encoded'] = unit_encoder.transform(test_df['Unit'].astype(str))

print("Parsing and encoding complete.")
print(f"\nSample parsed data:")
print(train_df[['sample_id', 'Pack_Size', 'Value', 'Unit', 'Unit_Encoded']].head())


Parsing catalog_content for train and test sets...


Parsing and encoding complete.

Sample parsed data:
   sample_id  Pack_Size  Value   Unit  Unit_Encoded
0      33127          6  72.00     Fl            20
1     198967          4  32.00  Ounce            40
2     261251          6  11.40  Ounce            40
3      55858          1  11.25  Ounce            40
4     292686          1  12.00  Count            16


In [5]:
print("\n" + "="*60)
print("Generating Text Embeddings")
print("="*60)

# Load the sentence transformer model
text_model = SentenceTransformer('all-MiniLM-L6-v2', device=device)

# Generate embeddings for the training data
print("Generating text embeddings for training data...")
train_text_embeddings = text_model.encode(
    train_df['catalog_content'].tolist(), 
    show_progress_bar=True,
    batch_size=32
)

# Generate embeddings for the test data
print("Generating text embeddings for test data...")
test_text_embeddings = text_model.encode(
    test_df['catalog_content'].tolist(), 
    show_progress_bar=True,
    batch_size=32
)

# Convert to DataFrames
train_text_embed_df = pd.DataFrame(
    train_text_embeddings, 
    columns=[f'txt_{i}' for i in range(train_text_embeddings.shape[1])]
)
test_text_embed_df = pd.DataFrame(
    test_text_embeddings, 
    columns=[f'txt_{i}' for i in range(test_text_embeddings.shape[1])]
)

print(f"\nText embedding shape: {train_text_embed_df.shape}")




Generating Text Embeddings


Generating text embeddings for training data...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]

Generating text embeddings for test data...


Batches:   0%|          | 0/2344 [00:00<?, ?it/s]


Text embedding shape: (75000, 384)


In [6]:
print("\n" + "="*60)
print("Generating Image Embeddings")
print("="*60)

# Image Model Setup
img_model = timm.create_model('efficientnet_b0', pretrained=True, num_classes=0, global_pool='avg').to(device)
img_model.eval()
config = img_model.default_cfg
transform = transforms.Compose([
    transforms.Resize(config['input_size'][1:]),
    transforms.CenterCrop(config['input_size'][1:]),
    transforms.ToTensor(),
    transforms.Normalize(mean=config['mean'], std=config['std']),
])

# Image Feature Extraction Function
def get_image_embedding(sample_id, model, device, transform, image_folder='images'):
    image_path = os.path.join(image_folder, f"{sample_id}.jpg")
    if not os.path.exists(image_path):
        return np.zeros(1280)
    try:
        img = Image.open(image_path).convert('RGB')
        batch_img = transform(img).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = model(batch_img)
        return embedding.cpu().numpy().flatten()
    except Exception:
        return np.zeros(1280)

# Generate Image Embeddings
print("Generating image embeddings for training data...")
train_image_embeddings = train_df['sample_id'].progress_apply(
    lambda x: get_image_embedding(x, img_model, device, transform)
)

print("Generating image embeddings for test data...")
test_image_embeddings = test_df['sample_id'].progress_apply(
    lambda x: get_image_embedding(x, img_model, device, transform)
)

# Convert to DataFrames
train_img_embed_df = pd.DataFrame(
    train_image_embeddings.to_list(), 
    columns=[f'img_{i}' for i in range(1280)]
)
test_img_embed_df = pd.DataFrame(
    test_image_embeddings.to_list(), 
    columns=[f'img_{i}' for i in range(1280)]
)

print(f"\nImage embedding shape: {train_img_embed_df.shape}")




Generating Image Embeddings
Generating image embeddings for training data...


  0%|          | 0/75000 [00:00<?, ?it/s]

Generating image embeddings for test data...


  0%|          | 0/75000 [00:00<?, ?it/s]


Image embedding shape: (75000, 1280)


In [7]:
print("\n" + "="*60)
print("Combining All Features")
print("="*60)

# Select numerical features
numerical_features = ['Pack_Size', 'Value', 'Unit_Encoded']
X_train_base = train_df[numerical_features].fillna(0)
X_test_base = test_df[numerical_features].fillna(0)

# Combine all features
X_train = pd.concat([
    X_train_base.reset_index(drop=True), 
    train_text_embed_df.reset_index(drop=True), 
    train_img_embed_df.reset_index(drop=True)
], axis=1)

X_test = pd.concat([
    X_test_base.reset_index(drop=True), 
    test_text_embed_df.reset_index(drop=True), 
    test_img_embed_df.reset_index(drop=True)
], axis=1)

# Prepare target variable - using log transform
y_train = train_df['price']
y_train_log = np.log1p(y_train)

print(f"X_train shape: {X_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"y_train_log shape: {y_train_log.shape}")






Combining All Features
X_train shape: (75000, 1667)
X_test shape: (75000, 1667)
y_train_log shape: (75000,)


In [8]:
print("\n" + "="*60)
print("Creating Train/Validation Split")
print("="*60)

X_train_split, X_val, y_train_log_split, y_val_log = train_test_split(
    X_train, y_train_log, test_size=0.2, random_state=42
)

print(f"Training split shape: {X_train_split.shape}")
print(f"Validation split shape: {X_val.shape}")


Creating Train/Validation Split
Training split shape: (60000, 1667)
Validation split shape: (15000, 1667)


In [9]:
def smape(y_true, y_pred):
    """Symmetric Mean Absolute Percentage Error"""
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_true) + np.abs(y_pred)) / 2
    return np.mean(numerator / denominator) * 100

In [10]:
from torch.utils.data import Dataset, DataLoader
class PriceDataset(Dataset):
    """Custom Dataset for price prediction"""
    def __init__(self, X, y=None):
        self.X = torch.FloatTensor(X.values if isinstance(X, pd.DataFrame) else X)
        self.y = torch.FloatTensor(y.values if y is not None else np.zeros(len(X)))
        self.has_labels = y is not None
    
    def __len__(self):
        return len(self.X)
    
    def __getitem__(self, idx):
        if self.has_labels:
            return self.X[idx], self.y[idx]
        return self.X[idx]

In [35]:
import torch.nn as nn
class SimpleMLP(nn.Module):
    """Simple MLP for price prediction"""
    def __init__(self, input_dim, hidden_dims=[512, 256, 128, 64], dropout=0.3):
        super(SimpleMLP, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.extend([
                nn.Linear(prev_dim, hidden_dim),
                nn.BatchNorm1d(hidden_dim),
                nn.ReLU(),
                nn.Dropout(dropout)
            ])
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.network(x).squeeze()


class AdvancedMultimodalMLP(nn.Module):
    """Advanced MLP with separate branches for different feature types"""
    def __init__(self, num_features, text_features, img_features):
        super(AdvancedMultimodalMLP, self).__init__()
        
        self.num_features = num_features
        self.text_features = text_features
        self.img_features = img_features
        
        # Numerical features branch
        self.num_net = nn.Sequential(
            nn.Linear(num_features, 64),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32)
        )
        
        # Text features branch
        self.text_net = nn.Sequential(
            nn.Linear(text_features, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128)
        )
        
        # Image features branch
        self.img_net = nn.Sequential(
            nn.Linear(img_features, 512),
            nn.BatchNorm1d(512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, 256)
        )
        
        # Fusion layer
        fusion_input_dim = 32 + 128 + 256
        self.fusion = nn.Sequential(
            nn.Linear(fusion_input_dim, 256),
            nn.BatchNorm1d(256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.BatchNorm1d(128),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )
    
    def forward(self, x):
        # Split input into different modalities
        num_feat = x[:, :self.num_features]
        text_feat = x[:, self.num_features:self.num_features+self.text_features]
        img_feat = x[:, self.num_features+self.text_features:]
        
        # Process each modality
        num_out = self.num_net(num_feat)
        text_out = self.text_net(text_feat)
        img_out = self.img_net(img_feat)
        
        # Fuse and predict
        combined = torch.cat([num_out, text_out, img_out], dim=1)
        output = self.fusion(combined)
        
        return output.squeeze()

In [38]:
from sklearn.preprocessing import LabelEncoder, StandardScaler
import torch.optim as optim
def train_mlp(X_train, y_train, X_val, y_val, 
              model_type='simple',
              num_epochs=100, 
              batch_size=256, 
              lr=0.001,
              num_features=3,
              text_features=384,
              img_features=1280):
    """Train MLP model"""
    print(f"\n{'='*60}")
    print(f"Training {model_type.upper()} MLP Model")
    print(f"{'='*60}")
    
    # Scale features
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    
    # Create datasets
    train_dataset = PriceDataset(X_train_scaled, y_train)
    val_dataset = PriceDataset(X_val_scaled, y_val)
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Initialize model
    input_dim = X_train.shape[1]
    
    if model_type == 'simple':
        model = SimpleMLP(input_dim).to(device)
    else:  # advanced
        model = AdvancedMultimodalMLP(num_features, text_features, img_features).to(device)
    
    print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
    
    # Loss and optimizer
    criterion = nn.MSELoss()
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=0.01)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, mode='min', factor=0.5, patience=5)   
    
    # Training loop
    best_smape = float('inf')
    patience_counter = 0
    patience = 15
    
    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_losses = []
        
        for X_batch, y_batch in train_loader:
            X_batch, y_batch = X_batch.to(device), y_batch.to(device)
            
            optimizer.zero_grad()
            predictions = model(X_batch)
            loss = criterion(predictions, y_batch)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            train_losses.append(loss.item())
        
        # Validation phase
        model.eval()
        val_losses = []
        all_preds = []
        all_targets = []
        
        with torch.no_grad():
            for X_batch, y_batch in val_loader:
                X_batch, y_batch = X_batch.to(device), y_batch.to(device)
                predictions = model(X_batch)
                loss = criterion(predictions, y_batch)
                
                val_losses.append(loss.item())
                all_preds.extend(predictions.cpu().numpy())
                all_targets.extend(y_batch.cpu().numpy())
        
        avg_train_loss = np.mean(train_losses)
        avg_val_loss = np.mean(val_losses)
        val_smape = smape(np.expm1(all_targets), np.expm1(all_preds))
        
        scheduler.step(avg_val_loss)
        
        # Print progress
        if (epoch + 1) % 5 == 0:
            print(f"Epoch [{epoch+1}/{num_epochs}] "
                  f"Train Loss: {avg_train_loss:.4f} | "
                  f"Val Loss: {avg_val_loss:.4f} | "
                  f"Val SMAPE: {val_smape:.4f}%")
        
        # Early stopping
        if val_smape < best_smape:
            best_smape = val_smape
            patience_counter = 0
            torch.save({
                'model_state_dict': model.state_dict(),
                'scaler': scaler,
                'best_smape': best_smape,
            }, 'best_mlp_model.pth')
        else:
            patience_counter += 1
            if patience_counter >= patience:
                print(f"\nEarly stopping at epoch {epoch+1}")
                break
    
    # Load best model
    checkpoint = torch.load('best_mlp_model.pth', weights_only=False)
    model.load_state_dict(checkpoint['model_state_dict'])
    scaler = checkpoint['scaler']
    
    print(f"\n{'='*60}")
    print(f"Best Validation SMAPE: {best_smape:.4f}%")
    print(f"{'='*60}")
    
    return model, scaler, best_smape

In [39]:

mlp_simple, scaler_simple, smape_simple = train_mlp(
    X_train_split, 
    y_train_log_split, 
    X_val, 
    y_val_log,
    model_type='simple',
    num_epochs=100,
    batch_size=256,
    lr=0.001
)


Training SIMPLE MLP Model


Model parameters: 1,028,481
Epoch [5/100] Train Loss: 0.6161 | Val Loss: 0.5806 | Val SMAPE: 59.2860%
Epoch [10/100] Train Loss: 0.5143 | Val Loss: 0.5517 | Val SMAPE: 57.4907%
Epoch [15/100] Train Loss: 0.4495 | Val Loss: 0.5367 | Val SMAPE: 56.0163%
Epoch [20/100] Train Loss: 0.3998 | Val Loss: 0.5636 | Val SMAPE: 57.4143%
Epoch [25/100] Train Loss: 0.3444 | Val Loss: 0.5292 | Val SMAPE: 54.7974%
Epoch [30/100] Train Loss: 0.3168 | Val Loss: 0.5297 | Val SMAPE: 54.5436%
Epoch [35/100] Train Loss: 0.2845 | Val Loss: 0.5355 | Val SMAPE: 54.5919%
Epoch [40/100] Train Loss: 0.2765 | Val Loss: 0.5406 | Val SMAPE: 54.7032%
Epoch [45/100] Train Loss: 0.2677 | Val Loss: 0.5452 | Val SMAPE: 54.8856%
Epoch [50/100] Train Loss: 0.2629 | Val Loss: 0.5336 | Val SMAPE: 54.1619%
Epoch [55/100] Train Loss: 0.2599 | Val Loss: 0.6008 | Val SMAPE: 57.7154%
Epoch [60/100] Train Loss: 0.2575 | Val Loss: 0.5331 | Val SMAPE: 54.1026%
Epoch [65/100] Train Loss: 0.2572 | Val Loss: 0.5350 | Val SMAPE: 54.0778

In [40]:
mlp_advanced, scaler_advanced, smape_advanced = train_mlp(
    X_train_split, 
    y_train_log_split, 
    X_val, 
    y_val_log,
    model_type='advanced',
    num_epochs=100,
    batch_size=256,
    lr=0.001,
    num_features=3,  # Pack_Size, Value, Unit_Encoded
    text_features=384,  # Text embedding size
    img_features=1280  # Image embedding size
)



Training ADVANCED MLP Model


Model parameters: 1,071,393
Epoch [5/100] Train Loss: 0.5490 | Val Loss: 0.5798 | Val SMAPE: 58.6892%
Epoch [10/100] Train Loss: 0.4702 | Val Loss: 0.5456 | Val SMAPE: 55.8585%
Epoch [15/100] Train Loss: 0.4232 | Val Loss: 0.5557 | Val SMAPE: 57.7974%
Epoch [20/100] Train Loss: 0.3836 | Val Loss: 0.5758 | Val SMAPE: 58.3562%
Epoch [25/100] Train Loss: 0.3377 | Val Loss: 0.5207 | Val SMAPE: 53.3606%
Epoch [30/100] Train Loss: 0.3233 | Val Loss: 0.5275 | Val SMAPE: 53.9032%
Epoch [35/100] Train Loss: 0.3119 | Val Loss: 0.5255 | Val SMAPE: 54.1479%
Epoch [40/100] Train Loss: 0.2892 | Val Loss: 0.5493 | Val SMAPE: 56.0553%

Early stopping at epoch 40

Best Validation SMAPE: 53.3606%
