In [1]:
"""
Optimized DistilBERT Baseline for Amazon ML Challenge
Text ‚Üí DistilBERT ‚Üí Dense Layers ‚Üí Price Prediction
Target: Minimize SMAPE score
"""

import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import DistilBertTokenizer, DistilBertModel, get_cosine_schedule_with_warmup
from sklearn.model_selection import KFold
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

class Config:
    # Paths
    TRAIN_PATH = 'train.csv'
    TEST_PATH = 'test.csv'
    OUTPUT_PATH = 'submission.csv'
    
    # Model
    MODEL_NAME = 'distilbert-base-uncased'  # Apache 2.0 license
    MAX_LEN = 128  # Balance between speed and info retention
    
    # Training
    BATCH_SIZE = 32  # Adjust based on GPU memory
    EPOCHS = 5
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    
    # Cross-validation
    N_FOLDS = 5
    SEED = 42
    
    # Device
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Price transformation (key insight from data: price range 0.13 to 2796)
    USE_LOG_PRICE = True  # Log transform stabilizes training

# ============================================================================
# METRICS
# ============================================================================
def smape(y_true, y_pred):
    """
    Symmetric Mean Absolute Percentage Error (lower is better)
    Formula: (1/n) * Œ£(|y_pred - y_true| / ((|y_pred| + |y_true|) / 2))
    """
    numerator = np.abs(y_pred - y_true)
    denominator = (np.abs(y_pred) + np.abs(y_true)) / 2
    # Avoid division by zero
    denominator = np.where(denominator == 0, 1e-8, denominator)
    return np.mean(numerator / denominator)

# ============================================================================
# DATASET
# ============================================================================
class PriceDataset(Dataset):
    def __init__(self, texts, prices=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if self.prices is not None:
            item['price'] = torch.tensor(self.prices[idx], dtype=torch.float)
        
        return item

# ============================================================================
# MODEL ARCHITECTURE
# ============================================================================
class DistilBERTPricePredictor(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', dropout=0.3):
        super().__init__()
        
        # Pretrained DistilBERT
        self.bert = DistilBertModel.from_pretrained(model_name)
        
        # Regression head with progressive dimension reduction
        self.regressor = nn.Sequential(
            nn.Linear(768, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(128, 1)
        )
        
    def forward(self, input_ids, attention_mask):
        # Get BERT output
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Use [CLS] token representation
        cls_output = outputs.last_hidden_state[:, 0, :]
        
        # Predict price
        price = self.regressor(cls_output)
        
        return price.squeeze()

# ============================================================================
# TRAINING FUNCTIONS
# ============================================================================
def train_epoch(model, dataloader, optimizer, scheduler, device, scaler=None):
    model.train()
    total_loss = 0
    
    pbar = tqdm(dataloader, desc='Training')
    for batch in pbar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        prices = batch['price'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        predictions = model(input_ids, attention_mask)
        
        # Huber Loss (robust to outliers, better than MSE for price prediction)
        loss = nn.HuberLoss(delta=1.0)(predictions, prices)
        
        # Backward pass
        loss.backward()
        
        # Gradient clipping (prevents exploding gradients)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        pbar.set_postfix({'loss': loss.item()})
    
    return total_loss / len(dataloader)

def validate_epoch(model, dataloader, device, scaler=None):
    model.eval()
    predictions = []
    actuals = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc='Validation'):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            prices = batch['price'].to(device)
            
            preds = model(input_ids, attention_mask)
            
            predictions.extend(preds.cpu().numpy())
            actuals.extend(prices.cpu().numpy())
    
    predictions = np.array(predictions)
    actuals = np.array(actuals)
    
    # Inverse transform if using log prices
    if Config.USE_LOG_PRICE:
        predictions = np.expm1(predictions)  # exp(x) - 1
        actuals = np.expm1(actuals)
    
    # Ensure positive predictions
    predictions = np.maximum(predictions, 0.01)
    
    # Calculate SMAPE
    smape_score = smape(actuals, predictions)
    
    return smape_score, predictions, actuals

# ============================================================================
# MAIN TRAINING PIPELINE
# ============================================================================
def main():
    print(f"üöÄ Device: {Config.DEVICE}")
    print(f"üìä Using {'log-transformed' if Config.USE_LOG_PRICE else 'raw'} prices\n")
    
    # Load data
    print("üìÇ Loading data...")
    train_df = pd.read_csv(Config.TRAIN_PATH)
    test_df = pd.read_csv(Config.TEST_PATH)
    
    print(f"Train shape: {train_df.shape}")
    print(f"Test shape: {test_df.shape}")
    print(f"Price range: {train_df['price'].min():.2f} - {train_df['price'].max():.2f}\n")
    
    # Prepare prices
    prices = train_df['price'].values
    if Config.USE_LOG_PRICE:
        prices = np.log1p(prices)  # log(1 + x) to handle prices close to 0
        print("‚úÖ Applied log1p transformation to prices")
    
    # Initialize tokenizer
    print("üî§ Loading tokenizer...")
    tokenizer = DistilBertTokenizer.from_pretrained(Config.MODEL_NAME)
    
    # K-Fold Cross-Validation
    kfold = KFold(n_splits=Config.N_FOLDS, shuffle=True, random_state=Config.SEED)
    fold_scores = []
    test_predictions = np.zeros(len(test_df))
    
    for fold, (train_idx, val_idx) in enumerate(kfold.split(train_df)):
        print(f"\n{'='*60}")
        print(f"üîÑ FOLD {fold + 1}/{Config.N_FOLDS}")
        print(f"{'='*60}")
        
        # Split data
        train_texts = train_df.iloc[train_idx]['catalog_content'].values
        train_prices = prices[train_idx]
        val_texts = train_df.iloc[val_idx]['catalog_content'].values
        val_prices = prices[val_idx]
        
        # Create datasets
        train_dataset = PriceDataset(train_texts, train_prices, tokenizer, Config.MAX_LEN)
        val_dataset = PriceDataset(val_texts, val_prices, tokenizer, Config.MAX_LEN)
        
        train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=2)
        val_loader = DataLoader(val_dataset, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=2)
        
        # Initialize model
        model = DistilBERTPricePredictor(Config.MODEL_NAME).to(Config.DEVICE)
        
        # Optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=Config.LEARNING_RATE, weight_decay=Config.WEIGHT_DECAY)
        
        total_steps = len(train_loader) * Config.EPOCHS
        warmup_steps = int(total_steps * Config.WARMUP_RATIO)
        scheduler = get_cosine_schedule_with_warmup(optimizer, warmup_steps, total_steps)
        
        # Training loop
        best_smape = float('inf')
        patience = 0
        max_patience = 2
        
        for epoch in range(Config.EPOCHS):
            print(f"\nüìà Epoch {epoch + 1}/{Config.EPOCHS}")
            
            train_loss = train_epoch(model, train_loader, optimizer, scheduler, Config.DEVICE)
            val_smape, _, _ = validate_epoch(model, val_loader, Config.DEVICE)
            
            print(f"Train Loss: {train_loss:.4f} | Val SMAPE: {val_smape:.4f}")
            
            # Save best model
            if val_smape < best_smape:
                best_smape = val_smape
                torch.save(model.state_dict(), f'best_model_fold{fold}.pt')
                patience = 0
                print(f"‚úÖ Best model saved! SMAPE: {best_smape:.4f}")
            else:
                patience += 1
                if patience >= max_patience:
                    print(f"‚ö†Ô∏è Early stopping triggered")
                    break
        
        fold_scores.append(best_smape)
        print(f"\nüéØ Fold {fold + 1} Best SMAPE: {best_smape:.4f}")
        
        # Predict on test set
        model.load_state_dict(torch.load(f'best_model_fold{fold}.pt'))
        test_dataset = PriceDataset(test_df['catalog_content'].values, None, tokenizer, Config.MAX_LEN)
        test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=2)
        
        model.eval()
        fold_test_preds = []
        with torch.no_grad():
            for batch in tqdm(test_loader, desc=f'Predicting test (Fold {fold+1})'):
                input_ids = batch['input_ids'].to(Config.DEVICE)
                attention_mask = batch['attention_mask'].to(Config.DEVICE)
                preds = model(input_ids, attention_mask)
                fold_test_preds.extend(preds.cpu().numpy())
        
        fold_test_preds = np.array(fold_test_preds)
        if Config.USE_LOG_PRICE:
            fold_test_preds = np.expm1(fold_test_preds)
        fold_test_preds = np.maximum(fold_test_preds, 0.01)
        
        test_predictions += fold_test_preds / Config.N_FOLDS
    
    # Final results
    print(f"\n{'='*60}")
    print(f"üèÜ FINAL RESULTS")
    print(f"{'='*60}")
    print(f"Average SMAPE across folds: {np.mean(fold_scores):.4f} ¬± {np.std(fold_scores):.4f}")
    print(f"Fold scores: {[f'{s:.4f}' for s in fold_scores]}")
    
    # Create submission
    submission = pd.DataFrame({
        'sample_id': test_df['sample_id'],
        'price': test_predictions
    })
    submission.to_csv(Config.OUTPUT_PATH, index=False)
    print(f"\n‚úÖ Submission saved to {Config.OUTPUT_PATH}")
    print(f"üìä Predicted price range: {submission['price'].min():.2f} - {submission['price'].max():.2f}")

if __name__ == "__main__":
    main()

üöÄ Device: cpu
üìä Using log-transformed prices

üìÇ Loading data...


FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

In [None]:
"""
DistilBERT Price Prediction Baseline - Amazon ML Challenge
Formatted for competition submission structure
"""

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================
class Config:
    MODEL_NAME = 'distilbert-base-uncased'
    MAX_LEN = 128
    BATCH_SIZE = 32
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    USE_LOG_PRICE = True
    MODEL_PATH = 'trained_model.pth'  # Path to save/load trained model

# ============================================================================
# DATASET
# ============================================================================
class PriceDataset(Dataset):
    def __init__(self, texts, prices=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if self.prices is not None:
            item['price'] = torch.tensor(self.prices[idx], dtype=torch.float)
        
        return item

# ============================================================================
# MODEL ARCHITECTURE
# ============================================================================
class DistilBERTPricePredictor(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', dropout=0.3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        
        self.regressor = nn.Sequential(
            nn.Linear(768, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(128, 1)
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        price = self.regressor(cls_output)
        return price.squeeze()

# ============================================================================
# TRAINING FUNCTION
# ============================================================================
def train_model(train_df):
    """
    Train the DistilBERT model on training data
    """
    print("üöÄ Starting Model Training...")
    print(f"Device: {Config.DEVICE}")
    print(f"Training samples: {len(train_df)}")
    
    # Prepare data
    texts = train_df['catalog_content'].values
    prices = train_df['price'].values
    
    if Config.USE_LOG_PRICE:
        prices = np.log1p(prices)
        print("‚úÖ Applied log1p transformation to prices")
    
    # Initialize tokenizer and model
    tokenizer = DistilBertTokenizer.from_pretrained(Config.MODEL_NAME)
    model = DistilBERTPricePredictor(Config.MODEL_NAME).to(Config.DEVICE)
    
    # Create dataset and dataloader
    train_dataset = PriceDataset(texts, prices, tokenizer, Config.MAX_LEN)
    train_loader = DataLoader(train_dataset, batch_size=Config.BATCH_SIZE, shuffle=True, num_workers=2)
    
    # Optimizer and loss
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    loss_fn = nn.HuberLoss(delta=1.0)
    
    # Training loop
    epochs = 3  # Reduced for faster training
    model.train()
    
    for epoch in range(epochs):
        print(f"\nüìà Epoch {epoch + 1}/{epochs}")
        total_loss = 0
        
        pbar = tqdm(train_loader, desc='Training')
        for batch in pbar:
            input_ids = batch['input_ids'].to(Config.DEVICE)
            attention_mask = batch['attention_mask'].to(Config.DEVICE)
            target_prices = batch['price'].to(Config.DEVICE)
            
            optimizer.zero_grad()
            predictions = model(input_ids, attention_mask)
            loss = loss_fn(predictions, target_prices)
            loss.backward()
            
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            
            total_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})
        
        avg_loss = total_loss / len(train_loader)
        print(f"Average Loss: {avg_loss:.4f}")
    
    # Save model
    torch.save({
        'model_state_dict': model.state_dict(),
        'config': {
            'model_name': Config.MODEL_NAME,
            'use_log_price': Config.USE_LOG_PRICE
        }
    }, Config.MODEL_PATH)
    print(f"\n‚úÖ Model saved to {Config.MODEL_PATH}")
    
    return model, tokenizer

# ============================================================================
# PREDICTION FUNCTION (COMPETITION FORMAT)
# ============================================================================
def predictor(sample_id, catalog_content, image_link, model=None, tokenizer=None):
    '''
    Predict product price using DistilBERT model
    
    Parameters:
    - sample_id: Unique identifier for the sample
    - catalog_content: Text containing product title and description
    - image_link: URL to product image (not used in baseline)
    
    Returns:
    - price: Predicted price as a float
    '''
    if model is None or tokenizer is None:
        raise ValueError("Model and tokenizer must be provided")
    
    # Tokenize input
    encoding = tokenizer(
        str(catalog_content),
        max_length=Config.MAX_LEN,
        padding='max_length',
        truncation=True,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(Config.DEVICE)
    attention_mask = encoding['attention_mask'].to(Config.DEVICE)
    
    # Predict
    model.eval()
    with torch.no_grad():
        prediction = model(input_ids, attention_mask)
    
    # Inverse transform
    price = prediction.cpu().item()
    if Config.USE_LOG_PRICE:
        price = np.expm1(price)  # exp(x) - 1
    
    # Ensure positive price
    price = max(price, 0.01)
    
    return round(price, 2)

# ============================================================================
# BATCH PREDICTION (OPTIMIZED FOR SPEED)
# ============================================================================
def batch_predictor(test_df, model, tokenizer, batch_size=32):
    """
    Predict prices for entire test set in batches (faster than row-by-row)
    """
    print("\nüîÆ Generating Predictions...")
    
    texts = test_df['catalog_content'].values
    test_dataset = PriceDataset(texts, None, tokenizer, Config.MAX_LEN)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=2)
    
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Predicting'):
            input_ids = batch['input_ids'].to(Config.DEVICE)
            attention_mask = batch['attention_mask'].to(Config.DEVICE)
            
            preds = model(input_ids, attention_mask)
            predictions.extend(preds.cpu().numpy())
    
    predictions = np.array(predictions)
    
    # Inverse transform
    if Config.USE_LOG_PRICE:
        predictions = np.expm1(predictions)
    
    # Ensure positive prices
    predictions = np.maximum(predictions, 0.01)
    predictions = np.round(predictions, 2)
    
    return predictions

# ============================================================================
# MAIN EXECUTION
# ============================================================================
if __name__ == "__main__":
    DATASET_FOLDER = 'dataset/'
    
    print("="*60)
    print("DistilBERT Price Prediction - Amazon ML Challenge")
    print("="*60)
    
    # Check if model already trained
    if os.path.exists(Config.MODEL_PATH):
        print(f"\n‚úÖ Found existing model at {Config.MODEL_PATH}")
        print("Loading trained model...")
        
        # Load model
        checkpoint = torch.load(Config.MODEL_PATH, map_location=Config.DEVICE)
        tokenizer = DistilBertTokenizer.from_pretrained(Config.MODEL_NAME)
        model = DistilBERTPricePredictor(Config.MODEL_NAME).to(Config.DEVICE)
        model.load_state_dict(checkpoint['model_state_dict'])
        
        print("‚úÖ Model loaded successfully!")
        
    else:
        print(f"\n‚ö†Ô∏è No trained model found. Training new model...")
        
        # Load training data
        train_path = os.path.join(DATASET_FOLDER, 'train.csv')
        if not os.path.exists(train_path):
            raise FileNotFoundError(f"Training data not found at {train_path}")
        
        train_df = pd.read_csv(train_path)
        print(f"Loaded {len(train_df)} training samples")
        
        # Train model
        model, tokenizer = train_model(train_df)
    
    # Load test data
    print(f"\nüìÇ Loading test data...")
    test_path = os.path.join(DATASET_FOLDER, 'test.csv')
    test = pd.read_csv(test_path)
    print(f"Loaded {len(test)} test samples")
    
    # METHOD 1: Batch Prediction (FASTER - RECOMMENDED)
    print("\nüöÄ Using batch prediction for speed...")
    test['price'] = batch_predictor(test, model, tokenizer, batch_size=Config.BATCH_SIZE)
    
    # METHOD 2: Row-by-row prediction (slower, but matches competition format)
    # Uncomment below if you want to use row-by-row approach
    """
    print("\nüöÄ Using row-by-row prediction...")
    test['price'] = test.apply(
        lambda row: predictor(
            row['sample_id'], 
            row['catalog_content'], 
            row['image_link'],
            model=model,
            tokenizer=tokenizer
        ), 
        axis=1
    )
    """
    
    # Select only required columns
    output_df = test[['sample_id', 'price']]
    
    # Save predictions
    output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')
    output_df.to_csv(output_filename, index=False)
    
    # Summary
    print("\n" + "="*60)
    print("‚úÖ PREDICTION COMPLETE")
    print("="*60)
    print(f"üìÅ Predictions saved to: {output_filename}")
    print(f"üìä Total predictions: {len(output_df)}")
    print(f"üí∞ Price range: ‚Çπ{output_df['price'].min():.2f} - ‚Çπ{output_df['price'].max():.2f}")
    print(f"üìà Average price: ‚Çπ{output_df['price'].mean():.2f}")
    print(f"\nüîç Sample predictions:")
    print(output_df.head(10))
    print("="*60)

In [None]:
"""
GPU-OPTIMIZED DistilBERT Training for RTX 3050 (4GB VRAM)
Maximizes throughput with mixed precision, gradient accumulation, and efficient data loading
"""
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:256"
import torch
# ... rest of imports

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizer, DistilBertModel
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import autocast, GradScaler
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# OPTIMIZED CONFIGURATION FOR RTX 3050
# ============================================================================
class Config:
    # Model
    MODEL_NAME = 'distilbert-base-uncased'
    MAX_LEN = 128
    
    # GPU Optimization (RTX 3050 specific)
    BATCH_SIZE = 48              # Increased from 32 (uses ~3.8GB)
    ACCUMULATION_STEPS = 1       # Set to 2 if OOM occurs
    USE_MIXED_PRECISION = True   # FP16 training (50% faster)
    
    # DataLoader optimization
    NUM_WORKERS = 4              # CPU threads for data loading
    PIN_MEMORY = True            # Faster CPU-GPU transfer
    PREFETCH_FACTOR = 2          # Prefetch batches
    
    # Training
    EPOCHS = 3
    LEARNING_RATE = 2e-5
    WEIGHT_DECAY = 0.01
    WARMUP_RATIO = 0.1
    
    # Other
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    USE_LOG_PRICE = True
    MODEL_PATH = 'trained_model.pth'

# ============================================================================
# DATASET (WITH OPTIMIZATIONS)
# ============================================================================
class PriceDataset(Dataset):
    def __init__(self, texts, prices=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        item = {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten()
        }
        
        if self.prices is not None:
            item['price'] = torch.tensor(self.prices[idx], dtype=torch.float)
        
        return item

# ============================================================================
# MODEL ARCHITECTURE
# ============================================================================
class DistilBERTPricePredictor(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', dropout=0.3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        
        self.regressor = nn.Sequential(
            nn.Linear(768, 512),
            nn.LayerNorm(512),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(512, 256),
            nn.LayerNorm(256),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(256, 128),
            nn.LayerNorm(128),
            nn.ReLU(),
            nn.Dropout(dropout),
            
            nn.Linear(128, 1)
        )
        
    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        price = self.regressor(cls_output)
        return price.squeeze()

# ============================================================================
# GPU-OPTIMIZED TRAINING FUNCTION
# ============================================================================
def train_model_optimized(train_df):
    """
    GPU-optimized training with mixed precision and efficient data loading
    """
    print("üöÄ Starting GPU-Optimized Training...")
    print(f"Device: {Config.DEVICE}")
    print(f"GPU: {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'N/A'}")
    print(f"VRAM Available: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    print(f"Training samples: {len(train_df)}")
    print(f"Batch size: {Config.BATCH_SIZE}")
    print(f"Mixed precision: {Config.USE_MIXED_PRECISION}")
    print(f"Gradient accumulation: {Config.ACCUMULATION_STEPS}")
    
    # Prepare data
    texts = train_df['catalog_content'].values
    prices = train_df['price'].values
    
    if Config.USE_LOG_PRICE:
        prices = np.log1p(prices)
        print("‚úÖ Applied log1p transformation to prices")
    
    # Initialize tokenizer and model
    print("\nüî§ Loading tokenizer and model...")
    tokenizer = DistilBertTokenizer.from_pretrained(Config.MODEL_NAME)
    model = DistilBERTPricePredictor(Config.MODEL_NAME).to(Config.DEVICE)
    
    # Count parameters
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total parameters: {total_params:,}")
    print(f"Trainable parameters: {trainable_params:,}")
    
    # Create optimized dataset and dataloader
    train_dataset = PriceDataset(texts, prices, tokenizer, Config.MAX_LEN)
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=True,
        num_workers=Config.NUM_WORKERS,
        pin_memory=Config.PIN_MEMORY,
        prefetch_factor=Config.PREFETCH_FACTOR,
        persistent_workers=True  # Keep workers alive between epochs
    )
    
    # Optimizer and scheduler
    optimizer = torch.optim.AdamW(
        model.parameters(),
        lr=Config.LEARNING_RATE,
        weight_decay=Config.WEIGHT_DECAY
    )
    
    total_steps = len(train_loader) * Config.EPOCHS // Config.ACCUMULATION_STEPS
    warmup_steps = int(total_steps * Config.WARMUP_RATIO)
    
    from transformers import get_cosine_schedule_with_warmup
    scheduler = get_cosine_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    # Loss function
    loss_fn = nn.HuberLoss(delta=1.0)
    
    # Mixed precision scaler
    scaler = GradScaler() if Config.USE_MIXED_PRECISION else None
    
    # Training loop
    print(f"\nüìà Starting training for {Config.EPOCHS} epochs...")
    model.train()
    
    for epoch in range(Config.EPOCHS):
        print(f"\n{'='*60}")
        print(f"Epoch {epoch + 1}/{Config.EPOCHS}")
        print(f"{'='*60}")
        
        total_loss = 0
        optimizer.zero_grad()
        
        pbar = tqdm(train_loader, desc=f'Epoch {epoch+1}')
        for batch_idx, batch in enumerate(pbar):
            input_ids = batch['input_ids'].to(Config.DEVICE, non_blocking=True)
            attention_mask = batch['attention_mask'].to(Config.DEVICE, non_blocking=True)
            target_prices = batch['price'].to(Config.DEVICE, non_blocking=True)
            
            # Mixed precision forward pass
            if Config.USE_MIXED_PRECISION:
                with autocast():
                    predictions = model(input_ids, attention_mask)
                    loss = loss_fn(predictions, target_prices)
                    loss = loss / Config.ACCUMULATION_STEPS
                
                # Scaled backward pass
                scaler.scale(loss).backward()
                
                # Gradient accumulation
                if (batch_idx + 1) % Config.ACCUMULATION_STEPS == 0:
                    scaler.unscale_(optimizer)
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    scaler.step(optimizer)
                    scaler.update()
                    scheduler.step()
                    optimizer.zero_grad()
            else:
                # Standard precision
                predictions = model(input_ids, attention_mask)
                loss = loss_fn(predictions, target_prices)
                loss = loss / Config.ACCUMULATION_STEPS
                loss.backward()
                
                if (batch_idx + 1) % Config.ACCUMULATION_STEPS == 0:
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
                    optimizer.step()
                    scheduler.step()
                    optimizer.zero_grad()
            
            total_loss += loss.item() * Config.ACCUMULATION_STEPS
            
            # Update progress bar
            pbar.set_postfix({
                'loss': f'{loss.item() * Config.ACCUMULATION_STEPS:.4f}',
                'lr': f'{scheduler.get_last_lr()[0]:.2e}',
                'gpu_mem': f'{torch.cuda.memory_allocated() / 1e9:.1f}GB'
            })
        
        avg_loss = total_loss / len(train_loader)
        print(f"\nEpoch {epoch+1} Summary:")
        print(f"  Average Loss: {avg_loss:.4f}")
        print(f"  Peak GPU Memory: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
        print(f"  Learning Rate: {scheduler.get_last_lr()[0]:.2e}")
        
        # Reset peak memory stats
        torch.cuda.reset_peak_memory_stats()
    
    # Save model
    print(f"\nüíæ Saving model...")
    torch.save({
        'model_state_dict': model.state_dict(),
        'config': {
            'model_name': Config.MODEL_NAME,
            'use_log_price': Config.USE_LOG_PRICE,
            'max_len': Config.MAX_LEN
        }
    }, Config.MODEL_PATH)
    print(f"‚úÖ Model saved to {Config.MODEL_PATH}")
    
    return model, tokenizer

# ============================================================================
# OPTIMIZED BATCH PREDICTION
# ============================================================================
def batch_predictor_optimized(test_df, model, tokenizer):
    """
    GPU-optimized batch prediction with mixed precision
    """
    print("\nüîÆ Generating Predictions (GPU-Optimized)...")
    
    texts = test_df['catalog_content'].values
    test_dataset = PriceDataset(texts, None, tokenizer, Config.MAX_LEN)
    test_loader = DataLoader(
        test_dataset,
        batch_size=Config.BATCH_SIZE * 2,  # Double batch size for inference
        shuffle=False,
        num_workers=Config.NUM_WORKERS,
        pin_memory=Config.PIN_MEMORY,
        prefetch_factor=Config.PREFETCH_FACTOR
    )
    
    model.eval()
    predictions = []
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc='Predicting'):
            input_ids = batch['input_ids'].to(Config.DEVICE, non_blocking=True)
            attention_mask = batch['attention_mask'].to(Config.DEVICE, non_blocking=True)
            
            # Use mixed precision for inference too
            if Config.USE_MIXED_PRECISION:
                with autocast():
                    preds = model(input_ids, attention_mask)
            else:
                preds = model(input_ids, attention_mask)
            
            predictions.extend(preds.cpu().numpy())
    
    predictions = np.array(predictions)
    
    # Inverse transform
    if Config.USE_LOG_PRICE:
        predictions = np.expm1(predictions)
    
    # Ensure positive prices
    predictions = np.maximum(predictions, 0.01)
    predictions = np.round(predictions, 2)
    
    return predictions

# ============================================================================
# MAIN EXECUTION
# ============================================================================
if __name__ == "__main__":
    DATASET_FOLDER = 'dataset/'
    
    print("="*60)
    print("GPU-OPTIMIZED DistilBERT Price Prediction")
    print("Optimized for NVIDIA GeForce RTX 3050")
    print("="*60)
    
    # Check GPU availability
    if not torch.cuda.is_available():
        print("‚ö†Ô∏è WARNING: CUDA not available! Running on CPU (very slow)")
    else:
        print(f"‚úÖ GPU Detected: {torch.cuda.get_device_name(0)}")
        print(f"‚úÖ CUDA Version: {torch.version.cuda}")
        print(f"‚úÖ PyTorch Version: {torch.__version__}")
    
    # Check if model already trained
    if os.path.exists(Config.MODEL_PATH):
        print(f"\n‚úÖ Found existing model at {Config.MODEL_PATH}")
        print("Loading trained model...")
        
        checkpoint = torch.load(Config.MODEL_PATH, map_location=Config.DEVICE)
        tokenizer = DistilBertTokenizer.from_pretrained(Config.MODEL_NAME)
        model = DistilBERTPricePredictor(Config.MODEL_NAME).to(Config.DEVICE)
        model.load_state_dict(checkpoint['model_state_dict'])
        
        print("‚úÖ Model loaded successfully!")
        
    else:
        print(f"\n‚ö†Ô∏è No trained model found. Training new model...")
        
        train_path = os.path.join(DATASET_FOLDER, 'train.csv')
        if not os.path.exists(train_path):
            raise FileNotFoundError(f"Training data not found at {train_path}")
        
        print(f"üìÇ Loading training data from {train_path}")
        train_df = pd.read_csv(train_path)
        print(f"‚úÖ Loaded {len(train_df)} training samples")
        
        # Train model with GPU optimization
        model, tokenizer = train_model_optimized(train_df)
    
    # Load test data
    print(f"\nüìÇ Loading test data...")
    test_path = os.path.join(DATASET_FOLDER, 'test.csv')
    test = pd.read_csv(test_path)
    print(f"‚úÖ Loaded {len(test)} test samples")
    
    # GPU-optimized batch prediction
    test['price'] = batch_predictor_optimized(test, model, tokenizer)
    
    # Select only required columns
    output_df = test[['sample_id', 'price']]
    
    # Save predictions
    output_filename = os.path.join(DATASET_FOLDER, 'test_out.csv')
    output_df.to_csv(output_filename, index=False)
    
    # Summary
    print("\n" + "="*60)
    print("‚úÖ PREDICTION COMPLETE")
    print("="*60)
    print(f"üìÅ Predictions saved to: {output_filename}")
    print(f"üìä Total predictions: {len(output_df)}")
    print(f"üí∞ Price range: ‚Çπ{output_df['price'].min():.2f} - ‚Çπ{output_df['price'].max():.2f}")
    print(f"üìà Average price: ‚Çπ{output_df['price'].mean():.2f}")
    print(f"üìâ Median price: ‚Çπ{output_df['price'].median():.2f}")
    
    if torch.cuda.is_available():
        print(f"\nüéÆ GPU Stats:")
        print(f"  Peak VRAM Used: {torch.cuda.max_memory_allocated() / 1e9:.2f} GB")
        print(f"  Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    
    print(f"\nüîç Sample predictions:")
    print(output_df.head(10))
    print("="*60)

In [None]:
"""
Optimized DistilBERT Price Prediction Baseline (25k subset)
GPU-Accelerated for RTX 3050
"""

import os
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from transformers import DistilBertTokenizerFast, DistilBertModel
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================
class Config:
    MODEL_NAME = 'distilbert-base-uncased'
    MAX_LEN = 128
    BATCH_SIZE = 64           # larger batch to maximize GPU throughput
    DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    USE_LOG_PRICE = True
    EPOCHS = 2                # fewer epochs for quick iteration
    MODEL_PATH = 'trained_model_25k.pth'
    TRAIN_SIZE = 25000
    TEST_SIZE = 25000
    MIXED_PRECISION = True    # Enable AMP (Automatic Mixed Precision)

# ============================================================================
# DATASET
# ============================================================================
class PriceDataset(Dataset):
    def __init__(self, texts, prices=None, tokenizer=None, max_len=128):
        self.texts = texts
        self.prices = prices
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        encoding = self.tokenizer(
            text,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0)
        }
        if self.prices is not None:
            item['price'] = torch.tensor(self.prices[idx], dtype=torch.float)
        return item

# ============================================================================
# MODEL ARCHITECTURE
# ============================================================================
class DistilBERTPricePredictor(nn.Module):
    def __init__(self, model_name='distilbert-base-uncased', dropout=0.3):
        super().__init__()
        self.bert = DistilBertModel.from_pretrained(model_name)
        self.regressor = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(256, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        price = self.regressor(cls_output)
        return price.squeeze()

# ============================================================================
# TRAINING FUNCTION
# ============================================================================
def train_model(train_df):
    print("üöÄ Training Optimized DistilBERT Model...")
    print(f"Device: {Config.DEVICE}")
    
    # Subset 25k samples
    train_df = train_df.sample(n=Config.TRAIN_SIZE, random_state=42).reset_index(drop=True)
    texts = train_df['catalog_content'].values
    prices = train_df['price'].values

    if Config.USE_LOG_PRICE:
        prices = np.log1p(prices)
        print("‚úÖ Applied log1p transformation to prices")

    tokenizer = DistilBertTokenizerFast.from_pretrained(Config.MODEL_NAME)
    model = DistilBERTPricePredictor(Config.MODEL_NAME).to(Config.DEVICE)

    train_dataset = PriceDataset(texts, prices, tokenizer, Config.MAX_LEN)
    train_loader = DataLoader(
        train_dataset,
        batch_size=Config.BATCH_SIZE,
        shuffle=True,
        num_workers=4,
        pin_memory=True
    )

    optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)
    loss_fn = nn.HuberLoss()
    scaler = torch.cuda.amp.GradScaler(enabled=Config.MIXED_PRECISION)

    model.train()
    for epoch in range(Config.EPOCHS):
        total_loss = 0
        pbar = tqdm(train_loader, desc=f"Epoch {epoch + 1}/{Config.EPOCHS}")
        for batch in pbar:
            input_ids = batch['input_ids'].to(Config.DEVICE, non_blocking=True)
            attention_mask = batch['attention_mask'].to(Config.DEVICE, non_blocking=True)
            target_prices = batch['price'].to(Config.DEVICE, non_blocking=True)

            optimizer.zero_grad()
            with torch.cuda.amp.autocast(enabled=Config.MIXED_PRECISION):
                predictions = model(input_ids, attention_mask)
                loss = loss_fn(predictions, target_prices)
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            total_loss += loss.item()
            pbar.set_postfix({'loss': loss.item()})
        avg_loss = total_loss / len(train_loader)
        print(f"üìâ Epoch {epoch+1} | Avg Loss: {avg_loss:.4f}")

    torch.save(model.state_dict(), Config.MODEL_PATH)
    print(f"‚úÖ Model saved to {Config.MODEL_PATH}")
    return model, tokenizer

# ============================================================================
# PREDICTION
# ============================================================================
def batch_predictor(test_df, model, tokenizer):
    test_df = test_df.sample(n=Config.TEST_SIZE, random_state=42).reset_index(drop=True)
    texts = test_df['catalog_content'].values

    test_dataset = PriceDataset(texts, None, tokenizer, Config.MAX_LEN)
    test_loader = DataLoader(test_dataset, batch_size=Config.BATCH_SIZE, shuffle=False, num_workers=4, pin_memory=True)

    model.eval()
    predictions = []
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Predicting"):
            input_ids = batch['input_ids'].to(Config.DEVICE, non_blocking=True)
            attention_mask = batch['attention_mask'].to(Config.DEVICE, non_blocking=True)
            with torch.cuda.amp.autocast(enabled=Config.MIXED_PRECISION):
                preds = model(input_ids, attention_mask)
            predictions.extend(preds.cpu().numpy())

    predictions = np.array(predictions)
    if Config.USE_LOG_PRICE:
        predictions = np.expm1(predictions)
    predictions = np.maximum(predictions, 0.01)
    predictions = np.round(predictions, 2)
    return predictions

# ============================================================================
# MAIN
# ============================================================================
if __name__ == "__main__":
    DATASET_FOLDER = 'dataset/'

    print("="*60)
    print("DistilBERT Price Prediction (25k Subset - Optimized GPU)")
    print("="*60)

    train_path = os.path.join(DATASET_FOLDER, 'train.csv')
    test_path = os.path.join(DATASET_FOLDER, 'test.csv')

    train_df = pd.read_csv(train_path)
    test_df = pd.read_csv(test_path)
    print(f"üìä Train: {len(train_df)} | Test: {len(test_df)}")

    model, tokenizer = train_model(train_df)
    preds = batch_predictor(test_df, model, tokenizer)
    test_df = test_df.iloc[:len(preds)].copy()
    test_df['price'] = preds

    output_path = os.path.join(DATASET_FOLDER, 'test_out_25k.csv')
    test_df[['sample_id', 'price']].to_csv(output_path, index=False)
    print(f"\n‚úÖ Predictions saved to {output_path}")


DistilBERT Price Prediction (25k Subset - Optimized GPU)
üìä Train: 75000 | Test: 75000
üöÄ Training Optimized DistilBERT Model...
Device: cpu
‚úÖ Applied log1p transformation to prices


Epoch 1/2:   0%|                                                                               | 0/391 [00:00<?, ?it/s]