In [6]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import time
import optuna

# Set random seeds for reproducibility
RANDOM_SEED = 50
torch.manual_seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [7]:
class DiamondPriceMLP(nn.Module):
    def __init__(self, input_dim, hidden_dims, dropout_rate=0.3, use_batch_norm=True, target_mean=None):
        super(DiamondPriceMLP, self).__init__()
        
        layers = []
        prev_dim = input_dim
        
        for hidden_dim in hidden_dims:
            layers.append(nn.Linear(prev_dim, hidden_dim))
            
            if use_batch_norm:
                layers.append(nn.BatchNorm1d(hidden_dim))
            
            layers.append(nn.ReLU())
            layers.append(nn.Dropout(dropout_rate))
            
            prev_dim = hidden_dim
        
        layers.append(nn.Linear(prev_dim, 1))
        self.network = nn.Sequential(*layers)
        
        self._initialize_weights(target_mean)
    
    def _initialize_weights(self, target_mean):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_normal_(m.weight, mode='fan_in', nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)
        if target_mean is not None:
            nn.init.constant_(self.network[-1].bias, target_mean)
    
    def forward(self, x):
        return self.network(x).squeeze()

In [8]:
X_train = np.load('data/X_train_final.npy')
y_train_log = np.load('data/y_train_log.npy')
X_val = np.load('data/X_val_final.npy')
y_val_log = np.load('data/y_val_log.npy')

def train_model(trial, X_train, y_train, X_val, y_val, device, 
                n_epochs=300, early_stopping_patience=40, early_stopping=False):
    """
    Train model with hyperparameters suggested by Optuna
    
    Returns validation RMSE (lower is better)
    """
    
    # ========================================================================
    # HYPERPARAMETERS TO TUNE
    # ========================================================================
    
    # 1. Architecture
    '''
    n_layers = trial.suggest_int('n_layers', 3, 6)
    hidden_dims = []
    current_dim = trial.suggest_categorical('start_units', [1024, 256, 512])

    for i in range(n_layers):
        # FIX: Suggest a 'shrink' decision (0 = same size, 1 = half size)
        # The choices [0, 1] are now STATIC, so Optuna stays happy.
        should_shrink = trial.suggest_categorical(f'shrink_layer_{i}', [0, 1])
        
        if should_shrink == 1 and current_dim > 64: # Safety floor
            current_dim = current_dim // 2
            
        hidden_dims.append(current_dim)
    '''
    hidden_dims = [512, 256, 256, 128, 128, 64, 32]
    
    # 2. Regularization
    dropout_rate = trial.suggest_float('dropout_rate', 0.0, 0.5)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
    use_batch_norm = trial.suggest_categorical('use_batch_norm', [True, False])
    
    # 3. Learning rate
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-2, log=True)
    
    # 4. Batch size
    batch_size = trial.suggest_categorical('batch_size', [128, 256, 512])
    
    # ========================================================================
    # CREATE MODEL AND OPTIMIZER
    # ========================================================================
    
    model = DiamondPriceMLP(
        input_dim=X_train.shape[1],
        hidden_dims=hidden_dims,
        dropout_rate=dropout_rate,
        use_batch_norm=use_batch_norm,
        target_mean=np.mean(y_train_log)
    ).to(device)
    

    optimizer = torch.optim.Adam(
        model.parameters(), 
        lr=learning_rate, 
        weight_decay=weight_decay
    )

    train_criterion = nn.HuberLoss(delta=trial.suggest_float('huber_delta', 0.05, 1.0))
    val_criterion = nn.MSELoss()
    
    # Create dataloaders
    train_dataset = TensorDataset(
        torch.FloatTensor(X_train),
        torch.FloatTensor(y_train)
    )
    val_dataset = TensorDataset(
        torch.FloatTensor(X_val),
        torch.FloatTensor(y_val)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # ========================================================================
    # TRAINING LOOP
    # ========================================================================
    
    best_val_rmse = float('inf')
    patience_counter = 0
    
    for epoch in range(n_epochs):
        # Train
        model.train()
        for features, targets in train_loader:
            features = features.to(device)
            targets = targets.to(device)
            
            output = model(features)
            loss = train_criterion(output.view(-1), targets.view(-1))
            
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)  # ← Gradient clipping
            
            optimizer.step()
        
        # Validate
        model.eval()
        val_losses = []
        val_real_losses = []
        
        with torch.no_grad():
            for features, targets in val_loader:
                features = features.to(device)
                targets = targets.to(device)
                
                output = model(features)
                loss = val_criterion(output.view(-1), targets.view(-1))
                val_losses.append(loss.item())

                output_safe = torch.clamp(output, min=0, max=20)  # ← Clamping to avoid overflow
                real_output = torch.expm1(output_safe)      # ← Changed from exp() to expm1()
                real_targets = torch.expm1(targets)    # ← Changed from exp() to expm1()
                real_loss = val_criterion(real_output.view(-1), real_targets.view(-1))
                val_real_losses.append(real_loss.item())   
        
        val_loss = np.mean(val_losses)
        val_real_loss  = np.mean(val_real_losses)
        val_real_rmse = np.sqrt(val_real_loss)
        
        if val_real_rmse < best_val_rmse:
                    best_val_rmse = val_real_rmse
                    patience_counter = 0
                    # Optional: Save best model state here
        else:
            patience_counter += 1

        # Report intermediate value for pruning
        trial.report(val_real_rmse, epoch)
        
        if early_stopping and patience_counter >= early_stopping_patience:
            print(f"Early stopping at epoch {epoch}")
            break

        # Prune unpromising trials
        if trial.should_prune():
            raise optuna.TrialPruned()
    
    # ========================================================================
    # FINAL EVALUATION (Real RMSE)
    # ========================================================================
    
    model.eval()
    total_real_mse = 0.0
    total_samples = 0
    
    with torch.no_grad():
        for features, targets in val_loader:
            features = features.to(device)
            targets = targets.to(device)
            
            # 1. Get model log-predictions
            output_log = model(features).view(-1)
            output_log_safe = torch.clamp(output_log, min=0, max=20)  # ← Clamping to avoid overflow
            targets_log = targets.view(-1)
            
            # 2. Transform BOTH to real prices ($)
            # We do this before the criterion to calculate RMSE in dollars
            output_real = torch.expm1(output_log_safe)
            targets_real = torch.expm1(targets_log)
            
            # 3. Calculate MSE for this batch in real dollars
            # val_criterion is nn.MSELoss()
            batch_mse = val_criterion(output_real, targets_real)
            
            # 4. Accumulate weighted by batch size (handles smaller last batches)
            batch_size = features.size(0)
            total_real_mse += batch_mse.item() * batch_size
            total_samples += batch_size

    # 5. Final RMSE calculation
    final_real_mse = total_real_mse / total_samples
    rmse = np.sqrt(final_real_mse)
    
    return rmse

In [9]:

def objective(trial):
    """
    Objective function for Optuna to minimize
    """
    
    # Load data
    print(f"\nTrial {trial.number}: Testing hyperparameters...")
    
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Train and evaluate
    
    rmse = train_model(trial, X_train, y_train_log, X_val, y_val_log, device, early_stopping=False)
    
    # Check for invalid values
    if np.isnan(rmse) or np.isinf(rmse):
        rmse = 1e10 # Large penalty
    
  
    print(f"  → Validation RMSE: ${rmse:,.0f}")
    
    return rmse

In [10]:
# now we can run the experiment
storage_name = "sqlite:///diamond_study.db"
sampler = optuna.samplers.TPESampler(seed=42)
study = optuna.create_study(
    study_name="diamond-mlp", 
    storage=storage_name, 
    direction="minimize",
    load_if_exists=True  # Crucial for restarting
)
study.optimize(objective, n_trials=100, show_progress_bar=True, catch=(Exception,))

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2026-01-10 13:24:37,957] A new study created in RDB with name: diamond-mlp


  0%|          | 0/100 [00:00<?, ?it/s]


Trial 0: Testing hyperparameters...
  → Validation RMSE: $14,038
[I 2026-01-10 13:34:57,878] Trial 0 finished with value: 14038.198408047916 and parameters: {'dropout_rate': 0.4835703256465729, 'weight_decay': 1.0031460941124857e-06, 'use_batch_norm': True, 'learning_rate': 0.0004389636493392904, 'batch_size': 512, 'huber_delta': 0.2238223925112907}. Best is trial 0 with value: 14038.198408047916.

Trial 1: Testing hyperparameters...
  → Validation RMSE: $21,808
[I 2026-01-10 13:55:39,997] Trial 1 finished with value: 21808.019355531163 and parameters: {'dropout_rate': 0.31319016592441107, 'weight_decay': 2.0119786395854447e-06, 'use_batch_norm': False, 'learning_rate': 0.0038449571088975606, 'batch_size': 256, 'huber_delta': 0.7148217317548615}. Best is trial 0 with value: 14038.198408047916.

Trial 2: Testing hyperparameters...
  → Validation RMSE: $14,606
[I 2026-01-10 14:05:57,467] Trial 2 finished with value: 14605.811523484776 and parameters: {'dropout_rate': 0.18197955320978676