In [9]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from sklearn import clone 
warnings.filterwarnings('ignore')

# Machine Learning Libraries
from sklearn.model_selection import train_test_split, KFold, cross_val_score, cross_val_predict
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline

# Advanced Models
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor

# Hyperparameter Optimization
import optuna
from optuna.integration import XGBoostPruningCallback
from optuna.pruners import MedianPruner

# Progress tracking
from tqdm import tqdm
import sys

# Ensemble Methods
from sklearn.base import BaseEstimator, RegressorMixin
import joblib
import time
from datetime import datetime

# Set random seeds for reproducibility
np.random.seed(42)

In [21]:
# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Load final datasets
X_train = pd.read_csv('../data/processed/X_train_final.csv')
X_test = pd.read_csv('../data/processed/X_test_final.csv')
target_data = pd.read_csv('../data/processed/y_train_final.csv')

print("\n" + "="*60)
print("Data Import Summary:")
print(f"Features available: {X_train.shape[1]}")
print(f"Training samples: {X_train.shape[0]}")
print(f"Test samples: {X_test.shape[0]}")
print("Target variable: Log-transformed SalePrice")
print("="*60)

# Extract log-transformed target variable (assuming 'SalePrice' column)
y_log = target_data['SalePrice']

print(f"\nTarget Variable Statistics:")
print(f"  Shape: {y_log.shape}")
print(f"  Mean: {y_log.mean():.4f}")
print(f"  Std: {y_log.std():.4f}")
print(f"  Min: {y_log.min():.4f}")
print(f"  Max: {y_log.max():.4f}")

# Verify this is log-transformed (should be roughly 11-13 range for house prices)
if y_log.min() > 10 and y_log.max() < 15:
    print("  ✓ Target appears to be log-transformed")
else:
    print("  ⚠️ Warning: Target may not be log-transformed")

# Validate data quality
print("\nData Quality Validation:")
print(f"Missing values in X_train: {X_train.isnull().sum().sum()}")
print(f"Missing values in X_test: {X_test.isnull().sum().sum()}")
print(f"Missing values in target: {y_log.isnull().sum()}")
print(f"Infinite values in X_train: {np.isinf(X_train.select_dtypes(include=[np.number])).sum().sum()}")
print(f"Infinite values in X_test: {np.isinf(X_test.select_dtypes(include=[np.number])).sum().sum()}")

# Handle any remaining issues
if X_train.isnull().sum().sum() > 0:
    X_train = X_train.fillna(X_train.median())
    X_test = X_test.fillna(X_train.median())  # Use training medians for test
    print("✓ Missing values handled")

if y_log.isnull().sum() > 0:
    print("⚠️ Warning: Missing values in target variable")


Data Import Summary:
Features available: 191
Training samples: 1458
Test samples: 1459
Target variable: Log-transformed SalePrice

Target Variable Statistics:
  Shape: (1458,)
  Mean: 12.0240
  Std: 0.3997
  Min: 10.4602
  Max: 13.5345
  ✓ Target appears to be log-transformed

Data Quality Validation:
Missing values in X_train: 0
Missing values in X_test: 0
Missing values in target: 0
Infinite values in X_train: 0
Infinite values in X_test: 2


In [22]:
# Handle infinite values if they exist
inf_cols_train = X_train.columns[np.isinf(X_train).any()]
inf_cols_test = X_test.columns[np.isinf(X_test).any()]

if len(inf_cols_train) > 0:
    print(f"\nInfinite values found in training columns: {list(inf_cols_train)}")
    for col in inf_cols_train:
        inf_rows = X_train[np.isinf(X_train[col])]
        inf_count = len(inf_rows)
        print(f"  {col}: {inf_count} infinite values")

        # Print corresponding IDs if available
        if 'Id' in inf_rows.columns:
            print("    → Row IDs with infinite values:")
            print(inf_rows['Id'].values)
        elif 'Id' in df_train.columns:
            print("    → Row IDs with infinite values (from df_train):")
            print(df_train.loc[inf_rows.index, 'Id'].values)
        else:
            print("    → Row indices with infinite values:")
            print(inf_rows.index.values)

        # Replace with median of finite values
        finite_values = X_train[col][np.isfinite(X_train[col])]
        if len(finite_values) > 0:
            replacement_value = finite_values.median()
            X_train[col] = X_train[col].replace([np.inf, -np.inf], replacement_value)
            print(f"    → Replaced with median: {replacement_value:.4f}")

if len(inf_cols_test) > 0:
    print(f"\nInfinite values found in test columns: {list(inf_cols_test)}")
    for col in inf_cols_test:
        inf_rows = X_test[np.isinf(X_test[col])]
        inf_count = len(inf_rows)
        print(f"  {col}: {inf_count} infinite values")

        # Print corresponding IDs if available
        if 'Id' in inf_rows.columns:
            print("    → Row IDs with infinite values:")
            print(inf_rows['Id'].values)
        elif 'Id' in df_test.columns:
            print("    → Row IDs with infinite values (from df_test):")
            print(df_test.loc[inf_rows.index, 'Id'].values)
        else:
            print("    → Row indices with infinite values:")
            print(inf_rows.index.values)

        # Replace with median from training
        if col in X_train.columns:
            finite_values = X_train[col][np.isfinite(X_train[col])]
            if len(finite_values) > 0:
                replacement_value = finite_values.median()
                X_test[col] = X_test[col].replace([np.inf, -np.inf], replacement_value)
                print(f"    → Replaced with training median: {replacement_value:.4f}")
            else:
                X_test[col] = X_test[col].replace([np.inf, -np.inf], 0)
                print(f"    → Replaced with 0 (no finite training values)")
        else:
            X_test[col] = X_test[col].replace([np.inf, -np.inf], 0)
            print(f"    → Replaced with 0 (column not in training)")

# Handle any remaining missing values
if X_train.isnull().sum().sum() > 0:
    X_train = X_train.fillna(X_train.median())
    X_test = X_test.fillna(X_train.median())
    print("✓ Missing values handled")

if y_log.isnull().sum() > 0:
    print("⚠️ Warning: Missing values in target variable")

# Final validation
print("\nAfter cleaning:")
print(f"Infinite values in X_train: {np.isinf(X_train.select_dtypes(include=[np.number])).sum().sum()}")
print(f"Infinite values in X_test: {np.isinf(X_test.select_dtypes(include=[np.number])).sum().sum()}")
print("✓ Data quality issues resolved")



Infinite values found in test columns: ['GarageAge', 'HouseAge']
  GarageAge: 1 infinite values
    → Row IDs with infinite values:
[2550]
    → Replaced with training median: 3.4340
  HouseAge: 1 infinite values
    → Row IDs with infinite values:
[2550]
    → Replaced with training median: 3.5835

After cleaning:
Infinite values in X_train: 0
Infinite values in X_test: 0
✓ Data quality issues resolved


In [11]:
# Create train-validation split (ONLY from training data)
print("\nCreating Train-Validation Split:")

X_train_split, X_val, y_train, y_val = train_test_split(
    X_train, y_log, 
    test_size=0.2, 
    random_state=42
)

print(f"Training set: {X_train_split.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")

# Cross-validation setup
cv = KFold(n_splits=5, shuffle=True, random_state=42)
print("✓ 5-fold cross-validation configured")

# Verify feature consistency
train_features = set(X_train_split.columns)
test_features = set(X_test.columns)
if train_features == test_features:
    print("✓ Feature consistency verified between train and test")
else:
    print("⚠️ Feature inconsistency detected")
    missing_in_test = train_features - test_features
    missing_in_train = test_features - train_features
    if missing_in_test:
        print(f"  Missing in test: {missing_in_test}")
    if missing_in_train:
        print(f"  Missing in train: {missing_in_train}")


Creating Train-Validation Split:
Training set: (1166, 191)
Validation set: (292, 191)
Test set: (1459, 191)
✓ 5-fold cross-validation configured
✓ Feature consistency verified between train and test


In [12]:
# Define baseline models
baseline_models = {
    'Ridge': Ridge(alpha=1.0, random_state=42),
    'Lasso': Lasso(alpha=0.01, random_state=42, max_iter=2000),
    'ElasticNet': ElasticNet(alpha=0.01, l1_ratio=0.5, random_state=42, max_iter=2000)
}

def evaluate_model(model, X, y, cv_folds, model_name="Model"):
    """Evaluate model using cross-validation with progress tracking"""
    
    print(f"Evaluating {model_name}...")
    
    # Cross-validation scores with progress bar
    cv_scores = []
    with tqdm(total=cv_folds.n_splits, desc=f"CV {model_name}", leave=False) as pbar:
        for train_idx, val_idx in cv_folds.split(X, y):
            X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
            y_fold_train, y_fold_val = y.iloc[train_idx], y.iloc[val_idx]
            
            # Fit and predict
            model_copy = clone(model)
            model_copy.fit(X_fold_train, y_fold_train)
            pred = model_copy.predict(X_fold_val)
            
            # Calculate RMSE for this fold
            fold_rmse = np.sqrt(mean_squared_error(y_fold_val, pred))
            cv_scores.append(fold_rmse)
            
            pbar.update(1)
    
    cv_rmse = np.array(cv_scores)
    
    # Fit on full training set for validation score
    print(f"  Fitting {model_name} on full training set...")
    model.fit(X, y)
    val_pred = model.predict(X_val)
    val_rmse = np.sqrt(mean_squared_error(y_val, val_pred))
    
    return {
        'model_name': model_name,
        'cv_rmse_mean': cv_rmse.mean(),
        'cv_rmse_std': cv_rmse.std(),
        'val_rmse': val_rmse
    }

print("Baseline Model Performance:")
print("=" * 60)

baseline_results = {}
total_models = len(baseline_models)

with tqdm(total=total_models, desc="Baseline Models") as pbar:
    for i, (name, model) in enumerate(baseline_models.items()):
        pbar.set_description(f"Training {name}")
        result = evaluate_model(model, X_train_split, y_train, cv, name)
        baseline_results[name] = result
        
        print(f"{name:<12}: CV {result['cv_rmse_mean']:.4f} ± {result['cv_rmse_std']:.4f} | "
              f"Val {result['val_rmse']:.4f}")
        
        pbar.update(1)

# Best baseline model
best_baseline = min(baseline_results.items(), key=lambda x: x[1]['cv_rmse_mean'])
print(f"\nBest baseline: {best_baseline[0]} (CV RMSE: {best_baseline[1]['cv_rmse_mean']:.4f})")

Baseline Model Performance:


Training Ridge:   0%|          | 0/3 [00:00<?, ?it/s] 

Evaluating Ridge...


Training Lasso:  33%|███▎      | 1/3 [00:00<00:00, 31.60it/s]

  Fitting Ridge on full training set...
Ridge       : CV 0.1220 ± 0.0090 | Val 0.1200
Evaluating Lasso...


Training ElasticNet:  67%|██████▋   | 2/3 [00:00<00:00, 14.54it/s]

  Fitting Lasso on full training set...
Lasso       : CV 0.1454 ± 0.0068 | Val 0.1488
Evaluating ElasticNet...


Training ElasticNet: 100%|██████████| 3/3 [00:00<00:00, 12.31it/s]

  Fitting ElasticNet on full training set...
ElasticNet  : CV 0.1366 ± 0.0082 | Val 0.1389

Best baseline: Ridge (CV RMSE: 0.1220)





In [13]:
# Test impact of feature scaling
print("\nFeature Scaling Impact Analysis:")

scalers = {
    'None': None,
    'StandardScaler': StandardScaler(),
    'RobustScaler': RobustScaler()
}

scaling_results = {}
total_scalers = len(scalers)

with tqdm(total=total_scalers, desc="Testing Scaling Methods") as pbar:
    for scaler_name, scaler in scalers.items():
        pbar.set_description(f"Testing {scaler_name}")
        
        if scaler is None:
            X_scaled = X_train_split
            X_val_scaled = X_val
        else:
            X_scaled = pd.DataFrame(scaler.fit_transform(X_train_split), 
                                   columns=X_train_split.columns, index=X_train_split.index)
            X_val_scaled = pd.DataFrame(scaler.transform(X_val), 
                                       columns=X_val.columns, index=X_val.index)
        
        # Test with Ridge regression
        ridge_scaled = Ridge(alpha=1.0, random_state=42)
        result = evaluate_model(ridge_scaled, X_scaled, y_train, cv, f"Ridge+{scaler_name}")
        scaling_results[scaler_name] = result
        
        print(f"Ridge + {scaler_name:<15}: {result['cv_rmse_mean']:.4f} ± {result['cv_rmse_std']:.4f}")
        
        pbar.update(1)

best_scaler = min(scaling_results.items(), key=lambda x: x[1]['cv_rmse_mean'])
print(f"Best scaling approach: {best_scaler[0]}")


Feature Scaling Impact Analysis:


Testing None:   0%|          | 0/3 [00:00<?, ?it/s]           

Evaluating Ridge+None...


Testing StandardScaler:  33%|███▎      | 1/3 [00:00<00:00, 25.67it/s]

  Fitting Ridge+None on full training set...
Ridge + None           : 0.1220 ± 0.0090
Evaluating Ridge+StandardScaler...


Testing RobustScaler:  67%|██████▋   | 2/3 [00:00<00:00, 34.06it/s]  

  Fitting Ridge+StandardScaler on full training set...
Ridge + StandardScaler : 0.1303 ± 0.0110
Evaluating Ridge+RobustScaler...


Testing RobustScaler: 100%|██████████| 3/3 [00:00<00:00, 34.14it/s]

  Fitting Ridge+RobustScaler on full training set...
Ridge + RobustScaler   : 0.1218 ± 0.0088
Best scaling approach: RobustScaler





In [14]:
def objective_xgb(trial):
    """Optuna objective function for XGBoost optimization"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 8000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 6),
        'subsample': trial.suggest_float('subsample', 0.4, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.8),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': 0
    }
    
    model = XGBRegressor(**params)
    
    # Cross-validation with early stopping
    cv_scores = cross_val_score(model, X_train_split, y_train, cv=cv, 
                               scoring='neg_mean_squared_error', n_jobs=-1)
    return np.sqrt(-cv_scores.mean())

print("XGBoost Hyperparameter Optimization:")
print("=" * 50)

# Progress callback for Optuna
class OptunaTqdmCallback:
    def __init__(self, n_trials):
        self.n_trials = n_trials
        self.pbar = None
    
    def __call__(self, study, trial):
        if self.pbar is None:
            self.pbar = tqdm(total=self.n_trials, desc="XGBoost Optimization")
        
        # Update progress bar
        self.pbar.set_postfix({
            'Best RMSE': f'{study.best_value:.4f}',
            'Trial': f'{trial.number + 1}/{self.n_trials}'
        })
        self.pbar.update(1)
        
        # Close progress bar when done
        if trial.number == self.n_trials - 1:
            self.pbar.close()

# Create Optuna study
study_xgb = optuna.create_study(
    direction='minimize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10),
    study_name='xgboost_optimization'
)

# Optimize with progress tracking
n_trials_xgb = 100
callback = OptunaTqdmCallback(n_trials_xgb)
study_xgb.optimize(objective_xgb, n_trials=n_trials_xgb, timeout=3600, callbacks=[callback])

print(f"Best XGBoost RMSE: {study_xgb.best_value:.4f}")
print("Best parameters:")
for key, value in study_xgb.best_params.items():
    print(f"  {key}: {value}")

# Train best XGBoost model
best_xgb_params = study_xgb.best_params
best_xgb = XGBRegressor(**best_xgb_params)
xgb_result = evaluate_model(best_xgb, X_train_split, y_train, cv, "XGBoost_Optimized")

[I 2025-07-12 21:17:32,920] A new study created in memory with name: xgboost_optimization


XGBoost Hyperparameter Optimization:


[I 2025-07-12 21:17:35,547] Trial 0 finished with value: 0.14680990514121617 and parameters: {'n_estimators': 3222, 'learning_rate': 0.013979863774511748, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.6231463707485482, 'colsample_bytree': 0.5389035426975048, 'reg_alpha': 6.197448410934974, 'reg_lambda': 0.019631783955960786}. Best is trial 0 with value: 0.14680990514121617.
XGBoost Optimization:   0%|          | 0/100 [00:00<?, ?it/s, Best RMSE=0.1468, Trial=1/100][I 2025-07-12 21:17:41,226] Trial 1 finished with value: 0.12427973505573868 and parameters: {'n_estimators': 6067, 'learning_rate': 0.009054963274947654, 'max_depth': 10, 'min_child_weight': 4, 'subsample': 0.7953532695521572, 'colsample_bytree': 0.23435404616886213, 'reg_alpha': 0.015144656059246716, 'reg_lambda': 0.16356397385917384}. Best is trial 1 with value: 0.12427973505573868.
XGBoost Optimization:   2%|▏         | 2/100 [00:05<04:38,  2.84s/it, Best RMSE=0.1243, Trial=2/100][I 2025-07-12 21:17:45,247] Trial 

Best XGBoost RMSE: 0.1144
Best parameters:
  n_estimators: 3837
  learning_rate: 0.007605090164667465
  max_depth: 3
  min_child_weight: 5
  subsample: 0.4604894519484403
  colsample_bytree: 0.20079977705221094
  reg_alpha: 0.058834012582946835
  reg_lambda: 0.11926384552972615
Evaluating XGBoost_Optimized...


                                                                   

  Fitting XGBoost_Optimized on full training set...


In [15]:
def objective_catboost(trial):
    """Optuna objective function for CatBoost optimization"""
    params = {
        'iterations': trial.suggest_int('iterations', 1000, 8000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'depth': trial.suggest_int('depth', 3, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1, 10),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0, 1),
        'random_state': 42,
        'verbose': False,
        'thread_count': -1
    }
    
    model = CatBoostRegressor(**params)
    
    cv_scores = cross_val_score(model, X_train_split, y_train, cv=cv, 
                               scoring='neg_mean_squared_error', n_jobs=-1)
    return np.sqrt(-cv_scores.mean())

print("\nCatBoost Hyperparameter Optimization:")
print("=" * 50)

study_catboost = optuna.create_study(
    direction='minimize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10),
    study_name='catboost_optimization'
)

# Optimize with progress tracking
n_trials_catboost = 100
callback_catboost = OptunaTqdmCallback(n_trials_catboost)
callback_catboost.pbar = tqdm(total=n_trials_catboost, desc="CatBoost Optimization")
study_catboost.optimize(objective_catboost, n_trials=n_trials_catboost, timeout=3600, callbacks=[callback_catboost])

print(f"Best CatBoost RMSE: {study_catboost.best_value:.4f}")
print("Best parameters:")
for key, value in study_catboost.best_params.items():
    print(f"  {key}: {value}")

best_catboost_params = study_catboost.best_params
best_catboost = CatBoostRegressor(**best_catboost_params)
catboost_result = evaluate_model(best_catboost, X_train_split, y_train, cv, "CatBoost_Optimized")

[I 2025-07-12 21:22:00,796] A new study created in memory with name: catboost_optimization



CatBoost Hyperparameter Optimization:


CatBoost Optimization:   0%|          | 0/100 [00:00<?, ?it/s][I 2025-07-12 21:22:03,521] Trial 0 finished with value: 0.12379510311064273 and parameters: {'iterations': 3569, 'learning_rate': 0.18437224861975415, 'depth': 3, 'l2_leaf_reg': 7.609098824012879, 'border_count': 162, 'bagging_temperature': 0.6859380682402874}. Best is trial 0 with value: 0.12379510311064273.
CatBoost Optimization:   1%|          | 1/100 [00:02<04:29,  2.72s/it, Best RMSE=0.1238, Trial=1/100][I 2025-07-12 21:22:13,499] Trial 1 finished with value: 0.11743359854015992 and parameters: {'iterations': 5791, 'learning_rate': 0.035900759201151634, 'depth': 5, 'l2_leaf_reg': 7.148092154033178, 'border_count': 239, 'bagging_temperature': 0.6996304249913571}. Best is trial 1 with value: 0.11743359854015992.
CatBoost Optimization:   2%|▏         | 2/100 [00:12<11:25,  7.00s/it, Best RMSE=0.1174, Trial=2/100][I 2025-07-12 21:22:22,058] Trial 2 finished with value: 0.11809978441165363 and parameters: {'iterations': 431

Best CatBoost RMSE: 0.1144
Best parameters:
  iterations: 7312
  learning_rate: 0.007111752233948758
  depth: 4
  l2_leaf_reg: 1.5466861568307504
  border_count: 112
  bagging_temperature: 0.5583051474063194
Evaluating CatBoost_Optimized...


CV CatBoost_Optimized:   0%|          | 0/5 [00:00<?, ?it/s]

0:	learn: 0.3828258	total: 51.9ms	remaining: 6m 19s
1:	learn: 0.3811813	total: 52.4ms	remaining: 3m 11s
2:	learn: 0.3795937	total: 53ms	remaining: 2m 9s
3:	learn: 0.3780345	total: 53.5ms	remaining: 1m 37s
4:	learn: 0.3764883	total: 53.9ms	remaining: 1m 18s
5:	learn: 0.3748950	total: 54.3ms	remaining: 1m 6s
6:	learn: 0.3734990	total: 54.7ms	remaining: 57.1s
7:	learn: 0.3719875	total: 55.1ms	remaining: 50.3s
8:	learn: 0.3704482	total: 55.6ms	remaining: 45.1s
9:	learn: 0.3689457	total: 56ms	remaining: 40.9s
10:	learn: 0.3674321	total: 56.4ms	remaining: 37.4s
11:	learn: 0.3658364	total: 56.8ms	remaining: 34.6s
12:	learn: 0.3643640	total: 57.2ms	remaining: 32.1s
13:	learn: 0.3628834	total: 57.6ms	remaining: 30s
14:	learn: 0.3613853	total: 58.1ms	remaining: 28.3s
15:	learn: 0.3599286	total: 58.5ms	remaining: 26.7s
16:	learn: 0.3585163	total: 59.5ms	remaining: 25.5s
17:	learn: 0.3569995	total: 59.8ms	remaining: 24.2s
18:	learn: 0.3555585	total: 60.4ms	remaining: 23.2s
19:	learn: 0.3540387	tot

CV CatBoost_Optimized:  20%|██        | 1/5 [00:03<00:12,  3.10s/it]

7178:	learn: 0.0372557	total: 2.75s	remaining: 51ms
7179:	learn: 0.0372512	total: 2.75s	remaining: 50.6ms
7180:	learn: 0.0372475	total: 2.75s	remaining: 50.2ms
7181:	learn: 0.0372404	total: 2.75s	remaining: 49.8ms
7182:	learn: 0.0372366	total: 2.75s	remaining: 49.4ms
7183:	learn: 0.0372316	total: 2.75s	remaining: 49ms
7184:	learn: 0.0372285	total: 2.75s	remaining: 48.7ms
7185:	learn: 0.0372245	total: 2.75s	remaining: 48.3ms
7186:	learn: 0.0372202	total: 2.75s	remaining: 47.9ms
7187:	learn: 0.0372162	total: 2.75s	remaining: 47.5ms
7188:	learn: 0.0372122	total: 2.75s	remaining: 47.1ms
7189:	learn: 0.0372101	total: 2.75s	remaining: 46.7ms
7190:	learn: 0.0372043	total: 2.75s	remaining: 46.3ms
7191:	learn: 0.0372002	total: 2.75s	remaining: 46ms
7192:	learn: 0.0371973	total: 2.75s	remaining: 45.6ms
7193:	learn: 0.0371910	total: 2.75s	remaining: 45.2ms
7194:	learn: 0.0371870	total: 2.75s	remaining: 44.8ms
7195:	learn: 0.0371823	total: 2.76s	remaining: 44.4ms
7196:	learn: 0.0371786	total: 2.76

CV CatBoost_Optimized:  40%|████      | 2/5 [00:06<00:09,  3.04s/it]

7244:	learn: 0.0400475	total: 2.7s	remaining: 25ms
7245:	learn: 0.0400358	total: 2.7s	remaining: 24.6ms
7246:	learn: 0.0400321	total: 2.7s	remaining: 24.3ms
7247:	learn: 0.0400258	total: 2.7s	remaining: 23.9ms
7248:	learn: 0.0400192	total: 2.7s	remaining: 23.5ms
7249:	learn: 0.0400139	total: 2.71s	remaining: 23.1ms
7250:	learn: 0.0400089	total: 2.71s	remaining: 22.8ms
7251:	learn: 0.0400026	total: 2.71s	remaining: 22.4ms
7252:	learn: 0.0399977	total: 2.71s	remaining: 22ms
7253:	learn: 0.0399935	total: 2.71s	remaining: 21.6ms
7254:	learn: 0.0399931	total: 2.71s	remaining: 21.3ms
7255:	learn: 0.0399870	total: 2.71s	remaining: 20.9ms
7256:	learn: 0.0399826	total: 2.71s	remaining: 20.5ms
7257:	learn: 0.0399779	total: 2.71s	remaining: 20.1ms
7258:	learn: 0.0399743	total: 2.71s	remaining: 19.8ms
7259:	learn: 0.0399715	total: 2.71s	remaining: 19.4ms
7260:	learn: 0.0399641	total: 2.71s	remaining: 19ms
7261:	learn: 0.0399598	total: 2.71s	remaining: 18.7ms
7262:	learn: 0.0399523	total: 2.71s	rem

CV CatBoost_Optimized:  60%|██████    | 3/5 [00:09<00:06,  3.08s/it]

7239:	learn: 0.0419895	total: 2.84s	remaining: 28.3ms
7240:	learn: 0.0419848	total: 2.84s	remaining: 27.9ms
7241:	learn: 0.0419762	total: 2.84s	remaining: 27.5ms
7242:	learn: 0.0419706	total: 2.84s	remaining: 27.1ms
7243:	learn: 0.0419649	total: 2.84s	remaining: 26.7ms
7244:	learn: 0.0419589	total: 2.84s	remaining: 26.3ms
7245:	learn: 0.0419561	total: 2.84s	remaining: 25.9ms
7246:	learn: 0.0419496	total: 2.84s	remaining: 25.5ms
7247:	learn: 0.0419456	total: 2.84s	remaining: 25.1ms
7248:	learn: 0.0419398	total: 2.84s	remaining: 24.7ms
7249:	learn: 0.0419349	total: 2.84s	remaining: 24.3ms
7250:	learn: 0.0419287	total: 2.85s	remaining: 23.9ms
7251:	learn: 0.0419265	total: 2.85s	remaining: 23.5ms
7252:	learn: 0.0419207	total: 2.85s	remaining: 23.1ms
7253:	learn: 0.0419182	total: 2.85s	remaining: 22.8ms
7254:	learn: 0.0419120	total: 2.85s	remaining: 22.4ms
7255:	learn: 0.0419083	total: 2.85s	remaining: 22ms
7256:	learn: 0.0419039	total: 2.85s	remaining: 21.6ms
7257:	learn: 0.0418981	total: 

CV CatBoost_Optimized:  80%|████████  | 4/5 [00:12<00:03,  3.04s/it]

0:	learn: 0.3951469	total: 1.4ms	remaining: 10.2s
1:	learn: 0.3933868	total: 1.84ms	remaining: 6.75s
2:	learn: 0.3918204	total: 2.69ms	remaining: 6.54s
3:	learn: 0.3902707	total: 3.12ms	remaining: 5.69s
4:	learn: 0.3885845	total: 3.61ms	remaining: 5.28s
5:	learn: 0.3867770	total: 3.96ms	remaining: 4.83s
6:	learn: 0.3851401	total: 4.53ms	remaining: 4.73s
7:	learn: 0.3834905	total: 5.25ms	remaining: 4.8s
8:	learn: 0.3817613	total: 6.24ms	remaining: 5.06s
9:	learn: 0.3801060	total: 6.58ms	remaining: 4.8s
10:	learn: 0.3784653	total: 6.92ms	remaining: 4.59s
11:	learn: 0.3769123	total: 7.27ms	remaining: 4.42s
12:	learn: 0.3753984	total: 7.58ms	remaining: 4.25s
13:	learn: 0.3737963	total: 7.89ms	remaining: 4.11s
14:	learn: 0.3722371	total: 8.89ms	remaining: 4.33s
15:	learn: 0.3706880	total: 9.28ms	remaining: 4.23s
16:	learn: 0.3691313	total: 9.61ms	remaining: 4.12s
17:	learn: 0.3676623	total: 9.94ms	remaining: 4.03s
18:	learn: 0.3660834	total: 10.3ms	remaining: 3.95s
19:	learn: 0.3645389	tota

                                                                    

7268:	learn: 0.0390288	total: 2.88s	remaining: 17ms
7269:	learn: 0.0390261	total: 2.88s	remaining: 16.7ms
7270:	learn: 0.0390240	total: 2.88s	remaining: 16.3ms
7271:	learn: 0.0390184	total: 2.88s	remaining: 15.9ms
7272:	learn: 0.0390144	total: 2.88s	remaining: 15.5ms
7273:	learn: 0.0390101	total: 2.88s	remaining: 15.1ms
7274:	learn: 0.0390087	total: 2.88s	remaining: 14.7ms
7275:	learn: 0.0390030	total: 2.88s	remaining: 14.3ms
7276:	learn: 0.0389982	total: 2.88s	remaining: 13.9ms
7277:	learn: 0.0389923	total: 2.89s	remaining: 13.5ms
7278:	learn: 0.0389889	total: 2.89s	remaining: 13.1ms
7279:	learn: 0.0389845	total: 2.89s	remaining: 12.7ms
7280:	learn: 0.0389771	total: 2.89s	remaining: 12.3ms
7281:	learn: 0.0389766	total: 2.89s	remaining: 11.9ms
7282:	learn: 0.0389719	total: 2.89s	remaining: 11.5ms
7283:	learn: 0.0389696	total: 2.89s	remaining: 11.1ms
7284:	learn: 0.0389665	total: 2.89s	remaining: 10.7ms
7285:	learn: 0.0389660	total: 2.89s	remaining: 10.3ms
7286:	learn: 0.0389630	total: 



337:	learn: 0.1578419	total: 143ms	remaining: 2.94s
338:	learn: 0.1576081	total: 143ms	remaining: 2.94s
339:	learn: 0.1574035	total: 144ms	remaining: 2.94s
340:	learn: 0.1571898	total: 144ms	remaining: 2.94s
341:	learn: 0.1569941	total: 144ms	remaining: 2.94s
342:	learn: 0.1567986	total: 145ms	remaining: 2.94s
343:	learn: 0.1565729	total: 145ms	remaining: 2.94s
344:	learn: 0.1563561	total: 145ms	remaining: 2.93s
345:	learn: 0.1561568	total: 146ms	remaining: 2.93s
346:	learn: 0.1559852	total: 146ms	remaining: 2.93s
347:	learn: 0.1557862	total: 146ms	remaining: 2.93s
348:	learn: 0.1555949	total: 147ms	remaining: 2.92s
349:	learn: 0.1554258	total: 148ms	remaining: 2.94s
350:	learn: 0.1552299	total: 148ms	remaining: 2.94s
351:	learn: 0.1550649	total: 148ms	remaining: 2.93s
352:	learn: 0.1548997	total: 149ms	remaining: 2.93s
353:	learn: 0.1546958	total: 149ms	remaining: 2.93s
354:	learn: 0.1545394	total: 149ms	remaining: 2.93s
355:	learn: 0.1543508	total: 150ms	remaining: 2.92s
356:	learn: 

In [16]:
def objective_lightgbm(trial):
    """Optuna objective function for LightGBM optimization"""
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 8000),
        'learning_rate': trial.suggest_float('learning_rate', 0.005, 0.3, log=True),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'num_leaves': trial.suggest_int('num_leaves', 10, 300),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'subsample': trial.suggest_float('subsample', 0.4, 0.8),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.2, 0.8),
        'reg_alpha': trial.suggest_float('reg_alpha', 0.01, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 0.01, 10.0, log=True),
        'random_state': 42,
        'n_jobs': -1,
        'verbosity': -1
    }
    
    model = LGBMRegressor(**params)
    
    cv_scores = cross_val_score(model, X_train_split, y_train, cv=cv, 
                               scoring='neg_mean_squared_error', n_jobs=-1)
    return np.sqrt(-cv_scores.mean())

print("\nLightGBM Hyperparameter Optimization:")
print("=" * 50)

study_lightgbm = optuna.create_study(
    direction='minimize',
    pruner=MedianPruner(n_startup_trials=5, n_warmup_steps=10),
    study_name='lightgbm_optimization'
)

# Optimize with progress tracking
n_trials_lightgbm = 100
callback_lightgbm = OptunaTqdmCallback(n_trials_lightgbm)
callback_lightgbm.pbar = tqdm(total=n_trials_lightgbm, desc="LightGBM Optimization")
study_lightgbm.optimize(objective_lightgbm, n_trials=n_trials_lightgbm, timeout=3600, callbacks=[callback_lightgbm])

print(f"Best LightGBM RMSE: {study_lightgbm.best_value:.4f}")

best_lightgbm_params = study_lightgbm.best_params
best_lightgbm = LGBMRegressor(**best_lightgbm_params)
lightgbm_result = evaluate_model(best_lightgbm, X_train_split, y_train, cv, "LightGBM_Optimized")

[I 2025-07-12 21:38:57,497] A new study created in memory with name: lightgbm_optimization



LightGBM Hyperparameter Optimization:


LightGBM Optimization:   0%|          | 0/100 [00:00<?, ?it/s][I 2025-07-12 21:39:13,947] Trial 0 finished with value: 0.1364786936130577 and parameters: {'n_estimators': 4435, 'learning_rate': 0.005841904931115278, 'max_depth': 4, 'num_leaves': 278, 'min_child_samples': 81, 'subsample': 0.580962517669, 'colsample_bytree': 0.6629739295414206, 'reg_alpha': 0.3063767887006961, 'reg_lambda': 0.8796466369346254}. Best is trial 0 with value: 0.1364786936130577.
LightGBM Optimization:   1%|          | 1/100 [00:16<27:08, 16.45s/it, Best RMSE=0.1365, Trial=1/100][I 2025-07-12 21:39:28,681] Trial 1 finished with value: 0.13792539549737132 and parameters: {'n_estimators': 7904, 'learning_rate': 0.028203448853857883, 'max_depth': 4, 'num_leaves': 64, 'min_child_samples': 96, 'subsample': 0.5520707775112087, 'colsample_bytree': 0.4811893283700854, 'reg_alpha': 0.6483223831960758, 'reg_lambda': 0.1091880133673079}. Best is trial 0 with value: 0.1364786936130577.
LightGBM Optimization:   2%|▏      

Best LightGBM RMSE: 0.1186
Evaluating LightGBM_Optimized...


CV LightGBM_Optimized:   0%|          | 0/5 [00:00<?, ?it/s]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004137 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2771
[LightGBM] [Info] Number of data points in the train set: 932, number of used features: 132
[LightGBM] [Info] Start training from score 12.010125


CV LightGBM_Optimized:  20%|██        | 1/5 [00:11<00:46, 11.62s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001887 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2777
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 134
[LightGBM] [Info] Start training from score 12.028694


CV LightGBM_Optimized:  40%|████      | 2/5 [00:22<00:34, 11.45s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2796
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 138
[LightGBM] [Info] Start training from score 12.022843


CV LightGBM_Optimized:  60%|██████    | 3/5 [00:34<00:23, 11.50s/it]

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002357 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2784
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 136
[LightGBM] [Info] Start training from score 12.026550


CV LightGBM_Optimized:  80%|████████  | 4/5 [00:45<00:11, 11.44s/it]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2795
[LightGBM] [Info] Number of data points in the train set: 933, number of used features: 134
[LightGBM] [Info] Start training from score 12.028549


                                                                    

  Fitting LightGBM_Optimized on full training set...
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002511 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2953
[LightGBM] [Info] Number of data points in the train set: 1166, number of used features: 141
[LightGBM] [Info] Start training from score 12.023355


In [17]:
def objective_lasso(trial):
    """Optuna objective for Lasso optimization"""
    alpha = trial.suggest_float('alpha', 0.0001, 1.0, log=True)
    model = Lasso(alpha=alpha, random_state=42, max_iter=3000)
    
    cv_scores = cross_val_score(model, X_train_split, y_train, cv=cv, 
                               scoring='neg_mean_squared_error', n_jobs=-1)
    return np.sqrt(-cv_scores.mean())

def objective_elasticnet(trial):
    """Optuna objective for ElasticNet optimization"""
    alpha = trial.suggest_float('alpha', 0.0001, 1.0, log=True)
    l1_ratio = trial.suggest_float('l1_ratio', 0.1, 0.9)
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=42, max_iter=3000)
    
    cv_scores = cross_val_score(model, X_train_split, y_train, cv=cv, 
                               scoring='neg_mean_squared_error', n_jobs=-1)
    return np.sqrt(-cv_scores.mean())

print("\nAdvanced Linear Models Optimization:")
print("=" * 50)

# Optimize Lasso with progress
print("Optimizing Lasso...")
study_lasso = optuna.create_study(direction='minimize')
with tqdm(total=50, desc="Lasso Optimization") as pbar:
    def lasso_callback(study, trial):
        pbar.set_postfix({'Best RMSE': f'{study.best_value:.4f}'})
        pbar.update(1)
    
    study_lasso.optimize(objective_lasso, n_trials=50, callbacks=[lasso_callback])

best_lasso_params = study_lasso.best_params
best_lasso = Lasso(**best_lasso_params, random_state=42, max_iter=3000)
lasso_result = evaluate_model(best_lasso, X_train_split, y_train, cv, "Lasso_Optimized")

# Optimize ElasticNet with progress
print("Optimizing ElasticNet...")
study_elasticnet = optuna.create_study(direction='minimize')
with tqdm(total=50, desc="ElasticNet Optimization") as pbar:
    def elasticnet_callback(study, trial):
        pbar.set_postfix({'Best RMSE': f'{study.best_value:.4f}'})
        pbar.update(1)
    
    study_elasticnet.optimize(objective_elasticnet, n_trials=50, callbacks=[elasticnet_callback])

best_elasticnet_params = study_elasticnet.best_params
best_elasticnet = ElasticNet(**best_elasticnet_params, random_state=42, max_iter=3000)
elasticnet_result = evaluate_model(best_elasticnet, X_train_split, y_train, cv, "ElasticNet_Optimized")

print(f"Lasso optimized RMSE: {study_lasso.best_value:.4f}")
print(f"ElasticNet optimized RMSE: {study_elasticnet.best_value:.4f}")

[I 2025-07-12 22:40:03,264] A new study created in memory with name: no-name-5706c71f-57c7-493a-93a1-9e67346da4d1



Advanced Linear Models Optimization:
Optimizing Lasso...


Lasso Optimization:   0%|          | 0/50 [00:00<?, ?it/s][I 2025-07-12 22:40:03,610] Trial 0 finished with value: 0.13714640011524987 and parameters: {'alpha': 0.005492669479285742}. Best is trial 0 with value: 0.13714640011524987.
Lasso Optimization:   2%|▏         | 1/50 [00:00<00:16,  2.91it/s, Best RMSE=0.1371][I 2025-07-12 22:40:03,843] Trial 1 finished with value: 0.17636772658165972 and parameters: {'alpha': 0.03367870882234822}. Best is trial 0 with value: 0.13714640011524987.
Lasso Optimization:   4%|▍         | 2/50 [00:00<00:13,  3.60it/s, Best RMSE=0.1371][I 2025-07-12 22:40:04,859] Trial 2 finished with value: 0.22426911812135134 and parameters: {'alpha': 0.11613771809712}. Best is trial 0 with value: 0.13714640011524987.
Lasso Optimization:   6%|▌         | 3/50 [00:01<00:28,  1.63it/s, Best RMSE=0.1371][I 2025-07-12 22:40:05,023] Trial 3 finished with value: 0.2806114704180849 and parameters: {'alpha': 0.2186779334082228}. Best is trial 0 with value: 0.13714640011524987

Evaluating Lasso_Optimized...


                                                                 

  Fitting Lasso_Optimized on full training set...


[I 2025-07-12 22:40:16,121] A new study created in memory with name: no-name-4b41f4b4-8bcf-4cc3-bec6-1341ba3f5702


Optimizing ElasticNet...


ElasticNet Optimization:   0%|          | 0/50 [00:00<?, ?it/s][I 2025-07-12 22:40:16,286] Trial 0 finished with value: 0.12221918789398262 and parameters: {'alpha': 0.0018690637320991287, 'l1_ratio': 0.7690337612058046}. Best is trial 0 with value: 0.12221918789398262.
ElasticNet Optimization:   2%|▏         | 1/50 [00:00<00:08,  6.06it/s, Best RMSE=0.1222][I 2025-07-12 22:40:16,415] Trial 1 finished with value: 0.14487303627051898 and parameters: {'alpha': 0.014583969248838072, 'l1_ratio': 0.6366876306262411}. Best is trial 0 with value: 0.12221918789398262.
ElasticNet Optimization:   4%|▍         | 2/50 [00:00<00:06,  6.95it/s, Best RMSE=0.1222][I 2025-07-12 22:40:16,558] Trial 2 finished with value: 0.14059001398844623 and parameters: {'alpha': 0.029309789574630735, 'l1_ratio': 0.1921836658521243}. Best is trial 0 with value: 0.12221918789398262.
ElasticNet Optimization:   6%|▌         | 3/50 [00:00<00:06,  6.97it/s, Best RMSE=0.1222][I 2025-07-12 22:40:16,706] Trial 3 finished wit

Evaluating ElasticNet_Optimized...


                                                                      

  Fitting ElasticNet_Optimized on full training set...
Lasso optimized RMSE: 0.1181
ElasticNet optimized RMSE: 0.1182


In [18]:
# Compile all model results
all_results = {
    'XGBoost': xgb_result,
    'CatBoost': catboost_result,
    'LightGBM': lightgbm_result,
    'Lasso': lasso_result,
    'ElasticNet': elasticnet_result
}

print("\nOptimized Model Performance Comparison:")
print("=" * 70)
print(f"{'Model':<15} {'CV RMSE':<12} {'CV Std':<10} {'Val RMSE':<12}")
print("-" * 70)

for name, result in all_results.items():
    print(f"{name:<15} {result['cv_rmse_mean']:<12.4f} {result['cv_rmse_std']:<10.4f} "
          f"{result['val_rmse']:<12.4f}")

# Best individual model
best_individual = min(all_results.items(), key=lambda x: x[1]['cv_rmse_mean'])
print(f"\nBest individual model: {best_individual[0]} (CV RMSE: {best_individual[1]['cv_rmse_mean']:.4f})")

# Performance improvement over baseline
baseline_rmse = best_baseline[1]['cv_rmse_mean']
best_rmse = best_individual[1]['cv_rmse_mean']
improvement = (baseline_rmse - best_rmse) / baseline_rmse * 100

print(f"Improvement over baseline: {improvement:.2f}%")


Optimized Model Performance Comparison:
Model           CV RMSE      CV Std     Val RMSE    
----------------------------------------------------------------------
XGBoost         0.1146       0.0098     0.1250      
CatBoost        0.1139       0.0110     0.1254      
LightGBM        0.1185       0.0111     0.1290      
Lasso           0.1178       0.0080     0.1203      
ElasticNet      0.1179       0.0081     0.1203      

Best individual model: CatBoost (CV RMSE: 0.1139)
Improvement over baseline: 6.69%


In [None]:
class StackingEnsemble(BaseEstimator, RegressorMixin):
    """Custom stacking ensemble with out-of-fold predictions"""
    
    def __init__(self, base_models, meta_model, cv_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.cv_folds = cv_folds
        self.fitted_base_models = {}
        
    def fit(self, X, y):
        """Fit the stacking ensemble"""
        print("Training Stacking Ensemble...")
        
        # Generate out-of-fold predictions for meta-learner
        oof_predictions = np.zeros((X.shape[0], len(self.base_models)))
        
        kf = KFold(n_splits=self.cv_folds, shuffle=True, random_state=42)
        
        for i, (model_name, model) in enumerate(self.base_models.items()):
            print(f"  Processing {model_name}...")
            
            for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
                # Train on fold
                X_fold_train, X_fold_val = X.iloc[train_idx], X.iloc[val_idx]
                y_fold_train = y.iloc[train_idx]
                
                # Clone and fit model
                fold_model = clone(model)
                fold_model.fit(X_fold_train, y_fold_train)
                
                # Predict on validation fold
                oof_predictions[val_idx, i] = fold_model.predict(X_fold_val)
            
            # Fit on full dataset for final predictions
            final_model = clone(model)
            final_model.fit(X, y)
            self.fitted_base_models[model_name] = final_model
        
        # Train meta-learner on out-of-fold predictions
        print("  Training meta-learner...")
        self.meta_model.fit(oof_predictions, y)
        
        return self
    
    def predict(self, X):
        """Generate predictions using stacking ensemble"""
        # Get base model predictions
        base_predictions = np.zeros((X.shape[0], len(self.base_models)))
        
        for i, (model_name, model) in enumerate(self.fitted_base_models.items()):
            base_predictions[:, i] = model.predict(X)
        
        # Meta-learner final prediction
        return self.meta_model.predict(base_predictions)

# Import clone
from sklearn.base import clone

# Select top 3 models for stacking
top_models = sorted(all_results.items(), key=lambda x: x[1]['cv_rmse_mean'])[:3]
print(f"Top 3 models for stacking: {[model[0] for model in top_models]}")

stacking_base_models = {
    'XGBoost': best_xgb,
    'CatBoost': best_catboost,
    'LightGBM': best_lightgbm
}

# Meta-learner
meta_learner = Ridge(alpha=1.0, random_state=42)

# Create and evaluate stacking ensemble
stacking_ensemble = StackingEnsemble(
    base_models=stacking_base_models,
    meta_model=meta_learner,
    cv_folds=5
)

stacking_result = evaluate_model(stacking_ensemble, X_train, y_train, cv, "Stacking_Ensemble")
print(f"\nStacking Ensemble RMSE: {stacking_result['cv_rmse_mean']:.4f} ± {stacking_result['cv_rmse_std']:.4f}")

Top 3 models for stacking: ['CatBoost', 'XGBoost', 'Lasso']
Evaluating Stacking_Ensemble...


                                                           

ValueError: Found input variables with inconsistent numbers of samples: [1458, 1166]

In [None]:
from scipy.optimize import minimize

def optimize_weights(predictions_dict, true_values):
    """Find optimal weights for model averaging using optimization"""
    
    def objective(weights):
        """Objective function to minimize"""
        # Normalize weights
        weights = weights / np.sum(weights)
        
        # Calculate ensemble prediction
        ensemble_pred = sum(w * pred for w, pred in zip(weights, predictions_dict.values()))
        
        # Return RMSE
        return np.sqrt(mean_squared_error(true_values, ensemble_pred))
    
    # Initial equal weights
    n_models = len(predictions_dict)
    initial_weights = np.ones(n_models) / n_models
    
    # Constraints: weights sum to 1 and are non-negative
    constraints = {'type': 'eq', 'fun': lambda w: np.sum(w) - 1}
    bounds = [(0, 1) for _ in range(n_models)]
    
    # Optimize
    result = minimize(objective, initial_weights, method='SLSQP', 
                     bounds=bounds, constraints=constraints)
    
    return result.x / np.sum(result.x)  # Ensure normalization

# Generate predictions for weight optimization
print("Optimizing Weighted Averaging:")

# Get validation predictions from top models
val_predictions = {}
for name, model in stacking_base_models.items():
    model.fit(X_train, y_train)
    val_predictions[name] = model.predict(X_val)

# Optimize weights
optimal_weights = optimize_weights(val_predictions, y_val)

print("Optimal weights:")
for i, (name, weight) in enumerate(zip(stacking_base_models.keys(), optimal_weights)):
    print(f"  {name}: {weight:.4f}")

# Evaluate weighted ensemble using cross-validation
class WeightedEnsemble(BaseEstimator, RegressorMixin):
    """Weighted ensemble of models"""
    
    def __init__(self, models, weights):
        self.models = models
        self.weights = weights
        
    def fit(self, X, y):
        """Fit all models"""
        for model in self.models.values():
            model.fit(X, y)
        return self
        
    def predict(self, X):
        """Generate weighted predictions"""
        predictions = [model.predict(X) for model in self.models.values()]
        return sum(w * pred for w, pred in zip(self.weights, predictions))

weighted_ensemble = WeightedEnsemble(stacking_base_models, optimal_weights)
weighted_result = evaluate_model(weighted_ensemble, X_train, y_train, cv, "Weighted_Ensemble")
print(f"Weighted Ensemble RMSE: {weighted_result['cv_rmse_mean']:.4f} ± {weighted_result['cv_rmse_std']:.4f}")

In [None]:
# Simple average ensemble for comparison
class SimpleEnsemble(BaseEstimator, RegressorMixin):
    """Simple averaging ensemble"""
    
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        for model in self.models.values():
            model.fit(X, y)
        return self
        
    def predict(self, X):
        predictions = [model.predict(X) for model in self.models.values()]
        return np.mean(predictions, axis=0)

simple_ensemble = SimpleEnsemble(stacking_base_models)
simple_result = evaluate_model(simple_ensemble, X_train, y_train, cv, "Simple_Ensemble")
print(f"Simple Ensemble RMSE: {simple_result['cv_rmse_mean']:.4f} ± {simple_result['cv_rmse_std']:.4f}")

# Ensemble comparison
ensemble_results = {
    'Stacking': stacking_result,
    'Weighted': weighted_result,
    'Simple': simple_result
}

print("\nEnsemble Method Comparison:")
print("-" * 60)
for name, result in ensemble_results.items():
    print(f"{name:<12}: {result['cv_rmse_mean']:.4f} ± {result['cv_rmse_std']:.4f}")

best_ensemble = min(ensemble_results.items(), key=lambda x: x[1]['cv_rmse_mean'])
print(f"\nBest ensemble: {best_ensemble[0]} (RMSE: {best_ensemble[1]['cv_rmse_mean']:.4f})")

In [None]:
# Compile all model performances
all_model_results = {**all_results, **ensemble_results}

print("\nFinal Model Performance Matrix:")
print("=" * 90)
print(f"{'Model':<20} {'CV RMSE':<12} {'CV Std':<10} {'Val RMSE':<12} {'Improvement':<12}")
print("-" * 90)

baseline_cv_rmse = best_baseline[1]['cv_rmse_mean']

for name, result in sorted(all_model_results.items(), key=lambda x: x[1]['cv_rmse_mean']):
    improvement = (baseline_cv_rmse - result['cv_rmse_mean']) / baseline_cv_rmse * 100
    print(f"{name:<20} {result['cv_rmse_mean']:<12.4f} {result['cv_rmse_std']:<10.4f} "
          f"{result['val_rmse']:<12.4f} {improvement:<12.2f}%")

# Select final model
final_model_name = min(all_model_results.items(), key=lambda x: x[1]['cv_rmse_mean'])[0]
final_model_rmse = all_model_results[final_model_name]['cv_rmse_mean']

print(f"\nFinal Model Selected: {final_model_name}")
print(f"Final Model RMSE: {final_model_rmse:.4f}")
print(f"Target RMSE (0.1185): {'✓ ACHIEVED' if final_model_rmse < 0.1185 else '✗ NOT ACHIEVED'}")

# Determine which model to use for final predictions
if final_model_name == "Stacking_Ensemble":
    final_model = stacking_ensemble
elif final_model_name == "Weighted_Ensemble":
    final_model = weighted_ensemble
elif final_model_name == "Simple_Ensemble":
    final_model = simple_ensemble
else:
    # Individual model
    model_map = {
        'XGBoost': best_xgb,
        'CatBoost': best_catboost,
        'LightGBM': best_lightgbm,
        'Lasso': best_lasso,
        'ElasticNet': best_elasticnet
    }
    final_model = model_map[final_model_name]

In [None]:
# Comprehensive validation of final model
print("Final Model Validation:")
print("=" * 50)

# Fit final model on full training data
final_model.fit(X_train_full, y_log)

# Validation set predictions
val_predictions_log = final_model.predict(X_val)
val_predictions_original = np.exp(val_predictions_log)
y_val_original = np.exp(y_val)

# Calculate multiple metrics
val_rmse_log = np.sqrt(mean_squared_error(y_val, val_predictions_log))
val_rmse_original = np.sqrt(mean_squared_error(y_val_original, val_predictions_original))
val_mae_original = mean_absolute_error(y_val_original, val_predictions_original)
val_r2 = r2_score(y_val_original, val_predictions_original)

print(f"Validation Metrics:")
print(f"  RMSE (log scale): {val_rmse_log:.4f}")
print(f"  RMSE (original): ${val_rmse_original:,.0f}")
print(f"  MAE (original): ${val_mae_original:,.0f}")
print(f"  R² Score: {val_r2:.4f}")

# Calculate percentage errors
percentage_errors = np.abs((val_predictions_original - y_val_original) / y_val_original) * 100
print(f"  Mean Absolute Percentage Error: {percentage_errors.mean():.2f}%")
print(f"  Median Absolute Percentage Error: {np.median(percentage_errors):.2f}%")

In [None]:
# Residual analysis
print("\nResidual Analysis:")

residuals_log = y_val - val_predictions_log
residuals_original = y_val_original - val_predictions_original

# Create residual plots
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(15, 10))

# Predicted vs Actual (log scale)
ax1.scatter(val_predictions_log, y_val, alpha=0.6)
ax1.plot([y_val.min(), y_val.max()], [y_val.min(), y_val.max()], 'r--', lw=2)
ax1.set_xlabel('Predicted (log scale)')
ax1.set_ylabel('Actual (log scale)')
ax1.set_title('Predicted vs Actual (Log Scale)')

# Predicted vs Actual (original scale)
ax2.scatter(val_predictions_original, y_val_original, alpha=0.6)
ax2.plot([y_val_original.min(), y_val_original.max()], 
         [y_val_original.min(), y_val_original.max()], 'r--', lw=2)
ax2.set_xlabel('Predicted Price ($)')
ax2.set_ylabel('Actual Price ($)')
ax2.set_title('Predicted vs Actual (Original Scale)')

# Residuals vs Predicted (log scale)
ax3.scatter(val_predictions_log, residuals_log, alpha=0.6)
ax3.axhline(y=0, color='r', linestyle='--')
ax3.set_xlabel('Predicted (log scale)')
ax3.set_ylabel('Residuals (log scale)')
ax3.set_title('Residuals vs Predicted (Log Scale)')

# Residual distribution
ax4.hist(residuals_log, bins=30, alpha=0.7, edgecolor='black')
ax4.set_xlabel('Residuals (log scale)')
ax4.set_ylabel('Frequency')
ax4.set_title('Residual Distribution')

plt.tight_layout()
plt.show()

# Statistical tests on residuals
from scipy.stats import normaltest, shapiro

# Normality test
stat, p_value = normaltest(residuals_log)
print(f"Residual Normality Test (D'Agostino): statistic={stat:.4f}, p-value={p_value:.4f}")

if p_value > 0.05:
    print("  ✓ Residuals appear normally distributed")
else:
    print("  ⚠ Residuals may not be normally distributed")

# Check for systematic bias
print(f"Mean residual: {residuals_log.mean():.6f}")
print(f"Residual standard deviation: {residuals_log.std():.4f}")

In [None]:
# Feature importance analysis for tree-based models
print("\nFeature Importance Analysis:")

if hasattr(final_model, 'feature_importances_'):
    # Direct feature importance
    feature_importance = pd.DataFrame({
        'Feature': X_train_full.columns,
        'Importance': final_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
elif hasattr(final_model, 'models'):
    # Ensemble model - average importances
    all_importances = []
    for model in final_model.models.values():
        if hasattr(model, 'feature_importances_'):
            all_importances.append(model.feature_importances_)
    
    if all_importances:
        avg_importance = np.mean(all_importances, axis=0)
        feature_importance = pd.DataFrame({
            'Feature': X_train_full.columns,
            'Importance': avg_importance
        }).sort_values('Importance', ascending=False)
    else:
        feature_importance = None

elif hasattr(final_model, 'coef_'):
    # Linear model - use absolute coefficients
    feature_importance = pd.DataFrame({
        'Feature': X_train_full.columns,
        'Importance': np.abs(final_model.coef_)
    }).sort_values('Importance', ascending=False)
else:
    feature_importance = None

if feature_importance is not None:
    print("Top 20 Most Important Features:")
    print(feature_importance.head(20).to_string(index=False))
    
    # Categorize top features
    top_20_features = feature_importance.head(20)['Feature'].tolist()
    engineered_features = [f for f in top_20_features 
                          if any(suffix in f for suffix in ['_ord', '_log', '_TE', 'Total', 'Has', 'Avg', 'Age', 'Years'])]
    
    print(f"\nFeature Engineering Impact in Top 20:")
    print(f"  Engineered features: {len(engineered_features)}/20 ({len(engineered_features)/20*100:.1f}%)")
    print(f"  Original features: {20-len(engineered_features)}/20 ({(20-len(engineered_features))/20*100:.1f}%)")
else:
    print("Feature importance not available for this model type")