In [1]:
import pandas as pd
import numpy as np
import time
import os
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import (
    mean_absolute_error, 
    mean_squared_error, 
    r2_score, 
    median_absolute_error, 
    explained_variance_score, 
    max_error
)
from tqdm.auto import tqdm 

In [2]:
# 1. LOAD TOP FEATURE DATASETS
X_train_top = pd.read_csv('../data/X_train_top.csv')
X_test_top = pd.read_csv('../data/X_test_top.csv')
y_train = pd.read_csv('../data/y_train.csv').values.ravel()
y_test = pd.read_csv('../data/y_test.csv').values.ravel()

In [3]:
# 2. DEFINE PIPELINE
def create_pipeline(model):
    return Pipeline([
        ('scaler', StandardScaler()),
        ('regressor', model)
    ])

# 3. INITIALIZE MODELS
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0, random_state=42),
    'Lasso Regression': Lasso(alpha=0.1, random_state=42, max_iter=10000),
    'Decision Tree': DecisionTreeRegressor(max_depth=10, random_state=42),
    'Random Forest': RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

# 4. TRAIN AND EVALUATE
results = []
threshold = 15

for name, model in tqdm(models.items(), desc="Evaluating Models on Top Features"):
    pipe = create_pipeline(model)
    
    # Training Time
    start_train = time.time()
    pipe.fit(X_train_top, y_train)
    training_time = time.time() - start_train
    
    # Prediction Time
    start_pred = time.time()
    y_pred = pipe.predict(X_test_top)
    prediction_time = time.time() - start_pred
    
    # Metric Calculations
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    med_ae = median_absolute_error(y_test, y_pred)
    var_score = explained_variance_score(y_test, y_pred)
    m_error = max_error(y_test, y_pred)
    within_threshold = np.mean(np.abs(y_test - y_pred) <= threshold) * 100
    
    results.append({
        'Model': name,
        'MAE': round(mae, 2),
        'RMSE': round(rmse, 2),
        'R2 Score': round(r2, 4),
        'Median AE': round(med_ae, 2),
        'Variance Score': round(var_score, 4),
        'Max Error': round(m_error, 2),
        'Acc ±15 (%)': round(within_threshold, 2),
        'Train Time (s)': round(training_time, 4),
        'Pred Time (s)': round(prediction_time, 4)
    })

# 5. DISPLAY RESULTS
performance_df = pd.DataFrame(results)
print("\n" + "="*120)
print("MODEL PERFORMANCE ON TOP FEATURES")
print("="*120)
print(performance_df.to_string(index=False))
print("="*120)

# 6. BEST MODELS BY METRIC
print("\nBEST PERFORMING MODELS:")
print("-" * 50)
metrics_lower_better = ['MAE', 'RMSE', 'Median AE', 'Max Error', 'Train Time (s)', 'Pred Time (s)']
metrics_higher_better = ['R2 Score', 'Variance Score', 'Acc ±15 (%)']

for metric in metrics_lower_better:
    best_model = performance_df.loc[performance_df[metric].idxmin(), 'Model']
    best_value = performance_df[metric].min()
    print(f"  {metric:20s}: {best_model:20s} ({best_value})")

for metric in metrics_higher_better:
    best_model = performance_df.loc[performance_df[metric].idxmax(), 'Model']
    best_value = performance_df[metric].max()
    print(f"  {metric:20s}: {best_model:20s} ({best_value})")

# 7. SAVE RESULTS (Create directory if needed)
os.makedirs('../results', exist_ok=True)
performance_df.to_csv('../results/model_comparison_top_features.csv', index=False)
print("\n✓ Results saved to '../results/model_comparison_top_features.csv'")

Evaluating Models on Top Features:   0%|          | 0/6 [00:00<?, ?it/s]


MODEL PERFORMANCE ON TOP FEATURES
            Model   MAE  RMSE  R2 Score  Median AE  Variance Score  Max Error  Acc ±15 (%)  Train Time (s)  Pred Time (s)
Linear Regression 11.98 16.83    0.3225       7.72          0.3225      80.18        71.03          0.0637         0.0396
 Ridge Regression 11.98 16.83    0.3225       7.72          0.3225      80.18        71.03          0.0669         0.0052
 Lasso Regression 12.00 16.84    0.3222       7.75          0.3222      80.82        71.11          0.1859         0.0060
    Decision Tree 11.59 16.72    0.3315       7.20          0.3315      88.88        73.22          1.3020         0.0076
    Random Forest 11.30 16.04    0.3845       7.12          0.3845      84.41        73.27          7.7947         0.1047
Gradient Boosting 11.90 16.68    0.3348       7.73          0.3348      79.31        71.40         39.4303         0.0672

BEST PERFORMING MODELS:
--------------------------------------------------
  MAE                 : Random Fore