In [5]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Load cleaned data
print("Loading data for modeling...")
df = pd.read_csv('cleaned_infectious_disease.csv')
print(f"Dataset shape: {df.shape}")

# Filter for total population (not gender-specific)
df_total = df[df['Sex'] == 'Total'].copy()
print(f"Total records (excluding gender breakdown): {df_total.shape[0]}")

# 1. PROBLEM FORMULATION
print("\n 1. PROBLEM FORMULATION")
print("="*30)

"""
We will explore three modeling approaches:

1. REGRESSION: Predict incidence rate (Rate) for each county-year combination
2. CLASSIFICATION: Predict high-risk counties (above median rate)
3. TIME SERIES: Forecast future incidence rates

Primary focus: Regression problem to predict incidence rates
"""

Loading data for modeling...
Dataset shape: (141777, 14)
Total records (excluding gender breakdown): 47259

 1. PROBLEM FORMULATION


'\nWe will explore three modeling approaches:\n\n1. REGRESSION: Predict incidence rate (Rate) for each county-year combination\n2. CLASSIFICATION: Predict high-risk counties (above median rate)\n3. TIME SERIES: Forecast future incidence rates\n\nPrimary focus: Regression problem to predict incidence rates\n'

In [6]:
# 2. FEATURE ENGINEERING
print("\n2. FEATURE ENGINEERING")
print("="*30)

# Create features at county-year level
features_df = df_total.copy()

# Basic temporal features
features_df['Year_Since_2000'] = features_df['Year'] - 2000
features_df['Year_Squared'] = features_df['Year_Since_2000'] ** 2

# Lag features (previous year's rate)
features_df = features_df.sort_values(['County', 'Year'])

# Create lagged rate for each county
features_df['Rate_Lag1'] = features_df.groupby('County')['Rate'].shift(1)
features_df['Rate_Lag2'] = features_df.groupby('County')['Rate'].shift(2)
features_df['Rate_Lag3'] = features_df.groupby('County')['Rate'].shift(3)

# Moving averages
features_df['Rate_MA2'] = features_df.groupby('County')['Rate'].rolling(window=2).mean().reset_index(level=0, drop=True)
features_df['Rate_MA3'] = features_df.groupby('County')['Rate'].rolling(window=3).mean().reset_index(level=0, drop=True)

# Rate change features
features_df['Rate_Change_Lag1'] = features_df['Rate'] - features_df['Rate_Lag1']
features_df['Rate_Pct_Change_Lag1'] = (features_df['Rate_Change_Lag1'] / features_df['Rate_Lag1']) * 100

# County-level statistics (using only past data to avoid data leakage)
county_stats = features_df.groupby('County').agg({
    'Rate': ['mean', 'std', 'min', 'max']
}).round(3)
county_stats.columns = ['County_Rate_Mean', 'County_Rate_Std', 'County_Rate_Min', 'County_Rate_Max']

# Merge county statistics
features_df = features_df.merge(county_stats, left_on='County', right_index=True, how='left')

# Create rate z-score relative to county history
features_df['Rate_Z_Score'] = (features_df['Rate'] - features_df['County_Rate_Mean']) / features_df['County_Rate_Std']

# Population density proxy (if we had area data, we'd use actual density)
# Using population percentiles instead
population_percentiles = features_df['Population'].rank(pct=True)
features_df['Population_Percentile'] = population_percentiles

# Create interaction features
features_df['Population_Year_Interaction'] = features_df['Population'] * features_df['Year_Since_2000']
features_df['Rate_Lag1_Population'] = features_df['Rate_Lag1'] * features_df['Population']

# Regional features (group counties by rate patterns)
# First, identify county clusters from EDA
from sklearn.cluster import KMeans
county_features = county_stats.values
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
county_clusters = kmeans.fit_predict(county_features)
county_cluster_map = dict(zip(county_stats.index, county_clusters))
features_df['County_Cluster'] = features_df['County'].map(county_cluster_map)

# Create dummy variables for categorical features
county_dummies = pd.get_dummies(features_df['County'], prefix='County', drop_first=True)
cluster_dummies = pd.get_dummies(features_df['County_Cluster'], prefix='Cluster')

# Combine all features
X_full = pd.concat([
    features_df[[
        'Year_Since_2000', 'Year_Squared', 'Population', 'Population_Percentile',
        'Rate_Lag1', 'Rate_Lag2', 'Rate_Lag3', 'Rate_MA2', 'Rate_MA3',
        'Rate_Change_Lag1', 'Rate_Pct_Change_Lag1',
        'County_Rate_Mean', 'County_Rate_Std', 'Rate_Z_Score',
        'Population_Year_Interaction', 'Rate_Lag1_Population'
    ]],
    county_dummies,
    cluster_dummies
], axis=1)

# Target variable
y = features_df['Rate']

# Remove rows with missing values (from lag features)
valid_mask = ~X_full.isnull().any(axis=1) & ~y.isnull()
X_full = X_full[valid_mask]
y = y[valid_mask]
features_df = features_df[valid_mask]

print(f"Final dataset shape: {X_full.shape}")
print(f"Feature columns: {list(X_full.columns)}")


2. FEATURE ENGINEERING
Final dataset shape: (23856, 78)
Feature columns: ['Year_Since_2000', 'Year_Squared', 'Population', 'Population_Percentile', 'Rate_Lag1', 'Rate_Lag2', 'Rate_Lag3', 'Rate_MA2', 'Rate_MA3', 'Rate_Change_Lag1', 'Rate_Pct_Change_Lag1', 'County_Rate_Mean', 'County_Rate_Std', 'Rate_Z_Score', 'Population_Year_Interaction', 'Rate_Lag1_Population', 'County_Alpine', 'County_Amador', 'County_Butte', 'County_Calaveras', 'County_California', 'County_Colusa', 'County_Contra Costa', 'County_Del Norte', 'County_El Dorado', 'County_Fresno', 'County_Glenn', 'County_Humboldt', 'County_Imperial', 'County_Inyo', 'County_Kern', 'County_Kings', 'County_Lake', 'County_Lassen', 'County_Los Angeles', 'County_Madera', 'County_Marin', 'County_Mariposa', 'County_Mendocino', 'County_Merced', 'County_Modoc', 'County_Mono', 'County_Monterey', 'County_Napa', 'County_Nevada', 'County_Orange', 'County_Placer', 'County_Plumas', 'County_Riverside', 'County_Sacramento', 'County_San Benito', 'County_

In [8]:
# 3. FEATURE SELECTION
print("\n3. FEATURE SELECTION")
print("="*30)

from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression
from sklearn.feature_selection import RFE
from sklearn.linear_model import LinearRegression

# Split data for feature selection
from sklearn.model_selection import train_test_split
X_temp, X_test, y_temp, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42, shuffle=False
)

# Method 1: Correlation with target
correlations = X_temp.corrwith(y_temp).abs().sort_values(ascending=False)
print("Top 10 features by absolute correlation:")
print(correlations.head(10))

# Method 2: SelectKBest using f_regression
selector_kbest = SelectKBest(score_func=f_regression, k=20)
X_kbest = selector_kbest.fit_transform(X_temp, y_temp)
selected_features_kbest = X_temp.columns[selector_kbest.get_support()].tolist()
print(f"\nSelected {len(selected_features_kbest)} features using SelectKBest:")

# Method 3: Recursive Feature Elimination
estimator = LinearRegression()
selector_rfe = RFE(estimator, n_features_to_select=15, step=1)
selector_rfe.fit(X_temp, y_temp)
selected_features_rfe = X_temp.columns[selector_rfe.get_support()].tolist()
print(f"\nSelected {len(selected_features_rfe)} features using RFE:")

# Combine selection methods
selected_features = list(set(selected_features_kbest[:15] + selected_features_rfe))
print(f"\nTotal unique selected features: {len(selected_features)}")
print("Selected features:")
for i, feat in enumerate(sorted(selected_features), 1):
    print(f"  {i:2d}. {feat}")

# Use selected features
X_selected = X_full[selected_features]



3. FEATURE SELECTION
Top 10 features by absolute correlation:
Rate_Z_Score            0.898879
Rate_Change_Lag1        0.707160
Rate_MA2                0.707052
Rate_MA3                0.667718
County_Rate_Mean        0.050621
Cluster_3               0.049963
County_Rate_Std         0.047817
County_San Francisco    0.037481
County_Alpine           0.035517
County_Kern             0.031082
dtype: float64


ValueError: Input X contains infinity or a value too large for dtype('float64').

In [9]:
# 4. TRAIN/VALIDATION/TEST SPLITS
print("\n4. DATA SPLITTING STRATEGY")
print("="*30)

# Time-based splitting (since this is time series data)
# Use first 80% of years for training, next 10% for validation, last 10% for testing

# Get unique years
unique_years = sorted(features_df['Year'].unique())
train_years = unique_years[:int(len(unique_years)*0.8)]  # 2001-2010
val_years = unique_years[int(len(unique_years)*0.8):int(len(unique_years)*0.9)]  # 2011-2012
test_years = unique_years[int(len(unique_years)*0.9):]  # 2013-2014

print(f"Training years: {train_years}")
print(f"Validation years: {val_years}")
print(f"Test years: {test_years}")

# Create masks
train_mask = features_df['Year'].isin(train_years)
val_mask = features_df['Year'].isin(val_years)
test_mask = features_df['Year'].isin(test_years)

# Split data
X_train = X_selected[train_mask]
X_val = X_selected[val_mask]
X_test = X_selected[test_mask]

y_train = y[train_mask]
y_val = y[val_mask]
y_test = y[test_mask]

print(f"\nData split sizes:")
print(f"Training set: {X_train.shape}")
print(f"Validation set: {X_val.shape}")
print(f"Test set: {X_test.shape}")


4. DATA SPLITTING STRATEGY
Training years: [np.int64(2001), np.int64(2002), np.int64(2003), np.int64(2004), np.int64(2005), np.int64(2006), np.int64(2007), np.int64(2008), np.int64(2009), np.int64(2010), np.int64(2011)]
Validation years: [np.int64(2012)]
Test years: [np.int64(2013), np.int64(2014)]


NameError: name 'X_selected' is not defined

In [10]:
# 5. BASELINE MODELS
print("\n5. BASELINE MODELS")
print("="*30)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

def evaluate_model(y_true, y_pred, model_name):
    """Calculate and display evaluation metrics."""
    mae = mean_absolute_error(y_true, y_pred)
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    r2 = r2_score(y_true, y_pred)
    
    print(f"{model_name}:")
    print(f"  MAE:  {mae:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²:   {r2:.4f}")
    print(f"  MAPE: {np.mean(np.abs((y_true - y_pred) / y_true)) * 100:.2f}%")
    
    return {'MAE': mae, 'RMSE': rmse, 'R2': r2}

# Baseline 1: Mean predictor (predict mean of training data)
train_mean = y_train.mean()
y_pred_mean = np.full_like(y_val, train_mean)
baseline1_results = evaluate_model(y_val, y_pred_mean, "Baseline 1: Mean Predictor")

# Baseline 2: Last value predictor (for time series)
# For each county, use last available rate
last_rates = features_df[features_df['Year'].isin(train_years)].groupby('County')['Rate'].last()
y_pred_last = features_df[val_mask].apply(
    lambda row: last_rates.get(row['County'], train_mean), axis=1
)
baseline2_results = evaluate_model(y_val, y_pred_last, "Baseline 2: Last Value Predictor")

# Baseline 3: County mean predictor
county_means = features_df[features_df['Year'].isin(train_years)].groupby('County')['Rate'].mean()
y_pred_county_mean = features_df[val_mask].apply(
    lambda row: county_means.get(row['County'], train_mean), axis=1
)
baseline3_results = evaluate_model(y_val, y_pred_county_mean, "Baseline 3: County Mean Predictor")


5. BASELINE MODELS


NameError: name 'y_train' is not defined

In [11]:
# 6. LINEAR MODELS
print("\n6. LINEAR MODELS")
print("="*30)

from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler

# Scale features for regularized models
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

# Linear Regression
print("\nLinear Regression:")
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_val)
lr_results = evaluate_model(y_val, y_pred_lr, "Linear Regression")

# Ridge Regression
print("\nRidge Regression:")
ridge = Ridge(alpha=1.0, random_state=42)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_val_scaled)
ridge_results = evaluate_model(y_val, y_pred_ridge, "Ridge Regression")

# Lasso Regression
print("\nLasso Regression:")
lasso = Lasso(alpha=0.01, random_state=42, max_iter=10000)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_val_scaled)
lasso_results = evaluate_model(y_val, y_pred_lasso, "Lasso Regression")

# Check feature importance from Lasso
lasso_coef = pd.DataFrame({
    'feature': selected_features,
    'coefficient': lasso.coef_
}).sort_values('coefficient', key=abs, ascending=False)

print("\nTop 10 features by Lasso coefficient magnitude:")
print(lasso_coef.head(10))



6. LINEAR MODELS


NameError: name 'X_train' is not defined

In [12]:
# 7. TREE-BASED MODELS
print("\n7. TREE-BASED MODELS")
print("="*30)

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Decision Tree
print("\nDecision Tree:")
dt = DecisionTreeRegressor(max_depth=5, random_state=42)
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_val)
dt_results = evaluate_model(y_val, y_pred_dt, "Decision Tree")

# Random Forest
print("\nRandom Forest:")
rf = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_val)
rf_results = evaluate_model(y_val, y_pred_rf, "Random Forest")

# Gradient Boosting
print("\nGradient Boosting:")
gb = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, 
                              max_depth=3, random_state=42)
gb.fit(X_train, y_train)
y_pred_gb = gb.predict(X_val)
gb_results = evaluate_model(y_val, y_pred_gb, "Gradient Boosting")

# Feature importance from Random Forest
rf_importance = pd.DataFrame({
    'feature': selected_features,
    'importance': rf.feature_importances_
}).sort_values('importance', ascending=False)

print("\nTop 10 features by Random Forest importance:")
print(rf_importance.head(10))



7. TREE-BASED MODELS

Decision Tree:


NameError: name 'X_train' is not defined

In [14]:
# 8. ENSEMBLE METHODS
print("\n8. ENSEMBLE METHODS")
print("="*30)

from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.svm import SVR

# Voting Regressor
print("\nVoting Regressor:")
voting_reg = VotingRegressor([
    ('lr', LinearRegression()),
    ('ridge', Ridge(alpha=1.0)),
    ('rf', RandomForestRegressor(n_estimators=50, random_state=42))
])
voting_reg.fit(X_train, y_train)
y_pred_voting = voting_reg.predict(X_val)
voting_results = evaluate_model(y_val, y_pred_voting, "Voting Regressor")

# Stacking Regressor
print("\nStacking Regressor:")
stacking_reg = StackingRegressor(
    estimators=[
        ('ridge', Ridge(alpha=1.0)),
        ('rf', RandomForestRegressor(n_estimators=50, random_state=42)),
        ('gb', GradientBoostingRegressor(n_estimators=50, random_state=42))
    ],
    final_estimator=LinearRegression(),
    cv=5
)
stacking_reg.fit(X_train, y_train)
y_pred_stacking = stacking_reg.predict(X_val)
stacking_results = evaluate_model(y_val, y_pred_stacking, "Stacking Regressor")



8. ENSEMBLE METHODS

Voting Regressor:


NameError: name 'X_train' is not defined

In [15]:
# 9. NEURAL NETWORKS
print("\n9. NEURAL NETWORK MODELS")
print("="*30)

try:
    from sklearn.neural_network import MLPRegressor
    
    print("\nMulti-layer Perceptron:")
    mlp = MLPRegressor(
        hidden_layer_sizes=(64, 32),
        activation='relu',
        solver='adam',
        alpha=0.001,
        batch_size=32,
        learning_rate='adaptive',
        max_iter=500,
        random_state=42
    )
    mlp.fit(X_train_scaled, y_train)
    y_pred_mlp = mlp.predict(X_val_scaled)
    mlp_results = evaluate_model(y_val, y_pred_mlp, "MLP Regressor")
    
except ImportError:
    print("MLPRegressor not available. Skipping neural network models.")


9. NEURAL NETWORK MODELS

Multi-layer Perceptron:


NameError: name 'X_train_scaled' is not defined

In [16]:
# 10. CROSS-VALIDATION STRATEGIES
print("\n10. CROSS-VALIDATION RESULTS")
print("="*30)

from sklearn.model_selection import TimeSeriesSplit, cross_val_score

# Time Series Cross-Validation
tscv = TimeSeriesSplit(n_splits=5)

# Evaluate models with time series CV
models = {
    'Linear Regression': LinearRegression(),
    'Ridge Regression': Ridge(alpha=1.0),
    'Random Forest': RandomForestRegressor(n_estimators=50, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=50, random_state=42)
}

print("Time Series Cross-Validation Results (Negative RMSE):")
print("-" * 50)

cv_results = {}
for name, model in models.items():
    # Use scaled data for Ridge, original for others
    if name == 'Ridge Regression':
        X_cv = X_train_scaled
    else:
        X_cv = X_train
    
    scores = cross_val_score(model, X_cv, y_train, 
                           cv=tscv, scoring='neg_root_mean_squared_error',
                           n_jobs=-1)
    cv_results[name] = -scores.mean()
    print(f"{name:20s}: Mean RMSE = {-scores.mean():.4f} (±{scores.std():.4f})")
any


10. CROSS-VALIDATION RESULTS
Time Series Cross-Validation Results (Negative RMSE):
--------------------------------------------------


NameError: name 'X_train' is not defined

In [17]:
# 11. MODEL COMPARISON
print("\n11. MODEL PERFORMANCE COMPARISON")
print("="*30)

# Collect all results
all_results = {
    'Mean Predictor': baseline1_results,
    'Last Value': baseline2_results,
    'County Mean': baseline3_results,
    'Linear Regression': lr_results,
    'Ridge Regression': ridge_results,
    'Lasso Regression': lasso_results,
    'Decision Tree': dt_results,
    'Random Forest': rf_results,
    'Gradient Boosting': gb_results,
    'Voting Regressor': voting_results,
    'Stacking Regressor': stacking_results
}

# Add MLP results if available
if 'mlp_results' in locals():
    all_results['MLP Regressor'] = mlp_results

# Create comparison dataframe
results_df = pd.DataFrame(all_results).T
results_df = results_df.sort_values('RMSE')

print("\nModel Performance Ranking (by RMSE):")
print("-" * 50)
print(results_df[['RMSE', 'MAE', 'R2']].round(4))

# Visualize model comparison
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
models_sorted = results_df.index.tolist()
rmse_sorted = results_df['RMSE'].values

bars = plt.barh(range(len(models_sorted)), rmse_sorted, color='steelblue', alpha=0.7)
plt.yticks(range(len(models_sorted)), models_sorted)
plt.xlabel('RMSE (Lower is Better)')
plt.title('Model Performance Comparison (Validation Set)')
plt.gca().invert_yaxis()  # Highest RMSE at top

# Add value labels
for i, bar in enumerate(bars):
    width = bar.get_width()
    plt.text(width + 0.1, bar.get_y() + bar.get_height()/2, 
             f'{width:.3f}', ha='left', va='center')

plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig('model_comparison.png', dpi=300, bbox_inches='tight')
plt.show()


11. MODEL PERFORMANCE COMPARISON


NameError: name 'baseline1_results' is not defined

In [18]:
# 12. SELECT BEST MODEL FOR FURTHER EVALUATION
print("\n12. SELECTING BEST MODEL")
print("="*30)

best_model_name = results_df.index[0]
print(f"Best model based on validation RMSE: {best_model_name}")
print(f"Validation RMSE: {results_df.loc[best_model_name, 'RMSE']:.4f}")
print(f"Validation R²: {results_df.loc[best_model_name, 'R2']:.4f}")

# Retrain best model on combined training + validation data
print(f"\nRetraining {best_model_name} on combined training + validation data...")

# Combine training and validation sets
X_train_val = pd.concat([X_train, X_val])
y_train_val = pd.concat([y_train, y_val])

# Retrain the best model
if best_model_name == 'Random Forest':
    best_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
elif best_model_name == 'Gradient Boosting':
    best_model = GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, 
                                          max_depth=3, random_state=42)
elif best_model_name == 'Ridge Regression':
    best_model = Ridge(alpha=1.0, random_state=42)
    # Scale data for Ridge
    scaler_full = StandardScaler()
    X_train_val_scaled = scaler_full.fit_transform(X_train_val)
    X_test_scaled = scaler_full.transform(X_test)
    best_model.fit(X_train_val_scaled, y_train_val)
    y_pred_test = best_model.predict(X_test_scaled)
else:
    # Default to Random Forest if model not explicitly handled
    best_model = RandomForestRegressor(n_estimators=100, max_depth=10, random_state=42, n_jobs=-1)
    best_model.fit(X_train_val, y_train_val)
    y_pred_test = best_model.predict(X_test)

if best_model_name != 'Ridge Regression':
    best_model.fit(X_train_val, y_train_val)
    y_pred_test = best_model.predict(X_test)

# Evaluate on test set
print(f"\nFinal Evaluation on Test Set ({test_years}):")
test_results = evaluate_model(y_test, y_pred_test, f"{best_model_name} (Test Set)")


12. SELECTING BEST MODEL


NameError: name 'results_df' is not defined

In [19]:
# 13. SAVE MODELS AND FEATURES
print("\n13. SAVING MODELS AND FEATURES")
print("="*30)

import joblib
import json

# Save best model
model_filename = f'best_model_{best_model_name.lower().replace(" ", "_")}.pkl'
joblib.dump(best_model, model_filename)
print(f"Saved best model: {model_filename}")

# Save scaler if used
if best_model_name == 'Ridge Regression':
    joblib.dump(scaler_full, 'scaler.pkl')
    print("Saved scaler: scaler.pkl")

# Save feature list
feature_info = {
    'selected_features': selected_features,
    'feature_importance': rf_importance.to_dict('records'),
    'data_split': {
        'train_years': train_years,
        'val_years': val_years,
        'test_years': test_years
    }
}

with open('feature_info.json', 'w') as f:
    json.dump(feature_info, f, indent=2)
print("Saved feature information: feature_info.json")

# Save predictions
predictions_df = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred_test,
    'County': features_df.loc[test_mask, 'County'],
    'Year': features_df.loc[test_mask, 'Year']
})
predictions_df.to_csv('test_predictions.csv', index=False)
print("Saved test predictions: test_predictions.csv")


13. SAVING MODELS AND FEATURES


NameError: name 'best_model_name' is not defined

In [20]:
# 14. MODEL SUMMARY
print("\n14. MODELING SUMMARY")
print("="*30)

print(f"Best Model: {best_model_name}")
print(f"Number of Features: {len(selected_features)}")
print(f"Training Size: {X_train.shape[0]} samples")
print(f"Validation Size: {X_val.shape[0]} samples")
print(f"Test Size: {X_test.shape[0]} samples")

print(f"\nPerformance Metrics:")
print(f"  Validation RMSE: {results_df.loc[best_model_name, 'RMSE']:.4f}")
print(f"  Test RMSE: {test_results['RMSE']:.4f}")
print(f"  Test R²: {test_results['R2']:.4f}")

print("\nKey Features (from Random Forest):")
for i, row in rf_importance.head(5).iterrows():
    print(f"  {row['feature']}: {row['importance']:.4f}")

print("\n" + "="*50)
print("MODEL TRAINING COMPLETE")
print("="*50)



14. MODELING SUMMARY


NameError: name 'best_model_name' is not defined