In [2]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

print("MODEL BUILDING AND EVALUATION")
print("="*50)

# 1. LOAD TRAINED MODEL AND DATA
print("\n1. LOADING TRAINED MODEL AND DATA")
print("="*30)

import joblib
import json

# Load best model
try:
    best_model = joblib.load('best_model_random_forest.pkl')
    print("Loaded: Random Forest model")
except:
    try:
        best_model = joblib.load('best_model_gradient_boosting.pkl')
        print("Loaded: Gradient Boosting model")
    except:
        print("Loading default model...")
        from sklearn.ensemble import RandomForestRegressor
        best_model = RandomForestRegressor()

# Load feature information
with open('feature_info.json', 'r') as f:
    feature_info = json.load(f)

selected_features = feature_info['selected_features']
print(f"Number of features: {len(selected_features)}")

# Load predictions
predictions_df = pd.read_csv('test_predictions.csv')
print(f"Test predictions loaded: {predictions_df.shape[0]} samples")

# Load full dataset for analysis
df = pd.read_csv('cleaned_infectious_disease.csv')
df_total = df[df['Sex'] == 'Total'].copy()


MODEL BUILDING AND EVALUATION

1. LOADING TRAINED MODEL AND DATA
Loading default model...


FileNotFoundError: [Errno 2] No such file or directory: 'feature_info.json'

In [None]:
# 2. HYPERPARAMETER OPTIMIZATION RESULTS
print("\n2. HYPERPARAMETER OPTIMIZATION")
print("="*30)

from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.ensemble import RandomForestRegressor

# Prepare data for hyperparameter tuning
from sklearn.model_selection import train_test_split

# Use the same feature engineering as in training
# (Recreating features for demonstration)
features_df = df_total.copy()
features_df['Year_Since_2000'] = features_df['Year'] - 2000
features_df['Rate_Lag1'] = features_df.groupby('County')['Rate'].shift(1)
features_df = features_df.dropna(subset=['Rate_Lag1'])

# Create simplified feature set for demonstration
X = features_df[['Year_Since_2000', 'Rate_Lag1']]
y = features_df['Rate']

# Split data
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.3, random_state=42, shuffle=False
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=42, shuffle=False
)

# Define parameter grid for Random Forest
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 15, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

print("Performing Randomized Search for Random Forest...")
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

# Use randomized search for efficiency
random_search = RandomizedSearchCV(
    rf, param_distributions=param_grid,
    n_iter=20,  # Number of parameter settings sampled
    cv=3,
    scoring='neg_root_mean_squared_error',
    random_state=42,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

print("Best parameters found:")
for param, value in random_search.best_params_.items():
    print(f"  {param}: {value}")

print(f"Best cross-validation RMSE: {-random_search.best_score_:.4f}")

# Train with best parameters
best_rf = random_search.best_estimator_
y_pred_val = best_rf.predict(X_val)

from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
val_rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f"Validation RMSE with optimized parameters: {val_rmse:.4f}")
