<a href="https://colab.research.google.com/github/Dan-Blanchette/DS_SP2025_Team_CDA/blob/main/Data_Science_AQI_Asthma.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Science: AQI and Asthma Correlation

# Part 2.1 (2A)

# Part 2.2(2B)

## Dan's AI/ML Model
Model 1: VGboosting + Random Forest manual stacking for AQI threhold predictions and county locations that are forecasted to have the most hospitilizations(above 50%).

In [None]:
# Author: Dan Blanchette
# Credit: sklearn documentation, plotly documentation, US Census Bureau,
# and ChatGPT for help with geopandas heatmap.

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import geopandas as gpd
from xgboost import XGBRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import TimeSeriesSplit
from sklearn.base import clone
import shap
import joblib



# --- Load and clean data ---
# Load the preprocessed dataset
df = pd.read_csv("/content/cleaned_aqi_hospitalizations.csv")

# --- Metrics ---
# Calculate and print total actual hospitalizations in the test set
total_hospitalizations = test_df['Value'].sum()
print(f"Total Actual Hospitalizations in Test Set: {total_hospitalizations:,.0f}")

# Log model performance by year
performance_by_year = test_df.copy()
performance_by_year['Actual'] = actual.flatten()
performance_by_year['Predicted'] = preds.flatten()

# Group by year and compute MAE and R²
print("Performance by Year:")
for year, group in performance_by_year.groupby('Year'):
    year_mae = mean_absolute_error(group['Actual'], group['Predicted'])
    year_r2 = r2_score(group['Actual'], group['Predicted'])
    print(f"Year {year}: MAE = {year_mae:.2f}, R² = {year_r2:.4f}")

# Print model performance: MAE and R^2
mae = mean_absolute_error(actual, preds)
r2 = r2_score(actual, preds)
print(f"\nMAE: {mae:.2f}")
print(f"R² Score: {r2:.4f}")

# --- County-level predictions ---
# Aggregate predictions at the county level and display top/bottom 10
county_preds = test_df[['CountyFIPS', 'County']].copy()
county_preds['Predicted_Hospitalizations'] = preds.flatten().round().astype(int)
full_county_results = county_preds.groupby(['CountyFIPS', 'County']).mean().round(0).astype(int).sort_values(by='Predicted_Hospitalizations', ascending=False)

print("\nTop 10 counties by predicted hospitalizations:")
print(full_county_results.head(10))
print("\nBottom 10 counties by predicted hospitalizations:")
print(full_county_results.tail(10))

# --- Save to CSV ---
# Save the county-level predictions for external use
full_county_results.to_csv("county_predictions.csv", float_format='%.0f')
print("\nCounty-level predictions saved to 'county_predictions.csv'")

# --- Plot predictions vs actual ---
# Visual check: scatter plot of predicted vs actual hospitalizations
plt.figure(figsize=(8, 6))
plt.scatter(actual, preds, alpha=0.5)
plt.plot([actual.min(), actual.max()], [actual.min(), actual.max()], 'r--')
plt.xlabel("Actual Hospitalizations")
plt.ylabel("Predicted Hospitalizations")
plt.title("Actual vs Predicted Hospitalizations")
plt.grid(True)
plt.show()

# --- County map ---
# Load shapefile and merge predictions for geographic visualization
shapefile_path = "/content/tl_2023_us_county.shp"
counties = gpd.read_file(shapefile_path)

# Create a CountyFIPS identifier for merging
if {'STATEFP', 'COUNTYFP'}.issubset(counties.columns):
    counties['CountyFIPS'] = (counties['STATEFP'] + counties['COUNTYFP']).astype(int)
elif 'GEOID' in counties.columns:
    counties['CountyFIPS'] = counties['GEOID'].astype(int)
else:
    raise KeyError(f"Shapefile must contain 'STATEFP' and 'COUNTYFP', or 'GEOID'. Found columns: {list(counties.columns)}")

# Merge the predictions with the shapefile
map_df = counties.merge(full_county_results.reset_index(), on='CountyFIPS', how='left')

# Plot the map
fig, ax = plt.subplots(1, 1, figsize=(12, 8))
map_df.plot(column='Predicted_Hospitalizations', cmap='OrRd', linewidth=0.8, ax=ax, edgecolor='0.8', legend=True)
ax.set_title("Predicted Hospitalizations by County", fontsize=16)
ax.axis('off')
plt.show()

# --- SHAP feature importance ---
# Use SHAP to explain XGBoost model feature contributions

explainer = shap.Explainer(xgb, X_scaled)
shap_values = explainer(X_test)

plt.figure(figsize=(10, 8))
shap.summary_plot(shap_values, features=X_test, feature_names=aqi_features)

# --- Save model and scalers ---
# Save the trained models and scalers for future use
joblib.dump(meta_model, "stacked_meta_model.pkl")
joblib.dump(xgb, "xgb_model.pkl")
joblib.dump(rf, "rf_model.pkl")
joblib.dump(scaler, "feature_scaler.pkl")
joblib.dump(y_scaler, "target_scaler.pkl")
print("\nModels and scalers saved successfully.")

# Evaluation:

The model is doing a great job overall, especially in the low-to-mid range values. It explains 73% of what's driving hospitalizations, with only moderate average error. There's room to improve accuracy on the higher end, but for the most part, this is a very solid, trustworthy model.

# Part 2.2 (2B) Jordan's Model

# Part 2.3(2C) Model Output Analysis