# Air Quality Modeling - Modeling and Evaluation

## G. Modeling
We predict numeric AQI (regression) because it preserves information about severity.
We use a time-based split to avoid leakage, and compare a baseline model with a tree-based model.


In [None]:
from pathlib import Path
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

ROOT = Path('..').resolve()
sys.path.append(str(ROOT / 'src'))

from features import build_feature_columns
from train import train_test_split_time, build_baseline_model, tune_tree_model
from evaluate import regression_metrics, summarize_metrics

data_path = ROOT / 'data' / 'processed_features.csv'
df = pd.read_csv(data_path, parse_dates=['timestamp'])

df = df.dropna(subset=['aqi'])
pollutant_cols = ['pm25', 'pm10', 'no2', 'o3', 'co', 'so2']
feature_cols = build_feature_columns(pollutant_cols, include_lags=True)
feature_cols = [c for c in feature_cols if c in df.columns]

train_df, test_df = train_test_split_time(df, time_col='timestamp', test_size=0.2)
X_train = train_df[feature_cols].values
y_train = train_df['aqi'].values
X_test = test_df[feature_cols].values
y_test = test_df['aqi'].values

baseline_model = build_baseline_model()
baseline_model.fit(X_train, y_train)
baseline_pred = baseline_model.predict(X_test)
baseline_metrics = regression_metrics(y_test, baseline_pred)
print('Baseline:', summarize_metrics(baseline_metrics))

best_model, best_params = tune_tree_model(X_train, y_train, random_state=42)
tree_pred = best_model.predict(X_test)
tree_metrics = regression_metrics(y_test, tree_pred)
print('Tree:', summarize_metrics(tree_metrics))
print('Best params:', best_params)

# Plot: predicted vs actual for best model
plt.figure(figsize=(6, 6))
plt.scatter(y_test, tree_pred, alpha=0.4)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')
plt.xlabel('Actual AQI')
plt.ylabel('Predicted AQI')
plt.title('Predicted vs Actual AQI')
plt.tight_layout()
plt.show()

# Plot: feature importance (if available)
model = best_model.named_steps['model']
if hasattr(model, 'feature_importances_'):
    importances = model.feature_importances_
    order = np.argsort(importances)[::-1]
    plt.figure(figsize=(8, 4))
    plt.bar(range(len(feature_cols)), importances[order])
    plt.xticks(range(len(feature_cols)), np.array(feature_cols)[order], rotation=45, ha='right')
    plt.title('Feature Importance')
    plt.tight_layout()
    plt.show()


## H. Evaluation & Interpretation
We report MAE, RMSE, and R2, examine feature importance, and analyze where errors are largest.


In [None]:
test_results = test_df.copy()
test_results['pred'] = tree_pred
test_results['abs_error'] = (test_results['aqi'] - test_results['pred']).abs()

# Error by month
test_results['month'] = test_results['timestamp'].dt.month
mae_by_month = test_results.groupby('month')['abs_error'].mean()
plt.figure(figsize=(8, 4))
mae_by_month.plot(kind='bar')
plt.title('MAE by Month (Test Set)')
plt.xlabel('Month')
plt.ylabel('MAE')
plt.tight_layout()
plt.show()

# Error distribution
plt.figure(figsize=(8, 4))
test_results['abs_error'].hist(bins=30)
plt.title('Absolute Error Distribution')
plt.xlabel('Absolute Error')
plt.ylabel('Count')
plt.tight_layout()
plt.show()


## I. Conclusions & Next Steps
- Data limitations: missing pollutants for many timestamps and locations.
- Improvements: add weather variables, refine aggregation windows, and apply sensor-level quality filters.
- Modeling: try gradient boosting and location-specific models.


In [None]:
# Populate a short report template
report_path = ROOT / 'reports' / 'summary.md'
rows_cleaned = len(df)
time_start = df['timestamp'].min().date()
time_end = df['timestamp'].max().date()
pct_missing_aqi = 100 * df['aqi'].isna().mean()

notes = 'Tree model performs better than baseline; remaining errors likely due to missing context.'

report = report_path.read_text()
report = report.replace('{{rows_cleaned}}', str(rows_cleaned))
report = report.replace('{{time_start}}', str(time_start))
report = report.replace('{{time_end}}', str(time_end))
report = report.replace('{{aggregation_window}}', 'Daily')
report = report.replace('{{pct_missing_aqi}}', '{:.2f}%'.format(pct_missing_aqi))
report = report.replace('{{best_model_name}}', 'RandomForestRegressor')
report = report.replace('{{mae}}', '{:.2f}'.format(tree_metrics['mae']))
report = report.replace('{{rmse}}', '{:.2f}'.format(tree_metrics['rmse']))
report = report.replace('{{r2}}', '{:.3f}'.format(tree_metrics['r2']))
report = report.replace('{{notes}}', notes)
report_path.write_text(report)

print(report)
