# Phase 4: Anomaly Detection & Health Impact

In this notebook we detect pollution spikes using a rolling z‑score method and estimate health risk scores for each observation.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from src.modeling.detect_anomalies import main as detect_anomalies_main
from src.modeling.health_risk import compute_risk_score

# Load processed data
df = pd.read_csv('../data/processed/merged_dataset.csv')
# Detect anomalies
rolling_mean = df['pm2_5'].rolling(window=24, min_periods=1, center=True).mean()
rolling_std = df['pm2_5'].rolling(window=24, min_periods=1, center=True).std()
z = (df['pm2_5'] - rolling_mean) / rolling_std
df['is_anomaly'] = z.abs() > 3
print('Number of anomalies:', df['is_anomaly'].sum())

# Compute health risk
scores = []
cats = []
for _, row in df.iterrows():
score, cat = compute_risk_score(row)
scores.append(score)
cats.append(cat)
df['health_risk_score'] = scores
df['health_risk_category'] = cats
df[['datetime', 'pm2_5', 'health_risk_score', 'health_risk_category']].head()

The above output shows the first few rows with computed health risk scores and categories.  You can visualise the distribution of risk scores or correlate them with AQI categories.