In [30]:
import pandas as pd
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.neighbors import KNeighborsRegressor

import warnings
warnings.filterwarnings('ignore')

## **Training and Evaluation**

In [31]:
# Load and prepare dataset for SVR
df = pd.read_csv('output/data_merged_cleaned.csv')

# Convert date and sort chronologically for time-based split
df['date'] = pd.to_datetime(df['date'], format='%d/%m/%Y')
df = df.sort_values('date').reset_index(drop=True)

# Separate features and target
X = df.drop(columns=['aqi_pm2.5', 'date'])
y = df['aqi_pm2.5']

# Time-based split (80% train, 20% test)
split_idx = int(len(X) * 0.8)
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

print(f"Training samples: {X_train.shape[0]}")
print(f"Testing samples: {X_test.shape[0]}")
print(f"Date range: {df['date'].iloc[0].strftime('%d/%m/%Y')} to {df['date'].iloc[-1].strftime('%d/%m/%Y')}")

Training samples: 1172
Testing samples: 293
Date range: 01/06/2019 to 29/11/2023


In [32]:

# Scale features for k-NN (distance-based)
knn_scaler = StandardScaler()
X_train_knn = knn_scaler.fit_transform(X_train)
X_test_knn = knn_scaler.transform(X_test)

# Train k-NN with default parameters
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train_knn, y_train)

# Predictions
y_pred_knn = knn_model.predict(X_test_knn)

# Evaluate
knn_rmse = np.sqrt(mean_squared_error(y_test, y_pred_knn))
knn_r2 = r2_score(y_test, y_pred_knn)

print("K-NEAREST NEIGHBORS PERFORMANCE:")
print(f"RMSE: {knn_rmse:.2f}")
print(f"R² Score: {knn_r2:.4f}")

K-NEAREST NEIGHBORS PERFORMANCE:
RMSE: 41.73
R² Score: 0.6546


In [33]:
from sklearn.linear_model import Ridge

# Scale features for Ridge Regression
ridge_scaler = StandardScaler()
X_train_ridge = ridge_scaler.fit_transform(X_train)
X_test_ridge = ridge_scaler.transform(X_test)

# Train Ridge Regression
ridge_model = Ridge(alpha=1.0)  # Default regularization
ridge_model.fit(X_train_ridge, y_train)

# Predictions
y_pred_ridge = ridge_model.predict(X_test_ridge)

# Evaluate
ridge_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge))
ridge_r2 = r2_score(y_test, y_pred_ridge)

print("RIDGE REGRESSION PERFORMANCE:")
print(f"RMSE: {ridge_rmse:.2f}")
print(f"R² Score: {ridge_r2:.4f}")

RIDGE REGRESSION PERFORMANCE:
RMSE: 43.00
R² Score: 0.6333


## **Hyperparameter Tuning**

In [34]:
from sklearn.model_selection import GridSearchCV

# k-NN parameter grid
knn_param_grid = {
    'n_neighbors': [3, 5, 7, 9, 11, 15, 20],
    'weights': ['uniform', 'distance'],
    'p': [1, 2]  # 1: Manhattan, 2: Euclidean
}

# Grid search for k-NN
knn_grid = GridSearchCV(
    KNeighborsRegressor(),
    knn_param_grid,
    cv=TimeSeriesSplit(n_splits=5),
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

knn_grid.fit(X_train_knn, y_train)

# Best k-NN model
best_knn = knn_grid.best_estimator_
y_pred_knn_tuned = best_knn.predict(X_test_knn)

knn_tuned_rmse = np.sqrt(mean_squared_error(y_test, y_pred_knn_tuned))
knn_tuned_r2 = r2_score(y_test, y_pred_knn_tuned)

print("TUNED K-NEAREST NEIGHBORS:")
print(f"Best Parameters: {knn_grid.best_params_}")
print(f"RMSE: {knn_tuned_rmse:.2f}")
print(f"R² Score: {knn_tuned_r2:.4f}")

TUNED K-NEAREST NEIGHBORS:
Best Parameters: {'n_neighbors': 20, 'p': 1, 'weights': 'distance'}
RMSE: 39.33
R² Score: 0.6932


In [35]:
# Ridge Regression parameter grid
ridge_param_grid = {
    'alpha': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'saga']
}

# Grid search for Ridge
ridge_grid = GridSearchCV(
    Ridge(),
    ridge_param_grid,
    cv=TimeSeriesSplit(n_splits=5),
    scoring='neg_root_mean_squared_error',
    n_jobs=-1
)

ridge_grid.fit(X_train_ridge, y_train)

# Best Ridge model
best_ridge = ridge_grid.best_estimator_
y_pred_ridge_tuned = best_ridge.predict(X_test_ridge)

ridge_tuned_rmse = np.sqrt(mean_squared_error(y_test, y_pred_ridge_tuned))
ridge_tuned_r2 = r2_score(y_test, y_pred_ridge_tuned)

print("TUNED RIDGE REGRESSION:")
print(f"Best Parameters: {ridge_grid.best_params_}")
print(f"RMSE: {ridge_tuned_rmse:.2f}")
print(f"R² Score: {ridge_tuned_r2:.4f}")

TUNED RIDGE REGRESSION:
Best Parameters: {'alpha': 10, 'solver': 'lsqr'}
RMSE: 43.33
R² Score: 0.6277


In [36]:
# Ensure required variables exist for SVR category analysis
from sklearn.metrics import mean_squared_error

# Define AQI categories
aqi_bins = [0, 100, 150, 200, 300, 500]
bin_labels = ['Good (0-100)', 'Moderate (101-150)', 'Unhealthy (151-200)', 
              'Very Unhealthy (201-300)', 'Hazardous (301-500)']

# Categorize test samples
y_test_binned = pd.cut(y_test, bins=aqi_bins, labels=bin_labels)

# Generate k-NN predictions
y_pred_knn = best_knn.predict(X_test_knn)

# Calculate k-NN performance by category
print("K-NN PERFORMANCE BY AQI CATEGORY")
print(f"{'Category':<25} {'Samples':<10} {'RMSE':<12} {'Bias':<12}")
print("-" * 60)

category_results_knn = []
for category in bin_labels:
    mask = y_test_binned == category
    if mask.sum() > 0:
        category_rmse = np.sqrt(mean_squared_error(y_test[mask], y_pred_knn[mask]))
        category_bias = (y_test[mask] - y_pred_knn[mask]).mean()
        
        category_results_knn.append({
            'category': category,
            'samples': mask.sum(),
            'rmse': category_rmse,
            'bias': category_bias
        })
        
        print(f"{category:<25} {mask.sum():<10} {category_rmse:<12.2f} {category_bias:<12.2f}")

K-NN PERFORMANCE BY AQI CATEGORY
Category                  Samples    RMSE         Bias        
------------------------------------------------------------
Good (0-100)              38         51.49        -41.72      
Moderate (101-150)        121        22.21        -10.52      
Unhealthy (151-200)       70         34.62        -6.74       
Very Unhealthy (201-300)  45         35.25        9.24        
Hazardous (301-500)       19         89.74        82.62       


## **Feature Engineering**

In [37]:

df_sorted = df.copy()

# Base features
X_base = df_sorted.drop(columns=['aqi_pm2.5', 'date'])
y = df_sorted['aqi_pm2.5']

# Create enhanced features
X_enhanced = X_base.copy()

# Lag features
X_enhanced['temp_avg_lag1'] = df_sorted['temp_avg_c'].shift(1)
X_enhanced['wind_avg_lag1'] = df_sorted['wind_speed_avg_mph'].shift(1)
X_enhanced['humidity_avg_lag1'] = df_sorted['humidity_avg_percent'].shift(1)

# Interaction features
X_enhanced['wind_temp_interaction'] = X_enhanced['wind_speed_avg_mph'] * X_enhanced['temp_min_c']
X_enhanced['wind_humidity_interaction'] = X_enhanced['wind_speed_avg_mph'] * X_enhanced['humidity_avg_percent']

# High-risk indicators
X_enhanced['high_risk_month'] = X_enhanced['month'].isin([1, 2, 10, 11]).astype(int)
X_enhanced['early_winter'] = ((X_enhanced['month'] == 11) | (X_enhanced['month'] == 12)).astype(int)

# Rolling features
X_enhanced['wind_3day_avg'] = df_sorted['wind_speed_avg_mph'].rolling(3, min_periods=1).mean()
X_enhanced['temp_3day_avg'] = df_sorted['temp_avg_c'].rolling(3, min_periods=1).mean()

# Fill NaN values
X_enhanced = X_enhanced.fillna(X_enhanced.mean())

print(f"Enhanced features created: {X_enhanced.shape[1]} total features")


print(f"\nTraining set: {X_train.shape[0]} samples")
print(f"Test set: {X_test.shape[0]} samples")
print(f"Training dates: {df_sorted['date'].iloc[0].strftime('%d/%m/%Y')} to {df_sorted['date'].iloc[split_idx-1].strftime('%d/%m/%Y')}")
print(f"Testing dates: {df_sorted['date'].iloc[split_idx].strftime('%d/%m/%Y')} to {df_sorted['date'].iloc[-1].strftime('%d/%m/%Y')}")



Enhanced features created: 35 total features

Training set: 1172 samples
Test set: 293 samples
Training dates: 01/06/2019 to 06/01/2023
Testing dates: 07/01/2023 to 29/11/2023


## **Final Evaluation**

In [38]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import root_mean_squared_error, r2_score

# Initialize KNN with optimized parameters
knn_model = KNeighborsRegressor(
    n_neighbors=7,          
    weights='distance',     
    p=1,                    
    n_jobs=-1               
)

# Train KNN
knn_model.fit(X_train_scaled, y_train)


# Make predictions
y_pred_knn = knn_model.predict(X_test_scaled)

# Evaluate KNN
rmse_knn = root_mean_squared_error(y_test, y_pred_knn)
r2_knn = r2_score(y_test, y_pred_knn)

print("\nKNN MODEL PERFORMANCE:")
print(f"RMSE: {rmse_knn:.2f}")
print(f"R²: {r2_knn:.4f}")

# Category performance analysis
print("\nKNN PERFORMANCE BY AQI CATEGORY:")

aqi_bins = [0, 100, 150, 200, 300, 500]
bin_labels = ['Good (0-100)', 'Moderate (101-150)', 'Unhealthy (151-200)', 
              'Very Unhealthy (201-300)', 'Hazardous (301-500)']

y_test_binned = pd.cut(y_test, bins=aqi_bins, labels=bin_labels)

print(f"{'Category':<25} {'Samples':<10} {'RMSE':<12} {'Bias':<12}")
print("-" * 60)

for category in bin_labels:
    mask = y_test_binned == category
    if mask.sum() > 0:
        category_rmse = root_mean_squared_error(y_test[mask], y_pred_knn[mask])
        category_bias = (y_test[mask] - y_pred_knn[mask]).mean()
        print(f"{category:<25} {mask.sum():<10} {category_rmse:<12.2f} {category_bias:<12.2f}")



KNN MODEL PERFORMANCE:
RMSE: 38.11
R²: 0.7119

KNN PERFORMANCE BY AQI CATEGORY:
Category                  Samples    RMSE         Bias        
------------------------------------------------------------
Good (0-100)              38         47.67        -37.58      
Moderate (101-150)        121        21.78        -7.52       
Unhealthy (151-200)       70         35.49        -3.48       
Very Unhealthy (201-300)  45         43.21        3.34        
Hazardous (301-500)       19         75.96        61.28       


In [39]:

# Compare with XGBoost
print("\nCOMPARISON WITH XGBOOST:")
print(f"KNN RMSE: {rmse_knn:.2f} vs XGBoost RMSE: 33.69")

# Hazardous days specific analysis
hazardous_mask = y_test > 300
if hazardous_mask.sum() > 0:
    knn_hazardous_rmse = root_mean_squared_error(y_test[hazardous_mask], y_pred_knn[hazardous_mask])
    print(f"\nKNN Hazardous Days RMSE: {knn_hazardous_rmse:.2f}")
    print(f"XGBoost Hazardous RMSE: 57.83")
    print(f"Difference: {knn_hazardous_rmse - 57.83:.2f} RMSE points")



COMPARISON WITH XGBOOST:
KNN RMSE: 38.11 vs XGBoost RMSE: 33.69

KNN Hazardous Days RMSE: 75.96
XGBoost Hazardous RMSE: 57.83
Difference: 18.13 RMSE points


In [40]:
import joblib
# Save k-NN model and scaler
knn_model_data = {
    'model': knn_model,
    'scaler': scaler_X,
    'feature_names': X_train.columns.tolist()
}

joblib.dump(knn_model_data, 'output/models/knn_model.pkl')
print("k-NN model saved to output/models/knn_model.pkl")

k-NN model saved to output/models/knn_model.pkl
