In [3]:
# -*- coding: utf-8 -*-
"""Model_training_AQI.ipynb (Random Forest Version)"""


import hopsworks
import pandas as pd
from dotenv import load_dotenv
import os
import plotly.graph_objects as go
from datetime import timedelta
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np
import joblib

# Load your .env with HOPSWORKS_API_KEY
load_dotenv()
api_key = os.getenv("HOPSWORKS_API_KEY")

project = hopsworks.login(api_key_value=api_key)
fs = project.get_feature_store()

# ✅ Fetch data
feature_group = fs.get_feature_group(name="weather_data_2", version=1)
df = feature_group.read()
df.head()




Connection closed.




To ensure compatibility please install the latest bug fix release matching the minor version of your backend (4.2) by running 'pip install hopsworks==4.2.*'



Logged in to project, explore it here https://c.app.hopsworks.ai:443/p/1257623
Finished: Reading data from Hopsworks, using Hopsworks Feature Query Service (2.96s) 


Unnamed: 0,datetime,temperature_2m,relative_humidity_2m,dew_point_2m,apparent_temperature,precipitation,rain,snowfall,surface_pressure,cloud_cover,...,winddirection_10m,pm10,pm2_5,carbon_monoxide,nitrogen_dioxide,sulphur_dioxide,ozone,aerosol_optical_depth,dust,uv_index
0,2025-04-22 08:00:00+00:00,26.3,72.0,20.9,30.3,0.0,0.0,0.0,1011.3,0.0,...,9.0,82.4,38.0,1075.0,27.2,13.9,67.0,0.28,81.0,1.5
1,2025-07-28 00:00:00+00:00,28.4,80.0,24.6,32.8,0.0,0.0,0.0,998.2,96.0,...,262.0,58.8,24.3,207.0,14.5,11.0,48.0,0.8,61.0,0.0
2,2025-06-18 21:00:00+00:00,29.7,84.0,26.7,34.6,0.1,0.1,0.0,997.6,34.0,...,248.0,45.1,22.3,219.0,10.5,8.9,56.0,0.48,36.0,0.0
3,2025-06-01 03:00:00+00:00,29.2,79.0,25.2,33.8,0.0,0.0,0.0,997.1,41.0,...,233.0,40.5,17.2,147.0,4.1,5.3,71.0,0.47,33.0,0.0
4,2024-11-06 01:00:00+00:00,24.0,89.0,22.1,27.8,0.0,0.0,0.0,1013.6,0.0,...,329.0,49.6,30.6,472.0,44.5,9.1,35.0,0.18,6.0,0.0


In [4]:
# --- AQI calculation ---
def calculate_aqi(pm25, pm10):
    return 0.5 * pm25 + 0.5 * pm10  # Simplified

df['aqi'] = calculate_aqi(df['pm2_5'], df['pm10'])
df['datetime'] = pd.to_datetime(df['datetime'])
df = df.sort_values('datetime')
df = df.dropna()

# --- Feature Engineering ---
df['hour'] = df['datetime'].dt.hour
df['day'] = df['datetime'].dt.day
df['month'] = df['datetime'].dt.month

# Lag features for past hours
def create_lagged_features(data, lag=24):  # 24 = past 24 hours
    for i in range(1, lag+1):
        data[f"aqi_lag_{i}"] = data['aqi'].shift(i)
    return data.dropna()

df = create_lagged_features(df)

# --- Prepare data ---
features = [
    'temperature_2m', 'relative_humidity_2m', 'windspeed_10m', 'ozone',
    'hour', 'day', 'month'
] + [f"aqi_lag_{i}" for i in range(1, 25)]

X = df[features]
y = df['aqi']

# --- Train/test split ---
split_idx = int(0.8 * len(X))
X_train, X_test = X.iloc[:split_idx], X.iloc[split_idx:]
y_train, y_test = y.iloc[:split_idx], y.iloc[split_idx:]

# --- Train Random Forest ---

rf_model = RandomForestRegressor(
    n_estimators=200,
    random_state=42,
    n_jobs=-1
)

rf_model.fit(X_train, y_train)
y_pred = rf_model.predict(X_test)



In [5]:
# --- Evaluation ---
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

def regression_accuracy(y_true, y_pred, tolerance=0.1):
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    within_range = np.abs((y_true - y_pred) / y_true) <= tolerance
    return np.mean(within_range) * 100

accuracy = regression_accuracy(y_test, y_pred, tolerance=0.1)

print(f"MAE: {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.2f}")
print(f"Accuracy (±10%): {accuracy:.2f}%")



MAE: 2.16
RMSE: 3.61
R²: 0.96
Accuracy (±10%): 84.97%


In [7]:
# --- Save model locally ---
import joblib
model_dir = "aqi_rf_model"
os.makedirs(model_dir, exist_ok=True)
joblib.dump(rf_model, f"{model_dir}/rf_aqi_forecast.pkl")

# --- Register model in Hopsworks ---
mr = project.get_model_registry()

model_meta = mr.python.create_model(
    name="AQI_RF_Forecaster",
    metrics={"MAE": mae, "RMSE": rmse, "R2": r2, "Accuracy": accuracy},
    description="Random Forest model for 3-day AQI forecasting",
)
model_meta.save(model_dir)
print("✅ Model saved successfully in Hopsworks Model Registry!")

# --- Predict Next 3 Days (72 hours) ---
recent_data = df.tail(24).copy()  # last 24 hours
future_preds = []

for i in range(72):  # Predict next 72 hours
    row = recent_data.iloc[-1:].copy()

    # Build lag features for prediction
    for j in range(1, 25):
        row[f"aqi_lag_{j}"] = recent_data['aqi'].iloc[-j]

    X_future = row[features]
    pred = rf_model.predict(X_future)[0]
    future_preds.append(pred)

    # Append new prediction for next step
    new_row = {
        'datetime': row['datetime'].iloc[0] + pd.Timedelta(hours=1),
        'aqi': pred,
        'temperature_2m': row['temperature_2m'].iloc[0],
        'relative_humidity_2m': row['relative_humidity_2m'].iloc[0],
        'windspeed_10m': row['windspeed_10m'].iloc[0],
        'ozone': row['ozone'].iloc[0],
        'hour': (row['hour'].iloc[0] + 1) % 24,
        'day': row['day'].iloc[0],
        'month': row['month'].iloc[0],
    }
    recent_data = pd.concat([recent_data, pd.DataFrame([new_row])])

print("Predicted AQI for next 3 days:", future_preds[:10], "...")



  0%|          | 0/6 [00:00<?, ?it/s]

Uploading /content/aqi_rf_model/rf_aqi_forecast.pkl: 0.000%|          | 0/124663825 elapsed<00:00 remaining<?

Model created, explore it at https://c.app.hopsworks.ai:443/p/1257623/models/AQI_RF_Forecaster/1
✅ Model saved successfully in Hopsworks Model Registry!
Predicted AQI for next 3 days: [36.501, 36.973750000000024, 37.76050000000002, 38.5675, 38.98700000000001, 38.59399999999999, 38.01025000000002, 37.32875000000002, 37.001750000000044, 36.332000000000036] ...


In [8]:
# --- Visualization ---

df_recent = df.tail(72*3).copy()  # past 3 days
actual_dates = df_recent['datetime']
actual_aqi = df_recent['aqi']

last_timestamp = df_recent['datetime'].iloc[-1]
future_dates = [last_timestamp + timedelta(hours=i+1) for i in range(72)]

fig = go.Figure()
fig.add_trace(go.Scatter(
    x=actual_dates,
    y=actual_aqi,
    mode='lines+markers',
    name='Actual AQI (Past 3 Days)',
    line=dict(color='green', width=2)
))
fig.add_trace(go.Scatter(
    x=future_dates,
    y=future_preds,
    mode='lines+markers',
    name='Predicted AQI (Next 3 Days)',
    line=dict(color='orange', width=2, dash='dot')
))
fig.update_layout(
    title='🌤 AQI Forecast: Past vs Next 3 Days (Random Forest)',
    xaxis_title='Date & Time',
    yaxis_title='Air Quality Index (AQI)',
    legend=dict(x=0, y=1.1, orientation="h"),
    template='plotly_white',
    hovermode='x unified'
)
fig.show()
