In [3]:
pip install pymongo

Note: you may need to restart the kernel to use updated packages.




In [4]:
import pandas as pd
import numpy as np
import joblib
import certifi
from pymongo import MongoClient
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 1. Connect to MongoDB
MONGO_URI = "mongodb+srv://ali321hasnain_db_user:etRWe1e6ASFlpwEO@cluster0.1eklm6h.mongodb.net/?appName=Cluster0"
ca = certifi.where()
client = MongoClient(MONGO_URI, tlsCAFile=ca)
db = client["AQIPredictionSystem"]
collection = db["karachi_features"]

# 2. Load Data
print("ðŸ“¥ Fetching data from MongoDB...")
df = pd.DataFrame(list(collection.find({}, {'_id': 0})))

# Ensure data is sorted by time before we "shift" it
# Note: Replace 'timestamp' with the actual column name in your MongoDB
if 'timestamp' in df.columns:
    df = df.sort_values('timestamp')

# --- DATA TRANSFORMATION: CREATING THE 72 TARGETS ---
print("ðŸ”„ Creating 72-hour targets by shifting historical data...")
target_cols = []
for i in range(1, 73):
    col_name = f'aqi_{i}h'
    # Shifting moves future AQI values into the current row for training
    df[col_name] = df['aqi'].shift(-i)
    target_cols.append(col_name)

# Drop rows at the end where we don't have future values (the last 3 days)
df = df.dropna()

# Define Features (X) and Multi-Output Targets (y)
# Drop the target columns and non-numeric columns from X
cols_to_drop = target_cols + ['aqi']
if 'timestamp' in df.columns:
    cols_to_drop.append('timestamp')

X = df.drop(columns=cols_to_drop)
y = df[target_cols]

print(f"âœ… Data Preparation Complete. Features: {X.shape[1]}, Targets: {y.shape[1]}")
# -----------------------------------------------------

# 3. Split Data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 4. Feature Scaling (Must save this to use later in prediction)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 5. Define the 4 Multi-Output Models
# MultiOutputRegressor lets standard models predict a vector (all 72 hours)
models = {
    "Ridge_Regression": MultiOutputRegressor(Ridge(alpha=1.0)),
    "Random_Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "Gradient_Boosting": MultiOutputRegressor(GradientBoostingRegressor(n_estimators=100, random_state=42)),
    "Neural_Network_MLP": MLPRegressor(hidden_layer_sizes=(100, 50), max_iter=1000, random_state=42)
}

# 6. Train, Evaluate, and Save
print("ðŸ“Š Training Multi-Output Models...")

for name, model in models.items():
    # Training
    model.fit(X_train_scaled, y_train)
    
    # Validation
    preds = model.predict(X_test_scaled)
    
    # Calculate Metrics (Average across all 72 hours)
    mae = mean_absolute_error(y_test, preds)
    rmse = np.sqrt(mean_squared_error(y_test, preds))
    r2 = r2_score(y_test, preds)
    
    print(f"\n--- {name} ---")
    print(f"R2 Score: {r2:.4f}")
    print(f"MAE: {mae:.2f} (Avg error per hour predicted)")
    print(f"RMSE: {rmse:.2f}")
    
    # Save the trained model file
    joblib.dump(model, f"{name}_model.pkl")

# Save the scaler (Crucial for the dashboard script)
joblib.dump(scaler, "scaler.pkl")

print("\nâœ… All 4 models and the scaler have been saved as .pkl files!")

ðŸ“¥ Fetching data from MongoDB...
ðŸ”„ Creating 72-hour targets by shifting historical data...
âœ… Data Preparation Complete. Features: 7, Targets: 72
ðŸ“Š Training Multi-Output Models...

--- Ridge_Regression ---
R2 Score: 0.3618
MAE: 15.25 (Avg error per hour predicted)
RMSE: 19.90

--- Random_Forest ---
R2 Score: 0.8672
MAE: 5.79 (Avg error per hour predicted)
RMSE: 9.07

--- Gradient_Boosting ---
R2 Score: 0.7177
MAE: 9.71 (Avg error per hour predicted)
RMSE: 13.24

--- Neural_Network_MLP ---
R2 Score: 0.6516
MAE: 11.03 (Avg error per hour predicted)
RMSE: 14.70

âœ… All 4 models and the scaler have been saved as .pkl files!


