In [2]:
# ==========================================
# üåç FINAL CLEAN AQI MODEL TRAINING
# ==========================================

import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# ------------------------------------------
# 1Ô∏è‚É£ Load Dataset
# ------------------------------------------
df = pd.read_csv("data.csv" , encoding = 'latin1')
df = df.sample(150000, random_state = 42).reset_index(drop=True) 
# ------------------------------------------
# 2Ô∏è‚É£ Drop Unnecessary + High Cardinality Columns
# ------------------------------------------
columns_to_drop = [
    'pm2_5',
    'spm',
    'agency',
    'stn_code',
    'date',
    'sampling_date',
    'location',                     # DROP THIS
    'location_monitoring_station'   # DROP THIS
]

df = df.drop(columns=[col for col in columns_to_drop if col in df.columns])


# ------------------------------------------
# 4Ô∏è‚É£ Handle Missing Values
# ------------------------------------------
numeric_cols = ['so2', 'no2', 'rspm']

for col in numeric_cols:
    if col in df.columns:
        df[col] = df[col].fillna(df[col].median())

# Fill categorical
if 'state' in df.columns:
    df['state'] = df['state'].fillna("Unknown")

if 'type' in df.columns:
    df['type'] = df['type'].fillna("Unknown")

# ------------------------------------------
# 5Ô∏è‚É£ Define Features & Target
# ------------------------------------------
X = df.drop(columns=['rspm'])
y = df['rspm']

# ------------------------------------------
# 6Ô∏è‚É£ Encode Only Low-Cardinality Columns
# ------------------------------------------
X = pd.get_dummies(
    X,
    columns=[col for col in ['state', 'type'] if col in X.columns],
    drop_first=True
)

# Save feature list
feature_columns = X.columns.tolist()

# ------------------------------------------
# 7Ô∏è‚É£ Train-Test Split
# ------------------------------------------
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

# ------------------------------------------
# 8Ô∏è‚É£ Train Models
# ------------------------------------------
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(
        n_estimators=50,
        max_depth=12,
        n_jobs=-1,
        random_state=42
    ),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42)
}

best_model = None
best_score = -1

print("\nüìä Model Performance Results:")

for name, model in models.items():

    model.fit(X_train, y_train)
    predictions = model.predict(X_test)

    r2 = r2_score(y_test, predictions)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))

    print(f"\nModel: {name}")
    print("R2 Score:", round(r2, 4))
    print("RMSE:", round(rmse, 4))

    if r2 > best_score:
        best_score = r2
        best_model = model
        best_model_name = name

# ------------------------------------------
# 9Ô∏è‚É£ Final Best Model
# ------------------------------------------
print("\nüèÜ Best Model Selected:", best_model_name)
print("Best R2 Score:", round(best_score, 4))

if best_score >= 0.8:
    print("Model Performance: Excellent üî•")
elif best_score >= 0.6:
    print("Model Performance: Good üëç")
elif best_score >= 0.4:
    print("Model Performance: Moderate ‚ö†Ô∏è")
else:
    print("Model Performance: Needs Improvement ‚ùå")

# ------------------------------------------
# üîü Save Model
# ------------------------------------------
joblib.dump(best_model, "aqi_model.pkl")
joblib.dump(feature_columns, "model_features.pkl")

print("\nüíæ Model and feature list saved successfully!")

  df = pd.read_csv("data.csv" , encoding = 'latin1')



üìä Model Performance Results:

Model: Linear Regression
R2 Score: 0.315
RMSE: 59.4456

Model: Random Forest
R2 Score: 0.3999
RMSE: 55.6426

Model: Gradient Boosting
R2 Score: 0.36
RMSE: 57.4591

üèÜ Best Model Selected: Random Forest
Best R2 Score: 0.3999
Model Performance: Needs Improvement ‚ùå

üíæ Model and feature list saved successfully!
