In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint
import joblib

# Load data
data = pd.read_csv("/content/city_day.csv")

# Drop 'City' if present
if 'City' in data.columns:
    data = data.drop('City', axis=1)

# Impute missing numerical values with median
numerical_cols = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3',
                  'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
for col in numerical_cols:
    if col in data.columns:
        data[col] = data[col].fillna(data[col].median())

# Impute missing categorical values
if 'AQI_Bucket' in data.columns:
    data['AQI_Bucket'] = data['AQI_Bucket'].fillna(data['AQI_Bucket'].mode()[0])

# Drop AQI column as it's the target we want to predict indirectly
if 'AQI' in data.columns:
    data = data.drop('AQI', axis=1)

# Extract date features if 'Date' exists
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data = data.drop('Date', axis=1)

# Drop duplicates
data = data.drop_duplicates()

target_variable = 'AQI_Bucket'

if target_variable in data.columns:
    X = data.drop(target_variable, axis=1)
    y = data[target_variable]

    # Split dataset with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)

    # Scale numerical features only
    scaler = StandardScaler()
    X_train_scaled_numeric = scaler.fit_transform(X_train[numerical_cols])
    X_test_scaled_numeric = scaler.transform(X_test[numerical_cols])

    # Combine scaled numeric features with date features
    date_cols = ['Year', 'Month', 'Day', 'DayOfWeek']
    X_train_final = np.concatenate([X_train_scaled_numeric, X_train[date_cols].values], axis=1)
    X_test_final = np.concatenate([X_test_scaled_numeric, X_test[date_cols].values], axis=1)

    # Define hyperparameter distribution for RandomizedSearchCV
    param_dist = {
        'n_estimators': randint(50, 300),
        'max_depth': randint(5, 30),
        'min_samples_split': randint(2, 10),
        'min_samples_leaf': randint(1, 5),
        'class_weight': ['balanced']
    }

    rf = RandomForestClassifier(random_state=42)

    random_search = RandomizedSearchCV(
        estimator=rf,
        param_distributions=param_dist,
        n_iter=30,              # Number of parameter settings sampled
        cv=3,                  # 3-fold cross-validation
        scoring='accuracy',
        n_jobs=-1,             # Use all CPU cores
        verbose=2,
        random_state=42
    )

    # Run hyperparameter tuning
    random_search.fit(X_train_final, y_train)

    print("Best hyperparameters:", random_search.best_params_)
    print("Best cross-validation accuracy:", random_search.best_score_)

    best_rf = random_search.best_estimator_

    # Evaluate on test set
    y_pred = best_rf.predict(X_test_final)
    test_acc = accuracy_score(y_test, y_pred)
    print(f"Test set accuracy: {test_acc:.4f}")
    print("\nClassification report:\n", classification_report(y_test, y_pred))

    # Save model and scaler
    joblib.dump(best_rf, "aqi_random_forest_best_model.pkl")
    joblib.dump(scaler, "aqi_scaler.pkl")

else:
    print(f"Target variable '{target_variable}' not found in the DataFrame.")


Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best hyperparameters: {'class_weight': 'balanced', 'max_depth': 26, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 138}
Best cross-validation accuracy: 0.8165856207895273
Test set accuracy: 0.8207

Classification report:
               precision    recall  f1-score   support

        Good       0.78      0.68      0.73       268
    Moderate       0.87      0.87      0.87      2606
        Poor       0.69      0.65      0.67       556
Satisfactory       0.81      0.85      0.83      1645
      Severe       0.82      0.78      0.80       268
   Very Poor       0.73      0.78      0.75       467

    accuracy                           0.82      5810
   macro avg       0.78      0.77      0.77      5810
weighted avg       0.82      0.82      0.82      5810



In [None]:
joblib.dump(best_rf, "aqi_random_forest_best_model.pkl")
joblib.dump(scaler, "aqi_scaler.pkl")

['aqi_scaler.pkl']

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from scipy.stats import randint
import joblib
import time

# Load data
data = pd.read_csv("/content/city_day.csv")

# Drop 'City' if present
if 'City' in data.columns:
    data = data.drop('City', axis=1)

# Impute missing numerical values with median
numerical_cols = ['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3',
                  'CO', 'SO2', 'O3', 'Benzene', 'Toluene', 'Xylene']
for col in numerical_cols:
    if col in data.columns:
        data[col] = data[col].fillna(data[col].median())

# Impute missing categorical values
if 'AQI_Bucket' in data.columns:
    data['AQI_Bucket'] = data['AQI_Bucket'].fillna(data['AQI_Bucket'].mode()[0])

# Drop AQI column as it's the target we want to predict indirectly
if 'AQI' in data.columns:
    data = data.drop('AQI', axis=1)

# Extract date features if 'Date' exists
if 'Date' in data.columns:
    data['Date'] = pd.to_datetime(data['Date'])
    data['Year'] = data['Date'].dt.year
    data['Month'] = data['Date'].dt.month
    data['Day'] = data['Date'].dt.day
    data['DayOfWeek'] = data['Date'].dt.dayofweek
    data = data.drop('Date', axis=1)

# Drop duplicates
data = data.drop_duplicates()

target_variable = 'AQI_Bucket'
if target_variable in data.columns:
    X = data.drop(target_variable, axis=1)
    y = data[target_variable]

    # Split dataset with stratification
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y)

    # Identify numerical and date columns for separate handling
    numerical_cols_present = [col for col in numerical_cols if col in X_train.columns]
    date_cols_present = ['Year', 'Month', 'Day', 'DayOfWeek']
    date_cols_present = [col for col in date_cols_present if col in X_train.columns]



In [2]:
# Scale numerical features only
scaler = StandardScaler()
X_train_scaled_numeric = scaler.fit_transform(X_train[numerical_cols_present])
X_test_scaled_numeric = scaler.transform(X_test[numerical_cols_present])

# Combine scaled numeric features with date features
X_train_final = np.concatenate([X_train_scaled_numeric, X_train[date_cols_present].values], axis=1)
X_test_final = np.concatenate([X_test_scaled_numeric, X_test[date_cols_present].values], axis=1)

# Define hyperparameter distribution for RandomizedSearchCV
param_dist = {
  'n_estimators': randint(50, 300),
  'max_depth': randint(5, 30),
  'min_samples_split': randint(2, 10),
  'min_samples_leaf': randint(1, 5),
  'class_weight': ['balanced']
}


In [3]:
start_time = time.time()

rf = RandomForestClassifier(random_state=42)

random_search = RandomizedSearchCV(
  estimator=rf,
  param_distributions=param_dist,
  n_iter=30,              # Number of parameter settings sampled
  cv=3,                  # 3-fold cross-validation
  scoring='accuracy',
  n_jobs=-1,             # Use all CPU cores
  verbose=2,
  random_state=42
)

# Run hyperparameter tuning
random_search.fit(X_train_final, y_train)

print("Best hyperparameters:", random_search.best_params_)
print("Best cross-validation accuracy:", random_search.best_score_)

best_rf = random_search.best_estimator_

# Evaluate on test set
y_pred = best_rf.predict(X_test_final)
test_acc = accuracy_score(y_test, y_pred)
print(f"Test set accuracy: {test_acc:.4f}")
print("\nClassification report:\n", classification_report(y_test, y_pred))

# Save model and scaler
joblib.dump(best_rf, "aqi_random_forest_best_model.pkl")
joblib.dump(scaler, "aqi_scaler.pkl")

end_time = time.time()

# Calculate the total training time
training_time = end_time - start_time
print(f"\nTotal training time: {training_time:.2f} seconds")
minutes = int(training_time // 60)
seconds = int(training_time % 60)
print(f"Total training time: {minutes} minutes and {seconds} seconds")

Fitting 3 folds for each of 30 candidates, totalling 90 fits
Best hyperparameters: {'class_weight': 'balanced', 'max_depth': 26, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 138}
Best cross-validation accuracy: 0.8165856207895273
Test set accuracy: 0.8207

Classification report:
               precision    recall  f1-score   support

        Good       0.78      0.68      0.73       268
    Moderate       0.87      0.87      0.87      2606
        Poor       0.69      0.65      0.67       556
Satisfactory       0.81      0.85      0.83      1645
      Severe       0.82      0.78      0.80       268
   Very Poor       0.73      0.78      0.75       467

    accuracy                           0.82      5810
   macro avg       0.78      0.77      0.77      5810
weighted avg       0.82      0.82      0.82      5810


Total training time: 553.02 seconds
Total training time: 9 minutes and 13 seconds
