In [1]:
import numpy as np 
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
df = pd.read_csv('city_day.csv')
df.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
0,Ahmedabad,2015-01-01,,,0.92,18.22,17.15,,0.92,27.64,133.36,0.0,0.02,0.0,,
1,Ahmedabad,2015-01-02,,,0.97,15.69,16.46,,0.97,24.55,34.06,3.68,5.5,3.77,,
2,Ahmedabad,2015-01-03,,,17.4,19.3,29.7,,17.4,29.07,30.7,6.8,16.4,2.25,,
3,Ahmedabad,2015-01-04,,,1.7,18.48,17.97,,1.7,18.59,36.08,4.43,10.14,1.0,,
4,Ahmedabad,2015-01-05,,,22.1,21.42,37.76,,22.1,39.33,39.31,7.01,18.89,2.78,,


In [3]:
df_cleaned = df.dropna(inplace=False)

In [4]:
df_cleaned.head()

Unnamed: 0,City,Date,PM2.5,PM10,NO,NO2,NOx,NH3,CO,SO2,O3,Benzene,Toluene,Xylene,AQI,AQI_Bucket
2123,Amaravati,2017-11-25,81.4,124.5,1.44,20.5,12.08,10.72,0.12,15.24,127.09,0.2,6.5,0.06,184.0,Moderate
2124,Amaravati,2017-11-26,78.32,129.06,1.26,26.0,14.85,10.28,0.14,26.96,117.44,0.22,7.95,0.08,197.0,Moderate
2125,Amaravati,2017-11-27,88.76,135.32,6.6,30.85,21.77,12.91,0.11,33.59,111.81,0.29,7.63,0.12,198.0,Moderate
2126,Amaravati,2017-11-28,64.18,104.09,2.56,28.07,17.01,11.42,0.09,19.0,138.18,0.17,5.02,0.07,188.0,Moderate
2127,Amaravati,2017-11-29,72.47,114.84,5.23,23.2,16.59,12.25,0.16,10.55,109.74,0.21,4.71,0.08,173.0,Moderate


In [5]:
df_cleaned['AQI_encoded'] = pd.factorize(df_cleaned['AQI_Bucket'])[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['AQI_encoded'] = pd.factorize(df_cleaned['AQI_Bucket'])[0]


In [6]:
X = df_cleaned[['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'CO', 'SO2', 'O3', 'NH3', 'AQI_encoded']]
y = df_cleaned['AQI']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [5, 10, 20],
    'min_samples_split': [2, 5, 10],
}

In [9]:
rf = RandomForestRegressor()

In [10]:
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=2, n_jobs=-1, verbose=1)
grid_search.fit(X_train, y_train)
print("Best Parameters from GridSearchCV:", grid_search.best_params_)

Fitting 2 folds for each of 27 candidates, totalling 54 fits
Best Parameters from GridSearchCV: {'max_depth': 20, 'min_samples_split': 10, 'n_estimators': 100}


In [11]:
random_search = RandomizedSearchCV(estimator=rf, param_distributions=param_grid, n_iter=10, cv=2, n_jobs=-1, verbose=1, random_state=42)
random_search.fit(X_train, y_train)
print("Best Parameters from RandomizedSearchCV:", random_search.best_params_)

Fitting 2 folds for each of 10 candidates, totalling 20 fits


Best Parameters from RandomizedSearchCV: {'n_estimators': 50, 'min_samples_split': 5, 'max_depth': 20}


In [12]:
print("Evaluation for GridSearchCV Best Model:")
y_pred = grid_search.best_estimator_.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse}, R^2 Score: {r2}")
test_score = grid_search.best_estimator_.score(X_test, y_test)
print(f"Test Score: {test_score}")

Evaluation for GridSearchCV Best Model:
MSE: 227.1839244520272, R^2 Score: 0.9726949305742961
Test Score: 0.9726949305742961


In [13]:
print("Evaluation for RandomizedSearchCV Best Model:")
y_pred = random_search.best_estimator_.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"MSE: {mse}, R^2 Score: {r2}")
test_score = random_search.best_estimator_.score(X_test, y_test)
print(f"Test Score: {test_score}")

Evaluation for RandomizedSearchCV Best Model:
MSE: 231.84086996167724, R^2 Score: 0.9721352157055639
Test Score: 0.9721352157055639
