In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('./stations_day_dataset.csv')
df.head()
df.isnull().sum().sort_values(ascending=False)

In [None]:
import numpy as np
import pandas as pd

def impute_strategy(df):
    result = []
    for station_id,group in df.groupby('StationId'):
        numeric_columns = group.select_dtypes(include='number').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count = group[col].isnull().sum()
            null_pct = null_count / row_count
            if null_count == row_count:
                skew = np.nan
            else:
                skew = group[col].dropna().skew()
            result.append({
                'StationId': station_id,
                'Column': col,
                'Nulls': null_count,
                'TotalRows': row_count,
                'NullPct': null_pct,
                'Skew': skew
            })
    return pd.DataFrame(result)
summary_df = impute_strategy(df)

In [None]:
import pandas as pd
import numpy as np

def impute_values_station_wise(df):
    for station_id,group in df.groupby('StationId'):
        numeric_columns = group.select_dtypes(include='number').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count_col = group[col].isnull().sum()
            null_pct =  null_count_col/row_count
            
            if null_pct <= 0.5:
                skew_value = group[col].dropna().skew()
                if skew_value > 0.5 or skew_value < -0.5:
                    df.loc[group.index,col] = group[col].fillna(group[col].median())
                else:
                    df.loc[group.index,col] = group[col].fillna(group[col].mean())
impute_values_station_wise(df)
df.isnull().sum().sort_values(ascending=False)

In [None]:
import pandas as pd
import numpy as np

def impute_values_city_wise(df):
    for City,group in df.groupby('City'):
        numeric_columns = group.select_dtypes(include='number').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count_col = group[col].isnull().sum()
            null_pct =  null_count_col/row_count
            
            if null_pct <= 0.5:
                skew_value = group[col].dropna().skew()
                if skew_value > 0.5 or skew_value < -0.5:
                    df.loc[group.index,col] = group[col].fillna(group[col].median())
                else:
                    df.loc[group.index,col] = group[col].fillna(group[col].mean())
impute_values_city_wise(df)
df.isnull().sum().sort_values(ascending=False)

In [None]:
import pandas as pd
import numpy as np

def impute_values_state_wise(df):
    for state,group in df.groupby('State'):
        numeric_columns = group.select_dtypes(include='number').columns
        row_count = len(group)
        for col in numeric_columns:
            null_count_col = group[col].isnull().sum()
            null_pct =  null_count_col/row_count
            
            if null_pct <= 0.5:
                skew_value = group[col].dropna().skew()
                if skew_value > 0.5 or skew_value < -0.5:
                    df.loc[group.index,col] = group[col].fillna(group[col].median())
                else:
                    df.loc[group.index,col] = group[col].fillna(group[col].mean())
impute_values_state_wise(df)
df.isnull().sum().sort_values(ascending=False)

In [None]:
## Impute remanining values with rest of median or mean based on skew value

numeric_columns = df.select_dtypes(include='number').columns

for col in numeric_columns:
    skew = df[col].dropna().skew()
    if skew > 0.5 or skew < -0.5:
        df[col] = df[col].fillna(df[col].median())
    else:
        df[col] = df[col].fillna(df[col].mean())
df.isnull().sum().sort_values(ascending=False)
df.to_csv('./stations_cleaned.csv')
df.columns

In [None]:
df['Month'] = pd.to_datetime(df['Date']).dt.month
monthly_average_pm = df.groupby(['Month'])['PM2.5'].mean().reset_index()
monthly_average_pm.columns = ['Month', 'Avg_PM2.5']
monthly_average_pm.sort_values(by='Avg_PM2.5', ascending=False)
def get_season(month):
    if month in [10,11,12,1,2]:
        return "Winter"
    if month in [3,4,5]:
        return "Summer"
    if month in [6,7,8,9]:
        return "Rainy"
df['Season'] = df['Month'].apply(get_season)
df.columns

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12,5))
sns.histplot(data=df,x='PM2.5',bins=50)
plt.show()

In [None]:
plt.figure(figsize=(12,5))
monthly_avg_pm = df.groupby('Month')['PM2.5'].mean().reset_index()
print(monthly_avg_pm)
sns.barplot(data=monthly_avg_pm,x='Month',y='PM2.5')
plt.show()

In [None]:
plt.figure(figsize=(12,5))
season_avg_pm = df.groupby('Season')['PM2.5'].mean().reset_index()
print(season_avg_pm)
sns.barplot(data=season_avg_pm,x='Season',y='PM2.5')
plt.show()

In [None]:
df['Day'] = pd.to_datetime(df['Date']).dt.dayofweek
daily_avg_pm = df.groupby('Day')['PM2.5'].mean().reset_index()
plt.figure(figsize=(12,6))
sns.barplot(data=daily_avg_pm,x='Day',y='PM2.5')

In [None]:
plt.figure(figsize=(12,6))
sns.scatterplot(data=df,x='temperature_2m',y='PM2.5')
plt.show()

In [None]:
from scipy.stats import zscore

df['PM2.5_zscore'] = zscore(df['PM2.5'])
df['PM2.5_outlier_flag'] = df['PM2.5_zscore'].abs() > 3
(df['PM2.5_outlier_flag'] == True).sum()

In [None]:
df.columns
plt.figure(figsize=(13,9))
correlation_data = df[['PM2.5', 'PM10', 'SO2', 'NO2',
                         'CO', 'O3', 'temperature_2m', 'relative_humidity_2m',
                         'windspeed_10m', 'NO', 'NOx','NH3','Benzene','Toluene']]
sns.heatmap(correlation_data.corr(),cmap=plt.cm.Reds,annot=True)
plt.title('Heatmap displaying the correlation matrix of the variables',fontsize=16)
plt.show()

In [None]:
df['PM2.5_outlier_flag'] = df['PM2.5_outlier_flag'].astype(int)

In [None]:
df.drop(columns=['AQI','AQI_Bucket','Unnamed: 0','FullAddress','StationName','FullAddress','StationId','location','longitude','latitude','Month','Date','State','PM2.5_flagged','Day','PM2.5_zscore'],inplace=True,errors='ignore')
df.columns

In [None]:
df_encoded = pd.get_dummies(df,columns=['City','Season'],drop_first=True)
df_encoded['PM2.5_outlier_flag'].head(1)

In [None]:
# df_encoded.to_csv('./stations_encoded.csv',index=False)
# df_encoded

# Model Training and Evaluation

In [71]:
df = df_encoded
df.columns

Index(['PM2.5', 'PM10', 'NO', 'NO2', 'NOx', 'NH3', 'CO', 'SO2', 'O3',
       'Benzene', 'Toluene', 'road_distance', 'river_distance',
       'industry_distance', 'temperature_2m', 'relative_humidity_2m',
       'windspeed_10m', 'PM2.5_outlier_flag', 'City_Aizawl', 'City_Amaravati',
       'City_Amritsar', 'City_Bengaluru', 'City_Bhopal', 'City_Brajrajnagar',
       'City_Chandigarh', 'City_Chennai', 'City_Coimbatore', 'City_Delhi',
       'City_Ernakulam', 'City_Gurugram', 'City_Guwahati', 'City_Hyderabad',
       'City_Jaipur', 'City_Jorapokhar', 'City_Kochi', 'City_Kolkata',
       'City_Lucknow', 'City_Mumbai', 'City_Patna', 'City_Shillong',
       'City_Talcher', 'City_Thiruvananthapuram', 'City_Visakhapatnam',
       'Season_Summer', 'Season_Winter'],
      dtype='object')

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

print(df.columns)  
numeric_df = df.select_dtypes(include='number')
corr_matrix = numeric_df.corr()

plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix[['PM2.5']].sort_values(by='PM2.5', ascending=False), 
            annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation of Features with PM2.5')
plt.show()
# df.drop(columns=['road_distance','river_distance','industry_distance'],inplace=True)

In [72]:
X = df.drop(columns=['PM2.5'])
y = df['PM2.5']

In [74]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
numeric_columns = X.select_dtypes(include='number').columns
categorical_columns = X.drop(columns=numeric_columns)
scaled_array = scaler.fit_transform(X[numeric_columns])
scaled_numeric_X = pd.DataFrame(scaled_array, columns=numeric_columns, index=X.index)
df = pd.concat([categorical_columns,scaled_numeric_X],axis=1)


In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train,y_train)

y_pred = model.predict(X_test)
r2 = r2_score(y_test,y_pred)
mae = mean_absolute_error(y_test,y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"R² Score: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print("PM2.5 mean:", y.mean())
print("PM2.5 std deviation:", y.std())

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

dt = DecisionTreeRegressor(random_state=42)
dt.fit(X_train,y_train)
y_pred_dt = dt.predict(X_test)

print("Decision Tree:")
print("R² Score:", r2_score(y_test, y_pred_dt))
print("RMSE:", mean_squared_error(y_test, y_pred_dt))
print("MAE:", mean_absolute_error(y_test, y_pred_dt))

In [75]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators=100, random_state=42,max_depth=30,n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest:")
print("R² Score:", r2_score(y_test, y_pred_rf))
print("RMSE:", mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))


Random Forest:
R² Score: 0.8950308241769108
RMSE: 595.9108734286533
MAE: 13.910725407183236


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error


rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_grid = {
    'n_estimators': [200, 100],
    'max_depth': [30, 20],
    'min_samples_split': [2, 5],
    'max_features': ['log2']
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    scoring='r2',
    verbose=2,
    n_jobs=-1
)

grid_search.fit(X_train, y_train)

best_rf = grid_search.best_estimator_

y_pred_rf_best = best_rf.predict(X_test)

print("Best Random Forest:")
print("Best Parameters:", grid_search.best_params_)
print("R² Score:", r2_score(y_test, y_pred_rf_best))
print("RMSE:", mean_squared_error(y_test, y_pred_rf_best))
print("MAE:", mean_absolute_error(y_test, y_pred_rf_best))


In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,root_mean_squared_error
from scipy.stats import randint

param_dist = {
    'n_estimators': randint(100, 200),
    'max_depth': randint(10, 30),
    'min_samples_split': randint(2, 10),
    'min_samples_leaf': randint(1, 5),
    'max_features': ['auto', 'sqrt', 'log2']
}

rf = RandomForestRegressor(random_state=42, n_jobs=-1)

random_search = RandomizedSearchCV(
    estimator=rf,
    param_distributions=param_dist,
    n_iter=12,          
    cv=2,               
    scoring='r2',
    random_state=42,
    verbose=2,
    n_jobs=-1
)

random_search.fit(X_train, y_train)

best_rf = random_search.best_estimator_

y_pred_rf = best_rf.predict(X_test)

print("Random Forest (RandomizedSearchCV):")
print("Best Params:", random_search.best_params_)
print("R² Score:", r2_score(y_test, y_pred_rf))
print("RMSE:", root_mean_squared_error(y_test, y_pred_rf, squared=False))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))


In [77]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,root_mean_squared_error

rf = RandomForestRegressor(n_estimators=100, random_state=42,max_depth=30,n_jobs=-1)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

print("Random Forest:")
print("R² Score:", r2_score(y_test, y_pred_rf))
print("RMSE:", root_mean_squared_error(y_test, y_pred_rf))
print("MAE:", mean_absolute_error(y_test, y_pred_rf))


Random Forest:
R² Score: 0.8950308241769108
RMSE: 24.411285779914447
MAE: 13.910725407183238


In [78]:
import folium
from folium.plugins import HeatMap
import pandas as pd

# Combine actual and predicted into a DataFrame with coordinates
df_map = X_test.copy()
df_map['PM2.5_actual'] = y_test.values
df_map['PM2.5_predicted'] = y_pred_rf

# Create base map centered on Delhi
delhi_map = folium.Map(location=[28.6139, 77.2090], zoom_start=10)

# Add actual PM2.5 layer
for _, row in df_map.iterrows():
    folium.CircleMarker(
        location=[row['latitude'], row['longitude']],
        radius=5,
        popup=f"Actual: {row['PM2.5_actual']:.2f}\nPredicted: {row['PM2.5_predicted']:.2f}",
        color='blue',
        fill=True,
        fill_color='blue',
        fill_opacity=0.5
    ).add_to(delhi_map)

# Add predicted PM2.5 layer (with slight offset to visualize both if needed)
for _, row in df_map.iterrows():
    folium.CircleMarker(
        location=[row['latitude'] + 0.002, row['longitude'] + 0.002],  # Slight offset
        radius=5,
        popup=f"Predicted: {row['PM2.5_predicted']:.2f}",
        color='red',
        fill=True,
        fill_color='red',
        fill_opacity=0.5
    ).add_to(delhi_map)

# Show map
delhi_map.save("delhi_pm25_comparison_map.html")

KeyError: 'latitude'

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Scatter plot
plt.figure(figsize=(8, 6))
sns.scatterplot(x=y_test, y=y_pred_rf, alpha=0.5)

plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')

plt.xlabel("Actual PM2.5")
plt.ylabel("Predicted PM2.5")
plt.title("Predicted vs Actual PM2.5 - Random Forest")
plt.grid(True)
plt.tight_layout()
plt.show()