In [13]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error,mean_absolute_error

In [14]:
#load processed data
PROJECT_ROOT=os.path.abspath(os.path.join(os.getcwd(), '..'))
DATA_PROCESSED_DIR=os.path.join(PROJECT_ROOT,'data','processed')

df=pd.read_csv(os.path.join(DATA_PROCESSED_DIR,'aqi_processed_data.csv'))

df['Timestamp']=pd.to_datetime(df['Timestamp'])
df=df.sort_values(['City','Timestamp']).reset_index(drop=True)

In [15]:
#feature engineering for ML models

HORIZON=1

POLLUT_COL=['PM2.5','PM10','NO2','NH3','SO2','CO','O3']
MISSING_FLAG_COL=[f"{c}_was_missing" for c in POLLUT_COL]

LAGS=[1,2,3,7,14]

In [16]:
def make_lag_features(city_df:pd.DataFrame)->pd.DataFrame:
    city_df=city_df.sort_values('Timestamp').copy()

    #target:next-day AQI
    city_df['y'] = city_df['AQI'].shift(-HORIZON)

    #lag AQI
    for lag in LAGS:
        city_df[f'AQI_lag{lag}'] = city_df['AQI'].shift(lag)

    #lag pollutants
    for col in POLLUT_COL:
        for lag in LAGS:
            city_df[f'{col}_lag{lag}'] = city_df[col].shift(lag)

    #calendar features(simple, effective)
    city_df['dow']=city_df['Timestamp'].dt.dayofweek
    city_df['month']=city_df['Timestamp'].dt.month

    return city_df



In [17]:
df_feat=df.groupby('City', group_keys=False).apply(make_lag_features)

#split flags

TRAIN_END_DATE='2022-12-31'
VAL_END_DATE='2023-03-31'

def add_split_flags(city_df:pd.DataFrame)->pd.DataFrame:
    city_df=city_df.sort_values('Timestamp').copy()
    city_df['split']='test'
    city_df.loc[city_df['Timestamp']<=VAL_END_DATE,'split']='val'
    city_df.loc[city_df['Timestamp']<=TRAIN_END_DATE,'split']='train'
    return city_df

df_feat=df_feat.groupby('City',group_keys=False).apply(add_split_flags)
df_feat['split'].value_counts()

  df_feat=df.groupby('City', group_keys=False).apply(make_lag_features)
  df_feat=df_feat.groupby('City',group_keys=False).apply(add_split_flags)


split
train    3288
test     1923
val       270
Name: count, dtype: int64

In [18]:
feature_cols=(
    [f'AQI_lag{lag}' for lag in LAGS]+
    [f'{col}_lag{lag}' for col in POLLUT_COL for lag in LAGS]+
    MISSING_FLAG_COL+
    ['dow','month']
)

#missing targeta/feature rows are dropped
df_model=df_feat.dropna(subset=['y']+feature_cols).copy()

df_model[['City','Timestamp','split','y']].head(),df_model.shape

(         City  Timestamp  split           y
 14  Bengaluru 2020-01-15  train   93.000000
 15  Bengaluru 2020-01-16  train  108.707383
 16  Bengaluru 2020-01-17  train   59.000000
 17  Bengaluru 2020-01-18  train   56.560000
 18  Bengaluru 2020-01-19  train   56.474483,
 (5263, 62))

In [19]:
df_model['Timestamp'] = pd.to_datetime(df_model['Timestamp'])

df_model['split']='test'
df_model.loc[df_model['Timestamp']<=pd.Timestamp("2024-06-30"), 'split'] = 'val'
df_model.loc[df_model['Timestamp']<=pd.Timestamp("2023-12-31"), 'split'] = 'train'

df_model.groupby(['City','split'])['Timestamp'].agg(['min','max','count'])


Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,count
City,split,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Bengaluru,test,2024-07-01,2024-12-30,167
Bengaluru,train,2020-01-15,2023-12-31,1384
Bengaluru,val,2024-01-01,2024-06-30,147
Delhi,test,2024-07-01,2024-12-30,183
Delhi,train,2020-01-15,2023-12-31,1447
Delhi,val,2024-01-01,2024-06-30,182
Mumbai,test,2024-07-01,2024-12-30,183
Mumbai,train,2020-01-15,2023-12-31,1388
Mumbai,val,2024-01-01,2024-06-30,182


In [20]:
#Train and evaluate models per city

def smape(y_true,y_pred,eps=1e-8):
    y_true=np.asarray(y_true)
    y_pred=np.asarray(y_pred)
    denom=(np.abs(y_true)+np.abs(y_pred)+eps)/2
    return np.mean(np.abs(y_true - y_pred) / denom) * 100

def rmse(y_true,y_pred):
    return np.sqrt(mean_squared_error(y_true,y_pred))

In [21]:
#train and evaluate models per city on val and test sets
def fit_eval_city(city_df:pd.DataFrame,model,model_name:str):
    out_rows=[]

    train=city_df[city_df['split']=='train']
    val=city_df[city_df['split']=='val']
    test=city_df[city_df['split']=='test']

    x_train,y_train=train[feature_cols],train['y']
    x_val,y_val=val[feature_cols],val['y']
    x_test,y_test=test[feature_cols],test['y']

    model.fit(x_train,y_train)

    for split_name,X,y in [('val',x_val,y_val),('test',x_test,y_test)]:
        pred=model.predict(X)
        out_rows.append({
            'City':city_df['City'].iloc[0],
            'Model':model_name,
            'Split':split_name,
            'MAE':mean_absolute_error(y,pred),
            'RMSE':rmse(y,pred),
            'sMAPE':smape(y,pred),
            'n':len(y)
        })
    return out_rows
results=[]


In [22]:
for city,g in df_model.groupby('City'):
    #Linear Regression
    lr_model=LinearRegression()
    results.extend(fit_eval_city(g,lr_model,'LinearRegression'))

    #Random Forest
    rf_model=RandomForestRegressor(n_estimators=300,random_state=42,n_jobs=-1)
    results+=fit_eval_city(g,rf_model,'RandomForest')

ml_results_df=pd.DataFrame(results)
ml_results_df

Unnamed: 0,City,Model,Split,MAE,RMSE,sMAPE,n
0,Bengaluru,LinearRegression,val,31.723824,66.753122,26.449206,147
1,Bengaluru,LinearRegression,test,27.496355,35.414938,30.837405,167
2,Bengaluru,RandomForest,val,30.483979,64.512233,25.663071,147
3,Bengaluru,RandomForest,test,24.736173,31.175776,28.452387,167
4,Delhi,LinearRegression,val,46.425536,60.969254,21.70451,182
5,Delhi,LinearRegression,test,42.304313,55.66454,23.509907,183
6,Delhi,RandomForest,val,53.112431,65.816031,24.990765,182
7,Delhi,RandomForest,test,44.25395,59.193801,22.88481,183
8,Mumbai,LinearRegression,val,20.368843,25.736431,23.591896,182
9,Mumbai,LinearRegression,test,16.426282,20.453013,22.611299,183


In [23]:
OUTPUTS_DIR=os.path.join(PROJECT_ROOT,'outputs')
os.makedirs(OUTPUTS_DIR,exist_ok=True)

ml_results_df.to_csv(os.path.join(OUTPUTS_DIR,'ml_model_results.csv'),index=False)


In [24]:
from sklearn.base import clone

MODELS={
    'LinearRegression':LinearRegression(),
    'RandomForest':RandomForestRegressor(n_estimators=300,random_state=42,n_jobs=-1)
    }

def train_predict_best(city_df:pd.DataFrame):
    val_rows=ml_results_df[(ml_results_df['City']==city_df['City'].iloc[0]) & (ml_results_df['Split']=='val')]
    best_model_name=val_rows.sort_values('MAE').iloc[0]['Model']

    model=clone(MODELS[best_model_name])

    train=city_df[city_df['split']=='train']
    test=city_df[city_df['split']=='test']

    x_train,y_train=train[feature_cols],train['y']
    x_test,y_test=test[feature_cols],test['y']

    model.fit(x_train,y_train)
    y_pred=model.predict(x_test)

    out=test[['City','Timestamp']].copy()
    out['y_true']=y_test.values
    out['y_pred']=y_pred
    out['Model']=best_model_name
    return out

pred_dfs=[]
for city,g in df_model.groupby('City'):
    pred_df=train_predict_best(g)
    pred_dfs.append(pred_df)

ml_test_preds=pd.concat(pred_dfs,ignore_index=True)
ml_test_preds.to_csv(os.path.join(OUTPUTS_DIR,'ml_model_test_predictions.csv'),index=False)

ml_test_preds.head()

Unnamed: 0,City,Timestamp,y_true,y_pred,Model
0,Bengaluru,2024-07-01,81.44,81.518655,RandomForest
1,Bengaluru,2024-07-02,70.04,78.65196,RandomForest
2,Bengaluru,2024-07-03,101.292349,91.881793,RandomForest
3,Bengaluru,2024-07-04,67.4,78.372061,RandomForest
4,Bengaluru,2024-07-05,89.27,96.823224,RandomForest
