In [1]:
import numpy as np # linear algebrae
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression

import os

# from polire import IDW

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from datetime import date, timedelta
import datetime

import json
import pickle as pkl
import itertools

import torch

Config

In [2]:
class CFG:
    evaluation_time_gap = 1
    convert_numpy = False
    target_list = ['pm2_5', 'pm10']
    features = None

## Basic Loading

In [3]:
df = pd.read_csv('/kaggle/input/airdelhi-tabularengineering/tabular_data.csv')

df = df.drop(columns = 'Unnamed: 0')
df['date_value'] = pd.to_datetime(df['date_value'])

dates = pd.to_datetime([df['date_value'].min(), df['date_value'].max()])

max_train_date = dates.min() + (dates.max() - dates.min()) * 0.75
max_train_date = max_train_date.floor("D")
min_train_date = df['date_value'].min()
max_date = df['date_value'].max().floor("D")

# metrics_dict = {
#     'MSE': mean_squared_error, 
#     'r2 score': r2_score, 
#     'MAE': mean_absolute_error,
# }

target_list = CFG.target_list

features = ['date_value', 'timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']

CFG.base_features = ['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']
CFG.features = CFG.base_features

max_train_date, min_train_date

(Timestamp('2021-01-07 00:00:00'), Timestamp('2020-11-01 00:00:00'))

In [4]:
scaler = MinMaxScaler()
df[['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']] = scaler.fit_transform(
    df[['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']]
)


## Basic Helper Functions

In [5]:
def clean_df(df, features=None):
    if features is None:
        return df[CFG.features]
    else:
        return df[features]

In [6]:
from sklearn.metrics import accuracy_score

class AQIClassifier:
    def __init__(self, pollutant):
        if pollutant == 'pm2_5':
            self.thresholds_ = [0, 30, 60, 90, 120, 250, np.inf]
        elif pollutant == 'pm10':
            self.thresholds_ = [0, 50, 100, 250, 350, 430, np.inf]
        else:
            raise ValueError("pollutant must be 'pm2_5' or 'pm10'")
        self.pollutant = pollutant
        self.bin_labels_ = ['Good', 'Satisfactory', 'Moderately Polluted', 'Poor', 'Very Poor', 'Severe']

    def bin(self, values):

        values = np.asarray(values)
        return np.digitize(values, self.thresholds_, right=True) - 1

    def __call__(self, y_true, y_pred, return_labels=False):
        y_true_bins = self.bin(y_true)
        y_pred_bins = self.bin(y_pred)

        acc = accuracy_score(y_true_bins, y_pred_bins)

        if return_labels:
            return acc, (y_true_bins, y_pred_bins)
        return acc


## Get Model Splits

In [7]:
class DataSplitter:

    def __init__(
        self, 
        dataset, 
        min_date, 
        max_date, 
        max_lookback=None,
        min_lookback=None,
    ):
        # self.X = dataset[CFG.features if features is None else features]
        
        # self.target = target
        # self.y = dataset[target]
        
        self.df = dataset
        
        self.min_train_date = min_date
        self.max_train_date = max_date
        self.df_max_date = self.df['date_value'].max()

        self.max_lookback = max_lookback
        self.min_lookback = min_lookback
    
    def get_train_test_split(self):
        df = self.df.copy()
        df = df[df['date_value'] >= self.min_train_date]
        d = self.max_train_date + timedelta(days = 1)

        splits = {}
        
        while d < self.df_max_date:
            train_temp = df[df['date_value'] <= d]
            if self.max_lookback is not None:
                train_temp = train_temp[train_temp['date_value'] > d - timedelta(
                    days = self.max_lookback)]

            if self.min_lookback is not None:
                train_temp1 = train_temp[train_temp['date_value'] <= d - timedelta(
                    days = self.min_lookback)]
                if len(train_temp1) > 200:
                    train_temp = train_temp1
            
            train_indices = train_temp.index
            test_indices = df[df['date_value'] == d + timedelta(days=1)].index

            splits[d] = {
                'train': train_indices,
                'test' : test_indices
            }
            
            d = d + timedelta(days = CFG.evaluation_time_gap)

        return splits

In [8]:
data_splitter = DataSplitter(df, min_train_date, max_train_date)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique())[-1])

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

2021-01-08 00:00:00 2021-01-08 00:00:00
2021-01-08 00:00:00 [Timestamp('2021-01-09 00:00:00')]


In [9]:
data_splitter = DataSplitter(df, min_train_date, max_train_date, max_lookback=3)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique()))

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

2021-01-08 00:00:00 [Timestamp('2021-01-06 00:00:00'), Timestamp('2021-01-07 00:00:00'), Timestamp('2021-01-08 00:00:00')]
2021-01-08 00:00:00 [Timestamp('2021-01-09 00:00:00')]


In [10]:
data_splitter = DataSplitter(df, min_train_date, min_train_date, max_lookback=2)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique()))

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

print(min_train_date, df.date_value.min())

2020-11-02 00:00:00 [Timestamp('2020-11-01 00:00:00'), Timestamp('2020-11-02 00:00:00')]
2020-11-02 00:00:00 [Timestamp('2020-11-03 00:00:00')]
2020-11-01 00:00:00 2020-11-01 00:00:00


## Model Holder

In [11]:
class Model:

    def __init__(
        self,
        model_class,
        params=None,
        target="pm2_5",
        features=None,
    ):
        self.model_class = model_class
        
        if params is None:
            self.params = {}
        else:
            self.params = params

        self.max_train_date = None
        self.target = target
        if features is None:
            features = CFG.features

        self.features = features

    def fit(self, X, y):
        model = self.model_class(**self.params)
        # print(X.columns, CFG.features)
        model.fit(X, y)
        return model

    def predict(self, X):
        model = self.split_models[self.max_train_date]
        return model.predict(X)

    def get_df_from_split(self, df, split, split_type='train'):
        X = df.iloc[split[split_type]].copy()
        y = np.array(X[self.target])
        X = clean_df(X, features=self.features)
        return X, y

    def fit_on_splits(self, df, splits):
        self.split_models = {}
        
        for d, split in splits.items():
            X, y = self.get_df_from_split(df, split)
            self.split_models[d] = self.fit(X, y)

    def predict_on_splits(self, df, splits, train=False):
        model_predictions = {}
        if train:
            train = 'train'
        else:
            train= 'test'
        
        for d, split in splits.items():
            X, y = self.get_df_from_split(df, split, split_type=train)
            model_predictions[d] = {
                'pred': self.split_models[d].predict(X),
                'true': y,
                'index': X.index
            }
        
        return model_predictions

## Model Evaluator

In [12]:
class Evaluator:

    def __init__(
        self,
        dataset,
        min_date,
        max_date,
        target,
        metrics_dict,
        max_lookback=None
    ):
        self.df_splitter = DataSplitter(dataset, min_date, max_date, max_lookback=max_lookback)
        self.metrics_dict = metrics_dict
        self.dataset = dataset
        self.target = target

        self.splits = self.df_splitter.get_train_test_split()

    def fit_predict(self, model):
        model.fit_on_splits(self.dataset, self.splits)
        train_preds = model.predict_on_splits(self.dataset, self.splits, train=True)
        test_preds = model.predict_on_splits(self.dataset, self.splits, train=False)
        return train_preds, test_preds

    def evaluate_metrics(self, y_true, y_pred):
        values = {}
        for d, metric in self.metrics_dict.items():
            values[d] = metric(y_true, y_pred)
        return values

    def merge_evaluations(self, evaluations):
        merged_evaluations = {}
        for d, evaluation in evaluations.items():
            if d in ['aggregated', 'aggregated_train']:
                continue
            if len(merged_evaluations) == 0:
                merged_evaluations = {
                    k : [v]
                    for k, v in evaluation.items()
                }
            else:
                for k, v in evaluation.items():
                    merged_evaluations[k].append(v)

        merged_evaluations['aggregated'] = evaluations['aggregated']
        # merged_evaluations['aggregated_train'] = evaluations['aggregated_train']
        
        return merged_evaluations

    def save(self, model, evaluations, model_name):
        with open(f'{model_name}_evaluation.json', 'w') as f:
            json.dump(evaluations, f)

        with open(f'{model_name}_model.pkl', 'wb') as f:
            pkl.dump(model, f)

    def print(self, train_evaluations, test_evaluations):
        print(f"""Train PM {self.target}: R2 Score {train_evaluations['aggregated']['r2 score']}, 
               RMSE {np.sqrt(train_evaluations['aggregated']['MSE'])}
               MAE  {train_evaluations['aggregated']['MAE']}
               Acc  {train_evaluations['aggregated']['Accuracy']}
        """)
        print(f"""Test PM {self.target}: R2 Score {test_evaluations['aggregated']['r2 score']}, 
              RMSE {np.sqrt(test_evaluations['aggregated']['MSE'])}
              MAE  {test_evaluations['aggregated']['MAE']}
              Acc  {test_evaluations['aggregated']['Accuracy']}
        """)
        

    def evaluate(self, model, daily=False, save=True, model_name=None, verbose=True):
        train_preds, test_preds = self.fit_predict(model)

        train_metrics = {}
        train_pred_full = []
        train_full = []
        for d, v in train_preds.items():
            train_metrics[d] = self.evaluate_metrics(v['true'], v['pred'])
            train_pred_full.append(v['pred'])
            train_full.append(v['true'])

        train_pred_full = np.concatenate(train_pred_full)
        train_full = np.concatenate(train_full)
        

        test_metrics = {}
        test_pred_full = []
        test_full = []
        index_full = []
        for d, v in test_preds.items():
            test_metrics[d] = self.evaluate_metrics(v['true'], v['pred'])
            test_pred_full.append(v['pred'])
            test_full.append(v['true'])
            index_full.append(v['index'])
        
        test_pred_full = np.concatenate(test_pred_full)
        test_full = np.concatenate(test_full)
        index_full = np.concatenate(index_full)

        train_metrics['aggregated'] = self.evaluate_metrics(train_full, train_pred_full)
        test_metrics['aggregated'] = self.evaluate_metrics(test_full, test_pred_full)
        
        if not daily:
            train_metrics, test_metrics = self.merge_evaluations(train_metrics), self.merge_evaluations(test_metrics)

        if save:
            self.save(model, {'train':train_metrics,'test':test_metrics}, model_name)

        if verbose:
            self.print(train_metrics, test_metrics)
        
        return train_metrics, test_metrics

## Additional Processing

In [13]:
from scipy.spatial import cKDTree

class IDW:

    def __init__(self, leafsize, power = 3, k = 10):
        self.leafsize = leafsize
        self.power = power
        self.k = k

    def fit(self, X, y):
        self.X = X
        # self.y = y.values
        self.y = y

        self.tree = cKDTree(self.X.values, leafsize=self.leafsize)

    def predict(self, test_df):
        X_test = test_df.values
        
        distances, indices = self.tree.query(X_test, k=self.k, workers=-1)
        distances = np.maximum(distances, 1e-10)
        
        weights = 1 / (distances ** self.power)
        weights /= np.sum(weights, axis=1, keepdims=True)
        
        interpolated_values = np.sum(weights * self.y[indices], axis=1)
        
        return interpolated_values

## Lagged Features

In [14]:
def add_lag_features(df, lags = [1]):

    df = df.copy()

    added_features = []
    for l in lags:
        df[f'pm2_5_lag_{l}'] = df.groupby(
            ['timeOfDay', 'lat', 'lon'])['pm2_5'].shift(l)
        df[f'pm10_lag_{l}'] = df.groupby(
            ['timeOfDay', 'lat', 'lon'])['pm10'].shift(l)

        added_features.append(f'pm2_5_lag_{l}')
        added_features.append(f'pm10_lag_{l}')

        # .reset_index(drop=False)
        
        # shifted_pm25 = df.groupby(['timeOfDay', 'lat', 'lon'])['pm2_5'].shift(1).reset_index(drop=False)
        # shifted_pm10 = df.groupby(['timeOfDay', 'lat', 'lon'])['pm10'].shift(1).reset_index(drop=False)

        # shifted_pm25 = shifted_pm25.rename(columns = {'pm2_5' : f'pm2_5_lag_{l}'})
        # shifted_pm10 = shifted_pm10.rename(columns = {'pm10' : f'pm10_lag_{l}'})

        # df = pd.merge(df, shifted_pm25, how = 'outer', on =  ['timeOfDay', 'lat', 'lon'])
        # df = pd.merge(df, shifted_pm10, how = 'outer', on =  ['timeOfDay', 'lat', 'lon'])


        df.sort_values(by=["lat", "lon", "date_value"], inplace=True)

    # Group by latitude and longitude
    grouped = df.groupby(["lat", "lon"])

    # Function to fill NaN values based on previous mean
    def fill_na_with_previous_mean(group):
        for col in group.columns:
            if col not in ["date_value", "lat", "lon"]:
                group[col] = group[col].astype(float)  # Ensure numeric columns
                group[col] = group[col].fillna(group[col].expanding().mean().shift())  # Previous days' mean
                
                # If still NaN (first row), replace with overall mean
                overall_mean = df[col].mean(skipna=True)
                group[col] = group[col].fillna(overall_mean)
        return group

    # Apply the function to each group
    df = grouped.apply(fill_na_with_previous_mean)

    df.reset_index(drop=True, inplace=True)

    df = df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])

    df.reset_index(drop=True, inplace=True)

    CFG.features += added_features
    
    return df


In [15]:
df = add_lag_features(df, lags = [1, 2, 3, 7])

  df = grouped.apply(fill_na_with_previous_mean)


In [16]:
# IDW Interpolation

def idw_interpolation(df, idw, lags):
    df = df.copy()
    df['lat'] = df['lat'] * 50
    df['lon'] = df['lon'] * 50
    
    target = idw.target
    added_features = []
    
    for lag in lags:
        df_splitter = DataSplitter(
            df.copy(), min_train_date, min_train_date+timedelta(days=1), 
            max_lookback=lag+1, min_lookback=lag-1
        )
        splits = df_splitter.get_train_test_split()
    
        idw.fit_on_splits(df, splits)
        preds = idw.predict_on_splits(df, splits)

        test_pred_full = []
        test_full = []
        index_full = []
        
        for d, v in preds.items():
            test_pred_full.append(v['pred'])
            test_full.append(v['true'])
            index_full.append(v['index'])
        
        test_pred_full = np.concatenate(test_pred_full)
        test_full = np.concatenate(test_full)
        index_full = np.concatenate(index_full)

        new_df = pd.DataFrame({
            # f'test_lag_{lag}': test_full,
            f'idw_lag_{lag}_{target}': test_pred_full,
        }, index = index_full)
        
        df = df.merge(new_df, left_index=True, right_index=True, how='outer')
        # df[f'test_lag_{lag}'] = df[f'test_lag_{lag}'].fillna(df[f'test_lag_{lag}'].expanding().mean())
        df[f'idw_lag_{lag}_{target}'] = df[f'idw_lag_{lag}_{target}'].fillna(df[f'idw_lag_{lag}_{target}'].expanding().mean())
        
        df = df.bfill()

        added_features.append(f'idw_lag_{lag}_{target}')

    df['lat'] = df['lat'] / 50
    df['lon'] = df['lon'] / 50

    CFG.features = CFG.features + added_features
    
    return df

In [17]:
# pm2_5

idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3}
)

# idw_model = Model(
#     RandomForestRegressor,
# )

df = idw_interpolation(df, idw_model, lags = [1])
df = idw_interpolation(df, idw_model, lags = [2])
df = idw_interpolation(df, idw_model, lags = [3])
df = idw_interpolation(df, idw_model, lags = [7])

# print(np.sqrt(mean_squared_error(df['test_lag_1'], df['idw_lag_1_pm2_5'])))
# print(np.sqrt(mean_squared_error(df['test_lag_2'], df['idw_lag_2_pm2_5'])))
# print(np.sqrt(mean_squared_error(df['test_lag_3'], df['idw_lag_3_pm2_5'])))

In [18]:
# pm10

idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm10'
)

# idw_model = Model(
#     RandomForestRegressor,
#     target='pm10'
# )

df = idw_interpolation(df, idw_model, lags = [1])
df = idw_interpolation(df, idw_model, lags = [2])
df = idw_interpolation(df, idw_model, lags = [3])
df = idw_interpolation(df, idw_model, lags = [7])

## Evaluation

In [19]:
def filter_sparse_data(df, count):
    df = df.copy()
    lat_lon_pairs = df.groupby(by=['lat', 'lon'])['pm2_5'].count().reset_index()
    lat_lon_pairs = lat_lon_pairs[lat_lon_pairs['pm2_5'] >= count]

    df = pd.merge(df, lat_lon_pairs[['lat', 'lon']], on = ['lat', 'lon'], how = 'inner')
    return df

df_sparse = filter_sparse_data(df, 150)

In [20]:
pm25_metrics_dict = {
    'MSE': mean_squared_error, 
    'r2 score': r2_score, 
    'MAE': mean_absolute_error,
    'Accuracy': AQIClassifier('pm2_5')
}

pm10_metrics_dict = {
    'MSE': mean_squared_error, 
    'r2 score': r2_score, 
    'MAE': mean_absolute_error,
    'Accuracy': AQIClassifier('pm10')
}

pm25_evaluator = Evaluator(
    df,
    min_train_date,
    max_train_date,
    'pm2_5',
    pm25_metrics_dict,
    max_lookback=None
)

pm10_evaluator = Evaluator(
    df_sparse,
    min_train_date,
    max_train_date,
    'pm10',
    pm10_metrics_dict,
    max_lookback=None
)

pm25_evaluator2 = Evaluator(
    df_sparse,
    min_train_date,
    max_train_date,
    'pm2_5',
    pm25_metrics_dict,
    max_lookback=None
)

pm10_evaluator2 = Evaluator(
    df,
    min_train_date,
    max_train_date,
    'pm10',
    pm10_metrics_dict,
    max_lookback=None
)

In [21]:
df.to_csv('engineered_df.csv')

In [22]:
df.head()

Unnamed: 0,date_value,timeOfDay,lat,lon,pm2_5,pm10,day_of_week,distance,bus_count,pm2_5_lag_1,...,pm2_5_lag_7,pm10_lag_7,idw_lag_1_pm2_5,idw_lag_2_pm2_5,idw_lag_3_pm2_5,idw_lag_7_pm2_5,idw_lag_1_pm10,idw_lag_2_pm10,idw_lag_3_pm10,idw_lag_7_pm10
0,2020-11-01,0.0,0.269231,0.809524,481.37,522.53,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
1,2020-11-01,0.0,0.307692,0.761905,471.18,513.5,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
2,2020-11-01,0.0,0.346154,0.714286,462.44,503.81,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
3,2020-11-01,0.0,0.346154,0.761905,468.14,507.55,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
4,2020-11-01,0.0,0.384615,0.714286,462.68,505.21,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935


In [23]:
CFG.features

['timeOfDay',
 'lat',
 'lon',
 'day_of_week',
 'distance',
 'bus_count',
 'pm2_5_lag_1',
 'pm10_lag_1',
 'pm2_5_lag_2',
 'pm10_lag_2',
 'pm2_5_lag_3',
 'pm10_lag_3',
 'pm2_5_lag_7',
 'pm10_lag_7',
 'idw_lag_1_pm2_5',
 'idw_lag_2_pm2_5',
 'idw_lag_3_pm2_5',
 'idw_lag_7_pm2_5',
 'idw_lag_1_pm10',
 'idw_lag_2_pm10',
 'idw_lag_3_pm10',
 'idw_lag_7_pm10']

In [24]:
model_list = {
    "XGB" : [XGBRegressor, {}],
    "Ridge" : [Ridge, {}],
    "IDW" : [IDW, {'leafsize': 50, 'k': 20, 'power': 0.25}],
    "LightGBM" : [LGBMRegressor, {'verbose':0}],
    "CatBoostRegressor": [CatBoostRegressor, {'verbose' : 0}],
    # "RandomForest" : [RandomForestRegressor, {'n_estimators' : 50}],
}

for model_name, [model, params] in model_list.items():
    print(f"Running Model {model_name}")
    model_wrapper = Model(model, params, target='pm2_5')
    pm25_evaluator.evaluate(
        model_wrapper, daily=False, save=True, model_name=model_name, verbose=True
    )

    model_wrapper = Model(model, params, target='pm10')
    pm10_evaluator.evaluate(
        model_wrapper, daily=False, save=True, model_name=model_name, verbose=True
    )

    model_wrapper = Model(model, params, target='pm2_5')
    pm25_evaluator2.evaluate(
        model_wrapper, daily=False, save=True, model_name=model_name, verbose=True
    )

    model_wrapper = Model(model, params, target='pm10')
    pm10_evaluator2.evaluate(
        model_wrapper, daily=False, save=True, model_name=model_name, verbose=True
    )

    print("-------------------------------------------------------------------------------")

Running Model XGB
Train PM pm2_5: R2 Score 0.7859136433820467, 
               RMSE 48.02828176849066
               MAE  34.51806205645105
               Acc  0.6930574009895927
        
Test PM pm2_5: R2 Score 0.05303557564343031, 
              RMSE 98.11367177220335
              MAE  74.64977040809585
              Acc  0.5080732568157186
        
Train PM pm10: R2 Score 0.7867392181921339, 
               RMSE 52.008693122275695
               MAE  37.35776788663118
               Acc  0.7213188333025009
        
Test PM pm10: R2 Score 0.06647111930742344, 
              RMSE 106.18441564406139
              MAE  80.73961916176759
              Acc  0.5278156221616712
        
Train PM pm2_5: R2 Score 0.7881689111276394, 
               RMSE 47.84520881988315
               MAE  34.42976916585778
               Acc  0.6938018094206154
        
Test PM pm2_5: R2 Score 0.08101477313879735, 
              RMSE 97.0471048143982
              MAE  73.54987564545453
              Acc  