In [1]:
import numpy as np # linear algebrae
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression

import os

# from polire import IDW

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from datetime import date, timedelta
import datetime

import json
import pickle as pkl
import itertools

Config

In [2]:
class CFG:
    evaluation_time_gap = 1
    convert_numpy = False
    target_list = ['pm2_5', 'pm10']
    features = None

## Basic Loading

In [3]:
df = pd.read_csv('/kaggle/input/airdelhi-tabularengineering/tabular_data.csv')

df = df.drop(columns = 'Unnamed: 0')
df['date_value'] = pd.to_datetime(df['date_value'])

dates = pd.to_datetime([df['date_value'].min(), df['date_value'].max()])

max_train_date = dates.min() + (dates.max() - dates.min()) * 0.75
max_train_date = max_train_date.floor("D")
min_train_date = df['date_value'].min()
max_date = df['date_value'].max().floor("D")

metrics_dict = {
    'MSE': mean_squared_error, 
    'r2 score': r2_score, 
    'MAE': mean_absolute_error,
}

target_list = CFG.target_list

features = ['date_value', 'timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']

CFG.base_features = ['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']
CFG.features = CFG.base_features

max_train_date, min_train_date

(Timestamp('2021-01-07 00:00:00'), Timestamp('2020-11-01 00:00:00'))

In [4]:
scaler = MinMaxScaler()
df[['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']] = scaler.fit_transform(
    df[['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']]
)


## Basic Helper Functions

In [5]:
def clean_df(df, features=None):
    if features is None:
        return df[CFG.features]
    else:
        return df[features]

## Get Model Splits

In [6]:
class DataSplitter:

    def __init__(
        self, 
        dataset, 
        min_date, 
        max_date, 
        max_lookback=None,
        min_lookback=None,
    ):
        # self.X = dataset[CFG.features if features is None else features]
        
        # self.target = target
        # self.y = dataset[target]
        
        self.df = dataset
        
        self.min_train_date = min_date
        self.max_train_date = max_date
        self.df_max_date = self.df['date_value'].max()

        self.max_lookback = max_lookback
        self.min_lookback = min_lookback
    
    def get_train_test_split(self):
        df = self.df.copy()
        df = df[df['date_value'] >= self.min_train_date]
        d = self.max_train_date + timedelta(days = 1)

        splits = {}
        
        while d < self.df_max_date:
            train_temp = df[df['date_value'] <= d]
            if self.max_lookback is not None:
                train_temp = train_temp[train_temp['date_value'] > d - timedelta(
                    days = self.max_lookback)]

            if self.min_lookback is not None:
                train_temp1 = train_temp[train_temp['date_value'] <= d - timedelta(
                    days = self.min_lookback)]
                if len(train_temp1) > 200:
                    train_temp = train_temp1
            
            train_indices = train_temp.index
            test_indices = df[df['date_value'] == d + timedelta(days=1)].index

            splits[d] = {
                'train': train_indices,
                'test' : test_indices
            }
            
            d = d + timedelta(days = CFG.evaluation_time_gap)

        return splits

In [7]:
data_splitter = DataSplitter(df, min_train_date, max_train_date)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique())[-1])

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

2021-01-08 00:00:00 2021-01-08 00:00:00
2021-01-08 00:00:00 [Timestamp('2021-01-09 00:00:00')]


In [8]:
data_splitter = DataSplitter(df, min_train_date, max_train_date, max_lookback=3)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique()))

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

2021-01-08 00:00:00 [Timestamp('2021-01-06 00:00:00'), Timestamp('2021-01-07 00:00:00'), Timestamp('2021-01-08 00:00:00')]
2021-01-08 00:00:00 [Timestamp('2021-01-09 00:00:00')]


In [9]:
data_splitter = DataSplitter(df, min_train_date, min_train_date, max_lookback=2)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique()))

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

print(min_train_date, df.date_value.min())

2020-11-02 00:00:00 [Timestamp('2020-11-01 00:00:00'), Timestamp('2020-11-02 00:00:00')]
2020-11-02 00:00:00 [Timestamp('2020-11-03 00:00:00')]
2020-11-01 00:00:00 2020-11-01 00:00:00


## Model Holder

In [10]:
class Model:

    def __init__(
        self,
        model_class,
        params=None,
        target="pm2_5",
        features=None,
    ):
        self.model_class = model_class
        
        if params is None:
            self.params = {}
        else:
            self.params = params

        self.max_train_date = None
        self.target = target
        if features is None:
            features = CFG.features

        self.features = features

    def fit(self, X, y):
        model = self.model_class(**self.params)
        # print(X.columns, CFG.features)
        model.fit(X, y)
        return model

    def predict(self, X):
        model = self.split_models[self.max_train_date]
        return model.predict(X)

    def get_df_from_split(self, df, split, split_type='train'):
        X = df.iloc[split[split_type]].copy()
        y = np.array(X[self.target])
        X = clean_df(X, features=self.features)
        return X, y

    def fit_on_splits(self, df, splits):
        self.split_models = {}
        
        for d, split in splits.items():
            X, y = self.get_df_from_split(df, split)
            self.split_models[d] = self.fit(X, y)

    def predict_on_splits(self, df, splits, train=False):
        model_predictions = {}
        if train:
            train = 'train'
        else:
            train= 'test'
        
        for d, split in splits.items():
            X, y = self.get_df_from_split(df, split, split_type=train)
            model_predictions[d] = {
                'pred': self.split_models[d].predict(X),
                'true': y,
                'index': X.index
            }
        
        return model_predictions

## Model Evaluator

In [11]:
class Evaluator:

    def __init__(
        self,
        dataset,
        min_date,
        max_date,
        target,
        metrics_dict,
        max_lookback=None
    ):
        self.df_splitter = DataSplitter(dataset, min_date, max_date, max_lookback=max_lookback)
        self.metrics_dict = metrics_dict
        self.dataset = dataset
        self.target = target

        self.splits = self.df_splitter.get_train_test_split()

    def fit_predict(self, model):
        model.fit_on_splits(self.dataset, self.splits)
        train_preds = model.predict_on_splits(self.dataset, self.splits, train=True)
        test_preds = model.predict_on_splits(self.dataset, self.splits, train=False)
        return train_preds, test_preds

    def evaluate_metrics(self, y_true, y_pred):
        values = {}
        for d, metric in metrics_dict.items():
            values[d] = metric(y_true, y_pred)
        return values

    def merge_evaluations(self, evaluations):
        merged_evaluations = {}
        for d, evaluation in evaluations.items():
            if d in ['aggregated', 'aggregated_train']:
                continue
            if len(merged_evaluations) == 0:
                merged_evaluations = {
                    k : [v]
                    for k, v in evaluation.items()
                }
            else:
                for k, v in evaluation.items():
                    merged_evaluations[k].append(v)

        merged_evaluations['aggregated'] = evaluations['aggregated']
        # merged_evaluations['aggregated_train'] = evaluations['aggregated_train']
        
        return merged_evaluations

    def save(self, model, evaluations, model_name):
        with open(f'{model_name}_evaluation.json', 'w') as f:
            json.dump(evaluations, f)

        with open(f'{model_name}_model.pkl', 'wb') as f:
            pkl.dump(model, f)

    def print(self, train_evaluations, test_evaluations):
        print(f"""Train PM {self.target}: R2 Score {train_evaluations['aggregated']['r2 score']}, 
               RMSE {np.sqrt(train_evaluations['aggregated']['MSE'])}
        """)
        print(f"""Test PM {self.target}: R2 Score {test_evaluations['aggregated']['r2 score']}, 
              RMSE {np.sqrt(test_evaluations['aggregated']['MSE'])}
        """)
        

    def evaluate(self, model, daily=False, save=True, model_name=None, verbose=True):
        train_preds, test_preds = self.fit_predict(model)

        train_metrics = {}
        train_pred_full = []
        train_full = []
        for d, v in train_preds.items():
            train_metrics[d] = self.evaluate_metrics(v['true'], v['pred'])
            train_pred_full.append(v['pred'])
            train_full.append(v['true'])

        train_pred_full = np.concatenate(train_pred_full)
        train_full = np.concatenate(train_full)
        

        test_metrics = {}
        test_pred_full = []
        test_full = []
        index_full = []
        for d, v in test_preds.items():
            test_metrics[d] = self.evaluate_metrics(v['true'], v['pred'])
            test_pred_full.append(v['pred'])
            test_full.append(v['true'])
            index_full.append(v['index'])
        
        test_pred_full = np.concatenate(test_pred_full)
        test_full = np.concatenate(test_full)
        index_full = np.concatenate(index_full)

        train_metrics['aggregated'] = self.evaluate_metrics(train_full, train_pred_full)
        test_metrics['aggregated'] = self.evaluate_metrics(test_full, test_pred_full)
        
        if not daily:
            train_metrics, test_metrics = self.merge_evaluations(train_metrics), self.merge_evaluations(test_metrics)

        if save:
            self.save(model, {'train':train_metrics,'test':test_metrics}, model_name)

        if verbose:
            self.print(train_metrics, test_metrics)
        
        return train_metrics, test_metrics

## Additional Processing

In [12]:
from scipy.spatial import cKDTree

class IDW:

    def __init__(self, leafsize, power = 3, k = 10):
        self.leafsize = leafsize
        self.power = power
        self.k = k

    def fit(self, X, y):
        self.X = X
        # self.y = y.values
        self.y = y

        self.tree = cKDTree(self.X.values, leafsize=self.leafsize)

    def predict(self, test_df):
        X_test = test_df.values
        
        distances, indices = self.tree.query(X_test, k=self.k, workers=-1)
        distances = np.maximum(distances, 1e-10)
        
        weights = 1 / (distances ** self.power)
        weights /= np.sum(weights, axis=1, keepdims=True)
        
        interpolated_values = np.sum(weights * self.y[indices], axis=1)
        
        return interpolated_values

## Lagged Features

In [13]:
def add_lag_features(df, lags = [1]):

    df = df.copy()

    added_features = []
    for l in lags:
        df[f'pm2_5_lag_{l}'] = df.groupby(
            ['timeOfDay', 'lat', 'lon'])['pm2_5'].shift(l)
        df[f'pm10_lag_{l}'] = df.groupby(
            ['timeOfDay', 'lat', 'lon'])['pm10'].shift(l)

        added_features.append(f'pm2_5_lag_{l}')
        added_features.append(f'pm10_lag_{l}')

        # .reset_index(drop=False)
        
        # shifted_pm25 = df.groupby(['timeOfDay', 'lat', 'lon'])['pm2_5'].shift(1).reset_index(drop=False)
        # shifted_pm10 = df.groupby(['timeOfDay', 'lat', 'lon'])['pm10'].shift(1).reset_index(drop=False)

        # shifted_pm25 = shifted_pm25.rename(columns = {'pm2_5' : f'pm2_5_lag_{l}'})
        # shifted_pm10 = shifted_pm10.rename(columns = {'pm10' : f'pm10_lag_{l}'})

        # df = pd.merge(df, shifted_pm25, how = 'outer', on =  ['timeOfDay', 'lat', 'lon'])
        # df = pd.merge(df, shifted_pm10, how = 'outer', on =  ['timeOfDay', 'lat', 'lon'])


        df.sort_values(by=["lat", "lon", "date_value"], inplace=True)

    # Group by latitude and longitude
    grouped = df.groupby(["lat", "lon"])

    # Function to fill NaN values based on previous mean
    def fill_na_with_previous_mean(group):
        for col in group.columns:
            if col not in ["date_value", "lat", "lon"]:
                group[col] = group[col].astype(float)  # Ensure numeric columns
                group[col] = group[col].fillna(group[col].expanding().mean().shift())  # Previous days' mean
                
                # If still NaN (first row), replace with overall mean
                overall_mean = df[col].mean(skipna=True)
                group[col] = group[col].fillna(overall_mean)
        return group

    # Apply the function to each group
    df = grouped.apply(fill_na_with_previous_mean)

    df.reset_index(drop=True, inplace=True)

    df = df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])

    df.reset_index(drop=True, inplace=True)

    CFG.features += added_features
    
    return df


In [14]:
df = add_lag_features(df, lags = [1, 2, 3, 7])

  df = grouped.apply(fill_na_with_previous_mean)


In [15]:
# idw_model = Model(
#     IDW,
#     params={'leafsize': 25, 'power': 0.5, 'k': 3}
# )

# df_splitter = DataSplitter(
#             df, min_train_date, min_train_date, 
#             max_lookback=2, min_lookback=None
#         )
# splits = df_splitter.get_train_test_split()

# idw_model.fit_on_splits(df, splits)
# preds = idw_model.predict_on_splits(df, splits)

In [16]:
# IDW Interpolation

def idw_interpolation(df, idw, lags):
    df = df.copy()
    df['lat'] = df['lat'] * 50
    df['lon'] = df['lon'] * 50
    
    target = idw.target
    added_features = []
    
    for lag in lags:
        df_splitter = DataSplitter(
            df.copy(), min_train_date, min_train_date+timedelta(days=1), 
            max_lookback=lag+1, min_lookback=lag-1
        )
        splits = df_splitter.get_train_test_split()
    
        idw.fit_on_splits(df, splits)
        preds = idw.predict_on_splits(df, splits)

        test_pred_full = []
        test_full = []
        index_full = []
        
        for d, v in preds.items():
            test_pred_full.append(v['pred'])
            test_full.append(v['true'])
            index_full.append(v['index'])
        
        test_pred_full = np.concatenate(test_pred_full)
        test_full = np.concatenate(test_full)
        index_full = np.concatenate(index_full)

        new_df = pd.DataFrame({
            # f'test_lag_{lag}': test_full,
            f'idw_lag_{lag}_{target}': test_pred_full,
        }, index = index_full)
        
        df = df.merge(new_df, left_index=True, right_index=True, how='outer')
        # df[f'test_lag_{lag}'] = df[f'test_lag_{lag}'].fillna(df[f'test_lag_{lag}'].expanding().mean())
        df[f'idw_lag_{lag}_{target}'] = df[f'idw_lag_{lag}_{target}'].fillna(df[f'idw_lag_{lag}_{target}'].expanding().mean())
        
        df = df.bfill()

        added_features.append(f'idw_lag_{lag}_{target}')

    df['lat'] = df['lat'] / 50
    df['lon'] = df['lon'] / 50

    CFG.features = CFG.features + added_features
    
    return df

In [17]:
# pm2_5

idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3}
)

# idw_model = Model(
#     RandomForestRegressor,
# )

df = idw_interpolation(df, idw_model, lags = [1])
df = idw_interpolation(df, idw_model, lags = [2])
df = idw_interpolation(df, idw_model, lags = [3])
df = idw_interpolation(df, idw_model, lags = [7])

# print(np.sqrt(mean_squared_error(df['test_lag_1'], df['idw_lag_1_pm2_5'])))
# print(np.sqrt(mean_squared_error(df['test_lag_2'], df['idw_lag_2_pm2_5'])))
# print(np.sqrt(mean_squared_error(df['test_lag_3'], df['idw_lag_3_pm2_5'])))

In [18]:
# pm10

idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm10'
)

# idw_model = Model(
#     RandomForestRegressor,
#     target='pm10'
# )

df = idw_interpolation(df, idw_model, lags = [1])
df = idw_interpolation(df, idw_model, lags = [2])
df = idw_interpolation(df, idw_model, lags = [3])
df = idw_interpolation(df, idw_model, lags = [7])

## Feature Selection

## Evaluation

In [19]:
metrics_dict = {
    'MSE': mean_squared_error, 
    'r2 score': r2_score, 
    'MAE': mean_absolute_error,
}

pm25_evaluator = Evaluator(
    df,
    min_train_date,
    max_train_date,
    'pm2_5',
    metrics_dict,
    max_lookback=None
)

pm10_evaluator = Evaluator(
    df,
    min_train_date,
    max_train_date,
    'pm10',
    metrics_dict,
    max_lookback=None
)

In [20]:
model_list = {
    "XGB" : [XGBRegressor, {}],
    "LightGBM" : [LGBMRegressor, {'verbose':0}],
    "Ridge" : [Ridge, {}],
    # "RandomForest" : [RandomForestRegressor, {}],
    "CatBoostRegressor": [CatBoostRegressor, {'verbose' : 0}],
    # "IDW" : [IDW, {'leafsize': 50, 'k': 20, 'power': 0.25}]
}

In [21]:
for model_name, [model, params] in model_list.items():
    print(f"Running Model {model_name}")
    model_wrapper = Model(model, params, target='pm2_5')
    pm25_evaluator.evaluate(
        model_wrapper, daily=False, save=True, model_name=model_name, verbose=True
    )

    model_wrapper = Model(model, params, target='pm10')
    pm10_evaluator.evaluate(
        model_wrapper, daily=False, save=True, model_name=model_name, verbose=True
    )
    print("-------------------------------------------------------------------------------")

Running Model XGB
Train PM pm2_5: R2 Score 0.7859136433820467, 
               RMSE 48.02828176849066
        
Test PM pm2_5: R2 Score 0.05303557564343031, 
              RMSE 98.11367177220335
        
Train PM pm10: R2 Score 0.7850737879552527, 
               RMSE 52.13196593121911
        
Test PM pm10: R2 Score 0.11454106635125783, 
              RMSE 102.9935025786704
        
-------------------------------------------------------------------------------
Running Model LightGBM
Train PM pm2_5: R2 Score 0.6976477005298432, 
               RMSE 57.076744232220406
        
Test PM pm2_5: R2 Score 0.18978115457804012, 
              RMSE 90.75360119824882
        
Train PM pm10: R2 Score 0.6965043355375742, 
               RMSE 61.94919341632914
        
Test PM pm10: R2 Score 0.19059976650541566, 
              RMSE 98.47075741779098
        
-------------------------------------------------------------------------------
Running Model Ridge
Train PM pm2_5: R2 Score 0.288984404845021

In [22]:
df.to_csv('engineered_df.csv')

In [23]:
df.head()

Unnamed: 0,date_value,timeOfDay,lat,lon,pm2_5,pm10,day_of_week,distance,bus_count,pm2_5_lag_1,...,pm2_5_lag_7,pm10_lag_7,idw_lag_1_pm2_5,idw_lag_2_pm2_5,idw_lag_3_pm2_5,idw_lag_7_pm2_5,idw_lag_1_pm10,idw_lag_2_pm10,idw_lag_3_pm10,idw_lag_7_pm10
0,2020-11-01,0.0,0.269231,0.809524,481.37,522.53,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
1,2020-11-01,0.0,0.307692,0.761905,471.18,513.5,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
2,2020-11-01,0.0,0.346154,0.714286,462.44,503.81,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
3,2020-11-01,0.0,0.346154,0.761905,468.14,507.55,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
4,2020-11-01,0.0,0.384615,0.714286,462.68,505.21,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935


In [24]:
CFG.features

['timeOfDay',
 'lat',
 'lon',
 'day_of_week',
 'distance',
 'bus_count',
 'pm2_5_lag_1',
 'pm10_lag_1',
 'pm2_5_lag_2',
 'pm10_lag_2',
 'pm2_5_lag_3',
 'pm10_lag_3',
 'pm2_5_lag_7',
 'pm10_lag_7',
 'idw_lag_1_pm2_5',
 'idw_lag_2_pm2_5',
 'idw_lag_3_pm2_5',
 'idw_lag_7_pm2_5',
 'idw_lag_1_pm10',
 'idw_lag_2_pm10',
 'idw_lag_3_pm10',
 'idw_lag_7_pm10']

## Full Interpolation

In [25]:
lat_lon_pairs = df.groupby(by=['lat', 'lon'])

In [26]:
lat_lon_pairs['pm2_5'].count().reset_index()# [['lat', 'lon']]

Unnamed: 0,lat,lon,pm2_5
0,0.000000,0.904762,4
1,0.000000,0.952381,4
2,0.038462,0.904762,2618
3,0.038462,0.952381,1927
4,0.076923,0.809524,292
...,...,...,...
257,1.000000,0.285714,3
258,1.000000,0.333333,3
259,1.000000,0.380952,1
260,1.000000,0.523810,1


In [27]:
# lat_lon_pairs['pm2_5'].count().plot(kind='hist', bins=20)
# (lat_lon_pairs['pm2_5'].count() < 200).sum()
lat_lon_pairs['pm2_5'].count().describe()

count     262.000000
mean      470.095420
std       633.341018
min         1.000000
25%         3.250000
50%       175.500000
75%       762.000000
max      2618.000000
Name: pm2_5, dtype: float64

In [28]:
def filter_sparse_data(df, count):
    df = df.copy()
    lat_lon_pairs = df.groupby(by=['lat', 'lon'])['pm2_5'].count().reset_index()
    lat_lon_pairs = lat_lon_pairs[lat_lon_pairs['pm2_5'] >= count]

    df = pd.merge(df, lat_lon_pairs[['lat', 'lon']], on = ['lat', 'lon'], how = 'inner')
    return df, lat_lon_pairs[['lat', 'lon']]

def make_dataset(lat_lon_pairs, df):
    times_of_day = np.sort(df['timeOfDay'].unique())
    dates = df['date_value'].unique()
    
    lat_lon_pairs = np.array(lat_lon_pairs)
    lat_lon_list = list(map(tuple, lat_lon_pairs))
    
    # Generate all possible combinations
    combinations = list(itertools.product(lat_lon_list, times_of_day, dates))
    
    # Convert to DataFrame
    new_df = pd.DataFrame(combinations, columns=['lat_lon', 'timeOfDay', 'date_value'])
    
    # Split lat_lon tuple into separate columns
    new_df[['lat', 'lon']] = pd.DataFrame(new_df['lat_lon'].tolist(), index=new_df.index)
    
    # Drop the combined lat_lon column
    new_df.drop(columns=['lat_lon'], inplace=True)

    new_df = new_df.sort_values(by = ['date_value', 'timeOfDay']).reset_index(drop=True)
    
    return new_df

In [29]:
dense_df, lat_lon_pairs = filter_sparse_data(df, 150)
dense_df_empty = make_dataset(lat_lon_pairs, dense_df)

In [30]:
# dense_df_empty.head()
len(dense_df_empty)

423605

In [31]:
def idw_dense_interpolate(df, df_original, idw):
    df = df.copy()
    # df['lat'] = df['lat'] * 50
    # df['lon'] = df['lon'] * 50
    # df['timeOfDay'] = df['timeOfDay'] * 50

    df_original = df_original.copy()
    # df_original['lat'] = df_original['lat'] * 50
    # df_original['lon'] = df_original['lon'] * 50
    # df_original['timeOfDay'] = df['timeOfDay'] * 50

    target = idw.target

    df_splitter = DataSplitter(
        df_original.copy(), min_train_date, min_train_date+timedelta(days=1), 
        max_lookback=3, min_lookback=1
    )
    train_splits = df_splitter.get_train_test_split()
    
    test_df_splitter = DataSplitter(
        df.copy(), min_train_date, min_train_date+timedelta(days=1), 
        max_lookback=3, min_lookback=None
    )
    test_splits = df_splitter.get_train_test_split()

    df[target] = np.nan

    idw.fit_on_splits(df_original, train_splits)
    preds = idw.predict_on_splits(df, test_splits)

    test_pred_full = []
    test_full = []
    index_full = []
    for d, v in preds.items():
        test_pred_full.append(v['pred'])
        test_full.append(v['true'])
        index_full.append(v['index'])
    
    test_pred_full = np.concatenate(test_pred_full)
    test_full = np.concatenate(test_full)
    index_full = np.concatenate(index_full)

    new_df = pd.DataFrame({
        # f'test_lag_{lag}': test_full,
        f'filled_{target}': test_pred_full,
    }, index = index_full)

    print(df.shape, new_df.shape)
    
    df = pd.merge(df, new_df, left_index=True, right_index=True, how='outer')
    
    print(df.shape)
    
    # df[f'test_lag_{lag}'] = df[f'test_lag_{lag}'].fillna(df[f'test_lag_{lag}'].expanding().mean())
    df[f'filled_{target}'] = df[f'filled_{target}'].fillna(df[f'filled_{target}'].expanding().mean())

    df = df.ffill()
    df = df.bfill()
    
    df = df.drop(columns = [target])

    print(df.shape)

    df = pd.merge(df, df_original[['date_value', 'timeOfDay', 'lat', 'lon', target]], 
                  how = 'left', on = ['date_value', 'timeOfDay', 'lat', 'lon'])

    print(df.shape, df_original.shape)
    
    # return df, target
    df[f'filled_{target}'] = df.where(df[target].isna(), df[target], axis=1)[f'filled_{target}']
    df[f'missing_{target}'] = df[target].isna()

    df[target] = df[f'filled_{target}']
    df = df.drop(columns = [f'filled_{target}'])

    # df['lat'] = df['lat'] / 50
    # df['lon'] = df['lon'] / 50
    # df['timeOfDay'] = df['timeOfDay'] / 50

    df = df.sort_values(by=['lat', 'lon', 'date_value', 'timeOfDay'])

    # df[target] = df[f'filled_{target}']
    # df = df.drop(columns = [f'filled_{target}'])

    return df

In [32]:
df.head()

Unnamed: 0,date_value,timeOfDay,lat,lon,pm2_5,pm10,day_of_week,distance,bus_count,pm2_5_lag_1,...,pm2_5_lag_7,pm10_lag_7,idw_lag_1_pm2_5,idw_lag_2_pm2_5,idw_lag_3_pm2_5,idw_lag_7_pm2_5,idw_lag_1_pm10,idw_lag_2_pm10,idw_lag_3_pm10,idw_lag_7_pm10
0,2020-11-01,0.0,0.269231,0.809524,481.37,522.53,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
1,2020-11-01,0.0,0.307692,0.761905,471.18,513.5,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
2,2020-11-01,0.0,0.346154,0.714286,462.44,503.81,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
3,2020-11-01,0.0,0.346154,0.761905,468.14,507.55,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
4,2020-11-01,0.0,0.384615,0.714286,462.68,505.21,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935


In [33]:
traffic_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='distance',
    features = ['timeOfDay', 'lat', 'lon']
)

bus_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='bus_count',
    features = ['timeOfDay', 'lat', 'lon']
)

pm25_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm2_5',
    features = ['timeOfDay', 'lat', 'lon']
)

pm10_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm10',
    features = ['timeOfDay', 'lat', 'lon']
)

idw_models = [traffic_idw_model, bus_idw_model, pm25_idw_model, pm10_idw_model]

for idw_model in idw_models:
    dense_df_empty = idw_dense_interpolate(dense_df_empty, df, idw_model)

(423605, 5) (121036, 1)
(423605, 6)
(423605, 5)
(423605, 6) (123165, 25)
(423605, 7) (121036, 1)
(423605, 8)
(423605, 7)
(423605, 8) (123165, 25)
(423605, 9) (121036, 1)
(423605, 10)
(423605, 9)
(423605, 10) (123165, 25)
(423605, 11) (121036, 1)
(423605, 12)
(423605, 11)
(423605, 12) (123165, 25)


In [34]:
dense_df = dense_df_empty
len(dense_df)

423605

In [35]:
dense_df['missing'] = dense_df['missing_pm10']
dense_df.drop(columns = ['missing_distance', 'missing_bus_count', 'missing_pm10', 'missing_pm2_5'], 
              inplace=True)

In [36]:
dense_df['missing'].mean()

0.713374488025401

In [37]:
dense_df.to_csv('dense_df.csv')

In [38]:
# df['day_of_week'] = pd.to_datetime(df[date_column]).dt.day_name()

In [39]:
dense_df.head()

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,timeOfDay,date_value,lat,lon,distance,bus_count,pm2_5,pm10,missing
0,0.0,2020-11-01,0.038462,0.904762,0.000447,0.0,134.703256,149.744402,True
133,0.029412,2020-11-01,0.038462,0.904762,0.000447,0.039709,224.655876,249.598959,True
266,0.058824,2020-11-01,0.038462,0.904762,0.000447,0.042062,225.761553,,False
399,0.088235,2020-11-01,0.038462,0.904762,0.000447,0.055838,228.781863,251.861934,True
532,0.117647,2020-11-01,0.038462,0.904762,0.000447,0.061136,231.063874,253.796266,True


In [40]:
len(dense_df.timeOfDay.unique())

35

## Full Dense for Evaluation

In [41]:
dense_df, lat_lon_pairs = filter_sparse_data(df, 0)
dense_df_empty = make_dataset(lat_lon_pairs, dense_df)
print(len(dense_df_empty))

traffic_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='distance',
    features = ['timeOfDay', 'lat', 'lon']
)

bus_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='bus_count',
    features = ['timeOfDay', 'lat', 'lon']
)

pm25_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm2_5',
    features = ['timeOfDay', 'lat', 'lon']
)

pm10_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm10',
    features = ['timeOfDay', 'lat', 'lon']
)

idw_models = [traffic_idw_model, bus_idw_model, pm25_idw_model, pm10_idw_model]

for idw_model in idw_models:
    dense_df_empty = idw_dense_interpolate(dense_df_empty, df, idw_model)

dense_df = dense_df_empty
print(len(dense_df))

dense_df['missing'] = dense_df['missing_pm10']
dense_df.drop(columns = ['missing_distance', 'missing_bus_count', 'missing_pm10', 'missing_pm2_5'], 
              inplace=True)

print(dense_df['missing'].mean())

dense_df.to_csv('full_dense_df.csv')

834470
(834470, 5) (121036, 1)
(834470, 6)
(834470, 5)
(834470, 6) (123165, 25)
(834470, 7) (121036, 1)
(834470, 8)
(834470, 7)
(834470, 8) (123165, 25)
(834470, 9) (121036, 1)
(834470, 10)
(834470, 9)
(834470, 10) (123165, 25)
(834470, 11) (121036, 1)
(834470, 12)
(834470, 11)
(834470, 12) (123165, 25)
834470
0.8524033218689707


In [42]:
os.listdir()

['LightGBM_evaluation.json',
 'Ridge_evaluation.json',
 'LightGBM_model.pkl',
 'XGB_evaluation.json',
 'dense_df.csv',
 'full_dense_df.csv',
 '__notebook__.ipynb',
 'Ridge_model.pkl',
 'CatBoostRegressor_evaluation.json',
 'catboost_info',
 'engineered_df.csv',
 'CatBoostRegressor_model.pkl',
 'XGB_model.pkl']