In [1]:
import numpy as np # linear algebrae
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression

import os

# from polire import IDW

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from datetime import date, timedelta
import datetime

import json
import pickle as pkl
import itertools

import torch

Config

In [2]:
class CFG:
    evaluation_time_gap = 1
    convert_numpy = False
    target_list = ['pm2_5', 'pm10']
    features = None

## Basic Loading

In [3]:
df = pd.read_csv('/kaggle/input/airdelhi-tabularengineering/tabular_data.csv')

df = df.drop(columns = 'Unnamed: 0')
df['date_value'] = pd.to_datetime(df['date_value'])

dates = pd.to_datetime([df['date_value'].min(), df['date_value'].max()])

max_train_date = dates.min() + (dates.max() - dates.min()) * 0.75
max_train_date = max_train_date.floor("D")
min_train_date = df['date_value'].min()
max_date = df['date_value'].max().floor("D")

metrics_dict = {
    'MSE': mean_squared_error, 
    'r2 score': r2_score, 
    'MAE': mean_absolute_error,
}

target_list = CFG.target_list

features = ['date_value', 'timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']

CFG.base_features = ['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']
CFG.features = CFG.base_features

max_train_date, min_train_date

(Timestamp('2021-01-07 00:00:00'), Timestamp('2020-11-01 00:00:00'))

In [4]:
scaler = MinMaxScaler()
df[['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']] = scaler.fit_transform(
    df[['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']]
)


## Basic Helper Functions

In [5]:
def clean_df(df, features=None):
    if features is None:
        return df[CFG.features]
    else:
        return df[features]

## Get Model Splits

In [6]:
class DataSplitter:

    def __init__(
        self, 
        dataset, 
        min_date, 
        max_date, 
        max_lookback=None,
        min_lookback=None,
    ):
        # self.X = dataset[CFG.features if features is None else features]
        
        # self.target = target
        # self.y = dataset[target]
        
        self.df = dataset
        
        self.min_train_date = min_date
        self.max_train_date = max_date
        self.df_max_date = self.df['date_value'].max()

        self.max_lookback = max_lookback
        self.min_lookback = min_lookback
    
    def get_train_test_split(self):
        df = self.df.copy()
        df = df[df['date_value'] >= self.min_train_date]
        d = self.max_train_date + timedelta(days = 1)

        splits = {}
        
        while d < self.df_max_date:
            train_temp = df[df['date_value'] <= d]
            if self.max_lookback is not None:
                train_temp = train_temp[train_temp['date_value'] > d - timedelta(
                    days = self.max_lookback)]

            if self.min_lookback is not None:
                train_temp1 = train_temp[train_temp['date_value'] <= d - timedelta(
                    days = self.min_lookback)]
                if len(train_temp1) > 200:
                    train_temp = train_temp1
            
            train_indices = train_temp.index
            test_indices = df[df['date_value'] == d + timedelta(days=1)].index

            splits[d] = {
                'train': train_indices,
                'test' : test_indices
            }
            
            d = d + timedelta(days = CFG.evaluation_time_gap)

        return splits

In [7]:
data_splitter = DataSplitter(df, min_train_date, max_train_date)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique())[-1])

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

2021-01-08 00:00:00 2021-01-08 00:00:00
2021-01-08 00:00:00 [Timestamp('2021-01-09 00:00:00')]


In [8]:
data_splitter = DataSplitter(df, min_train_date, max_train_date, max_lookback=3)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique()))

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

2021-01-08 00:00:00 [Timestamp('2021-01-06 00:00:00'), Timestamp('2021-01-07 00:00:00'), Timestamp('2021-01-08 00:00:00')]
2021-01-08 00:00:00 [Timestamp('2021-01-09 00:00:00')]


In [9]:
data_splitter = DataSplitter(df, min_train_date, min_train_date, max_lookback=2)
data_indices = data_splitter.get_train_test_split()

# Testing:
test_date = list(data_indices.keys())[0]
train_indices, test_indices = list(data_indices[test_date].values())
# test_date, list(data_indices.keys())
train_df_1 = df.iloc[train_indices]
print(test_date, sorted(train_df_1.date_value.unique()))

test_df_1 = df.iloc[test_indices]
print(test_date, sorted(test_df_1.date_value.unique()))

print(min_train_date, df.date_value.min())

2020-11-02 00:00:00 [Timestamp('2020-11-01 00:00:00'), Timestamp('2020-11-02 00:00:00')]
2020-11-02 00:00:00 [Timestamp('2020-11-03 00:00:00')]
2020-11-01 00:00:00 2020-11-01 00:00:00


## Model Holder

In [10]:
class Model:

    def __init__(
        self,
        model_class,
        params=None,
        target="pm2_5",
        features=None,
    ):
        self.model_class = model_class
        
        if params is None:
            self.params = {}
        else:
            self.params = params

        self.max_train_date = None
        self.target = target
        if features is None:
            features = CFG.features

        self.features = features

    def fit(self, X, y):
        model = self.model_class(**self.params)
        # print(X.columns, CFG.features)
        model.fit(X, y)
        return model

    def predict(self, X):
        model = self.split_models[self.max_train_date]
        return model.predict(X)

    def get_df_from_split(self, df, split, split_type='train'):
        X = df.iloc[split[split_type]].copy()
        y = np.array(X[self.target])
        X = clean_df(X, features=self.features)
        return X, y

    def fit_on_splits(self, df, splits):
        self.split_models = {}
        
        for d, split in splits.items():
            X, y = self.get_df_from_split(df, split)
            self.split_models[d] = self.fit(X, y)

    def predict_on_splits(self, df, splits, train=False):
        model_predictions = {}
        if train:
            train = 'train'
        else:
            train= 'test'
        
        for d, split in splits.items():
            X, y = self.get_df_from_split(df, split, split_type=train)
            model_predictions[d] = {
                'pred': self.split_models[d].predict(X),
                'true': y,
                'index': X.index
            }
        
        return model_predictions

## Model Evaluator

In [11]:
class Evaluator:

    def __init__(
        self,
        dataset,
        min_date,
        max_date,
        target,
        metrics_dict,
        max_lookback=None
    ):
        self.df_splitter = DataSplitter(dataset, min_date, max_date, max_lookback=max_lookback)
        self.metrics_dict = metrics_dict
        self.dataset = dataset
        self.target = target

        self.splits = self.df_splitter.get_train_test_split()

    def fit_predict(self, model):
        model.fit_on_splits(self.dataset, self.splits)
        train_preds = model.predict_on_splits(self.dataset, self.splits, train=True)
        test_preds = model.predict_on_splits(self.dataset, self.splits, train=False)
        return train_preds, test_preds

    def evaluate_metrics(self, y_true, y_pred):
        values = {}
        for d, metric in metrics_dict.items():
            values[d] = metric(y_true, y_pred)
        return values

    def merge_evaluations(self, evaluations):
        merged_evaluations = {}
        for d, evaluation in evaluations.items():
            if d in ['aggregated', 'aggregated_train']:
                continue
            if len(merged_evaluations) == 0:
                merged_evaluations = {
                    k : [v]
                    for k, v in evaluation.items()
                }
            else:
                for k, v in evaluation.items():
                    merged_evaluations[k].append(v)

        merged_evaluations['aggregated'] = evaluations['aggregated']
        # merged_evaluations['aggregated_train'] = evaluations['aggregated_train']
        
        return merged_evaluations

    def save(self, model, evaluations, model_name):
        with open(f'{model_name}_evaluation.json', 'w') as f:
            json.dump(evaluations, f)

        with open(f'{model_name}_model.pkl', 'wb') as f:
            pkl.dump(model, f)

    def print(self, train_evaluations, test_evaluations):
        print(f"""Train PM {self.target}: R2 Score {train_evaluations['aggregated']['r2 score']}, 
               RMSE {np.sqrt(train_evaluations['aggregated']['MSE'])}
        """)
        print(f"""Test PM {self.target}: R2 Score {test_evaluations['aggregated']['r2 score']}, 
              RMSE {np.sqrt(test_evaluations['aggregated']['MSE'])}
        """)
        

    def evaluate(self, model, daily=False, save=True, model_name=None, verbose=True):
        train_preds, test_preds = self.fit_predict(model)

        train_metrics = {}
        train_pred_full = []
        train_full = []
        for d, v in train_preds.items():
            train_metrics[d] = self.evaluate_metrics(v['true'], v['pred'])
            train_pred_full.append(v['pred'])
            train_full.append(v['true'])

        train_pred_full = np.concatenate(train_pred_full)
        train_full = np.concatenate(train_full)
        

        test_metrics = {}
        test_pred_full = []
        test_full = []
        index_full = []
        for d, v in test_preds.items():
            test_metrics[d] = self.evaluate_metrics(v['true'], v['pred'])
            test_pred_full.append(v['pred'])
            test_full.append(v['true'])
            index_full.append(v['index'])
        
        test_pred_full = np.concatenate(test_pred_full)
        test_full = np.concatenate(test_full)
        index_full = np.concatenate(index_full)

        train_metrics['aggregated'] = self.evaluate_metrics(train_full, train_pred_full)
        test_metrics['aggregated'] = self.evaluate_metrics(test_full, test_pred_full)
        
        if not daily:
            train_metrics, test_metrics = self.merge_evaluations(train_metrics), self.merge_evaluations(test_metrics)

        if save:
            self.save(model, {'train':train_metrics,'test':test_metrics}, model_name)

        if verbose:
            self.print(train_metrics, test_metrics)
        
        return train_metrics, test_metrics

## Additional Processing

In [12]:
from scipy.spatial import cKDTree

class IDW:

    def __init__(self, leafsize, power = 3, k = 10):
        self.leafsize = leafsize
        self.power = power
        self.k = k

    def fit(self, X, y):
        self.X = X
        # self.y = y.values
        self.y = y

        self.tree = cKDTree(self.X.values, leafsize=self.leafsize)

    def predict(self, test_df):
        X_test = test_df.values
        
        distances, indices = self.tree.query(X_test, k=self.k, workers=-1)
        distances = np.maximum(distances, 1e-10)
        
        weights = 1 / (distances ** self.power)
        weights /= np.sum(weights, axis=1, keepdims=True)
        
        interpolated_values = np.sum(weights * self.y[indices], axis=1)
        
        return interpolated_values

## Lagged Features

In [13]:
def add_lag_features(df, lags = [1]):

    df = df.copy()

    added_features = []
    for l in lags:
        df[f'pm2_5_lag_{l}'] = df.groupby(
            ['timeOfDay', 'lat', 'lon'])['pm2_5'].shift(l)
        df[f'pm10_lag_{l}'] = df.groupby(
            ['timeOfDay', 'lat', 'lon'])['pm10'].shift(l)

        added_features.append(f'pm2_5_lag_{l}')
        added_features.append(f'pm10_lag_{l}')

        # .reset_index(drop=False)
        
        # shifted_pm25 = df.groupby(['timeOfDay', 'lat', 'lon'])['pm2_5'].shift(1).reset_index(drop=False)
        # shifted_pm10 = df.groupby(['timeOfDay', 'lat', 'lon'])['pm10'].shift(1).reset_index(drop=False)

        # shifted_pm25 = shifted_pm25.rename(columns = {'pm2_5' : f'pm2_5_lag_{l}'})
        # shifted_pm10 = shifted_pm10.rename(columns = {'pm10' : f'pm10_lag_{l}'})

        # df = pd.merge(df, shifted_pm25, how = 'outer', on =  ['timeOfDay', 'lat', 'lon'])
        # df = pd.merge(df, shifted_pm10, how = 'outer', on =  ['timeOfDay', 'lat', 'lon'])


        df.sort_values(by=["lat", "lon", "date_value"], inplace=True)

    # Group by latitude and longitude
    grouped = df.groupby(["lat", "lon"])

    # Function to fill NaN values based on previous mean
    def fill_na_with_previous_mean(group):
        for col in group.columns:
            if col not in ["date_value", "lat", "lon"]:
                group[col] = group[col].astype(float)  # Ensure numeric columns
                group[col] = group[col].fillna(group[col].expanding().mean().shift())  # Previous days' mean
                
                # If still NaN (first row), replace with overall mean
                overall_mean = df[col].mean(skipna=True)
                group[col] = group[col].fillna(overall_mean)
        return group

    # Apply the function to each group
    df = grouped.apply(fill_na_with_previous_mean)

    df.reset_index(drop=True, inplace=True)

    df = df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])

    df.reset_index(drop=True, inplace=True)

    CFG.features += added_features
    
    return df


In [14]:
df = add_lag_features(df, lags = [1, 2, 3, 7])

  df = grouped.apply(fill_na_with_previous_mean)


In [15]:
# IDW Interpolation

def idw_interpolation(df, idw, lags):
    df = df.copy()
    df['lat'] = df['lat'] * 50
    df['lon'] = df['lon'] * 50
    
    target = idw.target
    added_features = []
    
    for lag in lags:
        df_splitter = DataSplitter(
            df.copy(), min_train_date, min_train_date+timedelta(days=1), 
            max_lookback=lag+1, min_lookback=lag-1
        )
        splits = df_splitter.get_train_test_split()
    
        idw.fit_on_splits(df, splits)
        preds = idw.predict_on_splits(df, splits)

        test_pred_full = []
        test_full = []
        index_full = []
        
        for d, v in preds.items():
            test_pred_full.append(v['pred'])
            test_full.append(v['true'])
            index_full.append(v['index'])
        
        test_pred_full = np.concatenate(test_pred_full)
        test_full = np.concatenate(test_full)
        index_full = np.concatenate(index_full)

        new_df = pd.DataFrame({
            # f'test_lag_{lag}': test_full,
            f'idw_lag_{lag}_{target}': test_pred_full,
        }, index = index_full)
        
        df = df.merge(new_df, left_index=True, right_index=True, how='outer')
        # df[f'test_lag_{lag}'] = df[f'test_lag_{lag}'].fillna(df[f'test_lag_{lag}'].expanding().mean())
        df[f'idw_lag_{lag}_{target}'] = df[f'idw_lag_{lag}_{target}'].fillna(df[f'idw_lag_{lag}_{target}'].expanding().mean())
        
        df = df.bfill()

        added_features.append(f'idw_lag_{lag}_{target}')

    df['lat'] = df['lat'] / 50
    df['lon'] = df['lon'] / 50

    CFG.features = CFG.features + added_features
    
    return df

In [16]:
# pm2_5

idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3}
)

# idw_model = Model(
#     RandomForestRegressor,
# )

df = idw_interpolation(df, idw_model, lags = [1])
df = idw_interpolation(df, idw_model, lags = [2])
df = idw_interpolation(df, idw_model, lags = [3])
df = idw_interpolation(df, idw_model, lags = [7])

# print(np.sqrt(mean_squared_error(df['test_lag_1'], df['idw_lag_1_pm2_5'])))
# print(np.sqrt(mean_squared_error(df['test_lag_2'], df['idw_lag_2_pm2_5'])))
# print(np.sqrt(mean_squared_error(df['test_lag_3'], df['idw_lag_3_pm2_5'])))

In [17]:
# pm10

idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm10'
)

# idw_model = Model(
#     RandomForestRegressor,
#     target='pm10'
# )

df = idw_interpolation(df, idw_model, lags = [1])
df = idw_interpolation(df, idw_model, lags = [2])
df = idw_interpolation(df, idw_model, lags = [3])
df = idw_interpolation(df, idw_model, lags = [7])

## Evaluation

In [18]:
metrics_dict = {
    'MSE': mean_squared_error, 
    'r2 score': r2_score, 
    'MAE': mean_absolute_error,
}

# pm25_evaluator = Evaluator(
#     df,
#     min_train_date,
#     max_train_date,
#     'pm2_5',
#     metrics_dict,
#     max_lookback=None
# )

# pm10_evaluator = Evaluator(
#     df,
#     min_train_date,
#     max_train_date,
#     'pm10',
#     metrics_dict,
#     max_lookback=None
# )

In [19]:
df.to_csv('engineered_df.csv')

In [20]:
df.head()

Unnamed: 0,date_value,timeOfDay,lat,lon,pm2_5,pm10,day_of_week,distance,bus_count,pm2_5_lag_1,...,pm2_5_lag_7,pm10_lag_7,idw_lag_1_pm2_5,idw_lag_2_pm2_5,idw_lag_3_pm2_5,idw_lag_7_pm2_5,idw_lag_1_pm10,idw_lag_2_pm10,idw_lag_3_pm10,idw_lag_7_pm10
0,2020-11-01,0.0,0.269231,0.809524,481.37,522.53,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
1,2020-11-01,0.0,0.307692,0.761905,471.18,513.5,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
2,2020-11-01,0.0,0.346154,0.714286,462.44,503.81,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
3,2020-11-01,0.0,0.346154,0.761905,468.14,507.55,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935
4,2020-11-01,0.0,0.384615,0.714286,462.68,505.21,1.0,0.001765,0.0,197.000308,...,197.253747,214.864707,197.287118,359.376438,446.796673,197.287118,231.26935,389.966421,481.86334,231.26935


In [21]:
CFG.features

['timeOfDay',
 'lat',
 'lon',
 'day_of_week',
 'distance',
 'bus_count',
 'pm2_5_lag_1',
 'pm10_lag_1',
 'pm2_5_lag_2',
 'pm10_lag_2',
 'pm2_5_lag_3',
 'pm10_lag_3',
 'pm2_5_lag_7',
 'pm10_lag_7',
 'idw_lag_1_pm2_5',
 'idw_lag_2_pm2_5',
 'idw_lag_3_pm2_5',
 'idw_lag_7_pm2_5',
 'idw_lag_1_pm10',
 'idw_lag_2_pm10',
 'idw_lag_3_pm10',
 'idw_lag_7_pm10']

## Full Interpolation

In [22]:
lat_lon_pairs = df.groupby(by=['lat', 'lon'])

In [23]:
lat_lon_pairs['pm2_5'].count().reset_index()# [['lat', 'lon']]

Unnamed: 0,lat,lon,pm2_5
0,0.000000,0.904762,4
1,0.000000,0.952381,4
2,0.038462,0.904762,2618
3,0.038462,0.952381,1927
4,0.076923,0.809524,292
...,...,...,...
257,1.000000,0.285714,3
258,1.000000,0.333333,3
259,1.000000,0.380952,1
260,1.000000,0.523810,1


In [24]:
lat_lon_pairs['pm2_5'].count().describe()

count     262.000000
mean      470.095420
std       633.341018
min         1.000000
25%         3.250000
50%       175.500000
75%       762.000000
max      2618.000000
Name: pm2_5, dtype: float64

In [25]:
def filter_sparse_data(df, count):
    df = df.copy()
    lat_lon_pairs = df.groupby(by=['lat', 'lon'])['pm2_5'].count().reset_index()
    lat_lon_pairs = lat_lon_pairs[lat_lon_pairs['pm2_5'] >= count]

    df = pd.merge(df, lat_lon_pairs[['lat', 'lon']], on = ['lat', 'lon'], how = 'inner')
    return df, lat_lon_pairs[['lat', 'lon']]

def make_dataset(lat_lon_pairs, df):
    times_of_day = np.sort(df['timeOfDay'].unique())
    dates = df['date_value'].unique()
    
    lat_lon_pairs = np.array(lat_lon_pairs)
    lat_lon_list = list(map(tuple, lat_lon_pairs))
    
    # Generate all possible combinations
    combinations = list(itertools.product(lat_lon_list, times_of_day, dates))
    
    # Convert to DataFrame
    new_df = pd.DataFrame(combinations, columns=['lat_lon', 'timeOfDay', 'date_value'])
    
    # Split lat_lon tuple into separate columns
    new_df[['lat', 'lon']] = pd.DataFrame(new_df['lat_lon'].tolist(), index=new_df.index)
    
    # Drop the combined lat_lon column
    new_df.drop(columns=['lat_lon'], inplace=True)

    new_df = new_df.sort_values(by = ['date_value', 'timeOfDay']).reset_index(drop=True)
    
    return new_df

In [26]:
dense_df, lat_lon_pairs = filter_sparse_data(df, 150)
dense_df_empty = make_dataset(lat_lon_pairs, dense_df)

In [27]:
# dense_df_empty.head()
len(dense_df_empty)

423605

In [28]:
def idw_dense_interpolate(df, df_original, idw):
    df = df.copy()
    df['lat'] = df['lat'] * 50
    df['lon'] = df['lon'] * 50
    # df['timeOfDay'] = df['timeOfDay'] * 50

    df_original = df_original.copy()
    df_original['lat'] = df_original['lat'] * 50
    df_original['lon'] = df_original['lon'] * 50
    # df_original['timeOfDay'] = df['timeOfDay'] * 50

    target = idw.target

    df_splitter = DataSplitter(
        df_original.copy(), min_train_date, min_train_date+timedelta(days=1), 
        max_lookback=3, min_lookback=1
    )
    train_splits = df_splitter.get_train_test_split()
    
    test_df_splitter = DataSplitter(
        df.copy(), min_train_date, min_train_date+timedelta(days=1), 
        max_lookback=3, min_lookback=None
    )
    test_splits = df_splitter.get_train_test_split()

    df[target] = np.nan

    idw.fit_on_splits(df_original, train_splits)
    preds = idw.predict_on_splits(df, test_splits)

    test_pred_full = []
    test_full = []
    index_full = []
    for d, v in preds.items():
        test_pred_full.append(v['pred'])
        test_full.append(v['true'])
        index_full.append(v['index'])
    
    test_pred_full = np.concatenate(test_pred_full)
    test_full = np.concatenate(test_full)
    index_full = np.concatenate(index_full)

    new_df = pd.DataFrame({
        # f'test_lag_{lag}': test_full,
        f'filled_{target}': test_pred_full,
    }, index = index_full)

    print(df.shape, new_df.shape)
    
    df = pd.merge(df, new_df, left_index=True, right_index=True, how='outer')
    
    print(df.shape)
    
    # df[f'test_lag_{lag}'] = df[f'test_lag_{lag}'].fillna(df[f'test_lag_{lag}'].expanding().mean())
    df[f'filled_{target}'] = df[f'filled_{target}'].fillna(df[f'filled_{target}'].expanding().mean())

    df = df.ffill()
    df = df.bfill()
    
    df = df.drop(columns = [target])

    print(df.shape)

    df = pd.merge(df, df_original[['date_value', 'timeOfDay', 'lat', 'lon', target]], 
                  how = 'left', on = ['date_value', 'timeOfDay', 'lat', 'lon'])

    print(df.shape, df_original.shape)
    
    # return df, target
    # return df.where(df[target].notna(), other= df[target], axis=1)
    # df[f'filled_{target}'] = df.where(df[target].notna(), 
    #                                   other= df[target], axis=1)[f'filled_{target}']
    
    # df = df.where(df.notna(), df[target], axis=0)
    # df = df.apply(lambda row: row.fillna(row[target]), axis=1)
    df[f'filled_{target}'] = df[target].combine_first(df[f'filled_{target}'])
    
    df[f'missing_{target}'] = df[target].isna()

    df[target] = df[f'filled_{target}']
    df = df.drop(columns = [f'filled_{target}'])

    df['lat'] = df['lat'] / 50
    df['lon'] = df['lon'] / 50
    # df['timeOfDay'] = df['timeOfDay'] / 50

    df = df.sort_values(by=['lat', 'lon', 'date_value', 'timeOfDay'])

    # df[target] = df[f'filled_{target}']
    # df = df.drop(columns = [f'filled_{target}'])

    return df

In [29]:
traffic_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='distance',
    features = ['timeOfDay', 'lat', 'lon']
)

bus_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='bus_count',
    features = ['timeOfDay', 'lat', 'lon']
)

pm25_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm2_5',
    features = ['timeOfDay', 'lat', 'lon']
)

pm10_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm10',
    features = ['timeOfDay', 'lat', 'lon']
)

idw_models = [traffic_idw_model, bus_idw_model, pm25_idw_model, pm10_idw_model]

for idw_model in idw_models:
    dense_df_empty = idw_dense_interpolate(dense_df_empty, df, idw_model)

(423605, 5) (121036, 1)
(423605, 6)
(423605, 5)
(423605, 6) (123165, 25)
(423605, 7) (121036, 1)
(423605, 8)
(423605, 7)
(423605, 8) (123165, 25)
(423605, 9) (121036, 1)
(423605, 10)
(423605, 9)
(423605, 10) (123165, 25)
(423605, 11) (121036, 1)
(423605, 12)
(423605, 11)
(423605, 12) (123165, 25)


In [30]:
dense_df = dense_df_empty
len(dense_df)

423605

In [31]:
dense_df.tail()

Unnamed: 0,timeOfDay,date_value,lat,lon,distance,missing_distance,bus_count,missing_bus_count,pm2_5,missing_pm2_5,pm10,missing_pm10
423072,0.882353,2021-01-30,0.807692,0.857143,0.004846,True,0.057328,True,198.275431,True,217.936466,True
423205,0.911765,2021-01-30,0.807692,0.857143,0.004846,True,0.057322,True,198.282628,True,217.943753,True
423338,0.941176,2021-01-30,0.807692,0.857143,0.00114,False,0.0,False,314.11,False,338.78,False
423471,0.970588,2021-01-30,0.807692,0.857143,0.001308,False,0.0,False,429.87,False,462.33,False
423604,1.0,2021-01-30,0.807692,0.857143,0.004846,True,0.057306,True,198.308631,True,217.970851,True


In [32]:
dense_df = dense_df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])
df = df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])

combined_df = pd.merge(dense_df[['date_value', 'timeOfDay', 'lat', 'lon', 'pm2_5', 'pm10']], 
                       df[['date_value', 'timeOfDay', 'lat', 'lon', 'pm2_5', 'pm10']], 
                       on = ['date_value', 'timeOfDay', 'lat', 'lon'],
                      how = 'inner')

In [33]:
combined_df

Unnamed: 0,date_value,timeOfDay,lat,lon,pm2_5_x,pm10_x,pm2_5_y,pm10_y
0,2020-11-01,0.0,0.269231,0.809524,481.37,522.53,481.37,522.53
1,2020-11-01,0.0,0.307692,0.761905,471.18,513.50,471.18,513.50
2,2020-11-01,0.0,0.346154,0.714286,462.44,503.81,462.44,503.81
3,2020-11-01,0.0,0.346154,0.761905,468.14,507.55,468.14,507.55
4,2020-11-01,0.0,0.384615,0.714286,462.68,505.21,462.68,505.21
...,...,...,...,...,...,...,...,...
121411,2021-01-30,1.0,0.615385,0.714286,248.29,268.74,248.29,268.74
121412,2021-01-30,1.0,0.615385,0.761905,262.03,285.68,262.03,285.68
121413,2021-01-30,1.0,0.615385,0.809524,259.75,279.04,259.75,279.04
121414,2021-01-30,1.0,0.653846,0.809524,269.06,289.19,269.06,289.19


In [34]:
dense_df['missing'] = dense_df['missing_pm10']
dense_df.drop(columns = ['missing_distance', 'missing_bus_count', 'missing_pm10', 'missing_pm2_5'], 
              inplace=True)

In [35]:
dense_df['missing'].mean()

0.713374488025401

In [36]:
dense_df.to_csv('dense_df.csv')

In [37]:
# df['day_of_week'] = pd.to_datetime(df[date_column]).dt.day_name()

In [38]:
dense_df.head()

Unnamed: 0,timeOfDay,date_value,lat,lon,distance,bus_count,pm2_5,pm10,missing
0,0.0,2020-11-01,0.038462,0.904762,0.000447,0.0,133.123529,148.118824,True
1,0.0,2020-11-01,0.038462,0.952381,0.000447,0.0,133.123529,148.118824,True
2,0.0,2020-11-01,0.076923,0.809524,0.000447,0.047577,246.319707,273.117459,True
3,0.0,2020-11-01,0.076923,0.857143,0.000447,0.0,272.947254,296.924339,True
4,0.0,2020-11-01,0.076923,0.904762,0.000447,0.142857,333.51,367.575,True


In [39]:
len(dense_df.timeOfDay.unique())

35

## Full Dense for Evaluation

In [40]:
full_dense_df, lat_lon_pairs = filter_sparse_data(df, 0)
full_dense_df_empty = make_dataset(lat_lon_pairs, full_dense_df)
print(len(full_dense_df_empty))

traffic_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='distance',
    features = ['timeOfDay', 'lat', 'lon']
)

bus_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='bus_count',
    features = ['timeOfDay', 'lat', 'lon']
)

pm25_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm2_5',
    features = ['timeOfDay', 'lat', 'lon']
)

pm10_idw_model = Model(
    IDW,
    params={'leafsize': 50, 'power': 3, 'k': 3},
    target='pm10',
    features = ['timeOfDay', 'lat', 'lon']
)

idw_models = [traffic_idw_model, bus_idw_model, pm25_idw_model, pm10_idw_model]

for idw_model in idw_models:
    full_dense_df_empty = idw_dense_interpolate(full_dense_df_empty, df, idw_model)

full_dense_df = full_dense_df_empty
print(len(full_dense_df))

full_dense_df['missing'] = full_dense_df['missing_pm10']
full_dense_df.drop(columns = ['missing_distance', 'missing_bus_count', 'missing_pm10', 'missing_pm2_5'], 
              inplace=True)

print(full_dense_df['missing'].mean())

full_dense_df.to_csv('full_dense_df.csv')

834470
(834470, 5) (121036, 1)
(834470, 6)
(834470, 5)
(834470, 6) (123165, 25)
(834470, 7) (121036, 1)
(834470, 8)
(834470, 7)
(834470, 8) (123165, 25)
(834470, 9) (121036, 1)
(834470, 10)
(834470, 9)
(834470, 10) (123165, 25)
(834470, 11) (121036, 1)
(834470, 12)
(834470, 11)
(834470, 12) (123165, 25)
834470
0.8524033218689707


In [41]:
full_dense_df = full_dense_df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])
df = df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])

combined_df = pd.merge(full_dense_df[['date_value', 'timeOfDay', 'lat', 'lon', 'pm2_5', 'pm10']], 
                       df[['date_value', 'timeOfDay', 'lat', 'lon', 'pm2_5', 'pm10']], 
                       on = ['date_value', 'timeOfDay', 'lat', 'lon'],
                      how = 'inner')

In [42]:
combined_df.head()

Unnamed: 0,date_value,timeOfDay,lat,lon,pm2_5_x,pm10_x,pm2_5_y,pm10_y
0,2020-11-01,0.0,0.269231,0.809524,481.37,522.53,481.37,522.53
1,2020-11-01,0.0,0.307692,0.761905,471.18,513.5,471.18,513.5
2,2020-11-01,0.0,0.346154,0.714286,462.44,503.81,462.44,503.81
3,2020-11-01,0.0,0.346154,0.761905,468.14,507.55,468.14,507.55
4,2020-11-01,0.0,0.384615,0.714286,462.68,505.21,462.68,505.21


In [43]:
os.listdir()

['full_dense_df.csv',
 'dense_df.csv',
 '__notebook__.ipynb',
 'engineered_df.csv']

## Sanity Check

In [44]:
test_df1 = pd.read_csv('/kaggle/working/dense_df.csv')
test_df2 = pd.read_csv('/kaggle/working/full_dense_df.csv')

## Convert To Grid

In [45]:
lat = df['lat'].unique()
lon = df['lon'].unique()

lat = np.sort(lat)
lon = np.sort(lon)

min_dist_lat = 1000
for i in range(len(lat) - 1):
    min_dist_lat = min(min_dist_lat, lat[i+1] - lat[i])
print(min_dist_lat)

for i in range(len(lat)):
    assert (lat[i] / min_dist_lat - round(lat[i] / min_dist_lat)) < 1e-6

min_dist_lon = 1000
for i in range(len(lon) - 1):
    min_dist_lon = min(min_dist_lon, lon[i+1] - lon[i])
print(min_dist_lon)

for i in range(len(lon)):
    assert (lon[i] / min_dist_lon - round(lon[i] / min_dist_lon)) < 1e-6

0.038461538461518785
0.04761904761903679


In [46]:
grid_dimensions = round(lat[-1] / min_dist_lat), round(lon[-1] / min_dist_lon)
grid_dimensions

(26, 21)

In [47]:
def get_grid():
    return np.zeros((30, 30))

def rounded_lat_lon(df):
    df = df.copy()
    df['lat'] = np.round(df['lat'] / min_dist_lat) + 2
    df['lon'] = np.round(df['lon'] / min_dist_lon) + 4

    return df

dense_df1 = rounded_lat_lon(dense_df)
full_dense_df1 = rounded_lat_lon(full_dense_df)
print(len(dense_df1), len(full_dense_df1))

423605 834470


In [48]:
dense_df1.head()

Unnamed: 0,timeOfDay,date_value,lat,lon,distance,bus_count,pm2_5,pm10,missing
0,0.0,2020-11-01,3.0,23.0,0.000447,0.0,133.123529,148.118824,True
1,0.0,2020-11-01,3.0,24.0,0.000447,0.0,133.123529,148.118824,True
2,0.0,2020-11-01,4.0,21.0,0.000447,0.047577,246.319707,273.117459,True
3,0.0,2020-11-01,4.0,22.0,0.000447,0.0,272.947254,296.924339,True
4,0.0,2020-11-01,4.0,23.0,0.000447,0.142857,333.51,367.575,True


In [49]:
t = dense_df1[(dense_df1['date_value'] == '2020-11-01') & (dense_df1['timeOfDay'] == 0)]
t.pivot(index='lat', columns = 'lon', values='pm2_5') # [['lat', 'lon', 'pm2_5']]

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


lon,6.0,7.0,8.0,9.0,10.0,11.0,12.0,13.0,14.0,15.0,16.0,17.0,18.0,19.0,20.0,21.0,22.0,23.0,24.0,25.0
lat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
3.0,,,,,,,,,,,,,,,,,,133.123529,133.123529,
4.0,,,,,,,,,,,,,,,,246.319707,272.947254,333.51,,
5.0,,,,,,,,,,,,,,,,151.062105,,207.228235,,
6.0,,,,,,,,,,,,,,,,224.174988,97.211,199.0,,
7.0,,,,,,,,,,,,,,,286.132833,308.915792,288.566471,,,
8.0,,,,,,,,,,,,,,,179.345791,235.97,94.225,,,
9.0,,,,,,,169.818216,161.144278,216.367643,,,,,284.635294,411.6,481.37,,355.829045,161.809123,
10.0,,,,,182.968127,275.291739,,,142.747852,,,,,45.85,471.18,124.5,,171.081717,,
11.0,,,,,207.511392,,578.791176,209.465814,322.31,142.255,160.09,325.14,269.425,462.44,468.14,224.174187,224.174187,224.174187,,
12.0,,,,,224.174187,224.174187,224.174187,,224.174187,,224.174187,224.174187,,462.68,467.5,,,,,


In [50]:
# def get_grids(df):
#     grids = {}

#     grids = {}  # Dictionary to hold all generated grids
    
#     # Get all unique combinations of date and time
#     grouped = df.groupby([date_col, time_col])
    
#     for (date_val, time_val), group in grouped:
#         # Pivot to grid with lat as rows (Y), lon as columns (X)
#         pivot = group.pivot(index=lat_col, columns=lon_col, values=target_col)

#         # Ensure sorted index/columns
#         pivot = pivot.sort_index(axis=0).sort_index(axis=1)
        
#         # Create an empty 30x30 grid filled with 0 (or np.nan if you prefer)
#         grid = np.zeros((grid_size, grid_size))
        
#         # Determine start indices to center your data
#         lat_start = (grid_size - pivot.shape[0]) // 2
#         lon_start = (grid_size - pivot.shape[1]) // 2
        
#         # Insert data into the middle of the 30x30 grid
#         lat_end = lat_start + pivot.shape[0]
#         lon_end = lon_start + pivot.shape[1]
        
#         # Handle edge cases where data is bigger than 30x30 (clip)
#         if lat_end > grid_size or lon_end > grid_size:
#             raise ValueError(f"Data too big to fit in {grid_size}x{grid_size} grid")

#         grid[lat_start:lat_end, lon_start:lon_end] = pivot.values
        
#         # Store grid with key (date, time)
#         grids[(date_val, time_val)] = grid


#     for (date, time), group in df.groupby(['date_value', 'timeOfDay']):
#         pm2_5_grid = group.pivot(index='lat', columns='lon', values='pm2_5')
#         pm10_grid = group.pivot(index='lat', columns='lon', values='pm10')
#         missing_grid = group.pivot(index='lat', columns='lon', values='pm10')
#         missing_grid = missing_grid.fillna(True)
#         grids[(date, time)] = {
#             'pm2_5': pm2_5_grid,
#             'pm10': pm10_grid,
#             'missing': missing_grid,
#         }

#     keys = sorted(grids.keys())  # Sort to keep ordering consistent
#     tensors = []

#     for key in keys:
#         A = grids[key]['pm2_5'] # .fillna(fill_value).values
#         B = grids[key]['pm10'] # .fillna(fill_value).values
#         M = grids[key]['missing'] # .fillna(fill_value).values

#         # Stack the three channels: shape becomes (3, height, width)
#         stacked = np.stack([A, B, M])
#         tensors.append(stacked)

#     # Final tensor: shape (num_grids, 3, height, width)
#     return np.stack(tensors), keys

In [51]:
import pandas as pd
import numpy as np

def create_grids_for_cnn(df, i_col='lon', j_col='lat', date_col='date_value', time_col='timeOfDay',
                         value_cols=['pm2_5', 'pm10', 'timeOfDay', 'day_of_week', 'distance',
                                     'bus_count', 'missing'], grid_size=30):
    """
    Create 30x30 grids for each (date_value, timeOfDay) combination.
    
    Parameters:
    - df: Input DataFrame
    - i_col, j_col: columns for longitude (x) and latitude (y)
    - date_col, time_col: columns to group by
    - value_cols: list of value columns to create channels for (e.g., ['A', 'B', 'missing'])
    - grid_size: size of the output grid (default 30x30)

    Returns:
    - grids_array: numpy array of shape (num_samples, channels, grid_size, grid_size)
    - keys: list of (date_value, timeOfDay) tuples
    """

    grouped = df.groupby([date_col, time_col])
    keys = []
    grid_list = []

    for key, group in grouped:
        keys.append(key)
        channels = []
        
        for val_col in value_cols:
            pivot = group.pivot(index=j_col, columns=i_col, values=val_col)
            pivot = pivot.sort_index(axis=0).sort_index(axis=1)

            lat_vals = pivot.index.to_numpy()
            lon_vals = pivot.columns.to_numpy()
            
            h, w = len(lat_vals), len(lon_vals)
            
            # Calculate start positions to center the data
            y_start = (grid_size - h) // 2
            x_start = (grid_size - w) // 2
            
            # Initialize empty grid
            grid = np.full((grid_size, grid_size), np.nan if val_col != 'missing' else 1, dtype=np.float32)

            # # Create an empty grid
            # grid = np.ones((grid_size, grid_size), dtype=np.float32)

            # h, w = pivot.shape
            # y_start = (grid_size - h) // 2
            # x_start = (grid_size - w) // 2

            y_end = y_start + h
            x_end = x_start + w

            if y_end > grid_size or x_end > grid_size:
                raise ValueError(f"Data for {key} too large to fit in {grid_size}x{grid_size} grid")

            # if val_col != 'missing':
            #     grid[y_start:y_end, x_start:x_end] = pivot.values
            # else:
            #     # For 'missing': treat any present (i, j) as 1 (True), others remain 0
            #     grid[y_start:y_end, x_start:x_end] = ~pivot.isna().values.astype(np.uint8)

            for yi, lat in enumerate(lat_vals):
                for xi, lon in enumerate(lon_vals):
                    value = pivot.at[lat, lon]
                    # if pd.isna(value):
                    #     continue
                    
                    grid[int(lat), int(lon)] = value


            channels.append(grid)
        
        # Stack channels (shape: 3 x 30 x 30)
        stacked = np.stack(channels, axis=0)
        grid_list.append(stacked)

    # Final output: (num_samples, channels, height, width)
    grids_array = np.stack(grid_list, axis=0)
    grids_array[:, :2, :, :] = np.where(
        grids_array[:, :2, :, :] == 1,
        np.nan,
        grids_array[:, :2, :, :]
    )

    grids_array[:, 2, :, :] = np.where(
        np.isnan(grids_array[:, 2, :, :]),
        1,
        grids_array[:, 2, :, :]
    )
    
    return grids_array, keys


In [52]:
dense_df1['day_of_week'] = dense_df1['date_value'].dt.dayofweek
full_dense_df1['day_of_week'] = full_dense_df1['date_value'].dt.dayofweek

In [53]:
dense_data_1, dates1 = create_grids_for_cnn(dense_df1)

In [54]:
dense_data_1.shape

(3185, 7, 30, 30)

In [55]:
dense_df1[(dense_df1['date_value'] == '2020-11-01') & (dense_df1['timeOfDay'] == 0) & (
    dense_df1['lon'] == 23
)]

Unnamed: 0,timeOfDay,date_value,lat,lon,distance,bus_count,pm2_5,pm10,missing,day_of_week
0,0.0,2020-11-01,3.0,23.0,0.000447,0.0,133.123529,148.118824,True,6
4,0.0,2020-11-01,4.0,23.0,0.000447,0.142857,333.51,367.575,True,6
6,0.0,2020-11-01,5.0,23.0,0.000447,0.008403,207.228235,226.503529,True,6
9,0.0,2020-11-01,6.0,23.0,0.000447,0.0,199.0,219.41,True,6
22,0.0,2020-11-01,9.0,23.0,0.000447,0.0,355.829045,403.913058,True,6
30,0.0,2020-11-01,10.0,23.0,0.000447,0.0,171.081717,212.435964,True,6
43,0.0,2020-11-01,11.0,23.0,0.000447,0.033649,224.174187,249.394728,True,6
90,0.0,2020-11-01,17.0,23.0,0.000447,0.033649,224.174187,249.394728,True,6
106,0.0,2020-11-01,19.0,23.0,0.000447,0.033649,224.174187,249.394728,True,6
112,0.0,2020-11-01,20.0,23.0,0.000447,0.033649,224.174187,249.394728,True,6


In [56]:
dense_data_1[0, 0, :, 23]

array([      nan,       nan,       nan, 133.12354, 333.51   , 207.22824,
       199.     ,       nan,       nan, 355.82904, 171.08171, 224.1742 ,
             nan,       nan,       nan,       nan,       nan, 224.1742 ,
             nan, 224.1742 , 224.1742 ,       nan,       nan,       nan,
             nan,       nan,       nan,       nan,       nan,       nan],
      dtype=float32)

In [57]:
dense_data_1[0, 1, :, 23]

array([      nan,       nan,       nan, 148.11882, 367.575  , 226.50352,
       219.41   ,       nan,       nan, 403.91306, 212.43596, 249.39473,
             nan,       nan,       nan,       nan,       nan, 249.39473,
             nan, 249.39473, 249.39473,       nan,       nan,       nan,
             nan,       nan,       nan,       nan,       nan,       nan],
      dtype=float32)

In [58]:
dense_data_1[0, 2, :, 23]

array([1., 1., 1., 0., 0., 0., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 1.,
       0., 1., 0., 0., 1., 1., 1., 1., 1., 1., 1., 1., 1.], dtype=float32)

In [59]:
dates1[1000]

(Timestamp('2020-11-29 00:00:00'), 0.588235294117647)

In [60]:
dense_df1[(dense_df1['date_value'] == '2020-11-29') & (dense_df1['timeOfDay'] == 0.588235294117647) & (
    # dense_df1['missing'] == False
    dense_df1['lon'] == 15
)]

Unnamed: 0,timeOfDay,date_value,lat,lon,distance,bus_count,pm2_5,pm10,missing,day_of_week
133035,0.588235,2020-11-29,11.0,15.0,0.003087,0.142857,78.56,85.52,False,6
133055,0.588235,2020-11-29,13.0,15.0,0.002648,0.0,98.62,103.85,False,6
133063,0.588235,2020-11-29,14.0,15.0,0.002648,0.0,95.0,100.0,False,6
133079,0.588235,2020-11-29,16.0,15.0,0.004846,0.058219,197.255572,216.856721,True,6
133086,0.588235,2020-11-29,17.0,15.0,0.004846,0.058219,197.255572,216.856721,True,6
133092,0.588235,2020-11-29,18.0,15.0,0.002648,0.0,103.25,109.86,False,6
133101,0.588235,2020-11-29,19.0,15.0,0.004846,0.058219,197.255572,216.856721,True,6
133123,0.588235,2020-11-29,22.0,15.0,0.004846,0.058219,197.255572,216.856721,True,6


In [61]:
dense_data_1[1000, 1, :, 15]

array([      nan,       nan,       nan,       nan,       nan,       nan,
             nan,       nan,       nan,       nan,       nan,  85.52   ,
             nan, 103.85   , 100.     ,       nan, 216.85672, 216.85672,
       109.86   , 216.85672,       nan,       nan, 216.85672,       nan,
             nan,       nan,       nan,       nan,       nan,       nan],
      dtype=float32)

## Saving the grids

In [62]:
dense_data_1, dates1 = create_grids_for_cnn(dense_df1)

In [63]:
full_dense_data_1, full_dates1 = create_grids_for_cnn(full_dense_df1)

In [64]:
np.savez_compressed("dense_grid.npz", grids=dense_data_1, keys=np.array(dates1, dtype=object))
np.savez_compressed("full_dense_grid.npz", 
                    grids=full_dense_data_1, keys=np.array(full_dates1, dtype=object))

In [65]:
(full_dense_data_1 != dense_data_1).mean()

0.7571306969674316

## Imputing Grid with IDW

In [66]:
from scipy.spatial import cKDTree
import numpy as np

def fill_nans_with_idw(grid, mask, k=3):
    """
    Fill NaNs in a 2D grid using Inverse Distance Weighting based on k nearest neighbors.

    Parameters:
    - grid: 2D NumPy array with NaNs
    - mask: 2D boolean array where True indicates valid data locations (same for all grids)
    - k: number of neighbors to consider (default 3)

    Returns:
    - Filled 2D NumPy array
    """
    filled = grid.copy()
    valid_coords = np.argwhere(mask)
    # print(valid_coords)
    tree = cKDTree(valid_coords)

    nan_coords = np.argwhere(1 - mask)
    # print(nan_coords)

    # for y, x in nan_coords:
    #     # Find k nearest neighbors from valid_coords
    #     dists, idxs = tree.query([y, x], k=k)
    #     if np.any(dists == 0):  # Just in case of exact match
    #         filled[y, x] = grid[tuple(valid_coords[idxs[dists == 0][0]])]
    #     else:
    #         weights = 1 / dists
    #         values = np.array([grid[tuple(valid_coords[i])] for i in idxs])
    #         filled[y, x] = np.sum(weights * values) / np.sum(weights)

    # return filled

    for y, x in nan_coords:
        # Adjust k if there are fewer valid points
        k_actual = min(k, len(valid_coords))
        dists, idxs = tree.query([y, x], k=k_actual)

        # Ensure both dists and idxs are arrays
        if k_actual == 1:
            dists = np.array([dists])
            idxs = np.array([idxs])

        # Handle exact match (distance 0)
        if np.any(dists == 0):
            filled[y, x] = grid[:, :, valid_coords[idxs[dists == 0][0]][0], valid_coords[idxs[dists == 0][0]][1]]
        else:
            # print(dists)
            weights = 1 / np.reshape(dists, (1, len(dists), 1, 1))
            # print(weights.shape)
            values = np.array([grid[:, :, valid_coords[i][0], valid_coords[i][1]] for i in idxs])
            filled[:, :, y, x] = np.sum(weights * values, axis=(0, 1)) / np.sum(weights)

    return filled



In [67]:
dense_data_2 = dense_data_1.copy()
dense_data_2[:, :-1, :, :] = fill_nans_with_idw(dense_data_1[:, :-1, :, :], 1 - np.isnan(dense_data_1[0][0]))

In [68]:
r = 15
i = 1000
dense_data_1[i, 0, :, r], dense_data_2[i, 0, :, r]

(array([      nan,       nan,       nan,       nan,       nan,       nan,
              nan,       nan,       nan,       nan,       nan,  78.56   ,
              nan,  98.62   ,  95.     ,       nan, 197.25557, 197.25557,
        103.25   , 197.25557,       nan,       nan, 197.25557,       nan,
              nan,       nan,       nan,       nan,       nan,       nan],
       dtype=float32),
 array([113.19196 , 112.989876, 129.17752 , 130.79805 , 138.70482 ,
        139.60683 , 147.44933 , 148.72855 , 150.93924 , 138.45573 ,
         93.885445,  78.56    , 157.69038 ,  98.62    ,  95.      ,
        163.17038 , 197.25557 , 197.25557 , 103.25    , 197.25557 ,
        197.25557 , 197.25557 , 197.25557 , 197.25557 , 197.25557 ,
        197.25557 , 197.25557 , 197.25557 , 197.25557 , 197.25557 ],
       dtype=float32))

In [69]:
full_dense_data_2 = full_dense_data_1.copy()
full_dense_data_2[:, :-1, :, :] = fill_nans_with_idw(
    full_dense_data_1[:, :-1, :, :], 1 - np.isnan(full_dense_data_1[0][0])
)

In [70]:
dense_data_2 = np.nan_to_num(dense_data_2, nan = 1)
full_dense_data_2 = np.nan_to_num(full_dense_data_2, nan = 1)

In [71]:
for i in range(7):
    print(i, np.unique(dense_data_2[:, i, :, :])[:5])
for i in range(7):
    print(i, np.unique(full_dense_data_2[:, i, :, :])[:5])

0 [ 9.95     12.08     12.68     13.570306 13.93    ]
1 [10.06     13.07     13.76     13.946713 14.      ]
2 [0.         0.02941176 0.05882353 0.0882353  0.11764706]
3 [0. 1. 2. 3. 4.]
4 [0.0000000e+00 3.2889571e-07 3.9015936e-07 6.2648985e-07 6.5237157e-07]
5 [0.0000000e+00 1.3015766e-33 1.5994865e-33 1.6558399e-33 1.7867520e-33]
6 [0. 1.]
0 [ 9.95 12.08 12.68 13.93 14.11]
1 [10.06 13.07 13.76 14.   15.08]
2 [0.         0.02941176 0.05882353 0.0882353  0.11764706]
3 [0. 1. 2. 3. 4.]
4 [0.0000000e+00 3.2889571e-07 3.9015936e-07 6.2648985e-07 6.5237157e-07]
5 [0.0000000e+00 1.7546429e-28 3.5092858e-28 4.1591536e-28 6.3599749e-28]
6 [0. 1.]


In [72]:
np.savez_compressed("idw_dense_grid.npz", grids=dense_data_2, keys=np.array(dates1, dtype=object))
np.savez_compressed("idw_full_dense_grid.npz", 
                    grids=full_dense_data_2, keys=np.array(full_dates1, dtype=object))

## Calculations for Satellite Image

In [73]:
df1 = pd.read_csv('/kaggle/input/airdelhi-tabularengineering/tabular_data.csv')

In [74]:
lat = np.sort(df1['lat'].unique())
lon = np.sort(df1['lon'].unique())

In [75]:
lat1 = lat - np.roll(lat, 1)
lon1 = lon - np.roll(lon, 1)
np.min(lat1[1:]), np.min(lon1[1:])

(0.008999999999996788, 0.010199999999997544)

In [76]:
lat.min() - 0.009 * 2, lat.max() + 0.009

(28.462, 28.723)

In [77]:
lon.min() - 0.0102 * 4, lon.max() + 0.0102 * 4

(77.05919999999999, 77.355)