In [1]:
import numpy as np # linear algebrae
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge, LinearRegression

import os

# from polire import IDW

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.preprocessing import MinMaxScaler

from datetime import date, timedelta
import datetime

import json
import pickle as pkl

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import SGD, Adam
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torch.nn import MSELoss

from tqdm import tqdm

In [2]:
class CFG:
    evaluation_time_gap = 1
    convert_numpy = False
    target_list = ['pm2_5', 'pm10']
    features = ['timeOfDay', 'lat', 'lon', 'distance', 'bus_count', 'day_of_week', 'pm2_5', 'pm10']

    batch_size = 128
    num_epochs = 50
    day_len = 35

    # model_size = 'vl'
    model_dropout = 0.2

    hidden_dim = 64
    num_layers = 9
    
    lr = 3e-3
    patience=2
    factor=0.9

    model_kind="gru"
    bidirectional=False
    
    pm2_5_thresholds = [0, 30, 60, 90, 120, 250, 2000]
    pm10_thresholds = [0, 50, 100, 250, 350, 430, 2000]
    aqi_category = ["Good", "Satisfactory", "Moderate", "Poor", "Very Poor", "Severe"]
    
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

## Dataset

In [3]:
class SequenceDataset(Dataset):

    def __init__(self, df, features = None, sequence_length=70, forecast_horizon=35):
        
        features = CFG.features if features is None else features
        self.features = features
        
        self.sequence_length = sequence_length
        self.forecast_horizon = forecast_horizon

        self.split_df(df)

    def split_df(self, df):
        df = df.sort_values(['lat', 'lon', 'date_value', 'timeOfDay'])
        lat_lon_pairs = df.groupby(['lat', 'lon'])['pm2_5'].count().reset_index()[['lat', 'lon']].values

        self.sequences = []

        for i in lat_lon_pairs:
            lat_lon_values = df[(df['lat'] == i[0]) & (df['lon'] == i[1])]
            self.sequences.append(lat_lon_values)

        self.X_seq = []
        self.y_seq = []
        self.masked_seq = []

        self.total_len = 0
        
        for s in self.sequences:
            s = s.sort_values(['date_value', 'timeOfDay'])
            self.X_seq.append(np.array(s[self.features].values, dtype=np.float32))
            self.y_seq.append(np.array(s[CFG.target_list].values, dtype=np.float32))
            self.masked_seq.append(1 - np.array(s['missing'].values, dtype=np.int8))
            # self.masked_seq.append(np.array(s['missing'].values, dtype=np.int8))

            curr_len = len(self.y_seq[-1])
            curr_len -= self.sequence_length  + self.forecast_horizon - CFG.day_len
            self.total_len += curr_len // CFG.day_len
            self.per_len = curr_len // CFG.day_len
    
    def __len__(self):
        return self.total_len

    def get_index(self, i):
        lat_lon = i // self.per_len
        within_idx = i % self.per_len
        return lat_lon, within_idx
    
    def __getitem__(self, i):
        lat_lon, within_idx = self.get_index(i)

        within_idx *= CFG.day_len
        start = within_idx
        X_end = within_idx + self.sequence_length
        y_end = X_end + self.forecast_horizon
        
        
        X = torch.tensor(self.X_seq[lat_lon][start:X_end]).to(device)
        y = torch.tensor(self.y_seq[lat_lon][X_end:y_end]).to(device)
        masks = torch.tensor(self.masked_seq[lat_lon][X_end:y_end] # , dtype=torch.int32
                            ).to(device)
        masks = torch.reshape(masks, (-1, 1))
        return X, y, masks

In [4]:
df = pd.read_csv('/kaggle/input/airdelhi-baselines-deepengineering/dense_df.csv')

df = df.drop(columns = 'Unnamed: 0')
df['date_value'] = pd.to_datetime(df['date_value'])

dates = pd.to_datetime([df['date_value'].min(), df['date_value'].max()])

max_train_date = dates.min() + (dates.max() - dates.min()) * 0.75
max_train_date = max_train_date.floor("D")
min_train_date = df['date_value'].min()
max_date = df['date_value'].max().floor("D")

metrics_dict = {
    'MSE': mean_squared_error, 
    'r2 score': r2_score, 
    'MAE': mean_absolute_error,
}

target_list = CFG.target_list

features = ['date_value', 'timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']

CFG.base_features = ['timeOfDay', 'lat', 'lon', 'day_of_week', 'distance', 'bus_count']
CFG.features = CFG.base_features

min_train_date, max_train_date

(Timestamp('2020-11-01 00:00:00'), Timestamp('2021-01-07 00:00:00'))

In [5]:
def add_lag_features(df, lags = [1]):

    df = df.copy()
    
    df.sort_values(by=["lat", "lon", "timeOfDay", "date_value"], inplace=True)

    added_features = []
    for l in lags:
        df[f'pm2_5_lag_{l}'] = df.groupby(
            ['timeOfDay', 'lat', 'lon'])['pm2_5'].shift(l)
        df[f'pm10_lag_{l}'] = df.groupby(
            ['timeOfDay', 'lat', 'lon'])['pm10'].shift(l)

        added_features.append(f'pm2_5_lag_{l}')
        added_features.append(f'pm10_lag_{l}')

        df.sort_values(by=["lat", "lon", "date_value"], inplace=True)

    # Group by latitude and longitude
    grouped = df.groupby(["lat", "lon"])

    # Function to fill NaN values based on previous mean
    def fill_na_with_previous_mean(group):
        for col in group.columns:
            if col not in ["date_value", "lat", "lon"]:
                group[col] = group[col].astype(float)  # Ensure numeric columns
                group[col] = group[col].fillna(group[col].expanding().mean().shift())  # Previous days' mean
                
                # If still NaN (first row), replace with overall mean
                overall_mean = df[col].mean(skipna=True)
                group[col] = group[col].fillna(overall_mean)
        return group

    # Apply the function to each group
    df = grouped.apply(fill_na_with_previous_mean)
    df.reset_index(drop=True, inplace=True)
    df = df.sort_values(by = ['date_value', 'timeOfDay', 'lat', 'lon'])
    df.reset_index(drop=True, inplace=True)
    CFG.features += added_features
    return df
    
df = add_lag_features(df, lags = [7])

  df = grouped.apply(fill_na_with_previous_mean)


In [6]:
df = df.sort_values(by=['lat', 'lon', 'timeOfDay', 'date_value'])

In [7]:
# df['missing'].mean()

In [8]:
# df[df['missing'] == 0].tail(200).head(30)
# df.tail(180).head(30)

In [9]:
class CustomScaler:

    def __init__(self, df):
        self.pm2_5_max = float(df['pm2_5'].max())
        self.pm2_5_min = float(df['pm2_5'].min())
        
        self.pm10_max = float(df['pm10'].max())
        self.pm10_min = float(df['pm10'].min())

    def transform(self, df):
        df = df.copy()
        df['pm2_5'] = (df['pm2_5'] - self.pm2_5_min) / (self.pm2_5_max - self.pm2_5_min)
        df['pm10'] = (df['pm10'] - self.pm10_min) / (self.pm10_max - self.pm10_min)
        return df

    def inverse_transform(self, X):
        X[:, 0] = X[:, 0] * (self.pm2_5_max - self.pm2_5_min) + self.pm2_5_min
        X[:, 1] = X[:, 1] * (self.pm10_max - self.pm10_min) + self.pm10_min
        return X

target_scaler = CustomScaler(df)
df = target_scaler.transform(df)

CFG.features = ['timeOfDay', 'lat', 'lon', 'distance', 'bus_count', 
                'day_of_week', 'pm2_5', 'pm10', 'pm2_5_lag_7', 'pm10_lag_7']

df['day_of_week'] = df['date_value'].dt.dayofweek

scaler = MinMaxScaler()
df[CFG.features] = scaler.fit_transform(df[CFG.features])

In [10]:
train_ds = SequenceDataset(df[df['date_value'] <= max_train_date])
test_ds = SequenceDataset(df[df['date_value'] > max_train_date])

train_dl = DataLoader(train_ds, batch_size = CFG.batch_size, shuffle=True)
test_dl = DataLoader(test_ds, batch_size = CFG.batch_size, shuffle=True)

In [11]:
xt, yt, mt = train_ds[0]
xt.shape, yt.shape, mt.shape

(torch.Size([70, 10]), torch.Size([35, 2]), torch.Size([35, 1]))

## RNN

In [12]:
class DirectMultiStepPredictor(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, dropout=0.2,
                 output_size=2, forecast_horizon=35, model_kind = None):
        """
        RNN-based model for multi-step time-series forecasting.

        Args:
            input_size (int): Number of input features.
            hidden_size (int): Hidden state size in RNN.
            num_layers (int): Number of stacked RNN layers.
            output_size (int): Number of target variables.
            forecast_horizon (int): Number of future time-steps to predict.
        """
        super(DirectMultiStepPredictor, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.forecast_horizon = forecast_horizon

        if model_kind is None:
            model_kind = CFG.model_kind

        if model_kind == 'rnn':
            self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)
        elif model_kind == 'lstm':
            self.rnn = nn.LSTM(input_size, hidden_size, num_layers, 
                               batch_first=True, dropout=dropout,
                               bidirectional = CFG.bidirectional)
        elif model_kind == 'gru':
            self.rnn = nn.GRU(input_size, hidden_size, num_layers, 
                              batch_first=True, dropout=dropout,
                              bidirectional = CFG.bidirectional)

        self.rnn = self.rnn.to(device)

        # Fully connected layer maps hidden state to output
        self.fc = nn.Linear(hidden_size, forecast_horizon * output_size).to(device)

        self._initialize_weights()

    def _initialize_weights(self):
        """Initialize weights using Xavier for RNN and linear layers."""
        for name, param in self.named_parameters():
            if 'weight_ih' in name:  # Input-hidden weights
                nn.init.xavier_uniform_(param)
            elif 'weight_hh' in name:  # Hidden-hidden weights
                nn.init.orthogonal_(param)  # Orthogonal init for stability
            elif 'bias' in name:
                nn.init.zeros_(param)  # Zero bias for stability
            elif 'fc' in name:  # Fully connected layer
                nn.init.kaiming_uniform_(param, nonlinearity='relu')

    def forward(self, x):
        batch_size = x.size(0)
        
        # hidden state
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(x.device)
        out, _ = self.rnn(x, h0)  # out: (batch_size, sequence_length, hidden_size)
        last_hidden_state = out[:, -1, :]  # Shape: (batch_size, hidden_size)

        # Fully connected layer to map hidden state to output
        out = self.fc(last_hidden_state)  # Shape: (batch_size, forecast_horizon * output_size)

        # Reshape to (batch_size, forecast_horizon, output_size)
        out = out.view(batch_size, self.forecast_horizon, -1)
        return out

def get_optimizer(model, lr = 1e-4, weight_decay = 1e-5):
    optimizer = Adam(params = model.parameters(), 
                     lr = lr, weight_decay = weight_decay)
    return optimizer

def get_reduce_lr(optimizer, factor=0.1, patience=2):
    scheduler = ReduceLROnPlateau(optimizer, mode = 'min', factor = factor, patience = patience)
    return scheduler

In [13]:
def get_model(input_size = 10, hidden_size = 32, num_layers = 3, 
              forecast_horizon=35, model_kind=None):
    if input_size is None:
        input_size = len(CFG.features)
    model = DirectMultiStepPredictor(input_size, hidden_size, num_layers, 
                               output_size=2, forecast_horizon=forecast_horizon, 
                                     model_kind=model_kind,
                                    dropout=CFG.model_dropout)
    return model

## Train and Validation Functions

In [14]:
class MaskedMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super(MaskedMSELoss, self).__init__()
        self.eps = eps

    def forward(self, preds, targets, mask):
        # print(preds.shape, targets.shape, mask.shape)
        loss = (preds - targets) ** 2
        masked_loss = (loss * mask).sum() / (mask.sum() + self.eps)
        return masked_loss

class MaskedMAELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super(MaskedMAELoss, self).__init__()
        self.eps = eps

    def forward(self, preds, targets, mask):
        loss = abs(preds - targets)
        masked_loss = (loss * mask).sum() / (mask.sum() + self.eps)
        return masked_loss

class MaskedR2(nn.Module):
    def __init__(self, eps=1e-6):
        super(MaskedR2, self).__init__()
        self.eps = eps

    def forward(self, preds, targets, mask):
        error = ((targets - preds) ** 2) * mask
        masked_mse = error.sum() / (mask.sum() + self.eps)
        mean_y = (targets * mask).sum() / (mask.sum() + self.eps)
        total_var = (((targets - mean_y) ** 2) * mask).sum() / (mask.sum() + self.eps)
        r2 = 1 - (masked_mse / (total_var + self.eps))
        return r2

masked_metrics_dict = {
    'MSE': MaskedMSELoss(),
    'MAE': MaskedMAELoss(),
    'r2 score': MaskedR2(),
}

In [15]:
class RunningLoss:
    def __init__(self, window = 10):
        self.loss = 0
        self.total = 0
        self.loss_last = []
        self.total_last = []

        self.window = window
    
    def update(self, loss, batch_size):
        total = 1
        self.loss += loss
        self.total += 1
        
        self.loss_last.append(loss)
        self.total_last.append(total)
        if len(self.loss_last) > self.window:
            self.loss_last.pop(0)
            self.total_last.pop(0)
    
    def reset(self):
        self.loss = 0
        self.total = 0
        self.loss_last = []
        self.total_last = []

    def print(self):
        print(f"Accuracy: {self.loss / self.total}")
    
    def get_curr_stats(self):
        return sum(self.loss_last) / sum(self.total_last)

    def total_stats(self):
        return self.loss / self.total


In [16]:
def validate(
    model,
    test_dl,
    criterion,
    verbose=True,
    get_predict=False,
):
    model.eval()
    running_loss = RunningLoss()

    predict_array = []
    true_array = []
    mask_array = []
    
    if verbose >= 1:
        pbar = tqdm(total = len(test_dl), ncols = 110, desc = "Validation Progress") # , dynamic_ncols=True, leave=False)
    else:
        pbar = None

    with torch.no_grad():
        for i, (x, y, mask) in enumerate(test_dl):
            x, y, mask = x.to(device), y.to(device), mask.to(device)
            pred = model(x)

            pred = target_scaler.inverse_transform(pred)
            y = target_scaler.inverse_transform(y)

            temp_loss = criterion(pred, y, mask)
            running_loss.update(temp_loss.item(), batch_size=x.shape[0])

            if get_predict:
                predict_array.append(pred)
                true_array.append(y)
                mask_array.append(mask)

            if verbose >= 1:
                total = len(test_dl)
                curr_loss = np.sqrt(running_loss.get_curr_stats())
                pbar.set_description(f"Validation Step {i} / {total}")
                pbar.set_postfix(Loss=f"{curr_loss:.4f}")
                pbar.update(1)
        if verbose >= 1:
            pbar.close()

    model.train()
    if get_predict:
        return predict_array, true_array, mask_array, np.sqrt(running_loss.total_stats())
    
    return np.sqrt(running_loss.total_stats())


In [17]:
def train(
        model,
        criterion,
        optimizer,
        num_epochs,
        train_dataloader,
        val_dataloader,
        scheduler=None,
        do_validate=False,
        validate_frequency=1,
        verbose=True,
        metrics_dict=None,
        model_name = "",
):
    if metrics_dict is None:
        metrics_dict = masked_metrics_dict
    
    loss_list = []
    val_loss_list = []
    all_loss_list = []

    best_val_loss = 100000000
    best_val_loss25 = 100000000
    best_val_loss10 = 100000000
    
    for epoch in tqdm(range(num_epochs), disable=verbose >= -1):
        running_loss = RunningLoss()
        if verbose >= 1:
            pbar = tqdm(total = len(train_dataloader), ncols = 110, desc = "Training Progress") # , dynamic_ncols=True, leave=False)
        else:
            pbar = None
        for i, (x, y, mask) in enumerate(train_dataloader):
            x, y, mask = x.to(device), y.to(device), mask.to(device)

            optimizer.zero_grad()
            
            pred = model(x)
            loss = criterion(pred, y, mask)
            loss.backward()

            optimizer.step()

            all_loss_list.append(loss.item())

            with torch.no_grad():
                pred = target_scaler.inverse_transform(pred)
                y = target_scaler.inverse_transform(y)
                temp_loss = torch.sqrt(criterion(pred, y, mask))
                running_loss.update(temp_loss.item(), batch_size=x.shape[0])

            if verbose >= 1:
                total = len(train_dataloader)
                curr_loss = running_loss.get_curr_stats()
                pbar.set_description(f"Epoch {epoch}: Step {i} / {total}")
                pbar.set_postfix(Loss=f"{curr_loss:.4f}")
                pbar.update(1)
        
        if verbose >= 1:
            print(f"Train Loss Total: {running_loss.total_stats()}")
            pbar.close()
        
        if do_validate and epoch % validate_frequency == validate_frequency - 1:
            pred1, y1, mask1, val_loss = validate(model, val_dataloader, criterion, 
                                           verbose=verbose, get_predict=True)
            pred1 = torch.concat(pred1)
            y1 = torch.concat(y1)
            mask1 = torch.concat(mask1)
            
            val_loss_list.append(val_loss)
            if scheduler is not None:
                scheduler.step(val_loss)
            
            if verbose >= 1:
                print(f"Val Loss Total: {val_loss}")

            if val_loss < best_val_loss:
                if verbose >= 0:
                    print(f"Better Val Loss: {val_loss} < {best_val_loss}")
                best_val_loss = val_loss
                torch.save(model.state_dict(), f"./{model_name}best.pth")
            
            for i, t in enumerate(CFG.target_list):
                for n, m in metrics_dict.items():
                    if n == 'MSE':
                        # print(y1[:, :, i].shape, pred1[:, :, i].shape, mask1.shape)
                        mse_loss = np.sqrt(m(pred1[:, :, i], y1[:, :, i], mask1[:, :, 0]).cpu().numpy())
                        if verbose >= 1:
                            print(f"{t}: {n}: {mse_loss}")
                        
                        if i == 0 and mse_loss < best_val_loss25:
                            if verbose >= 0:
                                print(f"----------Better MSE on pm2_5 {mse_loss} < {best_val_loss25}")
                            torch.save(model.state_dict(), f"./{model_name}2_5.pth")
                            best_val_loss25 = mse_loss
                        if i == 1 and mse_loss < best_val_loss10:
                            if verbose >= 0:
                                print(f"----------Better MSE on pm10 {mse_loss} < {best_val_loss10}")
                            torch.save(model.state_dict(), f"./{model_name}10.pth")
                            best_val_loss10 = mse_loss
                    
                    if verbose >= 1:
                        if n == 'MSE':
                            mse_loss = np.sqrt(m(pred1[:, :, i], y1[:, :, i], mask1[:, :, 0]).cpu().numpy())
                            print(f"{t}: {n}: {mse_loss}")
                        else:
                            print(f"{t}: {n}: {(m(pred1[:, :, i], y1[:, :, i], mask1[:, :, 0]))}")
        
        epoch_loss = running_loss.total_stats()
        loss_list.append(epoch_loss)
    
    return loss_list, val_loss_list, all_loss_list


In [18]:
# model = get_model(hidden_size = CFG.hidden_dim, num_layers = CFG.num_layers).to(device)
# criterion = MaskedMSELoss().to(device)
# optimizer = get_optimizer(model, lr = CFG.lr)
# scheduler = get_reduce_lr(optimizer, factor=CFG.factor, patience=CFG.patience)

In [19]:
def get_aqi_category_indices(preds, thresholds):
    indices = torch.bucketize(preds, torch.tensor(thresholds).to(device), right=False) - 1
    return torch.clamp(indices, min=0, max=5)

def compute_aqi_classification_metrics(pred, label, mask):
    """
    Compute per-class accuracy, precision, and recall separately for PM2.5 and PM10.

    Args:
        pred: Tensor of predicted AQI values of shape [N, S, 2].
        label: Tensor of ground truth AQI values of shape [N, S, 2].
        mask: Tensor of 0s and 1s indicating valid positions of shape [N, S, 1].

    Returns:
        Dictionary with separate per-class metrics for PM2.5 and PM10.
    """
    if mask.shape[-1] == 1:
        mask = mask.expand(-1, -1, 2)
    mask = mask.bool()

    metrics = {}
    for i, (name, thresholds) in enumerate(zip(CFG.target_list, [CFG.pm2_5_thresholds, CFG.pm10_thresholds])):
        pred_class = get_aqi_category_indices(pred[..., i], thresholds)
        label_class = get_aqi_category_indices(label[..., i], thresholds)

        pred_class = pred_class[mask[..., i]]
        label_class = label_class[mask[..., i]]

        class_metrics = {}
        for class_idx, class_name in enumerate(CFG.aqi_category):
            true_pos = ((pred_class == class_idx) & (label_class == class_idx)).sum().item()
            total_pred = (pred_class == class_idx).sum().item()
            total_true = (label_class == class_idx).sum().item()

            accuracy = true_pos / total_true if total_true else 0.0
            precision = true_pos / total_pred if total_pred else 0.0
            recall = true_pos / total_true if total_true else 0.0

            class_metrics[class_name] = {
                "true": true_pos,
                "total": total_true,
                "accuracy": accuracy,
                "precision": precision,
                "recall": recall
            }

        metrics[name] = class_metrics

    return metrics


In [20]:
def predict_eval(model, test_dl, criterion, verbose=True):
    o = validate(model, test_dl, criterion, verbose=verbose, get_predict=True)
    pred = torch.concat(o[0])
    y = torch.concat(o[1])
    mask = torch.concat(o[2])
    model_performance = {0 : {}, 1 : {}}
    for i, t in enumerate(CFG.target_list):
        for n, m in masked_metrics_dict.items():
            l = m(pred[:, :, i], y[:, :, i], mask[:, :, 0])
            if n == 'MSE':
                # print(y1[:, :, i].shape, pred1[:, :, i].shape, mask1.shape)
                l = np.sqrt(l.cpu().numpy())
                if verbose:
                    print(f"{t}: {n}: {l}")
            elif verbose:
                print(f"{t}: {n}: {l}")

            model_performance[i][n] = float(l)

    model_performance['classification'] = compute_aqi_classification_metrics(pred, y, mask)
    
    return model_performance

## 2nd DF

In [21]:
full_df = pd.read_csv('/kaggle/input/airdelhi-baselines-deepengineering/full_dense_df.csv')

full_df = full_df.drop(columns = 'Unnamed: 0')
full_df['date_value'] = pd.to_datetime(full_df['date_value'])

full_df = add_lag_features(full_df, lags = [7])
CFG.features = ['timeOfDay', 'lat', 'lon', 'distance', 'bus_count', 
                'day_of_week', 'pm2_5', 'pm10', 'pm2_5_lag_7', 'pm10_lag_7']
full_df = full_df.sort_values(by=['lat', 'lon', 'timeOfDay', 'date_value'])

full_df = target_scaler.transform(full_df)

full_df['day_of_week'] = full_df['date_value'].dt.dayofweek

scaler = MinMaxScaler()
full_df[CFG.features] = scaler.fit_transform(full_df[CFG.features])

full_train_ds= SequenceDataset(full_df[full_df['date_value'] <= max_train_date])
full_test_ds = SequenceDataset(full_df[full_df['date_value'] > max_train_date])

full_train_dl = DataLoader(full_train_ds, batch_size = CFG.batch_size, shuffle=True)
full_test_dl = DataLoader(full_test_ds, batch_size = CFG.batch_size, shuffle=True)

  df = grouped.apply(fill_na_with_previous_mean)


# Ablation Studies

In [22]:
import itertools
import json

def run_grid_search(
    param_grid: dict,
        df_dict,
        model_name,
    model_save_dir: str,
    result_json_path: str,
    default_params: dict = None
):
    from copy import deepcopy

    # Generate all parameter combinations
    keys, values = zip(*param_grid.items())
    
    param_combinations = [
        {**(default_params or {}), **dict(zip(keys, v))}
        for v in itertools.product(*values)
    ]
    
    model_dict = {}
    results_list = []
    
    for idx, param_set in enumerate(param_combinations):
        print(f"Training model {idx} with params: {param_set}")
        
        model = get_model(**param_set)
        criterion = MaskedMSELoss().to(device)
        optimizer = get_optimizer(model, lr = CFG.lr)
        scheduler = get_reduce_lr(optimizer, factor=CFG.factor, patience=CFG.patience)
        
        loss_list, val_loss_list, all_loss_list = train(
            model,
            criterion,
            optimizer,
            num_epochs=CFG.num_epochs,
            train_dataloader=df_dict['train_dl'],
            val_dataloader=df_dict['test_dl'],
            scheduler=scheduler,
            do_validate=True,
            validate_frequency=1,
            verbose=-2,
            model_name=model_save_dir + f"{model_name}_{idx}_"
        )
        
        model_path = model_save_dir + f"final_{model_name}_{idx}.pt"
        torch.save(model.state_dict(), model_path)
        model_dict[idx] = model

        result_metrics = {}
        paths = ['best', '2_5', '10']
        for p in paths:
            result_metrics[p] = {}
            model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")
            model.load_state_dict(model_state_dict)
            
            for df_name, df1 in df_dict.items():
                result_metrics[p][df_name] = predict_eval(model, df1, criterion, verbose=False)

        results_list.append({
            "model_index": idx,
            "model_name": model_path,
            "parameters": deepcopy(param_set),
            "results": result_metrics
        })

    with open(result_json_path, "w") as f:
        json.dump(results_list, f, indent=4)

    return model_dict


In [23]:
param_grid = {
    'hidden_size' : [16, 32, 64, 128], 
    'num_layers' : [1, 3, 6, 9],
    'model_kind': ['rnn', 'gru']
}

default_params = {
    'input_size' : 10, 
    'forecast_horizon' : 35
}

df_dict = {
    'train_dl' : train_dl,
    'test_dl' : test_dl,
    'full_train_dl' : full_train_dl,
    'full_test_dl' : full_test_dl,
}

model_name = "model_size_test"

model_save_dir = f"./{model_name}_dir/"
result_json_path = f"{model_save_dir}{model_name}_results.json"
os.makedirs(model_save_dir, exist_ok=True)

model_dict = run_grid_search(
    param_grid,
        df_dict,
        model_name,
    model_save_dir,
    result_json_path,
    default_params
)

Training model 0 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 16, 'num_layers': 1, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:06<00:00,  1.33s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")
  indices = torch.bucketize(preds, torch.tensor(thresholds).to(device), right=False) - 1


Training model 1 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 16, 'num_layers': 1, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:06<00:00,  1.33s/it]


Training model 2 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 16, 'num_layers': 3, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:07<00:00,  1.35s/it]


Training model 3 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 16, 'num_layers': 3, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:10<00:00,  1.41s/it]


Training model 4 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 16, 'num_layers': 6, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:10<00:00,  1.41s/it]


Training model 5 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 16, 'num_layers': 6, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:12<00:00,  1.44s/it]


Training model 6 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 16, 'num_layers': 9, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:11<00:00,  1.43s/it]


Training model 7 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 16, 'num_layers': 9, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:17<00:00,  1.56s/it]


Training model 8 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 32, 'num_layers': 1, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:07<00:00,  1.35s/it]


Training model 9 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 32, 'num_layers': 1, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:08<00:00,  1.37s/it]


Training model 10 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 32, 'num_layers': 3, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:10<00:00,  1.40s/it]


Training model 11 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 32, 'num_layers': 3, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:09<00:00,  1.39s/it]


Training model 12 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 32, 'num_layers': 6, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:10<00:00,  1.40s/it]


Training model 13 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 32, 'num_layers': 6, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:11<00:00,  1.43s/it]


Training model 14 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 32, 'num_layers': 9, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:11<00:00,  1.44s/it]


Training model 15 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 32, 'num_layers': 9, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:18<00:00,  1.57s/it]


Training model 16 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 64, 'num_layers': 1, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:02<00:00,  1.26s/it]


Training model 17 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 64, 'num_layers': 1, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:03<00:00,  1.26s/it]


Training model 18 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 64, 'num_layers': 3, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:03<00:00,  1.27s/it]


Training model 19 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 64, 'num_layers': 3, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:06<00:00,  1.33s/it]


Training model 20 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 64, 'num_layers': 6, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:10<00:00,  1.40s/it]


Training model 21 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 64, 'num_layers': 6, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:16<00:00,  1.53s/it]


Training model 22 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 64, 'num_layers': 9, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:13<00:00,  1.46s/it]


Training model 23 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 64, 'num_layers': 9, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:25<00:00,  1.71s/it]


Training model 24 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 128, 'num_layers': 1, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:02<00:00,  1.25s/it]


Training model 25 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 128, 'num_layers': 1, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:06<00:00,  1.33s/it]


Training model 26 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 128, 'num_layers': 3, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:09<00:00,  1.39s/it]


Training model 27 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 128, 'num_layers': 3, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:22<00:00,  1.65s/it]


Training model 28 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 128, 'num_layers': 6, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:18<00:00,  1.56s/it]


Training model 29 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 128, 'num_layers': 6, 'model_kind': 'gru'}


100%|██████████| 50/50 [01:43<00:00,  2.07s/it]


Training model 30 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 128, 'num_layers': 9, 'model_kind': 'rnn'}


100%|██████████| 50/50 [01:25<00:00,  1.71s/it]


Training model 31 with params: {'input_size': 10, 'forecast_horizon': 35, 'hidden_size': 128, 'num_layers': 9, 'model_kind': 'gru'}


100%|██████████| 50/50 [02:03<00:00,  2.46s/it]


In [24]:
# with open('/kaggle/working/model_size_test_dir/model_size_test_results.json', 'r') as f:
#     results_dict = json.load(f)

In [25]:
# def get_best_index(results_dict):
#     best_index_rnn = 0
#     best_index_gru = 1
#     for i, results in enumerate(results_dict):
#         if results['parameters']['model_kind'] == 'rnn':
#             if results['results']['test_dl']['0']['r2 score'] > results_dict[best_index_rnn][
#             'results']['test_dl']['0']['r2 score']:
#                 best_index_rnn = i
#         else:
#             if results['results']['test_dl']['0']['r2 score'] > results_dict[best_index_rnn][
#             'results']['test_dl']['1']['r2 score']:
#                 best_index_gru = i
#     return best_index_rnn, best_index_gru

In [26]:
# get_best_index(results_dict)

## Ablation 2: Input Sequence Length

In [27]:
def get_dl(df, sequence_length=35, forecast_horizon=35):
    ds = SequenceDataset(df, sequence_length = sequence_length, forecast_horizon = forecast_horizon)
    dl = DataLoader(ds, batch_size = CFG.batch_size, shuffle=True)
    return dl

In [28]:
def run_grid_search_df(
    param_grid: dict,
        df_dict,
        model_name,
    model_save_dir: str,
    result_json_path: str,
    default_params: dict = None,
    model_params: dict = None,
):
    from copy import deepcopy

    # Generate all parameter combinations
    keys, values = zip(*param_grid.items())
    
    param_combinations = [
        {**(default_params or {}), **dict(zip(keys, v))}
        for v in itertools.product(*values)
    ]
    
    model_dict = {}
    results_list = []
    
    for idx, param_set in enumerate(param_combinations):
        print(f"Training model on dataset {idx} with params: {param_set}")

        model = get_model(**model_params)

        ds_dict = {
            k : get_dl(df = v, **param_set)
            for k, v in df_dict.items()
        }
        
        criterion = MaskedMSELoss().to(device)
        optimizer = get_optimizer(model, lr = CFG.lr)
        scheduler = get_reduce_lr(optimizer, factor=CFG.factor, patience=CFG.patience)
        
        loss_list, val_loss_list, all_loss_list = train(
            model,
            criterion,
            optimizer,
            num_epochs=CFG.num_epochs,
            train_dataloader=ds_dict['train_dl'],
            val_dataloader=ds_dict['test_dl'],
            scheduler=scheduler,
            do_validate=True,
            validate_frequency=1,
            verbose=-2,
            model_name=model_save_dir + f"{model_name}_{idx}_"
        )
        
        model_path = model_save_dir + f"final_{model_name}_{idx}.pth"
        torch.save(model.state_dict(), model_path)
        model_dict[idx] = model
        
        result_metrics = {}
        paths = ['best', '2_5', '10']
        for p in paths:
            result_metrics[p] = {}
            model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")
            model.load_state_dict(model_state_dict)
            
            for df_name, df1 in ds_dict.items():
                result_metrics[p][df_name] = predict_eval(model, df1, criterion, verbose=False)

        results_list.append({
            "model_index": idx,
            "model_name": model_path,
            "parameters": deepcopy(param_set),
            "results": result_metrics
        })

    with open(result_json_path, "w") as f:
        json.dump(results_list, f, indent=4)

    return model_dict


In [29]:
param_grid = {
    'sequence_length' : [35, 70, 105, 140, 175], 
}

default_params = {
    'input_size' : 10, 
    'forecast_horizon' : 35
}

model_params = {
    'input_size': 10,
    'forecast_horizon': 35,
    'num_layers': 3,
    'hidden_size': 64,
    'model_kind': 'rnn'
}

df_dict = {
    'train_dl' : df[df['date_value'] <= max_train_date],
    'test_dl' : df[df['date_value'] > max_train_date],
    'full_train_dl' : full_df[full_df['date_value'] <= max_train_date],
    'full_test_dl' : full_df[full_df['date_value'] > max_train_date],
}

model_name = "rnn_sequence_length_test"

model_save_dir = f"./{model_name}_dir/"
result_json_path = f"{model_save_dir}{model_name}_results.json"
os.makedirs(model_save_dir, exist_ok=True)

model_dict1 = run_grid_search_df(
    param_grid,
        df_dict,
        model_name,
    model_save_dir,
    result_json_path,
    default_params=None,
    model_params = model_params,
)

Training model on dataset 0 with params: {'sequence_length': 35}


100%|██████████| 50/50 [01:07<00:00,  1.34s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 1 with params: {'sequence_length': 70}


100%|██████████| 50/50 [01:06<00:00,  1.33s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 2 with params: {'sequence_length': 105}


100%|██████████| 50/50 [01:04<00:00,  1.29s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 3 with params: {'sequence_length': 140}


100%|██████████| 50/50 [01:04<00:00,  1.28s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 4 with params: {'sequence_length': 175}


100%|██████████| 50/50 [01:04<00:00,  1.29s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


In [30]:
model_params = {
    'input_size': 10,
    'forecast_horizon': 35,
    'num_layers': 3,
    'hidden_size': 64,
    'model_kind': 'gru'
}

df_dict = {
    'train_dl' : df[df['date_value'] <= max_train_date],
    'test_dl' : df[df['date_value'] > max_train_date],
    'full_train_dl' : full_df[full_df['date_value'] <= max_train_date],
    'full_test_dl' : full_df[full_df['date_value'] > max_train_date],
}

model_name = "gru_sequence_length_test"

model_save_dir = f"./{model_name}_dir/"
result_json_path = f"{model_save_dir}{model_name}_results.json"
os.makedirs(model_save_dir, exist_ok=True)

model_dict1 = run_grid_search_df(
    param_grid,
        df_dict,
        model_name,
    model_save_dir,
    result_json_path,
    default_params=None,
    model_params = model_params,
)

Training model on dataset 0 with params: {'sequence_length': 35}


100%|██████████| 50/50 [01:08<00:00,  1.37s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 1 with params: {'sequence_length': 70}


100%|██████████| 50/50 [01:07<00:00,  1.35s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 2 with params: {'sequence_length': 105}


100%|██████████| 50/50 [01:09<00:00,  1.39s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 3 with params: {'sequence_length': 140}


100%|██████████| 50/50 [01:12<00:00,  1.44s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 4 with params: {'sequence_length': 175}


100%|██████████| 50/50 [01:15<00:00,  1.51s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


## Ablation 3: Forecast Horizon

In [31]:
def run_grid_search_horizon(
    horizon_values,
        df_dict,
        model_name,
    model_save_dir: str,
    result_json_path: str,
    df_params: dict = None,
    model_params: dict = None,
):    
    model_dict = {}
    results_list = []
    
    for idx, h in enumerate(horizon_values):
        print(f"Training model on dataset {idx} with horizon: {h}")

        model = get_model(forecast_horizon=h, **model_params)

        ds_dict = {
            k : get_dl(df = v, forecast_horizon = h, **df_params)
            for k, v in df_dict.items()
        }
        
        criterion = MaskedMSELoss().to(device)
        optimizer = get_optimizer(model, lr = CFG.lr)
        scheduler = get_reduce_lr(optimizer, factor=CFG.factor, patience=CFG.patience)
        
        loss_list, val_loss_list, all_loss_list = train(
            model,
            criterion,
            optimizer,
            num_epochs=CFG.num_epochs,
            train_dataloader=ds_dict['train_dl'],
            val_dataloader=ds_dict['test_dl'],
            scheduler=scheduler,
            do_validate=True,
            validate_frequency=1,
            verbose=-2,
            model_name=model_save_dir + f"{model_name}_{idx}_"
        )
        
        model_path = model_save_dir + f"final_{model_name}_{idx}.pt"
        torch.save(model.state_dict(), model_path)
        model_dict[idx] = model
        
        result_metrics = {}
        paths = ['best', '2_5', '10']
        for p in paths:
            result_metrics[p] = {}
            model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")
            model.load_state_dict(model_state_dict)
            
            for df_name, df1 in ds_dict.items():
                result_metrics[p][df_name] = predict_eval(model, df1, criterion, verbose=False)

        results_list.append({
            "model_index": idx,
            "model_name": model_path,
            "horizon": h,
            "params" : model_params,
            "results": result_metrics
        })

    with open(result_json_path, "w") as f:
        json.dump(results_list, f, indent=4)

    return model_dict


In [32]:
horizon_values = [35, 70, 105, 140, 175]

model_params = {
    'input_size': 10,
    'num_layers': 3,
    'hidden_size': 64,
    'model_kind': 'rnn'
}

df_dict = {
    'train_dl' : df[df['date_value'] <= max_train_date],
    'test_dl' : df[df['date_value'] > max_train_date],
    'full_train_dl' : full_df[full_df['date_value'] <= max_train_date],
    'full_test_dl' : full_df[full_df['date_value'] > max_train_date],
}

df_params = {
    'sequence_length' : 105,
}

model_name = "rnn_forecast_horizon_test"

model_save_dir = f"./{model_name}_dir/"
result_json_path = f"{model_save_dir}{model_name}_results.json"
os.makedirs(model_save_dir, exist_ok=True)

model_dict2 = run_grid_search_horizon(
    horizon_values,
        df_dict,
        model_name,
    model_save_dir,
    result_json_path,
    df_params=df_params,
    model_params = model_params,
)

Training model on dataset 0 with horizon: 35


100%|██████████| 50/50 [01:07<00:00,  1.34s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 1 with horizon: 70


100%|██████████| 50/50 [01:06<00:00,  1.33s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 2 with horizon: 105


100%|██████████| 50/50 [01:03<00:00,  1.26s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 3 with horizon: 140


100%|██████████| 50/50 [01:01<00:00,  1.23s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 4 with horizon: 175


100%|██████████| 50/50 [01:00<00:00,  1.21s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


In [33]:
model_params = {
    'input_size': 10,
    'num_layers': 3,
    'hidden_size': 64,
    'model_kind': 'gru'
}

df_dict = {
    'train_dl' : df[df['date_value'] <= max_train_date],
    'test_dl' : df[df['date_value'] > max_train_date],
    'full_train_dl' : full_df[full_df['date_value'] <= max_train_date],
    'full_test_dl' : full_df[full_df['date_value'] > max_train_date],
}

df_params = {
    'sequence_length' : 105,
}

model_name = "gru_forecast_horizon_test"

model_save_dir = f"./{model_name}_dir/"
result_json_path = f"{model_save_dir}{model_name}_results.json"
os.makedirs(model_save_dir, exist_ok=True)

model_dict2 = run_grid_search_horizon(
    horizon_values,
        df_dict,
        model_name,
    model_save_dir,
    result_json_path,
    df_params=df_params,
    model_params = model_params,
)

Training model on dataset 0 with horizon: 35


100%|██████████| 50/50 [01:09<00:00,  1.39s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 1 with horizon: 70


100%|██████████| 50/50 [01:10<00:00,  1.40s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 2 with horizon: 105


100%|██████████| 50/50 [01:07<00:00,  1.35s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 3 with horizon: 140


100%|██████████| 50/50 [01:06<00:00,  1.34s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")


Training model on dataset 4 with horizon: 175


100%|██████████| 50/50 [01:05<00:00,  1.31s/it]
  model_state_dict = torch.load(model_save_dir + f"{model_name}_{idx}_{p}.pth")
