In [1]:
from tqdm import tqdm
from utils import set_logger, prepare_data_for_CNN_LSTM
from dataset import FinedustCNNLSTMDataset
from model import FinedustCNNLSTM
from utils import prepare_data
from sklearn.model_selection import train_test_split
import os
import logging
from interpolate import simple_interpolate
import numpy as np
from preprocess import minmax_scaling
from train import train, train_cnn_lstm
from permutation import compute_cnn_lstm_permutation_importance

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader

In [2]:
config = {
    "learning_rate": 1e-4,
    "epochs": 500,
    "batch_size": 32,
    "num_layers": 2,
    "hidden_size": 128,
    "window_size": 24,
    "output_size": 1,
    "dropout_prob": 0.2,
    "patience": 10,
    "out_channels": 16,
    "kernel_size": 3,
    "K": 1,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# CNN-LSTM

In [5]:
regions = ["Andong", "Seoul", "Jeonju", "Daegu", "Gwangju"]

pred_list, loss_list = [], []

for region in regions:
  if region == "Jeonju":
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "IX", "PS", "지점", "위도", "경도"]
  else:
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "PS", "지점", "위도", "경도"]
  df = prepare_data_for_CNN_LSTM(region.lower(), columns_to_remove)
  df = simple_interpolate(df, method="linear")
  df, scaler = minmax_scaling(df)

  dataset = FinedustCNNLSTMDataset(df,
                                  window_size=config["window_size"],
                                  prediction_length=config["output_size"],
                                  time_window=3)
  train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
  train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)

  model = FinedustCNNLSTM(config,
                          in_channels=len(dataset.pm_columns),
                          input_size=len(dataset.feature_columns)).to(device)

  total_preds, losses = train_cnn_lstm(model,
                                      train_dataset,
                                      val_dataset,
                                      region, "CNN_LSTM", config,
                                      device)
  pred_y = np.concatenate(total_preds, axis=0)
  val_y = np.concatenate([y for x, y in val_dataset])

  pred_list.append((pred_y, val_y))
  loss_list.append(losses)

  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
  0%|          | 0/500 [00:03<?, ?it/s]


AttributeError: '_pickle.Pickler' object attribute 'persistent_id' is read-only

## Extract important features

In [38]:
regions

['Andong', 'Seoul', 'Jeonju', 'Daegu']

In [None]:
model_name = "CNN-LSTM"

output_dir = f"importance/{model_name}"
os.makedirs(output_dir, exist_ok=True)

for region in regions:
  if region == "Jeonju":
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "IX", "PS", "지점", "위도", "경도"]
  else:
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "PS", "지점", "위도", "경도"]
  df = prepare_data_for_CNN_LSTM(region.lower(), columns_to_remove)
  df = simple_interpolate(df, method="linear")
  df, scaler = minmax_scaling(df)

  dataset = FinedustCNNLSTMDataset(df,
                                 window_size=config["window_size"],
                                 prediction_length=config["output_size"],
                                 time_window=3)
  train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
  train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)

  if region == "Jeonju":
    config["hidden_size"] = 256
  else:
    config["hidden_size"] = 128
  model = FinedustCNNLSTM(config,
                          in_channels=len(dataset.pm_columns),
                          input_size=len(dataset.feature_columns)).to(device)
  save_path = f"models/{model_name}/{region}.pth"
  model.load_state_dict(torch.load(save_path))
  model.to(device)

  feature_importances = compute_cnn_lstm_permutation_importance(model, val_loader, dataset.feature_columns, dataset.pm_columns, device)
  output_path = f"{output_dir}/{region}.txt"

  with open(output_path, "w") as f:
    f.write("Feature Importances:\n")
    for feature, importance in sorted(feature_importances.items(), key=lambda x: x[1], reverse=True):
        f.write(f"{feature}: {importance:.4f}\n")
  print(f"Feature importances saved to {output_path}")

  data = data.interpolate(method='linear')


Baseline MSE Loss: 6.4699
Feature: WD, Permutation Importance: 3.6131
Feature: WS, Permutation Importance: 3.5572
Feature: TA, Permutation Importance: 4.3083
Feature: TD, Permutation Importance: 1.9318
Feature: HM, Permutation Importance: 7.6094
Feature: PV, Permutation Importance: 4.6644
Feature: VS, Permutation Importance: 12.2706
Feature: TS, Permutation Importance: 3.0322
Feature importances saved to importance/LSTM/Andong.txt


  data = data.interpolate(method='linear')


Baseline MSE Loss: 13.5851
Feature: WD, Permutation Importance: 6.2550
Feature: WS, Permutation Importance: 3.6059
Feature: TA, Permutation Importance: 1.1206
Feature: TD, Permutation Importance: 1.1049
Feature: HM, Permutation Importance: 5.1800
Feature: PV, Permutation Importance: 5.0815
Feature: VS, Permutation Importance: 10.2841
Feature: TS, Permutation Importance: 1.3686
Feature: TE_005, Permutation Importance: 0.7640
Feature: TE_01, Permutation Importance: 4.8736
Feature: TE_02, Permutation Importance: 5.2185
Feature: TE_03, Permutation Importance: 1.7073
Feature importances saved to importance/LSTM/Seoul.txt


  data = data.interpolate(method='linear')


RuntimeError: Error(s) in loading state_dict for FinedustLSTM:
	size mismatch for lstm.lstm.weight_ih_l0: copying a param with shape torch.Size([1024, 8]) from checkpoint, the shape in current model is torch.Size([512, 8]).
	size mismatch for lstm.lstm.weight_hh_l0: copying a param with shape torch.Size([1024, 256]) from checkpoint, the shape in current model is torch.Size([512, 128]).
	size mismatch for lstm.lstm.bias_ih_l0: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([512]).
	size mismatch for lstm.lstm.bias_hh_l0: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([512]).
	size mismatch for lstm.lstm.weight_ih_l1: copying a param with shape torch.Size([1024, 256]) from checkpoint, the shape in current model is torch.Size([512, 128]).
	size mismatch for lstm.lstm.weight_hh_l1: copying a param with shape torch.Size([1024, 256]) from checkpoint, the shape in current model is torch.Size([512, 128]).
	size mismatch for lstm.lstm.bias_ih_l1: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([512]).
	size mismatch for lstm.lstm.bias_hh_l1: copying a param with shape torch.Size([1024]) from checkpoint, the shape in current model is torch.Size([512]).
	size mismatch for lstm.fc1.weight: copying a param with shape torch.Size([768, 256]) from checkpoint, the shape in current model is torch.Size([384, 128]).
	size mismatch for lstm.fc1.bias: copying a param with shape torch.Size([768]) from checkpoint, the shape in current model is torch.Size([384]).
	size mismatch for lstm.fc2.weight: copying a param with shape torch.Size([256, 768]) from checkpoint, the shape in current model is torch.Size([128, 384]).
	size mismatch for lstm.fc2.bias: copying a param with shape torch.Size([256]) from checkpoint, the shape in current model is torch.Size([128]).
	size mismatch for fc1.weight: copying a param with shape torch.Size([256, 256]) from checkpoint, the shape in current model is torch.Size([128, 128]).
	size mismatch for fc1.bias: copying a param with shape torch.Size([256]) from checkpoint, the shape in current model is torch.Size([128]).
	size mismatch for fc2.weight: copying a param with shape torch.Size([1, 256]) from checkpoint, the shape in current model is torch.Size([1, 128]).