In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
%cd "/content/drive/MyDrive/Colab Notebooks/인지프/AI_Project/JH/LSTM"
!ls

/content/drive/MyDrive/Colab Notebooks/인지프/AI_Project/JH/LSTM
dataset.py	   inspect_null.ipynb  main.ipynb	      models	      __pycache__
filter_cols.ipynb  interpolate.py      main_with_china.ipynb  permutation.py  train.py
importance	   logs		       model.py		      preprocess.py   utils.py


In [13]:
from tqdm import tqdm
from utils import set_logger
from dataset import FinedustDataset
from model import FinedustLSTM
from utils import prepare_data_for_CNN_LSTM
from sklearn.model_selection import train_test_split
import os
import logging
from interpolate import simple_interpolate
import numpy as np
from preprocess import minmax_scaling
from train import train
from permutation import compute_permutation_importance

import torch
import torch.optim as optim
import torch.nn as nn
from torch.utils.data import DataLoader

In [4]:
config = {
    "learning_rate": 1e-4,
    "epochs": 500,
    "batch_size": 32,
    "num_layers": 2,
    "hidden_size": 128,
    "window_size": 24,
    "output_size": 1,
    "dropout": 0.2,
    "patience": 10,
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [5]:
region = "Seoul"

if region == "Jeonju":
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "IX", "PS", "지점", "위도", "경도"]
else:
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "PS", "지점", "위도", "경도"]
df = prepare_data_for_CNN_LSTM(region.lower(), columns_to_remove)
df = simple_interpolate(df, method="linear")
df, scaler = minmax_scaling(df)

df.shape

  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')


(10530, 21)

# LSTM
* China 데이터 추가

In [9]:
regions = ["Andong", "Seoul", "Jeonju", "Daegu", "Gwangju"]

pred_list, loss_list = [], []

for region in regions:
  if region == "Jeonju":
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "IX", "PS", "지점", "위도", "경도"]
  else:
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "PS", "지점", "위도", "경도"]
  df = prepare_data_for_CNN_LSTM(region.lower(), columns_to_remove)
  df = simple_interpolate(df, method="linear")
  df, scaler = minmax_scaling(df)

  dataset = FinedustDataset(df,
                            window_size=config["window_size"],
                            prediction_length=config["output_size"],
                            time_window=1)
  train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
  train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)

  model = FinedustLSTM(input_size=len(dataset.feature_columns),
                              hidden_size=config['hidden_size'],
                              num_layers=config['num_layers'],
                              output_size=config['output_size'],
                              dropout_prob=config['dropout']).to(device)

  total_preds, losses = train(model,
                              train_dataset,
                              val_dataset,
                              dataset.feature_columns,
                              region, "LSTM", config,
                              device,
                              log_suffix="China")
  pred_y = np.concatenate(total_preds, axis=0)
  val_y = np.concatenate([y for x, y in val_dataset])

  pred_list.append((pred_y, val_y))
  loss_list.append(losses)

  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
 25%|██▌       | 127/500 [02:14<06:34,  1.06s/it]
  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
 24%|██▍       | 120/500 [02:08<06:47,  1.07s/it]
  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
 20%|██        | 102/500 [01:47<06:59,  1.05s/it]
  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
 23%|██▎       | 114/500 [02:00<06:49,  1.06s/it]


## Extract important features

In [11]:
regions = ["Andong", "Seoul", "Jeonju", "Daegu", "Gwangju"]

In [14]:
output_dir = f"importance/LSTM"
os.makedirs(output_dir, exist_ok=True)

for region in regions:
  if region == "Jeonju":
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "IX", "PS", "지점", "위도", "경도"]
  else:
    columns_to_remove = ["CA_TOT", "CA_MID", "STN", "IR", "PA", "PS", "지점", "위도", "경도"]
  df = prepare_data_for_CNN_LSTM(region.lower(), columns_to_remove)
  df = simple_interpolate(df, method="linear")
  df, scaler = minmax_scaling(df)

  dataset = FinedustDataset(df,
                            window_size=config["window_size"],
                            prediction_length=config["output_size"],
                            time_window=1)
  train_dataset, val_dataset = train_test_split(dataset, test_size=0.2, random_state=42)
  train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True)
  val_loader = DataLoader(val_dataset, batch_size=config['batch_size'], shuffle=False)

  model = FinedustLSTM(input_size=len(dataset.feature_columns),
                              hidden_size=config['hidden_size'],
                              num_layers=config['num_layers'],
                              output_size=config['output_size'],
                              dropout_prob=config['dropout']).to(device)
  save_path = f"models/LSTM/{region}_China.pth"
  model.load_state_dict(torch.load(save_path))
  model.to(device)

  feature_importances = compute_permutation_importance(model, val_loader, dataset.feature_columns, device)
  output_path = f"{output_dir}/{region}_China.txt"

  with open(output_path, "w") as f:
    f.write("Feature Importances:\n")
    for feature, importance in sorted(feature_importances.items(), key=lambda x: x[1], reverse=True):
        f.write(f"{feature}: {importance:.4f}\n")
  print(f"Feature importances saved to {output_path}")

  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
  model.load_state_dict(torch.load(save_path))


Baseline MSE Loss: 10.9375
Feature: WD, Permutation Importance: 1.4286
Feature: WS, Permutation Importance: 0.5834
Feature: TA, Permutation Importance: 1.7806
Feature: TD, Permutation Importance: 1.1613
Feature: HM, Permutation Importance: 0.8931
Feature: PV, Permutation Importance: 2.1929
Feature: VS, Permutation Importance: 2.8792
Feature: TS, Permutation Importance: 0.1309
Feature: PM10_yanan, Permutation Importance: -0.0349
Feature: PM10_qingdao, Permutation Importance: 0.0095
Feature: PM10_chifeng, Permutation Importance: 0.0500
Feature: PM10_dalian, Permutation Importance: 1.3900
Feature importances saved to importance/LSTM/Andong_China.txt


  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
  model.load_state_dict(torch.load(save_path))


Baseline MSE Loss: 17.8765
Feature: WD, Permutation Importance: 1.5353
Feature: WS, Permutation Importance: 0.9353
Feature: TA, Permutation Importance: 0.7592
Feature: TD, Permutation Importance: 1.5867
Feature: HM, Permutation Importance: 1.1727
Feature: PV, Permutation Importance: 4.1511
Feature: VS, Permutation Importance: 2.4584
Feature: TS, Permutation Importance: 0.3745
Feature: TE_005, Permutation Importance: 0.6982
Feature: TE_01, Permutation Importance: 0.6823
Feature: TE_02, Permutation Importance: 1.2035
Feature: TE_03, Permutation Importance: 0.1311
Feature: PM10_yanan, Permutation Importance: 0.2010
Feature: PM10_qingdao, Permutation Importance: 0.0954
Feature: PM10_chifeng, Permutation Importance: 0.0328
Feature: PM10_dalian, Permutation Importance: 2.0401
Feature importances saved to importance/LSTM/Seoul_China.txt


  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
  model.load_state_dict(torch.load(save_path))


Baseline MSE Loss: 12.4515
Feature: WD, Permutation Importance: 2.2325
Feature: WS, Permutation Importance: 0.5999
Feature: TA, Permutation Importance: 0.7042
Feature: TD, Permutation Importance: 0.7312
Feature: HM, Permutation Importance: 3.6935
Feature: PV, Permutation Importance: 0.5336
Feature: VS, Permutation Importance: 5.9418
Feature: TS, Permutation Importance: 0.8150
Feature: PM10_yanan, Permutation Importance: 0.0848
Feature: PM10_qingdao, Permutation Importance: 0.0139
Feature: PM10_chifeng, Permutation Importance: -0.0272
Feature: PM10_dalian, Permutation Importance: 1.6368
Feature importances saved to importance/LSTM/Jeonju_China.txt


  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
  model.load_state_dict(torch.load(save_path))


Baseline MSE Loss: 10.7842
Feature: WD, Permutation Importance: 0.9565
Feature: WS, Permutation Importance: 0.9888
Feature: TA, Permutation Importance: 1.3001
Feature: TD, Permutation Importance: 2.0975
Feature: HM, Permutation Importance: 0.9603
Feature: PV, Permutation Importance: 2.3524
Feature: VS, Permutation Importance: 1.7820
Feature: TS, Permutation Importance: 1.0303
Feature: PM10_yanan, Permutation Importance: 0.0090
Feature: PM10_qingdao, Permutation Importance: -0.0060
Feature: PM10_chifeng, Permutation Importance: -0.0291
Feature: PM10_dalian, Permutation Importance: 1.2666
Feature importances saved to importance/LSTM/Daegu_China.txt


  df[pm_columns] = df[pm_columns].applymap(lambda x: np.nan if x <= 0 else x)
  data = data.interpolate(method='linear')
  model.load_state_dict(torch.load(save_path))


Baseline MSE Loss: 18.4833
Feature: WD, Permutation Importance: 0.7730
Feature: WS, Permutation Importance: 0.4563
Feature: TA, Permutation Importance: 0.7994
Feature: TD, Permutation Importance: 1.5038
Feature: HM, Permutation Importance: 1.8168
Feature: PV, Permutation Importance: 1.6616
Feature: VS, Permutation Importance: 1.4139
Feature: TS, Permutation Importance: 0.5689
Feature: PM10_yanan, Permutation Importance: 0.0159
Feature: PM10_qingdao, Permutation Importance: 0.0162
Feature: PM10_chifeng, Permutation Importance: 0.0544
Feature: PM10_dalian, Permutation Importance: 1.3554
Feature importances saved to importance/LSTM/Gwangju_China.txt
