In [10]:
#!/usr/bin/env python
# coding: utf-8
# 모델의 reproducibility를 위해 random seed를 고정함
seed_value = 743
print("Train with random seed", seed_value)

import os
os.environ['PYTHONHASHSEED'] = str(seed_value)
import random
random.seed(seed_value)
import numpy as np
np.random.seed(seed_value)
import tensorflow as tf
tf.set_random_seed(seed_value)
from keras import backend as K
session_conf = tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)
sess = tf.Session(graph=tf.get_default_graph(), config=session_conf)
K.set_session(sess)

Train with random seed 743
Device mapping:
/job:localhost/replica:0/task:0/device:GPU:0 -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6
/job:localhost/replica:0/task:0/device:GPU:1 -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:73:00.0, compute capability: 8.6
/job:localhost/replica:0/task:0/device:GPU:2 -> device: 2, name: NVIDIA RTX A6000, pci bus id: 0000:a6:00.0, compute capability: 8.6



2022-12-14 16:49:12.603694: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 46713 MB memory:  -> device: 0, name: NVIDIA RTX A6000, pci bus id: 0000:17:00.0, compute capability: 8.6
2022-12-14 16:49:12.604297: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 46181 MB memory:  -> device: 1, name: NVIDIA RTX A6000, pci bus id: 0000:73:00.0, compute capability: 8.6
2022-12-14 16:49:12.604849: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1525] Created device /job:localhost/replica:0/task:0/device:GPU:2 with 46713 MB memory:  -> device: 2, name: NVIDIA RTX A6000, pci bus id: 0000:a6:00.0, compute capability: 8.6


TypeError: Argument `config` must be a tf.ConfigProto, but got "Session"

In [None]:
import warnings
warnings.filterwarnings('ignore')

# 그래프를 그리기 위한 matplotlib 및 기타 utility 라이브러리를 import함
import pickle
import pandas as pd
from tqdm import trange
import matplotlib.pyplot as plt
import matplotlib
import matplotlib.font_manager as font_manager
# matplotlib를 사용해 그래프를 그릴 때 사용할 글꼴을 설정함
font_dirs = ['.']
font_files = font_manager.findSystemFonts(fontpaths=font_dirs)
font_list = font_manager.createFontList(font_files)
font_manager.fontManager.ttflist.extend(font_list)
matplotlib.rcParams['font.family'] = 'Malgun Gothic'

In [None]:
from sklearn.preprocessing import MinMaxScaler  # 데이터 정규화에 사용할 MinMaxScaler import
from sklearn.metrics import mean_squared_error  # MSE 성능 지표를 계산하기 하기 위한 함수 import

# 모델을 구축하기 위한 keras 관련 함수 import
from keras.models import *
from keras.layers import Lambda, RepeatVector
from keras.layers import Input, multiply
from keras.layers import Dense, LSTM, Dropout, Flatten
from keras import regularizers

In [None]:
def mean_absolute_percentage_error(y_true, y_pred):
    """MAPE 성능 지표를 계산하기 위한 함수 정의"""
    y_true, y_pred = np.array(y_true), np.array(y_pred)
    return np.mean(np.abs((y_true - y_pred) / y_true)) * 100

In [None]:
# Define the function to return the SMAPE value
def calculate_smape(actual, predicted) -> float:
    """SMAPE 성능 지표를 계산하기 위한 함수 정의"""

    # Convert actual and predicted to numpy
    # array data type if not already
    if not all([isinstance(actual, np.ndarray), 
                isinstance(predicted, np.ndarray)]):
        actual, predicted = np.array(actual),
        np.array(predicted)
  
    return round(
        np.mean(
            np.abs(predicted - actual) / 
            ((np.abs(predicted) + np.abs(actual))/2)
        )*100, 2
    )

In [None]:
# 클러스터링을 진행한 CSV 데이터 파일의 path를 설정함
data_filename = '../data/PM10_clustering_1day_result_1014.csv'
detrended_df = pd.read_csv(data_filename)  # Pandas의 read_csv 함수를 이용해 CSV 파일을 읽음

In [None]:
detrended_df.head()

In [None]:
detrended_df['Cluster'].value_counts()  # 각 클러스터에 포함된 시계열 데이터의 샘플 수 확인

In [None]:
specific_df = detrended_df[detrended_df['Cluster'] == 'Cluster 0']  # 특정 클러스터의 데이터만 사용함

In [None]:
cluster_row_num = len(specific_df)  # 선택한 클러스터의 시계열 데이터의 샘플 수 계산

In [None]:
def get_split_row_index(total_row, train_split=0.6):
    """DataFrame을 train, test, validation으로 나누기 위한 helper 함수"""
    train_data_up = int(total_row * train_split)
    remain_data_row = cluster_row_num - train_data_up
    
    # test set과 validation set의 row 수가 같도록
    # train, test, valid split을 진행함
    if remain_data_row % 2 == 1:
        train_data_up += 1
        remain_data_row -= 1
    
    valid_data_up = int(remain_data_row / 2)
    
    assert train_data_up + valid_data_up * 2 == total_row
    
    return train_data_up, train_data_up + valid_data_up

In [None]:
train_up_bound, valid_up_bound = get_split_row_index(cluster_row_num)

In [None]:
# 전체 DataFrame을 train, test, validation DataFrame으로 나눔
train_df = specific_df[:train_up_bound]
valid_df = specific_df[train_up_bound:valid_up_bound]
test_df = specific_df[valid_up_bound:]

In [None]:
# train, test, validation DataFrame의 시계열 데이터 샘플 수 확인
print("Train DataFrame row: ", len(train_df))
print("Valid DataFrame row: ", len(valid_df))
print("Test DataFrame row: ", len(test_df))

In [None]:
assert len(train_df) + len(valid_df) + len(test_df) == cluster_row_num

In [None]:
train_df.head()

In [None]:
list(valid_df.index.values)  # validation에 사용한 시계열 데이터 샘플의 index id 확인

In [None]:
list(test_df.index.values)  # test에 사용한 시계열 데이터 샘플의 index id 확인

In [None]:
# DataFrame에서 `Cluster` column을 삭제함
train_df = train_df.drop(['Cluster'], axis=1).reset_index(drop=True)
valid_df = valid_df.drop(['Cluster'], axis=1).reset_index(drop=True)
test_df = test_df.drop(['Cluster'], axis=1).reset_index(drop=True)

In [None]:
# 각 DataFrame에 포함된 0 값의 갯 수를 확인함
print("Train Data - Number of 0: ", (train_df == 0).sum().sum())
print("Valid Data - Number of 0: ", (valid_df == 0).sum().sum())
print("Test Data - Number of 0: ", (test_df == 0).sum().sum())

In [None]:
def dataframe_to_list(data_df):
    """DataFrame을 list로 변환하기 위한 helper 함수 정의"""
    total_data = []  # 전체 데이터 샘플 리스트
    data_list = []  # 데이터 샘플의 리스트
    for i in range(len(data_df)):
        i_row = data_df.loc[i, :].tolist()
        total_data.extend(i_row)
        data_list.append(i_row)
    
    return total_data, data_list

In [None]:
# DataFrame을 list 형태로 변환함
train_data_total, train_data_clusters = dataframe_to_list(train_df)
valid_data_total, valid_data_clusters = dataframe_to_list(valid_df)
test_data_total, test_data_clusters = dataframe_to_list(test_df)

In [None]:
def scale_data_clusters(scaler, data_clusters):
    """Scaler를 사용해 각 시계열 데이터 샘플에 대해 정규화를 진행하는 함수를 정의"""
    std_data_clusters = []
    for data_cluster in data_clusters:
        # scaler의 transform 함수를 사용해 정규화를 진행함
        std_data_cluster = scaler.transform(np.array(data_cluster).reshape(-1, 1))
        std_data_cluster = std_data_cluster.reshape(-1).tolist()
        std_data_clusters.append(std_data_cluster)
    return std_data_clusters

In [None]:
scaler = MinMaxScaler()  # MinMaxScaler 정규화 객체 생성
scaler.fit(np.array(train_data_total).reshape(-1, 1))
# 각 데이터 클러스터에 대해 정규화를 진행함
std_train_data_clusters = scale_data_clusters(scaler, train_data_clusters)
std_valid_data_clusters = scale_data_clusters(scaler, valid_data_clusters)
std_test_data_clusters = scale_data_clusters(scaler, test_data_clusters)

In [None]:
print(len(std_train_data_clusters))

In [None]:
def wrap_cluster(data_cluster, window_size=6, look_ahead=6):
    """List 형식의 data_cluster를 사용해 모델 입력을 위한
    (batch_size, window_size, 1) 형식으로 변환함"""
    data_x, data_y = [], []
    assert isinstance(data_cluster, list)
    data_cluster = np.array(data_cluster)
    for i in range(len(data_cluster) - window_size - look_ahead):
        x = data_cluster[i: (i + window_size)]
        y = data_cluster[i + window_size + look_ahead]
        data_x.append(x)  # window_size 만큼의 데이터를 입력으로 사용
        data_y.append(y)  # look_ahead 이후의 데이터를 예측함
    return np.array(data_x).reshape(-1, window_size, 1), np.array(data_y)

In [None]:
sample_cluster = std_train_data_clusters[0][:15]
print(sample_cluster)

In [None]:
wrap_cluster(sample_cluster)

In [None]:
def create_dataset(data_clusters):
    """wrap_cluster 함수를 이용해 dataset 생성"""
    X, y = [], []
    for data_cluster in data_clusters:
        cluster_X, cluster_y = wrap_cluster(data_cluster)
        X.append(cluster_X)
        y.append(cluster_y)
        
    # np.concatenate 함수를 이용해 여러 게의 numpy array를 하나로 concat 함
    # [(1, window_size, 1), (1, window_size, 1)] -> (2, window_size, 1)
    return np.concatenate(X, axis=0), np.concatenate(y, axis=0)

In [None]:
# train, test, validation 데이터 각각에 대해
# create_dataset 함수를 사용해 데이터 셋을 생성함
train_X, train_y = create_dataset(std_train_data_clusters)
valid_X, valid_y = create_dataset(std_valid_data_clusters)
test_X, test_y = create_dataset(std_test_data_clusters)

In [None]:
print(train_X.shape, train_y.shape)
print(valid_X.shape, valid_y.shape)
print(test_X.shape, test_y.shape)

In [None]:
def attention_3d_block(inputs, input_dim, single_attention_vector):
    """Feature attention block 정의"""
    time_steps = int(inputs.shape[1])
    # Attention weights 계산
    a = Dense(input_dim, activation='softmax', name='attention_vec')(inputs)  # (batch_size, input_dim, time_step)
    if single_attention_vector:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)  # (batch_size, input_dim)
        a = RepeatVector(time_steps)(a)  # (batch_size, input_dim, time_step)
    output_attention_mul = multiply([inputs, a], name='attention_mul')  # Attention weights 적용
    return output_attention_mul

In [None]:
def model_attention_applied_before_lstm(batch_size, time_step, feature_num, single_attention_vector):
    """Attention LSTM 모델 정의"""
    inputs = Input(shape=(time_step, feature_num))
    x = attention_3d_block(inputs, feature_num, single_attention_vector)
    x = LSTM(6, activation='tanh',
             stateful=False,
             return_sequences=True,
             kernel_initializer='he_normal')(x)
    x = Dropout(0.2)(x)
    x = Flatten()(x)
    x = Dense(10, activation='linear', kernel_regularizer=regularizers.l2(0.01),
              activity_regularizer=regularizers.l1(0.))(x)
    output = Dense(1, activation='linear', kernel_initializer='he_normal')(x)

    model = Model(input=[inputs], output=output)
    return model

In [None]:
# 하이퍼 파라미터 정의
batch_size = 4
look_back = 6
feature_num = 1
SINGLE_ATTENTION_VECTOR = True

# LSTM 모델을 생성하고 compile를 진행함
model = model_attention_applied_before_lstm(batch_size, look_back, feature_num, SINGLE_ATTENTION_VECTOR)
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
# 모델 훈련 진행
# 훈련 과정의 손실값을 history 변수에 저장
history = model.fit(train_X, train_y,
                    validation_data=(valid_X, valid_y),
                    batch_size=batch_size, epochs=10)

In [None]:
loss = history.history['loss']
val_loss = history.history['val_loss']

In [None]:
# train loss와 validation loss의 변화를 matplotlib를 사용해 시각화함
plt.plot(loss, label='loss')
plt.plot(val_loss, label='val_loss')
plt.legend()

In [None]:
# 훈련한 모델을 사용해 예측을 진행함
train_predict = model.predict(train_X, batch_size)
valid_predict = model.predict(valid_X, batch_size)
test_predict = model.predict(test_X, batch_size)

In [None]:
# 성능 측정을 위해 예측한 값에 대해 inverse transform을 진행함
inv_train_y = scaler.inverse_transform(train_y.reshape(-1, 1))
inv_train_predict = scaler.inverse_transform(train_predict)

In [None]:
inv_valid_y = scaler.inverse_transform(valid_y.reshape(-1, 1))
inv_valid_predict = scaler.inverse_transform(valid_predict)

In [None]:
inv_test_y = scaler.inverse_transform(test_y.reshape(-1, 1))
inv_test_predict = scaler.inverse_transform(test_predict)

In [None]:
# train, test, validation set에 대해 MAPE 계산
train_mape = mean_absolute_percentage_error(inv_train_y, inv_train_predict)
valid_mape = mean_absolute_percentage_error(inv_valid_y, inv_valid_predict)
test_mape = mean_absolute_percentage_error(inv_test_y, inv_test_predict)

In [None]:
print("Train MAPE: %.2f" % train_mape)
print("Valid MAPE: %.2f" % valid_mape)
print("Test MAPE: %.2f" % test_mape)

In [None]:
# train, test, validation set에 대해 SMAPE 계산
train_smape = calculate_smape(inv_train_y, inv_train_predict)
valid_smape = calculate_smape(inv_valid_y, inv_valid_predict)
test_smape = calculate_smape(inv_test_y, inv_test_predict)

In [None]:
print("Train SMAPE: %.2f" % train_smape)
print("Valid SMAPE: %.2f" % valid_smape)
print("Test SMAPE: %.2f" % test_smape)

In [None]:
# train, test, validation set에 대해 MSE 계산
train_mse = mean_squared_error(inv_train_y, inv_train_predict)
valid_mse = mean_squared_error(inv_valid_y, inv_valid_predict)
test_mse = mean_squared_error(inv_test_y, inv_test_predict)

In [None]:
print("Train MSE: %.2f" % train_mse)
print("Valid MSE: %.2f" % valid_mse)
print("Test MSE: %.2f" % test_mse)

In [None]:
# matplotlib을 이용해 train, test, validation set에 대한 실제값과 예측값을 시각화함
train_term = len(inv_train_y)
valid_term = len(inv_train_y) + len(inv_valid_y)
total_sample = len(inv_train_y) + len(inv_valid_y) + len(inv_test_y)

plt.figure(figsize=(16, 8))
plt.plot(np.arange(train_term), inv_train_y, color='red', ls='-', label='Real Train Data')
plt.plot(np.arange(train_term), inv_train_predict, color='blue', ls='--', label='Predict Train Data')
plt.plot(np.arange(train_term, valid_term), inv_valid_y, color='red', ls='-', label='Real Valid Data')
plt.plot(np.arange(train_term, valid_term), inv_valid_predict, color='green', ls='--', label='Predict Valid Data')
plt.plot(np.arange(valid_term, total_sample), inv_test_y, color='red', ls='-', label='Real Test Data')
plt.plot(np.arange(valid_term, total_sample), inv_test_predict, color='black', ls='--', label='Predict Test Data')
plt.title('Prediction')
plt.legend()