In [2]:
import torch
import torch.nn as nn
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from src.model.model import *
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

  from .autonotebook import tqdm as notebook_tqdm


### 0. 데이터 전처리

파일 로드

In [4]:
df = pd.read_csv("dataset/merged_df.csv") 
df = df[df["date_time"]>"2023-08-25 00:00:00"] # 기상데이터가 있는 기간부터

결측치 처리 (pm10, pm25)

In [5]:
pm10_lst = []
pm25_lst = []
for pos_num in tqdm(range(1,13)):
    temp_df = df[df["measure_position_id"]==pos_num] # position number에 해당하는 데이터프레임

    # 결측치는 앞뒤 비결측치 값의 평균으로 지정
    temp_df["pm10_value"] = temp_df['pm10_value'].fillna((temp_df["pm10_value"].ffill()+\
                                                        temp_df["pm10_value"].bfill())/2)
    temp_df["pm25_value"] = temp_df['pm25_value'].fillna((temp_df["pm25_value"].ffill()+\
                                                        temp_df["pm25_value"].bfill())/2)
    pm10 = list(temp_df["pm10_value"].values)
    pm25 = list(temp_df["pm25_value"].values)
    pm10_lst.append(pm10)
    pm25_lst.append(pm25)

# 길이에 맞게 자름
pm10_min_len = min(list(map(lambda x:len(x),pm10_lst)))
pm25_min_len = min(list(map(lambda x:len(x),pm25_lst)))
pm10_lst = list(map(lambda x:x[:pm10_min_len],pm10_lst))
pm25_lst = list(map(lambda x:x[:pm25_min_len],pm25_lst))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df["pm10_value"] = temp_df['pm10_value'].fillna((temp_df["pm10_value"].ffill()+\
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df["pm25_value"] = temp_df['pm25_value'].fillna((temp_df["pm25_value"].ffill()+\
100%|██████████| 12/12 [00:00<00:00, 312.92it/s]


기상데이터 전처리후 결합

In [4]:
temp_df = df[df["measure_position_id"]==1.0]
for strname in ["precipitation_one_hour","humidity","temperature","wind_speed"]:
    temp_df[strname]=temp_df[strname].fillna((temp_df[strname].ffill()+\
                                              temp_df[strname].bfill())/2)
    insert_lst = temp_df[strname]
    pm10_lst.append(insert_lst)
    pm25_lst.append(insert_lst)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  temp_df[strname]=temp_df[strname].fillna((temp_df[strname].ffill()+\


In [7]:
# 결측치확인
pm10_data = torch.Tensor(np.array(pm10_lst)).T
pm25_data = torch.Tensor(np.array(pm25_lst)).T
pm10_data = pm10_data[:-2]
pm25_data = pm25_data[:-2]
print(pm10_data.shape)
print(pm25_data.shape)
print(pm10_data.isnan().sum())
print(pm25_data.isnan().sum())

torch.Size([960, 12])
torch.Size([960, 12])
tensor(0)
tensor(0)


### 1. 데이터세트 생성

In [19]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, index):
        x = self.data[index:index+self.seq_length]
        y = self.data[index+self.seq_length,:12]
        return x, y

seq_length = 1

# pm10 데이터를 학습, 검증 및 테스트 세트로 나눕니다.
pm10_train, temp_data = train_test_split(pm10_data, test_size=0.25, shuffle=False)
pm10_valid, pm10_test = train_test_split(temp_data, test_size=0.4, shuffle=False)
train_data_loader_pm10 = DataLoader(TimeSeriesDataset(pm10_train, seq_length),
                                    batch_size=16, shuffle=True)
valid_data_loader_pm10 = DataLoader(TimeSeriesDataset(pm10_valid, seq_length), 
                                    batch_size=16, shuffle=True)
test_data_loader_pm10 = DataLoader(TimeSeriesDataset(pm10_test, seq_length), 
                                   batch_size=16, shuffle=True)

# pm25 데이터를 학습, 검증 및 테스트 세트로 나눕니다.
pm25_train, temp_data = train_test_split(pm25_data, test_size=0.25, shuffle=False)
pm25_valid, pm25_test = train_test_split(temp_data, test_size=0.4, shuffle=False)
train_data_loader_pm25 = DataLoader(TimeSeriesDataset(pm25_train, seq_length), 
                                    batch_size=16, shuffle=True)
valid_data_loader_pm25 = DataLoader(TimeSeriesDataset(pm25_valid, seq_length), 
                                    batch_size=16, shuffle=True)
test_data_loader_pm25 = DataLoader(TimeSeriesDataset(pm25_test, seq_length), 
                                   batch_size=16, shuffle=True)

### 2. 모델 지정

In [20]:
input_size = 16  
hidden_size = 64
num_layers = 4
output_size = 12 

model_pm10 = MultiInputOutputLSTM(input_size, hidden_size, num_layers, output_size)
model_pm25 = MultiInputOutputLSTM(input_size, hidden_size, num_layers, output_size)

# 손실 함수와 옵티마이저
criterion = nn.MSELoss()
optimizer_pm10 = torch.optim.Adam(model_pm10.parameters(), lr=0.001)
optimizer_pm25 = torch.optim.Adam(model_pm25.parameters(), lr=0.001)

### 3. 학습

#### PM10

In [9]:
# 모델 훈련
num_epochs = 1000
best_loss = float('inf')  # 초기 최고 손실 설정
checkpoint_path_pm10 = 'testweights/pm10/best_model_weights2.pth'

for epoch in range(num_epochs):
    for batch in train_data_loader_pm10:
        inputs, labels = batch
        optimizer_pm10.zero_grad()
        outputs = model_pm10(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_pm10.step()

    # 현재 epoch의 손실값 확인
    current_loss = loss.item()
    
     # 검증 손실 계산
    model_pm10.eval()
    valid_loss = 0.0
    with torch.no_grad():
        for batch in valid_data_loader_pm10:
            inputs, labels = batch
            outputs = model_pm10(inputs)
            valid_loss += criterion(outputs, labels).item()
    
    # 검증 손실이 가장 낮을 때 모델 가중치 저장
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model_pm10.state_dict(), checkpoint_path_pm10)

    if epoch%10 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/1000], Loss: 524.5707
Epoch [11/1000], Loss: 63.9816
Epoch [21/1000], Loss: 96.4349
Epoch [31/1000], Loss: 133.0191
Epoch [41/1000], Loss: 161.3814
Epoch [51/1000], Loss: 73.7318
Epoch [61/1000], Loss: 130.2796
Epoch [71/1000], Loss: 110.2948
Epoch [81/1000], Loss: 63.9729
Epoch [91/1000], Loss: 78.9729
Epoch [101/1000], Loss: 34.8458
Epoch [111/1000], Loss: 44.9871
Epoch [121/1000], Loss: 62.3526
Epoch [131/1000], Loss: 38.0480
Epoch [141/1000], Loss: 55.1448
Epoch [151/1000], Loss: 78.5895
Epoch [161/1000], Loss: 59.0046
Epoch [171/1000], Loss: 39.7374
Epoch [181/1000], Loss: 42.4826
Epoch [191/1000], Loss: 47.6980
Epoch [201/1000], Loss: 64.8528
Epoch [211/1000], Loss: 56.5142
Epoch [221/1000], Loss: 38.6040
Epoch [231/1000], Loss: 38.2021
Epoch [241/1000], Loss: 42.4196
Epoch [251/1000], Loss: 55.9004
Epoch [261/1000], Loss: 51.1525
Epoch [271/1000], Loss: 39.3010
Epoch [281/1000], Loss: 47.9469
Epoch [291/1000], Loss: 40.8767
Epoch [301/1000], Loss: 57.1797
Epoch [311/100

In [10]:
# 테스트 루프
input_lst = []
label_lst = []
pred_lst = []
model_pm10.eval()
test_loss = 0.0
with torch.no_grad():
    for batch in test_data_loader_pm10:
        inputs, labels = batch
        outputs = model_pm10(inputs)
        test_loss += criterion(outputs, labels).item()
        input_lst.extend(inputs)
        label_lst.extend(labels)
        pred_lst.extend(outputs)

print(f'Test Loss: {test_loss / len(test_data_loader_pm10):.4f}')

Test Loss: 58.8057


#### PM25

In [13]:
# 모델 훈련
num_epochs = 100
best_loss = float('inf')  # 초기 최고 손실 설정
checkpoint_path_pm25 = 'testweights/pm25/best_model_weights.pth'

for epoch in range(num_epochs):
    for batch in train_data_loader_pm25:
        inputs, labels = batch
        optimizer_pm25.zero_grad()
        outputs = model_pm25(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer_pm25.step()

    # 현재 epoch의 손실값 확인
    current_loss = loss.item()
    
     # 검증 손실 계산
    model_pm25.eval()
    valid_loss = 0.0
    with torch.no_grad():
        for batch in valid_data_loader_pm25:
            inputs, labels = batch
            outputs = model_pm25(inputs)
            valid_loss += criterion(outputs, labels).item()
    
    # 검증 손실이 가장 낮을 때 모델 가중치 저장
    if valid_loss < best_loss:
        best_loss = valid_loss
        torch.save(model_pm25.state_dict(), checkpoint_path_pm25)

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [1/100], Loss: 155.2595
Epoch [2/100], Loss: 32.6452
Epoch [3/100], Loss: 120.4637
Epoch [4/100], Loss: 47.1863
Epoch [5/100], Loss: 23.4247
Epoch [6/100], Loss: 16.8392
Epoch [7/100], Loss: 72.9942
Epoch [8/100], Loss: 24.6682
Epoch [9/100], Loss: 17.6003
Epoch [10/100], Loss: 15.8360
Epoch [11/100], Loss: 12.1751
Epoch [12/100], Loss: 29.5226
Epoch [13/100], Loss: 22.5087
Epoch [14/100], Loss: 14.0162
Epoch [15/100], Loss: 37.6619
Epoch [16/100], Loss: 14.7038
Epoch [17/100], Loss: 27.0630
Epoch [18/100], Loss: 7.7056
Epoch [19/100], Loss: 12.7369
Epoch [20/100], Loss: 17.9581
Epoch [21/100], Loss: 9.6349
Epoch [22/100], Loss: 18.7227
Epoch [23/100], Loss: 17.2819
Epoch [24/100], Loss: 20.7117
Epoch [25/100], Loss: 22.0040
Epoch [26/100], Loss: 18.4339
Epoch [27/100], Loss: 31.3125
Epoch [28/100], Loss: 21.3373
Epoch [29/100], Loss: 22.1347
Epoch [30/100], Loss: 14.9164
Epoch [31/100], Loss: 22.6069
Epoch [32/100], Loss: 19.8822
Epoch [33/100], Loss: 30.6453
Epoch [34/100], Los

In [74]:
# 테스트 루프
model_pm25.eval()
test_loss = 0.0
with torch.no_grad():
    for batch in test_data_loader_pm25:
        inputs, labels = batch
        outputs = model_pm25(inputs)
        test_loss += criterion(outputs, labels).item()

print(f'Test Loss: {test_loss / len(test_data_loader_pm25):.4f}')

Test Loss: 82.9221
