In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

!apt-get -qq install fonts-nanum
import matplotlib.font_manager as fm
import matplotlib as mpl

fontpath = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fm.fontManager.addfont(fontpath)
mpl.rc('font', family='NanumGothic')
mpl.rcParams['axes.unicode_minus'] = False
#from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

data = pd.read_csv('/content/drive/MyDrive/data_2024_3.csv', encoding='euc-kr')

# Shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

print(data.head())


# Define split ratios
#train_ratio = 0.7
#validation_ratio = 0.15
#test_ratio = 0.15

# Split the data into training, validation, and testing sets
#train_data, temp_data = train_test_split(data, test_size=(1 - train_ratio))
#val_data, test_data = train_test_split(temp_data, test_size=test_ratio/(test_ratio + validation_ratio))

# Save the splits to CSV files if needed
#train_data.to_csv("train_data.csv", index=False)
#val_data.to_csv("val_data.csv", index=False)
#test_data.to_csv("test_data.csv", index=False)

data.dropna(subset=["관측시간", "관측지점"], inplace=True)

# 관측시간을 datetime 타입으로 변환
data["관측시간"] = pd.to_datetime(data["관측시간"])

data[["관측지점", "관측지점세부"]] = data["관측지점"].str.split('_', expand=True)

columns_to_fill = ["관측최대풍속"]
columns_to_fill = ["관측최대풍속", "(AVOC)관측최대풍속", "(BVOC)관측최대풍속"]

for column in columns_to_fill:
    data[column] = data.groupby(["관측시간", "관측지점"])[column].transform(
        lambda x: x.fillna(x.mean())
    )

    # 그래도 남아있는 결측치는 관측지점으로 채우기
    data[column] = data.groupby('관측지점')[column].transform(
        lambda x: x.fillna(x.mean())
    )

    # 그래도 남아있는 결측치는 관측시간으로 채우기
    data[column] = data.groupby('관측시간')[column].transform(
        lambda x: x.fillna(x.mean())
    )

    # 여전히 남아있는 결측치는 전체 평균으로 채우기
    overall_mean = data[column].mean()
    data[column].fillna(overall_mean, inplace=True)

    # 결측치 처리 전 결측치 확인
if data.isnull().values.any():
    print("데이터프레임에 결측치가 있습니다.")
else:
    print("데이터프레임에 결측치가 없습니다.")

# 각 열의 결측치 수 확인
missing_values = data.isnull().sum()
print("각 열의 결측치 수:")
print(missing_values)

data.sort_values(by="관측시간", inplace=True)

train_ratio = 0.9

train_size = int(len(data) * train_ratio)
train_data = data.iloc[:train_size]
test_data = data.iloc[train_size:]

print("훈련 데이터 크기:", len(train_data))
print("테스트 데이터 크기:", len(test_data))

# 아직 관측지점 및 관측지점세부 고려하지 않음
train_data = train_data.drop(columns=['관측지점', '관측지점세부'])
test_data = test_data.drop(columns=['관측지점', '관측지점세부'])

train_data.to_csv("train_data.csv", index=False)
test_data.to_csv("test_data.csv", index=False)

Selecting previously unselected package fonts-nanum.
(Reading database ... 121918 files and directories currently installed.)
Preparing to unpack .../fonts-nanum_20200506-1_all.deb ...
Unpacking fonts-nanum (20200506-1) ...
Setting up fonts-nanum (20200506-1) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...
Mounted at /content/drive
       관측지점              관측시간  관측온도  관측습도    관측기압  관측풍속   관측풍향  관측최대풍속  \
0     인천_해안  2024-03-30 14:40  13.2  34.0  1009.2   1.3  330.7     3.7   
1  양간지대_주거1  2024-03-08 11:50   5.2  34.6  1012.2   2.1  223.1     5.9   
2     대구_강가  2024-03-04 14:10  12.8  36.7  1018.1   1.1  201.7     2.6   
3     인천_완충  2024-03-12 21:30   7.0  56.4  1018.3   0.1   13.1     1.9   
4     종로_도심  2024-03-07 22:10   4.1  40.0  1014.4   1.2   32.3     4.0   

   관측미세먼지  관측초미세먼지  ...  (AVOC)배관관측온도  (BVOC)관측온도  (BVOC)관측습도  (BVOC)관측기압  \
0    66.2     27.8  ...         106.3        13.2        34.1      1009.2   
1    40.5     29.9  ...         231.0         5.0   

In [5]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
import eval_vis


# CPU 혹은 GPU 사용, GPU 우선적으로 사용 가능
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 데이터 불러오기 (전처리 완료된 데이터라고 가정)
train_data = pd.read_csv("/content/train_data.csv")
test_data = pd.read_csv("/content/test_data.csv")

# 특성과 타겟 변수 분리
X_train = train_data.drop(columns=["관측미세먼지"]).values
X_test = test_data.drop(columns=["관측미세먼지"]).values
y_train = train_data["관측미세먼지"].values
y_test = test_data["관측미세먼지"].values

# 각각 train과 test를 알맞게 데이터와 매치, 관측 지점과 관측시간은 숫자형이 아니라서 일단 배제, 추후 필요 없는 특성이면 drop, 아니라면 다른 전처리 필요
X_train = train_data.drop(columns=["관측미세먼지", "관측시간"]).apply(pd.to_numeric, errors='coerce').fillna(0).values
X_test = test_data.drop(columns=["관측미세먼지", "관측시간"]).apply(pd.to_numeric, errors='coerce').fillna(0).values
y_train = train_data["관측미세먼지"].apply(pd.to_numeric, errors='coerce').fillna(0).values
y_test = test_data["관측미세먼지"].apply(pd.to_numeric, errors='coerce').fillna(0).values

# 스탠다드 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Tensor로 변환
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# DataLoader 정의
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# 모델 정의
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim, hidden_dim1=64, hidden_dim2=32, dropout=0.2):
        super(MLP, self).__init__()
        self.fc1 = nn.Linear(input_dim, hidden_dim1)
        self.fc2 = nn.Linear(hidden_dim1, hidden_dim2)
        self.fc3 = nn.Linear(hidden_dim2, output_dim)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        x = self.dropout(x)
        x = self.fc3(x)
        return x

# 그리드 서치 함수 정의
def grid_search(model, train_data, train_label, dataloader, param_grid, input_dim, output_dim, num_epochs=15, batch_size=32):
    results = []

    param_list = list(ParameterGrid(param_grid))

    for params in param_list:
        model_1 = model(input_dim, output_dim, hidden_dim1=params['hidden_dim1'], hidden_dim2=params['hidden_dim2'], dropout=params['dropout']).to(device)
        criterion = nn.MSELoss()
        optimizer = getattr(optim, params['optimizer'])(model_1.parameters(), lr=params['lr'])

        print(params)
        train_losses = []
        test_losses = []

        for epoch in range(num_epochs):
            model_1.train()
            running_loss = 0.0

            for inputs, labels in dataloader:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                outputs = model_1(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * inputs.size(0)

            epoch_loss = running_loss / len(dataloader.dataset)
            train_losses.append(epoch_loss)

            model_1.eval()
            test_loss = 0.0
            with torch.no_grad():
                outputs = model_1(train_data.to(device))
                test_loss = criterion(outputs, train_label.to(device)).item()
                outputs = outputs.squeeze().cpu().numpy()
                labels = train_label.cpu().numpy()
                result = eval_vis.evaluate_model(labels, outputs)
            test_losses.append(test_loss)
            results.append({'params': params, 'rmse': result[0], 'r2': result[1]})

            print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Test Loss: {test_loss:.4f}")

    best_result_rmse = min(results, key=lambda x: x['rmse'])
    best_result_r2 = max(results, key=lambda x: x['r2'])

    print("Best Parameters for RMSE:", best_result_rmse['params'])
    print("Best RMSE:", best_result_rmse['rmse'])

    print("Best Parameters for R2:", best_result_r2['params'])
    print("Best R2:", best_result_r2['r2'])

    return best_result_rmse, best_result_r2

# 하이퍼파라미터 그리드 정의
param_grid = {
    'hidden_dim1': [64],
    'hidden_dim2': [32],
    'dropout': [0.2 , 0.3],
    'optimizer': ['Adam' , 'SGD'],
    'lr': [0.001]
}



# Cross Validation

In [6]:
#Cross validation Module

from sklearn.model_selection import KFold
n_splits = 5

kf = KFold(n_splits = n_splits , shuffle = True)

fold_results = []

for fold, (train_indices, val_indices) in enumerate(kf.split(X_train)):
  print(f'Fold {fold + 1}')
  X_fold_train, X_fold_val = X_train[train_indices], X_train[val_indices]
  y_fold_train, y_fold_val = y_train[train_indices], y_train[val_indices]
  fold_train_dataset = TensorDataset(X_fold_train, y_fold_train)
  fold_train_loader = DataLoader(fold_train_dataset, batch_size=32, shuffle=True)



  # 그리드 서치 수행
  best_result_rmse, best_result_r2 = grid_search(MLP, X_train, y_train, train_loader, param_grid, X_train.shape[1], 1, num_epochs=10, batch_size=32)

  # 최적의 하이퍼파라미터 조합으로 모델 학습
  best_params = best_result_rmse['params']
  model = MLP(X_fold_train.shape[1], 1, hidden_dim1=best_params['hidden_dim1'], hidden_dim2=best_params['hidden_dim2'], dropout=best_params['dropout']).to(device)
  criterion = nn.MSELoss()
  optimizer = getattr(optim, best_params['optimizer'])(model.parameters(), lr=best_params['lr'])


Fold 1
{'dropout': 0.2, 'hidden_dim1': 64, 'hidden_dim2': 32, 'lr': 0.001, 'optimizer': 'Adam'}
Epoch [1/10], Train Loss: 64.3299, Test Loss: 3.9963
Epoch [2/10], Train Loss: 30.6775, Test Loss: 3.8442
Epoch [3/10], Train Loss: 26.2433, Test Loss: 4.0523
Epoch [4/10], Train Loss: 23.9165, Test Loss: 4.0539
Epoch [5/10], Train Loss: 21.8711, Test Loss: 4.6455
Epoch [6/10], Train Loss: 20.9000, Test Loss: 4.3049


KeyboardInterrupt: 