In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

!apt-get -qq install fonts-nanum
import matplotlib.font_manager as fm
import matplotlib as mpl

fontpath = '/usr/share/fonts/truetype/nanum/NanumGothic.ttf'
fm.fontManager.addfont(fontpath)
mpl.rc('font', family='NanumGothic')
mpl.rcParams['axes.unicode_minus'] = False
#from sklearn.model_selection import train_test_split

from google.colab import drive
drive.mount('/content/drive')

Selecting previously unselected package fonts-nanum.
(Reading database ... 121918 files and directories currently installed.)
Preparing to unpack .../fonts-nanum_20200506-1_all.deb ...
Unpacking fonts-nanum (20200506-1) ...
Setting up fonts-nanum (20200506-1) ...
Processing triggers for fontconfig (2.13.1-4.2ubuntu5) ...
Mounted at /content/drive


In [5]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(actual_values, predicted_values):

    # RMSE 계산
    rmse = np.sqrt(mean_squared_error(actual_values, predicted_values))

    # R-squared 계산
    r2 = r2_score(actual_values, predicted_values)

    return rmse, r2

In [6]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import recall_score, confusion_matrix

# CPU 혹은 GPU 사용, GPU 우선적으로 사용 가능
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 데이터 불러오기 (전처리 완료된 데이터라고 가정)
train_data = pd.read_csv("/content/drive/MyDrive/train_seoul.csv")
test_data = pd.read_csv("/content/drive/MyDrive/test_seoul.csv")

# 특성과 타겟 변수 분리
X_train = train_data.drop(columns=["관측미세먼지"]).values
X_test = test_data.drop(columns=["관측미세먼지"]).values
y_train = train_data["관측미세먼지"].values
y_test = test_data["관측미세먼지"].values

# 각각 train과 test를 알맞게 데이터와 매치, 관측 지점과 관측시간은 숫자형이 아니라서 일단 배제, 추후 필요 없는 특성이면 drop, 아니라면 다른 전처리 필요
X_train = train_data.drop(columns=["관측미세먼지", "경과일","경과시간"]).apply(pd.to_numeric, errors='coerce').fillna(0).values
X_test = test_data.drop(columns=["관측미세먼지","경과일","경과시간"]).apply(pd.to_numeric, errors='coerce').fillna(0).values
y_train = train_data["관측미세먼지"].apply(pd.to_numeric, errors='coerce').fillna(0).values
y_test = test_data["관측미세먼지"].apply(pd.to_numeric, errors='coerce').fillna(0).values

# 스탠다드 스케일링
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Tensor로 변환
X_train = torch.tensor(X_train, dtype=torch.float32)
y_train = torch.tensor(y_train, dtype=torch.float32).view(-1, 1)
X_test = torch.tensor(X_test, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32).view(-1, 1)

# DataLoader 정의
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [7]:
class TransformerRegression(nn.Module):
    def __init__(self, input_dim, output_dim, num_encoder_layers=6, nhead=4, hidden_dim=512, dropout=0.1):
        super(TransformerRegression, self).__init__()
        self.input_proj = nn.Linear(input_dim, hidden_dim)
        encoder_layer = nn.TransformerEncoderLayer(d_model=hidden_dim, nhead=nhead, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=num_encoder_layers)
        self.fc_out = nn.Linear(hidden_dim, output_dim)
        self.hidden_dim = hidden_dim

    def forward(self, x):
        # x shape: (batch_size, seq_len, input_dim)
        x = self.input_proj(x)  # shape: (batch_size, seq_len, hidden_dim)
        x = x.permute(1, 0, 2)  # shape: (seq_len, batch_size, hidden_dim)
        x = self.transformer_encoder(x)  # shape: (seq_len, batch_size, hidden_dim)
        x = x.mean(dim=0)  # shape: (batch_size, hidden_dim)
        x = self.fc_out(x)  # shape: (batch_size, output_dim)
        return x

In [8]:
# 그리드 서치 함수 정의
def grid_search(model, train_data, train_label, dataloader, param_grid, input_dim, output_dim, num_epochs=15, batch_size=32):
    results = []
    param_list = list(ParameterGrid(param_grid))

    for params in param_list:
        model_1 = model(input_dim, output_dim, num_encoder_layers=params['num_encoder_layers'], hidden_dim=params['hidden_dim'], dropout=params['dropout']).to(device)
        criterion = nn.MSELoss()
        optimizer = getattr(optim, params['optimizer'])(model_1.parameters(), lr=params['lr'])

        print(params)
        train_losses = []
        test_losses = []

        for epoch in range(num_epochs):
            model_1.train()
            running_loss = 0.0

            for inputs, labels in dataloader:
                inputs, labels = inputs.to(device), labels.to(device)
                if inputs.dim() == 2:
                    inputs = inputs.unsqueeze(1)  # (batch_size, seq_len, input_dim)으로 맞추기 위해 차원 추가
                optimizer.zero_grad()
                outputs = model_1(inputs)
                loss = criterion(outputs, labels)
                loss.backward()
                optimizer.step()
                running_loss += loss.item() * inputs.size(0)

            epoch_loss = running_loss / len(dataloader.dataset)
            train_losses.append(epoch_loss)

            model_1.eval()
            test_loss = 0.0
            with torch.no_grad():
                test_inputs = train_data.to(device)
                if test_inputs.dim() == 2:
                    test_inputs = test_inputs.unsqueeze(1)  # 테스트 데이터 차원도 확인
                outputs = model_1(test_inputs)
                test_loss = criterion(outputs, train_label.to(device)).item()
                outputs = outputs.squeeze().cpu().numpy()
                labels = train_label.cpu().numpy()
                result = evaluate_model(labels, outputs)
                sensitivity = recall_score((labels >= 81).astype(int), (outputs >= 81).astype(int))  # 민감도 계산 추가
            test_losses.append(test_loss)
            results.append({'params': params, 'rmse': result[0], 'r2': result[1], 'sensitivity': sensitivity})  # 민감도 결과 저장

            print(f"Epoch [{epoch + 1}/{num_epochs}], Train Loss: {epoch_loss:.4f}, Test Loss: {test_loss:.4f}, Sensitivity: {sensitivity:.4f}")  # 민감도 출력

    # 민감도가 가장 높은 모델의 파라미터 선택
    best_result_sensitivity = max(results, key=lambda x: x['sensitivity'])  # 민감도가 가장 높은 모델 선택
    best_result_rmse = min(results, key=lambda x: x['rmse'])
    best_result_r2 = max(results, key=lambda x: x['r2'])

    print("Best Parameters for Sensitivity:", best_result_sensitivity['params'])  # 민감도가 가장 높은 모델의 파라미터 출력
    print("Best Sensitivity:", best_result_sensitivity['sensitivity'])  # 민감도가 가장 높은 모델의 민감도 출력

    print("Best Parameters for RMSE:", best_result_rmse['params'])
    print("Best RMSE:", best_result_rmse['rmse'])

    print("Best Parameters for R2:", best_result_r2['params'])
    print("Best R2:", best_result_r2['r2'])

    return best_result_sensitivity, best_result_rmse, best_result_r2


In [15]:
# 하이퍼파라미터 그리드 정의
param_grid = {
    'num_encoder_layers': [4],
    'hidden_dim': [512],
    'dropout': [0.1],
    'lr': [0.01],
    'optimizer': ['Adam']
}

In [16]:
best_result_sensitivity, best_result_rmse, best_result_r2 = grid_search(TransformerRegression, X_train, y_train, train_loader, param_grid, input_dim=28, output_dim=1, num_epochs=2, batch_size=32)

# 결과 출력
print("Best result by Sensitivity:", best_result_sensitivity)
print("Best result by RMSE:", best_result_rmse)
print("Best result by R2:", best_result_r2)




{'dropout': 0.1, 'hidden_dim': 512, 'lr': 0.01, 'num_encoder_layers': 4, 'optimizer': 'Adam'}
Epoch [1/2], Train Loss: 838.7515, Test Loss: 831.1143, Sensitivity: 0.0000
Epoch [2/2], Train Loss: 834.1394, Test Loss: 829.5030, Sensitivity: 0.0000
Best Parameters for Sensitivity: {'dropout': 0.1, 'hidden_dim': 512, 'lr': 0.01, 'num_encoder_layers': 4, 'optimizer': 'Adam'}
Best Sensitivity: 0.0
Best Parameters for RMSE: {'dropout': 0.1, 'hidden_dim': 512, 'lr': 0.01, 'num_encoder_layers': 4, 'optimizer': 'Adam'}
Best RMSE: 28.801092
Best Parameters for R2: {'dropout': 0.1, 'hidden_dim': 512, 'lr': 0.01, 'num_encoder_layers': 4, 'optimizer': 'Adam'}
Best R2: -0.0030577166218106466
Best result by Sensitivity: {'params': {'dropout': 0.1, 'hidden_dim': 512, 'lr': 0.01, 'num_encoder_layers': 4, 'optimizer': 'Adam'}, 'rmse': 28.829052, 'r2': -0.0050060961534383885, 'sensitivity': 0.0}
Best result by RMSE: {'params': {'dropout': 0.1, 'hidden_dim': 512, 'lr': 0.01, 'num_encoder_layers': 4, 'optim

In [21]:
from sklearn.metrics import recall_score, confusion_matrix
import seaborn as sns

#민감도를 기준으로 최적의 파라미터 설정
best_params = best_result_sensitivity['params']
model = TransformerRegression(X_train.shape[1], 1, num_encoder_layers=best_params['num_encoder_layers'], nhead=4, hidden_dim=best_params['hidden_dim'], dropout=best_params['dropout']).to(device)
criterion = nn.MSELoss()
optimizer = getattr(optim, best_params['optimizer'])(model.parameters(), lr=best_params['lr'])

# 학습
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0

    for inputs, labels in train_loader:
        inputs, labels = inputs.to(device), labels.to(device)
        if inputs.dim() == 2:
            inputs = inputs.unsqueeze(1)  # (batch_size, seq_len, input_dim)으로 맞추기 위해 차원 추가
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(train_loader.dataset)
    print(f"Epoch [{epoch + 1}/{num_epochs}], Loss: {epoch_loss:.4f}")

# 테스트 데이터에 대한 예측 및 민감도 계산
model.eval()
with torch.no_grad():
    y_pred = model(X_test.to(device)).cpu().numpy()
    y_pred = y_pred.squeeze()

# 예측값을 고농도와 저농도로 분류 (81을 기준으로 분류)
y_pred_classes = (y_pred >= 81).astype(int)
y_test_classes = (y_test.squeeze().numpy() >= 81).astype(int)

# 민감도 계산
sensitivity_high = recall_score(y_test_classes, y_pred_classes, pos_label=1)
sensitivity_low = recall_score(y_test_classes, y_pred_classes, pos_label=0)

# 혼동 행렬을 통해 세부 통계 계산
conf_matrix = confusion_matrix(y_test_classes, y_pred_classes)
tn, fp, fn, tp = conf_matrix.ravel()

# 출력
print(f'Total number of high concentration samples: {tp + fn}')
print(f'Number of correctly identified high concentration samples: {tp}')
print(f'Number of high concentration samples missed: {fn}')
print(f'Sensitivity (Recall) for high concentration data: {sensitivity_high:.4f}')

print(f'Total number of low concentration samples: {tn + fp}')
print(f'Number of correctly identified low concentration samples: {tn}')
print(f'Number of low concentration samples missed: {fp}')
print(f'Sensitivity (Recall) for low concentration data: {sensitivity_low:.4f}')

#loss
plt.figure(figsize=(10, 6))
plt.plot(y_test, label='Actual', color='blue')
plt.plot(y_pred, label='Predicted', color='red', linestyle='--')
plt.title('Actual vs Predicted')
plt.xlabel('Sample')
plt.ylabel('Value')
plt.legend()
plt.show()

#시각화
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues", xticklabels=["Low Concentration", "High Concentration"], yticklabels=["Low Concentration", "High Concentration"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()


Epoch [1/10], Loss: 838.8777
Epoch [2/10], Loss: 833.8051
Epoch [3/10], Loss: 834.4033
Epoch [4/10], Loss: 833.4862
Epoch [5/10], Loss: 832.6149
Epoch [6/10], Loss: 832.8015
Epoch [7/10], Loss: 832.7676
Epoch [8/10], Loss: 831.9919
Epoch [9/10], Loss: 832.5319
Epoch [10/10], Loss: 832.0037


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 2 is not equal to len(dims) = 3