### [Pytorch 기반 회귀 모델 구현] <hr>
- Layer => Full-Connected Layer, Linear
- 손실함수 => MSELoss, MAELoss, ...

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler 

import torch
import torch.nn as nn
from torch.utils.data import random_split
from torchmetrics.functional import mean_squared_error, r2_score
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import ReduceLROnPlateau

[1] 데이터 준비

In [2]:
data_file = '../DATA/BostonHousing.csv'

In [3]:
HousingDF = pd.read_csv(data_file)
HousingDF.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [4]:
HousingDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  b        506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [5]:
HousingDF.shape

(506, 14)

In [6]:
HousingDF['tax'].value_counts()

tax
666    132
307     40
403     30
437     15
304     14
      ... 
285      1
198      1
256      1
244      1
313      1
Name: count, Length: 66, dtype: int64

In [7]:
featureDF = HousingDF[HousingDF.columns[:-1]]
featureDF = featureDF[featureDF.columns.difference(['chas', 'rad'])]
targetSR = HousingDF[HousingDF.columns[-1]]

In [8]:
a = torch.tensor([[1, 2, 3], [-1, -2, 5]])
a.max(1, keepdim = True)[1]

tensor([[2],
        [2]])

[2] 사용자 데이터셋 클래스 정의

In [9]:
# 클래스 정의
class BostonDS(Dataset):

    # 필요한 데이터 전처리 수행도 가능 => 정규화 등
    def __init__(self, x_data, y_data):
        super().__init__()
        # x, y 데이터 => ndarray
        x_data = x_data.values if isinstance(x_data, pd.DataFrame) else x_data
        y_data = y_data.values if isinstance(y_data, pd.Series) else y_data
        
        self.feature = torch.FloatTensor(x_data)
        self.target = torch.FloatTensor(y_data)
        self.length = self.feature.shape[0]
        #self.norm_feature = self.normalization()

    def __len__(self):
        return self.length
    
    def __getitem__(self, index):
        return self.feature[index], self.target[index]
    
    # 데이터 정규화 기능 함수
    # def normalization(self):
    #     for col in range(self.feature.shape[1]):
    #         MAX = self.feature.max(dim = 0)[0][col]
    #         MIN = self.feature.min(dim = 0)[0][col]
    #         if (MAX - MIN) != 0:
    #             self.feature[:, col] = (self.feature[:, col] - MIN) / (MAX - MIN)
    #         else:
    #             epsilon = 10^(-6)
    #             self.feature[:, col] = (self.feature[:, col] - MIN) / (MAX - MIN + epsilon)

In [10]:
X_train, X_test, y_train, y_test = train_test_split(featureDF, targetSR,
                                                    test_size = 0.2,
                                                    random_state = 42)

In [11]:
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train,
                                                      test_size = 0.1,
                                                      random_state = 42)

In [12]:
MY_SCALER = StandardScaler()
MY_SCALER.fit(X_train)
scaled_X_train = MY_SCALER.transform(X_train)
scaled_X_val = MY_SCALER.transform(X_val)
scaled_X_test = MY_SCALER.transform(X_test)

In [13]:
trainDS = BostonDS(scaled_X_train, y_train)
validDS = BostonDS(scaled_X_val, y_val)
testDS = BostonDS(scaled_X_test, y_test)

In [14]:
# DataLoader 생성
# drop_last 매개변수 : 배치 사이즈로 데이터셋 분리 후 남는 데이터 처리 방법 설정 [기본 : False]
BATCH_SIZE = 10
trainDL = DataLoader(trainDS, batch_size = BATCH_SIZE, drop_last = True)
validDL = DataLoader(validDS, batch_size = BATCH_SIZE, drop_last = True)
testDL = DataLoader(testDS, batch_size = BATCH_SIZE, drop_last = True)

In [15]:
# Epoch당 반복 단위
print(f'batch_size = {BATCH_SIZE}')
print(f'trainDS => {len(trainDS)}개, validDS => {len(validDS)}개, testDS => {len(testDS)}개')
print(f'trainDL => {len(trainDL)}개, validDL => {len(validDL)}개, testDL => {len(testDL)}개')

batch_size = 10
trainDS => 363개, validDS => 41개, testDS => 102개
trainDL => 36개, validDL => 4개, testDL => 10개


In [104]:
class LinearModel(nn.Module):
    def __init__(self, in_dim):
        super().__init__()
        self.layer = nn.Sequential(
            nn.Linear(in_dim, 4),
            nn.BatchNorm1d(num_features = 4),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(4, 2),
            nn.BatchNorm1d(num_features = 2),
            nn.ReLU(),
            nn.Dropout(),
            nn.Linear(2, 1))
        self.init_weights()
    
    def init_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Linear):
                torch.nn.init.kaiming_normal_(layer.weight.data)
                torch.nn.init.zeros_(layer.bias.data) 
            
    def forward(self, x):
        x = self.layer(x)
        return x

In [105]:
if torch.cuda.is_available():
    DEVICE = torch.device('cuda')
else:
    DEVICE = torch.device('cpu')
    
print('Using PyTorch version:', torch.__version__, ' Device:', DEVICE)

Using PyTorch version: 2.2.2  Device: cpu


In [106]:
### ===> Optimizer, Objective Function 설정
MY_MODEL = LinearModel(11).to(DEVICE)

OPTIMIZER = torch.optim.Adam(MY_MODEL.parameters())
LOSS_FN = nn.MSELoss()
SCHEDULER = ReduceLROnPlateau(OPTIMIZER, mode = 'min', patience = 5)
print(MY_MODEL)

LinearModel(
  (layer): Sequential(
    (0): Linear(in_features=11, out_features=4, bias=True)
    (1): BatchNorm1d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
    (3): Dropout(p=0.5, inplace=False)
    (4): Linear(in_features=4, out_features=2, bias=True)
    (5): BatchNorm1d(2, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0.5, inplace=False)
    (8): Linear(in_features=2, out_features=1, bias=True)
  )
)


In [107]:
for layer in MY_MODEL.modules():
    if isinstance(layer, nn.Linear):
        print(layer)

Linear(in_features=11, out_features=4, bias=True)
Linear(in_features=4, out_features=2, bias=True)
Linear(in_features=2, out_features=1, bias=True)


In [111]:
### ===> 학습 진행함수 
def training():
    # 학습모드 => 정규화, 경사하강법, 드랍아웃 등의 기능 활성화 
    MY_MODEL.train()
    
    # 배치크기 만큼 학습 진행 및 저장
    train_report=[[], []]
    for idx, (feature, target)  in enumerate(trainDL):
        # 배치크기만큼의 학습 데이터 준비
        feature, target = feature.to(DEVICE), target.to(DEVICE)
        
        # 학습
        pre_traget = MY_MODEL(feature)
        target = target.unsqueeze(1)
        
        # 손실계산
        loss = LOSS_FN(pre_traget, target)
        train_report[0].append(loss.item())
        
        # 성능 평가
        acc = r2_score(pre_traget, target)
        train_report[1].append(acc.item())
        
        # W,b업데이트
        OPTIMIZER.zero_grad()
        loss.backward()
        OPTIMIZER.step()
    
    # 에포크 단위로 학습 모델 저장
    # torch.save(model, './model/my_model.pt')
    
    # 에포크 단위 학습 진행 메시지 출력
    train_loss = np.mean(train_report[0])
    train_accuracy = np.mean(train_report[1])

    #print(f'\n[{epoch+1} Train ] Loss ==> {train_loss:.4f} Accuracy ==> {train_accuracy:.2f}\n')
    
    return train_loss, train_accuracy

In [112]:
### ===> 검증 및 테스트 진행함수 
def testing():
    # 학습모드 => 정규화, 경사하강법, 드랍아웃 등의 기능 활성화 
    MY_MODEL.eval()
        
    with torch.no_grad():
        # 배치크기 만큼 학습 진행 및 저장
        valid_report=[[], []]
        for idx, (feature, target) in enumerate(validDL):
            # 배치크기만큼의 학습 데이터 준비
            feature, target = feature.to(DEVICE), target.to(DEVICE)
        
            # 학습
            pre_traget = MY_MODEL(feature)
            target = target.unsqueeze(1)
        
            # 손실계산
            loss = LOSS_FN(pre_traget, target)
            valid_report[0].append(loss)
            
            # 성능 평가 
            acc = r2_score(pre_traget, target)
            valid_report[1].append(acc)

    #testing_type = 'Valid' if kind == 'valid' else 'Test'
        
    # 에포크 단위 학습 진행 메시지 출력
    val_loss = np.mean(valid_report[0])
    val_accuarcy = np.mean(valid_report[1])

    #print(f'[{epoch+1} {testing_type} ] Loss ==> {valid_loss:.4f} Acc ==> {valid_accuarcy:.2f}\n')
    
    return val_loss, val_accuarcy

In [113]:
EPOCHS = 500
loss_list = [[], []]
accuracy_list = [[], []]

for epoch in range(1, EPOCHS + 1):
    
    train_loss, train_accuracy = training()
    val_loss, val_accuracy = testing()
    
    print(f"\n[EPOCH: {epoch}], \tMSE Loss: {train_loss:.4f}, \tTrain Accuracy: {train_accuracy:.2f}\n")
    print(f"\n[EPOCH: {epoch}], \tMSE Loss: {val_loss:.4f}, \tVal Accuracy: {val_accuracy:.2f}\n")

    SCHEDULER.step(val_loss)
    # 조기종료 기능 => 조건 : val_loss가 지정된 횟수이상 개선이 안되면 학습 종료
    if SCHEDULER.num_bad_epochs >= SCHEDULER.patience:
        print(f'Early stopping at epoch {epoch}')
        break

    loss_list[0].append(train_loss)
    loss_list[1].append(val_loss)
    accuracy_list[0].append(train_accuracy)
    accuracy_list[1].append(val_accuracy)


[EPOCH: 1], 	MSE Loss: 611.4171, 	Train Accuracy: -8.50


[EPOCH: 1], 	MSE Loss: 561.0781, 	Val Accuracy: -9.88


[EPOCH: 2], 	MSE Loss: 604.8598, 	Train Accuracy: -8.42


[EPOCH: 2], 	MSE Loss: 556.7396, 	Val Accuracy: -9.79


[EPOCH: 3], 	MSE Loss: 600.1387, 	Train Accuracy: -8.34


[EPOCH: 3], 	MSE Loss: 554.3311, 	Val Accuracy: -9.74


[EPOCH: 4], 	MSE Loss: 596.5798, 	Train Accuracy: -8.24


[EPOCH: 4], 	MSE Loss: 550.9536, 	Val Accuracy: -9.66


[EPOCH: 5], 	MSE Loss: 589.8144, 	Train Accuracy: -8.15


[EPOCH: 5], 	MSE Loss: 547.7042, 	Val Accuracy: -9.59


[EPOCH: 6], 	MSE Loss: 588.3133, 	Train Accuracy: -8.09


[EPOCH: 6], 	MSE Loss: 544.7639, 	Val Accuracy: -9.52


[EPOCH: 7], 	MSE Loss: 582.2570, 	Train Accuracy: -7.99


[EPOCH: 7], 	MSE Loss: 542.4207, 	Val Accuracy: -9.47


[EPOCH: 8], 	MSE Loss: 575.6915, 	Train Accuracy: -7.99


[EPOCH: 8], 	MSE Loss: 535.6500, 	Val Accuracy: -9.33


[EPOCH: 9], 	MSE Loss: 569.6431, 	Train Accuracy: -7.90


[EPOCH: 9], 	MSE Loss: 529.96