In [451]:
import numpy as np
import pandas as pd
import os

import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler

from torchmetrics.regression import MeanSquaredError, R2Score

In [452]:
DATA_PATH = "./Data/"

In [453]:
train_df = pd.read_csv(DATA_PATH + 'train.csv')
train_df

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년),가격(백만원)
0,TRAIN_0000,P사,TayGTS,Nearly New,86.077,AWD,13642,0,No,2,159.66
1,TRAIN_0001,K사,Niro,Nearly New,56.000,FWD,10199,6,No,0,28.01
2,TRAIN_0002,A사,eT,Brand New,91.200,AWD,2361,7,No,0,66.27
3,TRAIN_0003,A사,RSeTGT,Nearly New,,AWD,21683,3,No,0,99.16
4,TRAIN_0004,B사,i5,Pre-Owned,61.018,AWD,178205,1,No,0,62.02
...,...,...,...,...,...,...,...,...,...,...,...
7492,TRAIN_7492,H사,ION5,Brand New,,AWD,3773,10,No,0,35.95
7493,TRAIN_7493,B사,i3,Pre-Owned,46.000,RWD,135411,2,No,0,23.40
7494,TRAIN_7494,P사,TayCT,Brand New,,AWD,1363,2,No,0,120.00
7495,TRAIN_7495,B사,i3,Nearly New,56.000,RWD,39445,6,No,2,24.00


In [454]:
test_df = pd.read_csv(DATA_PATH + 'test.csv')
test_df.head()

Unnamed: 0,ID,제조사,모델,차량상태,배터리용량,구동방식,주행거리(km),보증기간(년),사고이력,연식(년)
0,TEST_000,P사,TayCT,Nearly New,76.093,AWD,14057,2,No,0
1,TEST_001,B사,iX,Brand New,90.0,AWD,7547,8,No,0
2,TEST_002,B사,i5,Brand New,,RWD,7197,7,Yes,0
3,TEST_003,H사,ION5,Nearly New,68.479,AWD,10357,7,No,1
4,TEST_004,K사,EV6,Brand New,,FWD,7597,10,No,0


In [455]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   ID        7497 non-null   object 
 1   제조사       7497 non-null   object 
 2   모델        7497 non-null   object 
 3   차량상태      7497 non-null   object 
 4   배터리용량     4786 non-null   float64
 5   구동방식      7497 non-null   object 
 6   주행거리(km)  7497 non-null   int64  
 7   보증기간(년)   7497 non-null   int64  
 8   사고이력      7497 non-null   object 
 9   연식(년)     7497 non-null   int64  
 10  가격(백만원)   7497 non-null   float64
dtypes: float64(2), int64(3), object(6)
memory usage: 644.4+ KB


In [456]:
train_df['제조사'].value_counts()

제조사
H사    1237
B사    1169
K사    1164
A사    1142
T사    1109
P사    1071
V사     605
Name: count, dtype: int64

In [457]:
train_df['보증기간'] = train_df['보증기간(년)'].apply(lambda x: 'high' if x >= 7 else 'low')
test_df['보증기간'] = test_df['보증기간(년)'].apply(lambda x: 'high' if x >= 7 else 'low')

In [458]:
def fill_battery(cond_df, df):
    for i in range(len(cond_df)):
        model = cond_df.iloc[i]['모델']
        status = cond_df.iloc[i]['차량상태']
        warranty = cond_df.iloc[i]['보증기간(년)']
        
        cond = (train_df['모델'] == model) & (train_df['차량상태'] == status) & (train_df['보증기간(년)'] == warranty)       
        df.loc[cond_df.iloc[i].name, '배터리용량'] = train_df[cond]['배터리용량'].mean()

In [459]:
fill_battery(train_df[train_df['배터리용량'].isna()], train_df)
fill_battery(test_df[test_df['배터리용량'].isna()], test_df)

In [472]:
train_df = train_df.fillna(train_df[train_df['제조사'] == 5]['배터리용량'].mean())


In [473]:
test_df = test_df.fillna(train_df[train_df['제조사'] == 5]['배터리용량'].mean())

In [474]:
train_df.drop('ID', axis = 1, inplace = True)
test_df.drop('ID', axis = 1, inplace = True)

KeyError: "['ID'] not found in axis"

In [475]:
encoder = LabelEncoder()

object_list = ['제조사', '모델', '차량상태', '구동방식', '보증기간', '사고이력']

for col in object_list:
    train_df[col] = encoder.fit_transform(train_df[col])
    test_df[col] = encoder.fit_transform(test_df[col])

In [476]:
target = train_df[['가격(백만원)']]
feature = train_df.drop('가격(백만원)', axis = 1)

In [477]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 846 entries, 0 to 845
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   제조사       846 non-null    int64  
 1   모델        846 non-null    int64  
 2   차량상태      846 non-null    int64  
 3   배터리용량     846 non-null    float64
 4   구동방식      846 non-null    int64  
 5   주행거리(km)  846 non-null    int64  
 6   보증기간(년)   846 non-null    int64  
 7   사고이력      846 non-null    int64  
 8   연식(년)     846 non-null    int64  
 9   보증기간      846 non-null    int64  
dtypes: float64(1), int64(9)
memory usage: 66.2 KB


In [478]:
class CustomDataset(Dataset):
    def __init__(self, featureDF, targetDF):
        self.featureDF = featureDF
        self.targetDF = targetDF
        self.n_rows = self.featureDF.shape[0]
        self.n_cols = self.featureDF.shape[1]
    
    def __len__(self):
        return self.n_rows

    def __getitem__(self, index):
        featureTS = torch.FloatTensor(self.featureDF.iloc[index])
        targetTS = torch.FloatTensor(self.targetDF.iloc[index])

        return featureTS, targetTS

In [479]:
class MLPModel(nn.Module):
    def __init__(self, input_size, hidden_dim, output_dim):
        super().__init__()

        self.input_layer = nn.Linear(input_size, hidden_dim)
        self.hidden_layer = nn.Linear(hidden_dim, hidden_dim)
        self.output_layer = nn.Linear(hidden_dim, output_dim)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0)

    def forward(self, x):
        inputs = self.relu(self.input_layer(x))

        hidden = self.relu(self.hidden_layer(inputs))
        hidden = self.dropout(hidden)
        
        output = self.output_layer(hidden)

        return output

In [480]:
def testing(featureDF, targetDF, model):
    featureTS = torch.FloatTensor(featureDF.values).to(DEVICE)
    targetTS = torch.FloatTensor(targetDF.values).to(DEVICE)

    model.eval()

    with torch.no_grad():
        pre_val = model(featureTS)
        mse_loss_val = MSEloss(pre_val, targetTS)
        score_val = R2score(pre_val, targetTS)

    return mse_loss_val, score_val, pre_val

In [481]:
def training(model, featureDF, targetDF, optimizer, EPOCH, scheduler, DEVICE, k_folds = 5):
    
    SAVE_PATH = './saved_models/'
    os.makedirs(SAVE_PATH, exist_ok = True)    

    BREAK_CNT_LOSS = 0
    BREAK_CNT_SCORE = 0
    LIMIT_VALUE = 10
    
    MSE_LOSS_HISTORY, SCORE_HISTORY = [[], []], [[], []]

    kfold = KFold(n_splits = k_folds, shuffle = True, random_state = 7)

    fold = 1
    i = 1
    
    for train_idx, val_idx in kfold.split(featureDF):
        print(f'Fold {fold}/{k_folds}')

        X_train, X_val = featureDF.iloc[train_idx], featureDF.iloc[val_idx]
        y_train, y_val = targetDF.iloc[train_idx], targetDF.iloc[val_idx]

        train_dataset = CustomDataset(X_train, y_train)
        val_dataset = CustomDataset(X_val, y_val)

        trainDL = DataLoader(train_dataset, batch_size = 32, shuffle = True)
        valDL = DataLoader(val_dataset, batch_size = 32)

        for epoch in range(1, EPOCH + 1):
            model.train()
            SAVE_WEIGHT = os.path.join(SAVE_PATH, f'model_weights_{i}.pth')

            mse_loss_total, score_total = 0, 0

            for featureTS, targetTS in trainDL:
                featureTS = featureTS.to(DEVICE)
                targetTS = targetTS.to(DEVICE)

                pre_y = model(featureTS)
                mse_loss = MSEloss(pre_y, targetTS)

                mse_loss_total += mse_loss.item()

                score = R2score(pre_y, targetTS)
                score_total += score.item()

                optimizer.zero_grad()
                mse_loss.backward()
                optimizer.step()
            
            test_mse_loss, test_score, pre_val = testing(X_val, y_val, model)

            MSE_LOSS_HISTORY[1].append(test_mse_loss)
            SCORE_HISTORY[1].append(test_score)

            MSE_LOSS_HISTORY[0].append(mse_loss_total / len(trainDL))
            SCORE_HISTORY[0].append(score_total / len(trainDL))
            
            train_mse_loss = (mse_loss_total / len(trainDL))
            print(f'[{epoch} / {EPOCH}]\n - TRAIN LOSS : {MSE_LOSS_HISTORY[0][-1]}\n - TRAIN SCORE : {SCORE_HISTORY[0][-1]}')
            print(f'[{epoch} / {EPOCH}]\n - TEST LOSS : {MSE_LOSS_HISTORY[1][-1]}\n - TEST SCORE : {SCORE_HISTORY[1][-1]}')
            
            scheduler.step(train_mse_loss)

            if len(MSE_LOSS_HISTORY) >= 2:
                if MSE_LOSS_HISTORY[-1] >= MSE_LOSS_HISTORY[-2]: BREAK_CNT_LOSS += 1

            if len(MSE_LOSS_HISTORY) == 1:
                torch.save(model.state_dict(), SAVE_WEIGHT)

            else:
                if MSE_LOSS_HISTORY[-1] < min(MSE_LOSS_HISTORY[:-1]):
                    torch.save(model.state_dict(), SAVE_WEIGHT)


            if BREAK_CNT_LOSS > LIMIT_VALUE:
                print(f"{epoch} EPOCH에 학습 중단")
                break
            
            i += 1
        fold += 1

    return MSE_LOSS_HISTORY, SCORE_HISTORY

In [482]:
EPOCH = 100
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
LR = 0.001

input_size = 10
hidden_dim = 128
output_dim = 1

mlp_model = MLPModel(input_size, hidden_dim, output_dim).to(DEVICE)

MSEloss = MeanSquaredError().to(DEVICE)
R2score = R2Score().to(DEVICE)

optimizer = optim.Adam(mlp_model.parameters(), lr = LR)

scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, mode = 'min', patience = 10, verbose = True)



In [484]:
mse_loss, r2_score = training(mlp_model, feature, target, optimizer, EPOCH, scheduler, DEVICE)

Fold 1/5
[1 / 100]
 - TRAIN LOSS : 753.6688833033785
 - TRAIN SCORE : 0.41349952810622276
[1 / 100]
 - TEST LOSS : 781.351806640625
 - TEST SCORE : 0.43280547857284546
[2 / 100]
 - TRAIN LOSS : 732.7574643074198
 - TRAIN SCORE : 0.4249525720134695
[2 / 100]
 - TEST LOSS : 742.1983032226562
 - TEST SCORE : 0.4612275958061218
[3 / 100]
 - TRAIN LOSS : 774.0954708342856
 - TRAIN SCORE : 0.39846387822577295
[3 / 100]
 - TEST LOSS : 716.7151489257812
 - TEST SCORE : 0.4797261953353882
[4 / 100]
 - TRAIN LOSS : 719.9424871891102
 - TRAIN SCORE : 0.43748000866555153
[4 / 100]
 - TEST LOSS : 818.0748291015625
 - TEST SCORE : 0.40614771842956543
[5 / 100]
 - TRAIN LOSS : 718.3665039387155
 - TRAIN SCORE : 0.4422815112357444
[5 / 100]
 - TEST LOSS : 731.1386108398438
 - TEST SCORE : 0.46925604343414307
[6 / 100]
 - TRAIN LOSS : 708.2142784443307
 - TRAIN SCORE : 0.4404970093610439
[6 / 100]
 - TEST LOSS : 714.8839111328125
 - TEST SCORE : 0.48105549812316895
[7 / 100]
 - TRAIN LOSS : 704.1797741

In [188]:
def predict(featureDF, model, DEVICE):
    # DataFrame -> Tensor 변환
    featureTS = torch.FloatTensor(featureDF.to_numpy().astype(np.float32)).to(DEVICE)
    
    model.eval()  # 평가 모드로 전환 (dropout 등 비활성화)
    
    with torch.no_grad():  # 그래디언트 계산 비활성화
        predictions = model(featureTS)  # 예측 수행
    
    return predictions.cpu().numpy()  # 결과를 numpy로 반환


In [189]:
test_pred = predict(test_feature, mlp_model, DEVICE)

In [190]:
submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')
submission['가격(백만원)'] = test_pred

submission

Unnamed: 0,ID,가격(백만원)
0,TEST_000,131.109787
1,TEST_001,80.255913
2,TEST_002,64.103508
3,TEST_003,34.632587
4,TEST_004,47.696903
...,...,...
841,TEST_841,150.898682
842,TEST_842,38.766518
843,TEST_843,38.746567
844,TEST_844,58.793934


In [191]:
submission.to_csv(DATA_PATH + 'sample_submission.csv', index = False)