In [1]:
import numpy as np
from torch.utils.data import Dataset, DataLoader
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

df = pd.read_csv('scaled_train.csv')

categorical_columns = ['Occupation', 'Type_of_Loan', 'Credit_Mix', 'Payment_of_Min_Amount', 'Payment_Behaviour']
numerical_columns = df.select_dtypes(include=['int64', 'float64']).columns

In [2]:
# Label Encoding (범주형 변수만)
for col in categorical_columns:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])

# 타겟 변수 Encoding
target_encoder = LabelEncoder()
df['Credit_Score'] = target_encoder.fit_transform(df['Credit_Score'])

# 표준화하지 않는 범주형 변수와 이미 스케일링된 K-means 컬럼을 분리
X_categorical = df[categorical_columns].values  # 범주형 변수 (표준화 X)
X_numerical = df[numerical_columns].values  # 수치형 변수 (표준화 필요)
X_kmeans = df[['kmeans']].values  # 이미 클러스터링된 결과 (표준화 필요 X)

# 수치형 변수만 표준화
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

# 범주형 + 수치형 + kmeans 결과 병합
X = np.hstack((X_categorical, X_numerical_scaled, X_kmeans))

# 타겟 변수
y = df['Credit_Score'].values

# 훈련/테스트 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

### SMOTENC 적용

In [3]:
# 범주형 변수가 먼저 포함되어 있고, 마지막에 k-means 결과가 있으므로 범주형 인덱스를 추출
num_categorical_columns = len(categorical_columns)
kmeans_index = X.shape[1] - 1  # k-means가 마지막 열이므로 그 인덱스를 구함

# 범주형 변수 인덱스 + k-means 인덱스
categorical_indices = list(range(num_categorical_columns)) + [kmeans_index]


In [4]:
from collections import Counter
print(Counter(y_train))

from imblearn.over_sampling import SMOTENC

# 범주형 변수가 있는 경우 SMOTE-NC 적용
smote_nc = SMOTENC(random_state=42, categorical_features=categorical_indices)  # 범주형 변수의 인덱스를 지정
X_train_resampled, y_train_resampled = smote_nc.fit_resample(X_train, y_train)

# SMOTE-NC 적용 후 분포 확인
print(Counter(y_train_resampled))


Counter({2: 42539, 1: 23199, 0: 14262})
Counter({2: 42539, 1: 42539, 0: 42539})


In [6]:
import torch
import torch.nn as nn
import torch.optim as optim

class CreditScoreDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.long)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_dataset = CreditScoreDataset(X_train_resampled, y_train_resampled)
test_dataset = CreditScoreDataset(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=512, shuffle=True, drop_last=True)
test_loader = DataLoader(test_dataset, batch_size=512, shuffle=False)

In [7]:
import torch
import torch.nn as nn
import torch.optim as optim

class MLP(nn.Module):
    def __init__(self, input_size):
        super(MLP, self).__init__()
        # 레이어 7개 구성
        self.layer1 = nn.Linear(input_size, 256)
        self.bn1 = nn.BatchNorm1d(256)
        
        self.layer2 = nn.Linear(256, 512)
        self.bn2 = nn.BatchNorm1d(512)
        self.dropout1 = nn.Dropout(0.2)
        
        self.layer3 = nn.Linear(512, 1024)
        self.bn3 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.2)
        
        self.layer4 = nn.Linear(1024, 512)
        self.bn4 = nn.BatchNorm1d(512)
        self.dropout3 = nn.Dropout(0.2)
        
        self.layer5 = nn.Linear(512, 256)
        self.bn5 = nn.BatchNorm1d(256)

        self.layer6 = nn.Linear(256, 128)
        self.bn6 = nn.BatchNorm1d(128)

        self.output_layer = nn.Linear(128, 3)  # 출력층

        self.relu = nn.ReLU()

        # He 초기화 적용
        for layer in [self.layer1, self.layer2, self.layer3, self.layer4, self.layer5, self.layer6, self.output_layer]:
            nn.init.kaiming_normal_(layer.weight)

    def forward(self, x):
        x = self.relu(self.bn1(self.layer1(x)))
        x = self.relu(self.dropout1(self.bn2(self.layer2(x))))
        x = self.relu(self.dropout2(self.bn3(self.layer3(x))))
        x = self.relu(self.dropout3(self.bn4(self.layer4(x))))
        x = self.relu(self.bn5(self.layer5(x)))
        x = self.relu(self.bn6(self.layer6(x)))
        x = self.output_layer(x)
        return x

# 모델 초기화 및 학습 준비
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
input_size = X_train_resampled.shape[1]
model = MLP(input_size).to(device)

criterion = nn.CrossEntropyLoss()

# Adam 옵티마이저 사용
optimizer = optim.AdamW(model.parameters(), lr=0.003, weight_decay=0.002)

# 학습률 스케줄러 추가: 5 에폭마다 학습률을 40%로 감소
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=15, gamma=0.8)

# Early Stopping 변수 초기화
best_val_accuracy = 0
early_stop_counter = 0
patience = 60  # 에폭 동안 개선되지 않으면 중지

epochs = 900

# 학습 루프
for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0

    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        _, predicted = torch.max(outputs, 1)
        total_train += targets.size(0)
        correct_train += (predicted == targets).sum().item()

    # 학습률 스케줄러 업데이트
    scheduler.step()

    # 평가 모드
    model.eval()
    correct_val = 0
    total_val = 0
    with torch.no_grad():
        for inputs, targets in test_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total_val += targets.size(0)
            correct_val += (predicted == targets).sum().item()

    train_accuracy = 100 * correct_train / total_train
    val_accuracy = 100 * correct_val / total_val
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {running_loss/len(train_loader):.4f}, '
          f'학습 정확도: {train_accuracy:.2f}%, 평가 정확도: {val_accuracy:.2f}%')

    # Early Stopping 조건 체크
    if val_accuracy > best_val_accuracy:
        best_val_accuracy = val_accuracy
        best_train_accuracy = train_accuracy
        best_epoch = epoch + 1  # 가장 좋은 성능을 보인 에폭 저장
        early_stop_counter = 0  # 평가 정확도가 개선되면 카운터 초기화
    else:
        early_stop_counter += 1  # 개선되지 않으면 카운터 증가

    if early_stop_counter >= patience:
        print("Early stopping triggered!")
        break  # 학습 중지

# Early Stopping 후 가장 좋았던 모델의 성능 출력
print(f'Early Stopping triggered at epoch {best_epoch}, '
      f'Best 평가 정확도: {best_val_accuracy:.2f}%, '
      f'Best 학습 정확도: {best_train_accuracy:.2f}%')


Epoch [1/900], Loss: 0.8616, 학습 정확도: 61.74%, 평가 정확도: 54.53%
Epoch [2/900], Loss: 0.7818, 학습 정확도: 67.72%, 평가 정확도: 59.40%
Epoch [3/900], Loss: 0.7655, 학습 정확도: 68.85%, 평가 정확도: 39.28%
Epoch [4/900], Loss: 0.7603, 학습 정확도: 69.19%, 평가 정확도: 62.96%
Epoch [5/900], Loss: 0.7376, 학습 정확도: 70.67%, 평가 정확도: 65.14%
Epoch [6/900], Loss: 0.7303, 학습 정확도: 70.99%, 평가 정확도: 48.74%
Epoch [7/900], Loss: 0.7123, 학습 정확도: 71.46%, 평가 정확도: 60.40%
Epoch [8/900], Loss: 0.6965, 학습 정확도: 71.52%, 평가 정확도: 57.93%
Epoch [9/900], Loss: 0.6834, 학습 정확도: 72.03%, 평가 정확도: 67.08%
Epoch [10/900], Loss: 0.6790, 학습 정확도: 72.21%, 평가 정확도: 64.78%
Epoch [11/900], Loss: 0.6756, 학습 정확도: 72.26%, 평가 정확도: 62.90%
Epoch [12/900], Loss: 0.6720, 학습 정확도: 72.45%, 평가 정확도: 64.53%
Epoch [13/900], Loss: 0.6695, 학습 정확도: 72.44%, 평가 정확도: 50.98%
Epoch [14/900], Loss: 0.6661, 학습 정확도: 72.54%, 평가 정확도: 60.91%
Epoch [15/900], Loss: 0.6645, 학습 정확도: 72.63%, 평가 정확도: 57.41%
Epoch [16/900], Loss: 0.6610, 학습 정확도: 72.72%, 평가 정확도: 49.83%
Epoch [17/900], Loss: 0.6593, 학습 