# 대출 승인 여부 이진 분류 모델링
- Tab Transformer 모델 사용

## 패키지 Import

In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from imblearn.over_sampling import SMOTE, SMOTENC
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, precision_score, recall_score

## Data Load

In [219]:
df = pd.read_csv('data/preprocessed_data.csv')
df.head()

Unnamed: 0,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,Male,Yes,1,Graduate,No,4583,1508,128,360,1,Rural,N
1,Male,Yes,0,Graduate,Yes,3000,0,66,360,1,Urban,Y
2,Male,Yes,0,Not Graduate,No,2583,2358,120,360,1,Urban,Y
3,Male,No,0,Graduate,No,6000,0,141,360,1,Urban,Y
4,Male,Yes,2,Graduate,Yes,5417,4196,267,360,1,Urban,Y


## Y 값 확인

In [222]:
df['Loan_Status'].value_counts()

Loan_Status
Y    411
N    181
Name: count, dtype: int64

- 클래스 간 불균형 존재
- Resampling 필요

## 모델링

### 데이터 처리

In [227]:
# 라벨 인코딩

for column in df.select_dtypes(include=['object']).columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column].astype(str))

In [229]:
X = df.drop('Loan_Status', axis=1).values
y = df['Loan_Status'].values

In [231]:
# 데이터 셋 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [233]:
# 클래스 불균형 해결 -> SMOTE 적용
# Upsampling
smote = SMOTE()#random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

In [235]:
pd.Series(y_train_resampled).value_counts()

0    328
1    328
Name: count, dtype: int64

- 클래스 불균형 문제 해결

In [238]:
# 데이터 스케일링
scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [240]:
# 데이터 타입 Tensor로 변경
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

### 모델 정의
- Tab Transformer 사용
- https://github.com/lucidrains/tab-transformer-pytorch 

In [46]:
!pip install tab-transformer-pytorch

Collecting tab-transformer-pytorch
  Downloading tab_transformer_pytorch-0.3.0-py3-none-any.whl.metadata (690 bytes)
Collecting einops>=0.3 (from tab-transformer-pytorch)
  Downloading einops-0.8.0-py3-none-any.whl.metadata (12 kB)
Downloading tab_transformer_pytorch-0.3.0-py3-none-any.whl (6.9 kB)
Downloading einops-0.8.0-py3-none-any.whl (43 kB)
   ---------------------------------------- 0.0/43.2 kB ? eta -:--:--
   ---------------------------------------- 43.2/43.2 kB 2.1 MB/s eta 0:00:00
Installing collected packages: einops, tab-transformer-pytorch
Successfully installed einops-0.8.0 tab-transformer-pytorch-0.3.0


In [50]:
from tab_transformer_pytorch import TabTransformer

In [284]:
model = TabTransformer(
    categories=[],
    num_continuous=X_train_tensor.shape[1],  # 연속형 변수의 개수
    dim=64,  # 모델 차원
    dim_out=1,  # 이진 분류 출력
    depth=10,  # 모델 깊이
    heads=16,  # 멀티헤드 어텐션 헤드 수
    attn_dropout=0.1,  # 어텐션 드롭아웃
    ff_dropout=0.1,  # 피드포워드 드롭아웃
    mlp_hidden_mults=(4, 8, 16, 4, 2),  # MLP의 히든 레이어 크기 비율
    mlp_act=nn.ReLU(),  # MLP의 활성화 함수
)

In [286]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [288]:
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # 예측 및 손실 계산
    # model(범주형, 수치형) 각 type별로 따로 넣어야함
    y_pred = model(torch.empty((X_train_tensor.shape[0], 0), dtype=torch.int64), X_train_tensor)  # 범주형 변수가 없으므로 비어있는 텐서를 줘야함.
    loss = criterion(y_pred, y_train_tensor)

    # 역전파 및 최적화
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0 :
      print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

# 모델 평가
model.eval()
with torch.no_grad():
    y_pred = model(torch.empty((X_train_tensor.shape[0], 0), dtype=torch.int64), X_test_tensor)
    y_pred = torch.sigmoid(y_pred)
    y_pred_class = (y_pred > 0.5).float()

    accuracy = (y_pred_class == y_test_tensor).float().mean()
    print(f'Valid Accuracy: {accuracy:.4f}')

Epoch 100/500, Loss: 0.6504
Epoch 200/500, Loss: 0.5873
Epoch 300/500, Loss: 0.5176
Epoch 400/500, Loss: 0.4513
Epoch 500/500, Loss: 0.3703
Valid Accuracy: 0.7731


In [290]:
model = TabTransformer(
    categories=[],
    num_continuous=X_train_tensor.shape[1],  # 연속형 변수의 개수
    dim=64,  # 모델 차원
    dim_out=1,  # 이진 분류 출력
    depth=10,  # 모델 깊이
    heads=20,  # 멀티헤드 어텐션 헤드 수
    attn_dropout=0.1,  # 어텐션 드롭아웃
    ff_dropout=0.1,  # 피드포워드 드롭아웃
    mlp_hidden_mults=(4, 8, 16, 2),  # MLP의 히든 레이어 크기 비율
    mlp_act=nn.ReLU(),  # MLP의 활성화 함수
)

In [292]:
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [294]:
num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # 예측 및 손실 계산
    # model(범주형, 수치형) 각 type별로 따로 넣어야함
    y_pred = model(torch.empty((X_train_tensor.shape[0], 0), dtype=torch.int64), X_train_tensor)  # 범주형 변수가 없으므로 비어있는 텐서를 줘야함.
    loss = criterion(y_pred, y_train_tensor)

    # 역전파 및 최적화
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0 :
      print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

# 모델 평가
model.eval()
with torch.no_grad():
    y_pred = model(torch.empty((X_train_tensor.shape[0], 0), dtype=torch.int64), X_test_tensor)
    y_pred = torch.sigmoid(y_pred)
    y_pred_class = (y_pred > 0.5).float()

    accuracy = (y_pred_class == y_test_tensor).float().mean()
    print(f'Valid Accuracy: {accuracy:.4f}')

Epoch 100/500, Loss: 0.6116
Epoch 200/500, Loss: 0.5636
Epoch 300/500, Loss: 0.4913
Epoch 400/500, Loss: 0.4164
Epoch 500/500, Loss: 0.3509
Valid Accuracy: 0.7815


In [310]:
model = TabTransformer(
    categories=[],
    num_continuous=X_train_tensor.shape[1],  # 연속형 변수의 개수
    dim=64,  # 모델 차원
    dim_out=1,  # 이진 분류 출력
    depth=16,  # 모델 깊이
    heads=32,  # 멀티헤드 어텐션 헤드 수
    attn_dropout=0.1,  # 어텐션 드롭아웃
    ff_dropout=0.1,  # 피드포워드 드롭아웃
    mlp_hidden_mults=(4, 8, 16, 2),  # MLP의 히든 레이어 크기 비율
    mlp_act=nn.ReLU(),  # MLP의 활성화 함수
)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001)

num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # 예측 및 손실 계산
    # model(범주형, 수치형) 각 type별로 따로 넣어야함
    y_pred = model(torch.empty((X_train_tensor.shape[0], 0), dtype=torch.int64), X_train_tensor)  # 범주형 변수가 없으므로 비어있는 텐서를 줘야함.
    loss = criterion(y_pred, y_train_tensor)

    # 역전파 및 최적화
    loss.backward()
    optimizer.step()

    if (epoch+1) % 10 == 0 :
      print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

# 모델 평가
model.eval()
with torch.no_grad():
    y_pred = model(torch.empty((X_train_tensor.shape[0], 0), dtype=torch.int64), X_test_tensor)
    y_pred = torch.sigmoid(y_pred)
    y_pred_class = (y_pred > 0.5).float()

    accuracy = (y_pred_class == y_test_tensor).float().mean()
    print(f'Valid Accuracy: {accuracy:.4f}')

Epoch 10/500, Loss: 0.6798
Epoch 20/500, Loss: 0.6756
Epoch 30/500, Loss: 0.6704
Epoch 40/500, Loss: 0.6633
Epoch 50/500, Loss: 0.6544
Epoch 60/500, Loss: 0.6438
Epoch 70/500, Loss: 0.6325
Epoch 80/500, Loss: 0.6212
Epoch 90/500, Loss: 0.6115
Epoch 100/500, Loss: 0.6043
Epoch 110/500, Loss: 0.5995
Epoch 120/500, Loss: 0.5956
Epoch 130/500, Loss: 0.5919
Epoch 140/500, Loss: 0.5880
Epoch 150/500, Loss: 0.5840
Epoch 160/500, Loss: 0.5798
Epoch 170/500, Loss: 0.5755
Epoch 180/500, Loss: 0.5710
Epoch 190/500, Loss: 0.5662
Epoch 200/500, Loss: 0.5612
Epoch 210/500, Loss: 0.5559
Epoch 220/500, Loss: 0.5504
Epoch 230/500, Loss: 0.5447
Epoch 240/500, Loss: 0.5389
Epoch 250/500, Loss: 0.5332
Epoch 260/500, Loss: 0.5277
Epoch 270/500, Loss: 0.5224
Epoch 280/500, Loss: 0.5173
Epoch 290/500, Loss: 0.5122
Epoch 300/500, Loss: 0.5071
Epoch 310/500, Loss: 0.5018
Epoch 320/500, Loss: 0.4964
Epoch 330/500, Loss: 0.4909
Epoch 340/500, Loss: 0.4852
Epoch 350/500, Loss: 0.4792
Epoch 360/500, Loss: 0.4729
E

In [338]:
model = TabTransformer(
    categories=[],
    num_continuous=X_train_tensor.shape[1],  # 연속형 변수의 개수
    dim=64,  # 모델 차원
    dim_out=1,  # 이진 분류 출력
    depth=16,  # 모델 깊이
    heads=64,  # 멀티헤드 어텐션 헤드 수
    attn_dropout=0.1,  # 어텐션 드롭아웃
    ff_dropout=0.1,  # 피드포워드 드롭아웃
    mlp_hidden_mults=(8, 4,2),  # MLP의 히든 레이어 크기 비율
    mlp_act=nn.ReLU(),  # MLP의 활성화 함수
)

criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0001, weight_decay = 0.001)

num_epochs = 500
for epoch in range(num_epochs):
    model.train()
    optimizer.zero_grad()

    # 예측 및 손실 계산
    # model(범주형, 수치형) 각 type별로 따로 넣어야함
    y_pred = model(torch.empty((X_train_tensor.shape[0], 0), dtype=torch.int64), X_train_tensor)  # 범주형 변수가 없으므로 비어있는 텐서를 줘야함.
    loss = criterion(y_pred, y_train_tensor)

    # 역전파 및 최적화
    loss.backward()
    optimizer.step()

    if (epoch+1) % 100 == 0 :
      print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item():.4f}')

# 모델 평가
model.eval()
with torch.no_grad():
    y_pred = model(torch.empty((X_train_tensor.shape[0], 0), dtype=torch.int64), X_test_tensor)
    y_pred = torch.sigmoid(y_pred)
    y_pred_class = (y_pred > 0.5).float()

    accuracy = (y_pred_class == y_test_tensor).float().mean()
    print(f'Valid Accuracy: {accuracy:.4f}')

Epoch 100/500, Loss: 0.6203
Epoch 200/500, Loss: 0.5740
Epoch 300/500, Loss: 0.5309
Epoch 400/500, Loss: 0.4914
Epoch 500/500, Loss: 0.4610
Valid Accuracy: 0.8067
