In [1]:
import pandas as pd
import numpy as np
import torch
from torch import nn
from sklearn.model_selection import train_test_split

## Explore data

In [2]:
df = pd.read_csv("loan_data.csv")
df.head()

Unnamed: 0,person_age,person_gender,person_education,person_income,person_emp_exp,person_home_ownership,loan_amnt,loan_intent,loan_int_rate,loan_percent_income,cb_person_cred_hist_length,credit_score,previous_loan_defaults_on_file,loan_status
0,22.0,female,Master,71948.0,0,RENT,35000.0,PERSONAL,16.02,0.49,3.0,561,No,1
1,21.0,female,High School,12282.0,0,OWN,1000.0,EDUCATION,11.14,0.08,2.0,504,Yes,0
2,25.0,female,High School,12438.0,3,MORTGAGE,5500.0,MEDICAL,12.87,0.44,3.0,635,No,1
3,23.0,female,Bachelor,79753.0,0,RENT,35000.0,MEDICAL,15.23,0.44,2.0,675,No,1
4,24.0,male,Master,66135.0,1,RENT,35000.0,MEDICAL,14.27,0.53,4.0,586,No,1


In [3]:
columns = ["loan_status", "person_income", "loan_intent", "loan_percent_income","credit_score"]
df = df[columns] 
df.head()

Unnamed: 0,loan_status,person_income,loan_intent,loan_percent_income,credit_score
0,1,71948.0,PERSONAL,0.49,561
1,0,12282.0,EDUCATION,0.08,504
2,1,12438.0,MEDICAL,0.44,635
3,1,79753.0,MEDICAL,0.44,675
4,1,66135.0,MEDICAL,0.53,586


In [4]:
df.groupby("loan_status").agg({
    "person_income": ["mean", "min", "max"],
    "credit_score": ["mean", "min", "max"],
    "loan_percent_income": ["mean", "min", "max"]   
})

Unnamed: 0_level_0,person_income,person_income,person_income,credit_score,credit_score,credit_score,loan_percent_income,loan_percent_income,loan_percent_income
Unnamed: 0_level_1,mean,min,max,mean,min,max,mean,min,max
loan_status,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2
0,86157.040743,9595.0,7200766.0,632.814914,390,850,0.121783,0.0,0.66
1,59886.0969,8000.0,845636.0,631.8872,431,767,0.202521,0.0,0.62


In [5]:
df.groupby("loan_intent").agg({
    "loan_status": ["mean"]  
})

Unnamed: 0_level_0,loan_status
Unnamed: 0_level_1,mean
loan_intent,Unnamed: 1_level_2
DEBTCONSOLIDATION,0.302729
EDUCATION,0.169562
HOMEIMPROVEMENT,0.263015
MEDICAL,0.278194
PERSONAL,0.201404
VENTURE,0.144264


## Preprocess Data

In [6]:
df = pd.get_dummies(df, columns = ["loan_intent"])
df.head()

Unnamed: 0,loan_status,person_income,loan_percent_income,credit_score,loan_intent_DEBTCONSOLIDATION,loan_intent_EDUCATION,loan_intent_HOMEIMPROVEMENT,loan_intent_MEDICAL,loan_intent_PERSONAL,loan_intent_VENTURE
0,1,71948.0,0.49,561,False,False,False,False,True,False
1,0,12282.0,0.08,504,False,True,False,False,False,False
2,1,12438.0,0.44,635,False,False,False,True,False,False
3,1,79753.0,0.44,675,False,False,False,True,False,False
4,1,66135.0,0.53,586,False,False,False,True,False,False


In [7]:
X_data = df.drop(columns=["loan_status"]).astype("float32")
y_data = df["loan_status"].astype("float32")

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size = 0.2, random_state = 42)

In [9]:
X_train_tensor = torch.tensor(X_train.values, dtype = torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype = torch.float32)

In [10]:
X_train_mean = X_train_tensor.mean(axis = 0)
X_train_std = X_train_tensor.std(axis = 0)
X_train_normalized = (X_train_tensor - X_train_mean) / X_train_std

In [11]:
X_test_normalized = (X_test_tensor - X_train_mean) / X_train_std

In [12]:
y_train_tensor = torch.tensor(y_train.values, dtype = torch.float32).reshape((-1, 1))
y_test_tensor = torch.tensor(y_test.values, dtype = torch.float32).reshape((-1, 1))

## Develop a model

In [13]:
model = nn.Sequential(
    nn.Linear(9, 32),
    nn.ReLU(),
    nn.Linear(32, 16),
    nn.ReLU(),
    nn.Linear(16, 1)
)

loss_fn = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)

## Train and evaluate the model

In [14]:
num_entries = X_train.shape[0] # 훈련 데이터 크기
batch_size = 32

for i in range(0, 100):
    loss_sum = 0
    for start in range(0, num_entries, batch_size):
        end = min(num_entries, start + batch_size)
        X_data = X_train_normalized[start:end]
        y_data = y_train_tensor[start:end]
        
        optimizer.zero_grad() # 기울기 초기화
        outputs = model(X_data)
        loss = loss_fn(outputs, y_data)
        loss.backward() # 역전파
        loss_sum += loss.item() # 손실 값 합산
        optimizer.step() # 가중치 업데이트
    
    if i % 10 == 0:
        print(loss_sum)

model.eval() # 평가
with torch.no_grad():
    outputs = model(X_test_normalized)
    y_pred = torch.sigmoid(outputs) > 0.5  # 이진 분류 예측 (0 and 1)
    accuracy = (y_pred == y_test_tensor).type(torch.float32).mean()
    print(accuracy)

611.0402579903603
478.54807211458683
469.6584235727787
464.48901541531086
461.842178016901
459.7415138185024
458.12276643514633
456.8161113113165
455.61233788728714
454.56736290454865
tensor(0.8348)
