# 고객의 채무 불이행 여부 분류
---
#### 데이터 양
- train : 100,000 개
- test : 35,815 개
---
#### input과 output
- input : 고객 재무 상태에 대한 75개 feature
- output : 채무 불이행 여부
    - 0 = 이행
    - 1 = 불이행 / 부도
---
#### features
- **int_rate** : 대출자에 부여된 이자율 (Interest rate of the loan the applicant received)
- **annual_inc** : 연 소득 (annual income)
- **dti** : 소득 대비 부채 비율 (Debt-to-income ratio)
- **delinq_2yrs** : 지난 2년 간 체납 발생 횟수 (Delinquencies on lines of credit in the last 2 years)
- **inq_last_6mths** : 지난 6개월 간 신용 조회 수 (Inquiries into the applicant's credit during the last 6 months)
- **pub_rec** : 파산 횟수 (Number of bankruptcies listed in the public record)
- **revol_bal** : 리볼빙 잔액 (Total credit revolving balance)
- **total_acc** : 지금까지 소유했던 신용카드 개수 (num_total_cc_accounts : Total number of credit card accounts in the applicant's history)
- **collections_12_mths_ex_med** : 의료부문을 제외한 지난 12개월 간 추심 발생 횟수 (num_collections_last_12m : Number of collections in the last 12 months. This excludes medical collections)
- **acc_now_delinq** : 대출자가 체납 상태에 있지 않은 계좌의 수 (The number of accounts on which the borrower is now delinquent)
- **tot_coll_amt** : 대출자에 대한 현재까지의 총 추심액 (total_collection_amount_ever : The total amount that the applicant has had against them in collections)
- **tot_cur_bal** : 전 계좌의 현재 통합 잔고 (Total current balance of all accounts)
- **chargeoff_within_12_mths** : 대출 부 신청인의 대출 신청 직전 12개월 간 세금 공제 횟수 (Number of charge-offs within last 12 months at time of application for the secondary applicant)
- **delinq_amnt** : 체납 금액 (delinquency amount)
- **tax_liens** : 세금 저당권의 수 (Number of tax liens)
- **emp_length1** ~ 12 : 고용 연수 (Number of years in the job)
- **home_ownership1** ~ 6 : 대출 신청자의 주거 소유 형태 (The ownership status of the applicant's residence)
- **verification_status1** ~ 3 : 공동 소득 발생 여부 및 형태 (verification_income_joint : Type of verification of the joint income)
- **purpose1** ~ 14 : 대출 목적 (The purpose of the loan)
- **initial_list_status1** ~ 2 : 최초 대출 상태 (Initial listing status of the loan)
- **mths_since_last_delinq1** ~ 11 : 마지막 체납이 지금으로부터 몇개월 전에 있었는지를 나타내는 변수 (Months since the last delinquency)
- **funded_amnt** : 대출액 (Funded amount)
- **funded_amnt_inv** : 사채 대출액 (Funded amount by investors)
- **total_rec_late_fee** : 총 연체료 중 납부액 (Late fees received to date)
- **term1** : 상환 기간 (The number of payments on the loan. Values are in months and can be either 36 or 60)
- **open_acc** : 개설 개좌 수 (The number of open credit lines in the borrower's credit file)
- **installment** : 대출 발생 시 월 상환액 (The monthly payment owed by the borrower if the loan originates)
- **revol_util** : 리볼빙 한도 대비 리볼빙 사용 비율 (Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit)
- **out_prncp** : 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded)
- **out_prncp_inv** : 사채 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded by investors)
- **total_rec_int** : 이자 상환액 (Interest received to date)
- **fico_range_low** : FICO(일종의 신용점수) 최저값 (The lower boundary range the borrowerʼs FICO at loan origination belongs to)
- **fico_range_high** : FICO(일종의 신용점수) 최고값 (The upper boundary range the borrowerʼs FICO at loan origination belongs to)
- **depvar** : 고객의 부도 여부 (dependent variable)

---
# 필요 데이터 로드
---

In [2]:
# Libraries for data handling
import numpy as np
import pandas as pd

# Libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for machin learning
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score

import sklearn.metrics as metrics
from xgboost import XGBClassifier

In [3]:
train = pd.read_csv("../data/train.csv")
test = pd.read_csv("../data/test.csv")

In [4]:
print(train.shape)
# print(train.columns.values)
# train.describe().T

(100000, 76)


# processing

In [5]:
all_to_one = ['emp_length', 'mths_since_last_delinq']
def merge_one_hot_columns(df, prefix):
    # 해당 prefix로 시작하는 컬럼들 선택
    one_hot_columns = [col for col in df.columns if col.startswith(prefix)]
    
    # 원핫 인코딩된 값에서 해당 컬럼 이름 추출
    merged_column = df[one_hot_columns].idxmax(axis=1)
    
    # 접두사 제거
    merged_column = merged_column.apply(lambda x: int(x[len(prefix):]))
    
    return merged_column

for col_name in all_to_one:
    train[col_name] = merge_one_hot_columns(train, col_name)
    one_hot_columns = [col for col in train.columns if col.startswith(col_name)]
    train = train.drop(one_hot_columns[:-1], axis=1)

train['term1'] = train['term1'].replace({0: 36, 1: 60})

for col_name in all_to_one:
    test[col_name] = merge_one_hot_columns(test, col_name)
    one_hot_columns = [col for col in test.columns if col.startswith(col_name)]
    test = test.drop(one_hot_columns[:-1], axis=1)
test['term1'] = test['term1'].replace({0: 36, 1: 60})

In [6]:
print(test.columns)

Index(['ID', 'int_rate', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       'pub_rec', 'revol_bal', 'total_acc', 'collections_12_mths_ex_med',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens',
       'home_ownership1', 'home_ownership2', 'home_ownership3',
       'home_ownership4', 'home_ownership5', 'home_ownership6',
       'verification_status1', 'verification_status2', 'verification_status3',
       'purpose1', 'purpose2', 'purpose3', 'purpose4', 'purpose5', 'purpose6',
       'purpose7', 'purpose8', 'purpose9', 'purpose10', 'purpose11',
       'purpose12', 'purpose13', 'purpose14', 'initial_list_status1',
       'initial_list_status2', 'funded_amnt', 'funded_amnt_inv',
       'total_rec_late_fee', 'term1', 'open_acc', 'installment', 'revol_util',
       'out_prncp', 'out_prncp_inv', 'total_rec_int', 'fico_range_low',
       'fico_range_high', 'emp_length', 'mths_since_last_delinq'],
      dtype='object'

---

# modeling

---
## train

In [7]:
# model 채점
def get_clf_eval(y_answer, y_pred):
    acc = metrics.accuracy_score(y_answer, y_pred)
    prec = metrics.precision_score(y_answer, y_pred)
    recall = metrics.recall_score(y_answer, y_pred)
    AUC = metrics.roc_auc_score(y_answer, y_pred)
    F1 = metrics.f1_score(y_answer, y_pred, average="macro")
    confus_met = metrics.confusion_matrix(y_answer, y_pred)

    print("========================")
    print("정확도 : {:.6f}".format(acc))
    print("정밀도 : {:.6f}".format(prec))
    print("재현율 : {:.6f}".format(recall))
    print("AUC : {:.6f}".format(AUC))
    
    print(" ** F1 : {:.6f} **".format(F1))
    
    print("====confusion_matrix====\n{}".format(confus_met))
    print("========================")

    return F1

In [8]:
X = train.drop('depvar', axis=1)
y = train['depvar']

model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')  # XGBoost 특정 옵션()
# Stratified K-Fold 설정
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 각 폴드의 점수 저장 및 모델 저장
fold_scores = []
models = []

# K-Fold 교차 검증 수행
for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    model.fit(X_train, y_train)
    y_prec = model.predict(X_val)
    score = get_clf_eval(y_val, y_prec)
    fold_scores.append(score)
    models.append(model)  # 각 폴드에서 학습된 모델 저장

# 가장 높은 점수를 얻은 모델 선택
best_fold_index = fold_scores.index(max(fold_scores))
best_model = models[best_fold_index]

print(f"Best fold: {best_fold_index}")
print(f"Best fold score: {fold_scores[best_fold_index]}")
print(f"Mean accuracy: {sum(fold_scores) / len(fold_scores)}")

X_test = test.iloc[:, 1:]
test_pred = best_model.predict(X_test)


정확도 : 0.750450
정밀도 : 0.657818
재현율 : 0.487026
AUC : 0.682343
 ** F1 : 0.692784 **
====confusion_matrix====
[[11837  1650]
 [ 3341  3172]]
정확도 : 0.749850
정밀도 : 0.655934
재현율 : 0.487872
AUC : 0.682131
 ** F1 : 0.692438 **
====confusion_matrix====
[[11819  1667]
 [ 3336  3178]]
정확도 : 0.754400
정밀도 : 0.662343
재현율 : 0.501689
AUC : 0.689077
 ** F1 : 0.699446 **
====confusion_matrix====
[[11820  1666]
 [ 3246  3268]]
정확도 : 0.751250
정밀도 : 0.654301
재현율 : 0.500921
AUC : 0.686542
 ** F1 : 0.696431 **
====confusion_matrix====
[[11762  1724]
 [ 3251  3263]]
정확도 : 0.756000
정밀도 : 0.667900
재현율 : 0.498925
AUC : 0.689549
 ** F1 : 0.700334 **
====confusion_matrix====
[[11870  1616]
 [ 3264  3250]]
Best fold: 4
Best fold score: 0.7003336858096947
Mean accuracy: 0.6962865466457048


In [9]:
submission = pd.read_csv('../data/sample_submission.csv')
submission['answer'] = test_pred
submission.to_csv('submission.csv', index=False)