# 고객의 채무 불이행 여부 분류
---
#### 데이터 양
- train : 100,000 개
- test : 35,815 개
---
#### input과 output
- input : 고객 재무 상태에 대한 75개 feature
- output : 채무 불이행 여부
    - 0 = 이행
    - 1 = 불이행 / 부도
---
#### features
- **int_rate** : 대출자에 부여된 이자율 (Interest rate of the loan the applicant received)
- **annual_inc** : 연 소득 (annual income)
- **dti** : 소득 대비 부채 비율 (Debt-to-income ratio)
- **delinq_2yrs** : 지난 2년 간 체납 발생 횟수 (Delinquencies on lines of credit in the last 2 years)
- **inq_last_6mths** : 지난 6개월 간 신용 조회 수 (Inquiries into the applicant's credit during the last 6 months)
- **pub_rec** : 파산 횟수 (Number of bankruptcies listed in the public record)
- **revol_bal** : 리볼빙 잔액 (Total credit revolving balance)
- **total_acc** : 지금까지 소유했던 신용카드 개수 (num_total_cc_accounts : Total number of credit card accounts in the applicant's history)
- **collections_12_mths_ex_med** : 의료부문을 제외한 지난 12개월 간 추심 발생 횟수 (num_collections_last_12m : Number of collections in the last 12 months. This excludes medical collections)
- **acc_now_delinq** : 대출자가 체납 상태에 있지 않은 계좌의 수 (The number of accounts on which the borrower is now delinquent)
- **tot_coll_amt** : 대출자에 대한 현재까지의 총 추심액 (total_collection_amount_ever : The total amount that the applicant has had against them in collections)
- **tot_cur_bal** : 전 계좌의 현재 통합 잔고 (Total current balance of all accounts)
- **chargeoff_within_12_mths** : 대출 부 신청인의 대출 신청 직전 12개월 간 세금 공제 횟수 (Number of charge-offs within last 12 months at time of application for the secondary applicant)
- **delinq_amnt** : 체납 금액 (delinquency amount)
- **tax_liens** : 세금 저당권의 수 (Number of tax liens)
- **emp_length1** ~ 12 : 고용 연수 (Number of years in the job)
- **home_ownership1** ~ 6 : 대출 신청자의 주거 소유 형태 (The ownership status of the applicant's residence)
- **verification_status1** ~ 3 : 공동 소득 발생 여부 및 형태 (verification_income_joint : Type of verification of the joint income)
- **purpose1** ~ 14 : 대출 목적 (The purpose of the loan)
- **initial_list_status1** ~ 2 : 최초 대출 상태 (Initial listing status of the loan)
- **mths_since_last_delinq1** ~ 11 : 마지막 체납이 지금으로부터 몇개월 전에 있었는지를 나타내는 변수 (Months since the last delinquency)
- **funded_amnt** : 대출액 (Funded amount)
- **funded_amnt_inv** : 사채 대출액 (Funded amount by investors)
- **total_rec_late_fee** : 총 연체료 중 납부액 (Late fees received to date)
- **term1** : 상환 기간 (The number of payments on the loan. Values are in months and can be either 36 or 60)
- **open_acc** : 개설 개좌 수 (The number of open credit lines in the borrower's credit file)
- **installment** : 대출 발생 시 월 상환액 (The monthly payment owed by the borrower if the loan originates)
- **revol_util** : 리볼빙 한도 대비 리볼빙 사용 비율 (Revolving line utilization rate, or the amount of credit the borrower is using relative to all available revolving credit)
- **out_prncp** : 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded)
- **out_prncp_inv** : 사채 대출액 중 원리금 잔액 (Remaining outstanding principal for total amount funded by investors)
- **total_rec_int** : 이자 상환액 (Interest received to date)
- **fico_range_low** : FICO(일종의 신용점수) 최저값 (The lower boundary range the borrowerʼs FICO at loan origination belongs to)
- **fico_range_high** : FICO(일종의 신용점수) 최고값 (The upper boundary range the borrowerʼs FICO at loan origination belongs to)
- **depvar** : 고객의 부도 여부 (dependent variable)

---
# 필요 데이터 로드
---

In [1]:
# Libraries for data handling
import numpy as np
import pandas as pd

# Libraries for plotting
import matplotlib.pyplot as plt
import seaborn as sns

# Libraries for machin learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import sklearn.metrics as metrics
from sklearn.linear_model import LogisticRegression

In [2]:
train = pd.read_csv("data/train.csv")
test = pd.read_csv("data/test.csv")

In [3]:
train.iloc[:, :-1].columns

Index(['int_rate', 'annual_inc', 'dti', 'delinq_2yrs', 'inq_last_6mths',
       'pub_rec', 'revol_bal', 'total_acc', 'collections_12_mths_ex_med',
       'acc_now_delinq', 'tot_coll_amt', 'tot_cur_bal',
       'chargeoff_within_12_mths', 'delinq_amnt', 'tax_liens', 'emp_length1',
       'emp_length2', 'emp_length3', 'emp_length4', 'emp_length5',
       'emp_length6', 'emp_length7', 'emp_length8', 'emp_length9',
       'emp_length10', 'emp_length11', 'emp_length12', 'home_ownership1',
       'home_ownership2', 'home_ownership3', 'home_ownership4',
       'home_ownership5', 'home_ownership6', 'verification_status1',
       'verification_status2', 'verification_status3', 'purpose1', 'purpose2',
       'purpose3', 'purpose4', 'purpose5', 'purpose6', 'purpose7', 'purpose8',
       'purpose9', 'purpose10', 'purpose11', 'purpose12', 'purpose13',
       'purpose14', 'initial_list_status1', 'initial_list_status2',
       'mths_since_last_delinq1', 'mths_since_last_delinq2',
       'mths_since_l

In [7]:
# StandardSclaer
from sklearn.preprocessing import StandardScaler

scale_data = train.iloc[:, :-1]
scaler = StandardScaler()
scaler.fit(scale_data)
scaler_data_scaled = scaler.transform(scale_data)
scaler_data_scaled = pd.DataFrame(scaler_data_scaled)
scaler_data_scaled.columns = scale_data.columns
scaler_data_scaled = round(scaler_data_scaled)
print(scaler_data_scaled)

       int_rate  annual_inc  dti  delinq_2yrs  inq_last_6mths  pub_rec  \
0          -1.0        -1.0  1.0         -0.0             0.0     -0.0   
1          -0.0         0.0 -2.0         -0.0             0.0      1.0   
2          -0.0        -0.0  1.0         -0.0             2.0     -0.0   
3           0.0         0.0 -0.0          4.0             1.0     -0.0   
4          -0.0        -1.0  1.0         -0.0             0.0      3.0   
...         ...         ...  ...          ...             ...      ...   
99995       1.0        -0.0 -0.0         -0.0             2.0      1.0   
99996      -1.0        -0.0 -2.0         -0.0            -1.0     -0.0   
99997       0.0        -0.0  2.0         -0.0             0.0     -0.0   
99998       2.0        -1.0 -2.0         -0.0             0.0     -0.0   
99999       1.0         1.0  2.0         -0.0            -1.0     -0.0   

       revol_bal  total_acc  collections_12_mths_ex_med  acc_now_delinq  ...  \
0           -1.0        0.0    

In [9]:
# IsolationForest
from sklearn.ensemble import IsolationForest
import collections

clf = IsolationForest(
                    n_estimators=100,
                    max_samples="auto", 
                    contamination=0.1,
                    max_features=1,
                    bootstrap=False,
                    n_jobs=1,
                    random_state=42,
                    verbose=0
)

# fit 함수를 이용하여, 데이터셋을 학습시킨다.
clf.fit(scaler_data_scaled)

# predict 함수를 이용하여, outlier를 판별해 준다. 0과 1로 이루어진 Series형태의 데이터가 나온다.
y_pred_outliers = clf.predict(scaler_data_scaled)
collections.Counter(y_pred_outliers)


scaler_data_scaled['out'] = y_pred_outliers
outliers = scaler_data_scaled.loc[scaler_data_scaled['out'] == -1]
outlier_index = list(outliers.index)

In [10]:
# 3. PCA(component = 3), Scaler와 무관하게 코드 동일

from mpl_toolkits.mplot3d import Axes3D
from sklearn.decomposition import PCA
def scaled_pca(X_scaled):
    pca = PCA(n_components=3)
    X_scaled_reduce = pca.fit_transform(X_scaled)
    fig = plt.figure()
    ax = fig.add_subplot(111, projection='3d')
    ax.set_zlabel("x_composite_3")
    # Plot the compressed data points
    ax.scatter(X_scaled_reduce[:, 0], X_scaled_reduce[:, 1], zs=X_scaled_reduce[:, 2], s=4, lw=4, label="inliers",c="green")
    # Plot x's for the ground truth outliers
    ax.scatter(X_scaled_reduce[outlier_index,0],X_scaled_reduce[outlier_index,1], 
            X_scaled_reduce[outlier_index,2],
            lw=2, s=60, marker="x", c="red", label="outliers")
    ax.legend()
    plt.show()

    # 4. PCA(component = 2), Scaler와 무관하게 코드 동일

    pca = PCA(2)
    pca.fit(X_scaled)
    res=pd.DataFrame(pca.transform(X_scaled))
    Z = np.array(res)
    plt.title("IsolationForest")
    b1 = plt.scatter(res[0], res[1], c='green',
                    s=20,label="normal points")
    b1 =plt.scatter(res.iloc[outlier_index,0],res.iloc[outlier_index,1], c='green',s=20,  edgecolor="red",label="predicted outliers")
    plt.legend(loc="upper right")
    plt.show()


In [16]:
scaler_data_scaled['depvar'] = train['depvar']
scale_data = scaler_data_scaled.loc[scaler_data_scaled['out'] == 1]
scale_data.shape

(90000, 77)

In [18]:
X = scale_data.drop(['depvar', 'out'], axis=1)
y = scale_data['depvar']
print(X.shape, y.shape)

(90000, 75) (90000,)


In [23]:
# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# print(f'train shape : {X_train.shape}, {y_train.shape}')
# print(f'train shape : {X_val.shape}, {y_val.shape}')

In [25]:
# y_val.sum()/len(y_val)

In [27]:
# model 채점
def get_clf_eval(y_answer, y_pred):
    acc = metrics.accuracy_score(y_answer, y_pred)
    prec = metrics.precision_score(y_answer, y_pred)
    recall = metrics.recall_score(y_answer, y_pred)
    AUC = metrics.roc_auc_score(y_answer, y_pred)
    F1 = metrics.f1_score(y_answer, y_pred, average="macro")
    confus_met = metrics.confusion_matrix(y_answer, y_pred)

    print("========================")
    print("정확도 : {:.6f}".format(acc))
    print("정밀도 : {:.6f}".format(prec))
    print("재현율 : {:.6f}".format(recall))
    print("AUC : {:.6f}".format(AUC))
    
    print(" ** F1 : {:.6f} **".format(F1))
    
    print("====confusion_matrix====\n{}".format(confus_met))
    print("========================")

    return F1

In [1]:
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold

model = XGBClassifier()
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

fold_scores = []
models = []

for train_index, val_index in skf.split(X, y):
    X_train, X_val = X.iloc[train_index], X.iloc[val_index]
    y_train, y_val = y.iloc[train_index], y.iloc[val_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    score = get_clf_eval(y_val, y_pred)
    fold_scores.append(score)
    models.append(model)

best_fold_index = fold_scores.index(max(fold_scores))
best_model = models[best_fold_index]

print(f"Best fold: {best_fold_index}")
print(f"Best fold score: {fold_scores[best_fold_index]}")
print(f"Mean accuracy: {sum(fold_scores) / len(fold_scores)}")

NameError: name 'X' is not defined

In [33]:
X_test = test.iloc[:, 1:]
transformed_X_test = scaler.transform(X_test)
transformed_X_test = pd.DataFrame(transformed_X_test)
transformed_X_test.columns = X_test.columns
transformed_X_test = round(transformed_X_test)
print(transformed_X_test.shape)

(35816, 75)


In [35]:
test_pred = best_model.predict(transformed_X_test)

submission = pd.read_csv('data/sample_submission.csv')
submission['answer'] = test_pred
submission.to_csv('submission.csv', index=False)