In [1]:
import pandas as pd
import numpy as np
import os
import random
from copy import deepcopy
import math
from itertools import product
import pickle
import joblib

import matplotlib.pyplot as plt
import seaborn as sb

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay

In [35]:
user_df = pd.read_csv('./data/user_spec.csv')
loan_df = pd.read_csv('./data/loan_result.csv')
# log_df = pd.read_csv('./data/log_data.csv' )

In [36]:
user_df.shape, loan_df.shape

((1394216, 17), (13527363, 7))

In [37]:
loan_df = loan_df.loc[loan_df['is_applied'].dropna(axis=0).index]

In [69]:
merged_df=pd.merge(user_df, loan_df, on='application_id')

In [70]:
merged_df.shape

(10270011, 23)

In [71]:
merged_df.columns

Index(['application_id', 'user_id', 'birth_year', 'gender', 'insert_time',
       'credit_score', 'yearly_income', 'income_type', 'company_enter_month',
       'employment_type', 'houseown_type', 'desired_amount', 'purpose',
       'personal_rehabilitation_yn', 'personal_rehabilitation_complete_yn',
       'existing_loan_cnt', 'existing_loan_amt', 'loanapply_insert_time',
       'bank_id', 'product_id', 'loan_limit', 'loan_rate', 'is_applied'],
      dtype='object')

In [72]:
merged_df.isnull().sum()

application_id                               0
user_id                                      0
birth_year                               91626
gender                                   91626
insert_time                                  0
credit_score                           1243812
yearly_income                                0
income_type                                  0
company_enter_month                     303568
employment_type                              0
houseown_type                                0
desired_amount                               0
purpose                                      0
personal_rehabilitation_yn             5873229
personal_rehabilitation_complete_yn    9232232
existing_loan_cnt                      2143811
existing_loan_amt                      3044140
loanapply_insert_time                        0
bank_id                                      0
product_id                                   0
loan_limit                                5625
loan_rate    

In [73]:
merged_df['age'] = 2022 - merged_df['birth_year']
merged_df.drop([
    'application_id','user_id','birth_year','insert_time',
    'loanapply_insert_time', 'bank_id','product_id',
    'personal_rehabilitation_yn', 'personal_rehabilitation_complete_yn'
    ]
    , axis=1, inplace=True)

In [74]:
cate_list = [
    'income_type',
    'employment_type',
    'houseown_type',
    'purpose',
#     'personal_rehabilitation_yn',
#     'personal_rehabilitation_complete_yn',
]

In [75]:
for cate in cate_list:
    merged_df = pd.get_dummies(data = merged_df, columns=[cate], prefix=cate)

# drop

In [84]:
data_set=merged_df.copy()

In [85]:
data_set.dropna(inplace=True)

In [86]:
data_set.shape

(6718484, 41)

In [87]:
data_set = pd.concat([data_set[data_set['is_applied']==0].sample(100_000, random_state=42), data_set[data_set['is_applied']==1].sample(100_000, random_state=42)])

In [88]:
X,y= data_set.loc[:, data_set.columns !='is_applied'] ,data_set['is_applied']
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=42)

In [89]:
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

RandomForestClassifier(random_state=42)

In [90]:
model.score(X_test, y_test)

0.787425

In [95]:
print(*sorted(zip(model.feature_importances_, X_test.columns), reverse=True), sep='\n')

(0.17717951451829048, 'credit_score')
(0.1536787724427905, 'loan_rate')
(0.09940527040932981, 'loan_limit')
(0.09150185835829833, 'desired_amount')
(0.08173739386260723, 'existing_loan_amt')
(0.07890960552409164, 'company_enter_month')
(0.07288739553121247, 'yearly_income')
(0.07248563232373308, 'age')
(0.05698642301275427, 'existing_loan_cnt')
(0.016614355908840542, 'income_type_EARNEDINCOME')
(0.011803254927539617, 'gender')
(0.010866493381125261, 'purpose_생활비')
(0.008828559093822418, 'houseown_type_전월세')
(0.008432704437602802, 'houseown_type_기타가족소유')
(0.007702812564903346, 'houseown_type_자가')
(0.007003873207271149, 'purpose_대환대출')
(0.006642881257334032, 'employment_type_정규직')
(0.0056735095298231985, 'employment_type_기타')
(0.004990439672428415, 'income_type_PRIVATEBUSINESS')
(0.004801543741054417, 'employment_type_계약직')
(0.0038961058688042477, 'income_type_EARNEDINCOME2')
(0.002470783405348363, 'purpose_사업자금')
(0.0023778252501610564, 'purpose_주택구입')
(0.002243420745427083, 'income_typ

# 돌려보기

편의를 위해서 gender과 age null은 drop

In [108]:
merged_df = merged_df.loc[merged_df['gender'].dropna(axis=0).index]

In [109]:
merged_df.isnull().sum()[merged_df.isnull().sum()!=0]

credit_score           1229941
company_enter_month     299731
existing_loan_cnt      2101116
existing_loan_amt      2994214
loan_limit                5584
loan_rate                 5584
dtype: int64

In [110]:
null_feature = merged_df.isnull().sum()[merged_df.isnull().sum()!=0].index
options = [0, 'min', 'max', 'mean', 'median']
random_list = [random.randint(1,1000) for i in range(len(null_feature)*len(options))]

In [112]:
model_list = []
acc_list = []
f1_list = []
confusion_list = []
for i, option_group in enumerate(product(options, repeat=len(null_feature))):
    tmp = deepcopy(merged_df)
    data_set = pd.concat([tmp[tmp['is_applied']==0].sample(100_000, random_state=random_list[i]), tmp[tmp['is_applied']==1].sample(100_000, random_state=random_list[i])])
    
    for column_name, option in zip(null_feature, list(product(options, repeat=len(null_feature)))[5]):
        if option == 0:
            data_set[column_name].fillna(0, inplace=True)
        else:
            eval(f'data_set["{column_name}"].fillna(data_set["{column_name}"].{option}(), inplace=True)')

    X,y= data_set.loc[:, data_set.columns !='is_applied'] ,data_set['is_applied']
    X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=random_list[i])
    model = RandomForestClassifier(random_state=random_list[i])
    model.fit(X_train, y_train)
    y_pred=model.predict(X_test)
    
    model_list.append(model)
    acc_list.append(model.score(X_test,y_test))
    f1_list.append(f1_score(y_test,y_pred))
    confusion_list.append(confusion_matrix(y_test,y_pred))
    
    with open(f'./mydata/m_{option_group}.txt','w') as f:
        f.write('\n'.join(map(str,data_set.index)))
    saved_model = pickle.dumps(model)
    joblib.dump(model,f'./model/m_{option_group}.pkl')
    print(i, option_group, 'done')

KeyboardInterrupt: 