In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# load data

In [36]:
!pwd
df = pd.read_csv('../input/loan-data-set/loan_data_set.csv')
df.head()

# Data Preprocessing
전체 데이터의 수는 614개

In [37]:
df.info()

Gender, Married, Dependents, Self_Employed, LoanAmount, Loan_Amount_Term, Credit_History에서 null 값이 있는 것으로 파악됨

In [38]:
df.isnull().sum()

결측치를 시각화해본 결과

In [39]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.figure(figsize=(15, 10))
sns.heatmap(df.isnull(), cbar=True)

## colunms describe
- Loan_ID : 대출 ID
- Gender : 성별
- Married : 결혼 여부
- Dependents : 부양가족 수 
- Education : 교육
- Self_Employed : 자영업
- ApplicantIncome : 소득
- CoapplicantIncome : 같이 사는 사람, 소득
- LoanAmount : 대출 금액
- Loan_Amount_Term : 대출 기간(?)
- Credit_History : 신용정보
- Property_Area : 사는 지역
- Loan_Status : 대출 여부 (target)

## remove null 

In [51]:
# Gender, Married, Dependents, Self_Employed, LoanAmount, Loan_Amount_Term, Credit_History
# Loan_ID, ApplicantIncome, CoapplicantIncome, LoanAmount
for col in df.columns:
    if col not in ['Loan_ID','ApplicantIncome','CoapplicantIncome', 'LoanAmount']:
        print(col, df[col].unique())

'ApplicantIncome','CoapplicantIncome', 'LoanAmount' 데이터의 결측값을 평균값으로 채움

In [43]:
def fiina_maen(df, col_list):
    for col in col_list:
        df[col] = df[col].fillna(df[col].mean())
    return df

In [42]:
df = fiina_maen(df, ['ApplicantIncome','CoapplicantIncome', 'LoanAmount'])

In [49]:
df = df.dropna(axis=0, how='any')

In [50]:
df.isnull().sum()

In [54]:
from sklearn import preprocessing

def drop_features(df, col_list):
    df.drop(col_list, axis=1, inplace=True)
    return df

def encode_feature(df, col_list):
    for feature in col_list:
        le = preprocessing.LabelEncoder()
        le = le.fit(df[feature])
        df[feature] = le.transform(df[feature])
    return df

In [55]:
df = drop_features(df, ['Loan_ID'])
df = encode_feature(df, ['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed', 'Loan_Amount_Term', 'Credit_History', 'Property_Area', 'Loan_Status'])

In [64]:
df = df.astype({'ApplicantIncome':'float64'})

In [65]:
df.tail()

In [76]:
df.info()

In [78]:
df

In [82]:
X_df = df.drop('Loan_Status', axis=1)
y_df = df['Loan_Status']

In [84]:
X_df.head()

In [85]:
y_df

# EDA

In [71]:
plt.figure(figsize=(12, 10))
sns.heatmap(df.corr(), cmap='Blues')

In [87]:
sns.barplot(x='Gender', y='Loan_Status', data=df)

In [88]:
sns.barplot(x='Married', y='Loan_Status', data=df)

# model train & predict

In [91]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, test_size=0.2, random_state=11)

In [92]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [98]:
dt_clf = DecisionTreeClassifier()
rf_clf = RandomForestClassifier()
lr_clf = LogisticRegression(solver='liblinear')

dt_clf.fit(X_train, y_train)
dt_pred = dt_clf.predict(X_test)
print('DecisionTreeClassifier 정확도 : {0:.4f}'.format(accuracy_score(dt_pred, y_test)))

rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('RandomForestClassifier 정확도 : {0:.4f}'.format(accuracy_score(rf_pred, y_test)))

lr_clf.fit(X_train, y_train)
lr_pred = lr_clf.predict(X_test)
print('LogisticRegression 정확도 : {0:.4f}'.format(accuracy_score(lr_pred, y_test)))

In [102]:
y_df.values

In [111]:
from sklearn.model_selection import KFold 

def exec_kfold(X_df, y_df, clf, folds=5):
    kfold = KFold(n_splits=folds)
    
    scores = []
    
    for iter_num, (train_idx, test_idx) in enumerate(kfold.split(X_df)):
        X_train, X_test = X_df.values[train_idx], X_df.values[test_idx]
        y_train, y_test = y_df.values[train_idx], y_df.values[test_idx]
        
        clf.fit(X_train, y_train)
        pred = clf.predict(X_test)
        accuracy = accuracy_score(y_test, pred)
        scores.append(accuracy)
        print(f'교차 검증 {iter_num} acc : {accuracy:.2f}')
        
    mean_score = np.mean(scores)
    print(f'평균 정확도 {mean_score:.2f}')        

In [115]:
exec_kfold(X_df, y_df, dt_clf)

In [116]:
exec_kfold(X_df, y_df, rf_clf)

In [117]:
exec_kfold(X_df, y_df, lr_clf)

sklearn.model_selection의 cross_val_score api를 사용

In [118]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt_clf, X_df, y_df, cv=5)

for iter_num, accuracy in enumerate(scores):
    print(f'교차 검증 {iter_num} acc : {accuracy:.2f}')
    
print(f'평균 정확도 {np.mean(scores):.2f}')        