# Loan Prediction Based on Customer Behavior

# 라이브러리 호출

In [8]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, r2_score, accuracy_score, f1_score

from sklearn.preprocessing import MinMaxScaler
from category_encoders import OneHotEncoder, OrdinalEncoder

from sklearn.linear_model import LinearRegression, RidgeCV, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from xgboost import XGBRegressor, XGBClassifier
from sklearn.base import BaseEstimator

# 데이터 불러오기

In [9]:
train = pd.read_csv('/Users/hanhyeongu/Desktop/codestates/project/Section2/Loan prediction/Training Data.csv')
test = pd.read_csv('/Users/hanhyeongu/Desktop/codestates/project/Section2/Loan prediction/Test Data.csv')
sample_prediction = pd.read_csv('/Users/hanhyeongu/Desktop/codestates/project/Section2/Loan prediction/Sample Prediction Dataset.csv')

In [10]:
print('train 데이터 형태: ', train.shape)
print('test 데이터 형태: ', test.shape)

train 데이터 형태:  (252000, 13)
test 데이터 형태:  (28000, 12)


In [11]:
train.head()

Unnamed: 0,Id,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag
0,1,1303834,23,3,single,rented,no,Mechanical_engineer,Rewa,Madhya_Pradesh,3,13,0
1,2,7574516,40,10,single,rented,no,Software_Developer,Parbhani,Maharashtra,9,13,0
2,3,3991815,66,4,married,rented,no,Technical_writer,Alappuzha,Kerala,4,10,0
3,4,6256451,41,2,single,rented,yes,Software_Developer,Bhubaneswar,Odisha,2,12,1
4,5,5768871,47,11,single,rented,no,Civil_servant,Tiruchirappalli[10],Tamil_Nadu,3,14,1


In [12]:
test.head()

Unnamed: 0,ID,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS
0,1,7393090,59,19,single,rented,no,Geologist,Malda,West Bengal,4,13
1,2,1215004,25,5,single,rented,no,Firefighter,Jalna,Maharashtra,5,10
2,3,8901342,50,12,single,rented,no,Lawyer,Thane,Maharashtra,9,14
3,4,1944421,49,9,married,rented,yes,Analyst,Latur,Maharashtra,3,12
4,5,13429,25,18,single,rented,yes,Comedian,Berhampore,West Bengal,13,11


In [13]:
# train데이터와 test데이터의 ID 컬럼 동일시 하기 위해 컬럼명 변경
train.rename(columns={'Id':'ID'}, inplace=True)

# Data Proprocessing

## 중복 ID 검사

In [14]:
def analysis_on_features(df):
    '''
    데이터 프레임에 중복된 Id가 있는지 파악
    '''
    duplicated_rows = 0
    dupli = df.duplicated(subset=['ID'])
    for i in dupli.index:
        if dupli[i] == True:
            duplicated_rows += 1
            print(i)
    
    return duplicated_rows

In [15]:
train_duplicated_rows = analysis_on_features(train)
test_duplicated_rows = analysis_on_features(test)

print('train 데이터의 중복열 개수는 {}개 이다.' .format(train_duplicated_rows))
print('test 데이터의 중복열 개수는 {}개 이다.' .format(test_duplicated_rows))

train 데이터의 중복열 개수는 0개 이다.
test 데이터의 중복열 개수는 0개 이다.


## Feature Engineering 및 Preprocessing

### 직업 그룹화

In [16]:
train['Profession'].unique()

array(['Mechanical_engineer', 'Software_Developer', 'Technical_writer',
       'Civil_servant', 'Librarian', 'Economist', 'Flight_attendant',
       'Architect', 'Designer', 'Physician', 'Financial_Analyst',
       'Air_traffic_controller', 'Politician', 'Police_officer', 'Artist',
       'Surveyor', 'Design_Engineer', 'Chemical_engineer',
       'Hotel_Manager', 'Dentist', 'Comedian', 'Biomedical_Engineer',
       'Graphic_Designer', 'Computer_hardware_engineer',
       'Petroleum_Engineer', 'Secretary', 'Computer_operator',
       'Chartered_Accountant', 'Technician', 'Microbiologist',
       'Fashion_Designer', 'Aviator', 'Psychologist', 'Magistrate',
       'Lawyer', 'Firefighter', 'Engineer', 'Official', 'Analyst',
       'Geologist', 'Drafter', 'Statistician', 'Web_designer',
       'Consultant', 'Chef', 'Army_officer', 'Surgeon', 'Scientist',
       'Civil_engineer', 'Industrial_Engineer', 'Technology_specialist'],
      dtype=object)

- 의사(medical) -> 'Physician', 'Surgeon', 'Dentist'<br><br>

- 공직자(public) -> 'Civil_servant', 'Librarian', 'Politician', 'Police_officer', 'Magistrate', 'Firefighter', 'Official', 'Army_officer', 'Chartered_Accountant'  <br><br>

- 기계 & 건축 & 엔지니어(eng) -> 'Mechanical_engineer', 'Architect', 'Surveyor', 'Design_Engineer', 'Chemical_engineer', 'Biomedical_Engineer', Petroleum_Engineer’, 'Technician', 'Engineer'<br><br>

- IT(IT) - > 'Software_Developer', 'Graphic_Designer', 'Computer_hardware_engineer', 'Computer_operator', 'Web_designer', 'Technical_writer'<br><br>

- 예술(artfical) -> 'Designer', 'Artist', 'Comedian', 'Fashion_Designer'<br><br>  

- 전문직(pro) - > 'Economist', 'Flight_attendant', 'Financial_Analyst', 'Air_traffic_controller', 'Hotel_Manager', 'Secretary', 'Aviator', 'Lawyer', 'Analyst', 'Drafter', 'Consultant', 'Chef', 'Civil_engineer', 'Industrial_Engineer', 'Technology_specialist'<br><br>

- 학계(scholar) -> 'Microbiologist', 'Psychologist', 'Geologist', 'Statistician', 'Scientist'

In [17]:
pro_dic = {
    'Medical': 0,
    'Public': 0,
    'Eng': 0,
    'IT': 0,
    'Art': 0,
    'Pro': 0,
    'Scholar': 0
}

pro_dic['Medical'] = ['Physician', 'Surgeon', 'Dentist']
pro_dic['Public'] = ['Civil_servant', 'Librarian', 'Politician', 'Police_officer', 'Magistrate', 
            'Firefighter', 'Official', 'Army_officer', 'Chartered_Accountant']
pro_dic['Eng'] = ['Mechanical_engineer', 'Architect', 'Surveyor', 'Design_Engineer', 'Chemical_engineer', 
        'Biomedical_Engineer', 'Petroleum_Engineer', 'Technician' ,'Engineer']
pro_dic['IT'] = ['Software_Developer', 'Graphic_Designer', 'Computer_hardware_engineer', 
        'Computer_operator', 'Web_designer','Technical_writer']
pro_dic['Art'] = ['Designer', 'Artist', 'Comedian', 'Fashion_Designer']
pro_dic['Pro'] = ['Economist', 'Flight_attendant', 'Financial_Analyst', 'Air_traffic_controller', 'Hotel_Manager', 
        'Secretary', 'Aviator', 'Lawyer', 'Analyst', 'Drafter', 'Consultant', 'Chef', 'Civil_engineer', 'Industrial_Engineer', 
        'Technology_specialist']
pro_dic['Scholar'] = ['Microbiologist', 'Psychologist', 'Geologist', 'Statistician', 'Scientist']

### 주(STATE) 그룹화

In [18]:
train['STATE'].unique()

array(['Madhya_Pradesh', 'Maharashtra', 'Kerala', 'Odisha', 'Tamil_Nadu',
       'Gujarat', 'Rajasthan', 'Telangana', 'Bihar', 'Andhra_Pradesh',
       'West_Bengal', 'Haryana', 'Puducherry', 'Karnataka',
       'Uttar_Pradesh', 'Himachal_Pradesh', 'Punjab', 'Tripura',
       'Uttarakhand', 'Jharkhand', 'Mizoram', 'Assam',
       'Jammu_and_Kashmir', 'Delhi', 'Chhattisgarh', 'Chandigarh',
       'Uttar_Pradesh[5]', 'Manipur', 'Sikkim'], dtype=object)

- 북인도(NI) -> 'Madhya_Pradesh', 'Rajasthan', 'Haryana', 'Uttar_Pradesh', 'Himachal_Pradesh', 'Punjab', 'Uttarakhand', 'Jammu_and_Kashmir', 'Delhi', 'Chhattisgarh', 'Chandigarh', 'Uttar_Pradesh' 

- 서인도(WI) -> 'Maharashtra', 'Gujarat'

- 남인도(SI) -> 'Kerala', 'Tamil_Nadu', 'Telangana', 'Andhra_Pradesh', 'Puducherry', 'Karnataka' 

- 동인도(EI) -> 'Odisha', 'Bihar', 'West_Bengal', 'Jharkhand'

- 북동인도(NEI) -> 'Tripura', 'Mizoram', 'Assam', 'Manipur', 'Sikkim'  

In [19]:
state_dic = {
    'NI': 0,
    'WI': 0,
    'SI': 0,
    'EI': 0,
    'NEI': 0
}

state_dic['NI'] = ['Madhya_Pradesh', 'Rajasthan', 'Haryana', 'Uttar_Pradesh', 
    'Himachal_Pradesh', 'Punjab', 'Uttarakhand', 'Jammu_and_Kashmir', 'Delhi', 'Chhattisgarh', 
    'Chandigarh', 'Uttar_Pradesh']
state_dic['WI'] = ['Maharashtra', 'Gujarat']
state_dic['SI'] = ['Kerala', 'Tamil_Nadu', 'Telangana', 'Andhra_Pradesh', 'Puducherry', 'Karnataka']
state_dic['EI'] = ['Odisha', 'Bihar', 'West_Bengal', 'Jharkhand']
state_dic['NEI'] = ['Tripura', 'Mizoram', 'Assam', 'Manipur', 'Sikkim']

In [20]:
# profession 바꾸는 함수
def change_pro(data):
    for key, value in pro_dic.items():
        for i in value:
            if data == i:
                data = key
    return data


# state 바꾸는 함수
def change_state(data):
    for key, value in state_dic.items():
        for j in value:
            if data == j:
                data = key
    
    return data


# 전처리 함수
def eda_preprocess(df):
    # 분석에 사용하지 않을 'ID' 컬럼 삭제
    df = df.drop(['ID'], axis=1)

    # 'CITY'컬럼 삭제 -> 너무 높은 카디널리티
    df = df.drop(['CITY'], axis=1)

    # 'Profession' 컬럼 값 그룹화하기
    df['Profession'] = df['Profession'].apply(change_pro)
        
    # 'STATE'컬럼 값 그룹화하기
    df['STATE'] = df['STATE'].apply(change_state)

    # 'OutofWork_time' -> 직장이 없던 기간
    df['OutofWork_time'] = df['Experience'] - df['CURRENT_JOB_YRS']
    
    return df



In [21]:
train_processed = eda_preprocess(train.copy())
test_processed = eda_preprocess(test.copy())

In [22]:
train_processed

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,Risk_Flag,OutofWork_time
0,1303834,23,3,single,rented,no,Eng,NI,3,13,0,0
1,7574516,40,10,single,rented,no,IT,WI,9,13,0,1
2,3991815,66,4,married,rented,no,IT,SI,4,10,0,0
3,6256451,41,2,single,rented,yes,IT,EI,2,12,1,0
4,5768871,47,11,single,rented,no,Public,SI,3,14,1,8
...,...,...,...,...,...,...,...,...,...,...,...,...
251995,8154883,43,13,single,rented,no,Medical,EI,6,11,0,7
251996,2843572,26,10,single,rented,no,Public,NI,6,11,0,4
251997,4522448,46,7,single,rented,no,Eng,WI,7,12,0,0
251998,6507128,45,0,single,rented,no,IT,SI,0,10,0,0


# Modeling - Income 예측

## 변수 및 모델 정의

In [23]:
train1 = train_processed.copy()
test1 = test_processed.copy()

train1 = train1.drop(['Risk_Flag'], axis=1)
train1.head()

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,OutofWork_time
0,1303834,23,3,single,rented,no,Eng,NI,3,13,0
1,7574516,40,10,single,rented,no,IT,WI,9,13,1
2,3991815,66,4,married,rented,no,IT,SI,4,10,0
3,6256451,41,2,single,rented,yes,IT,EI,2,12,0
4,5768871,47,11,single,rented,no,Public,SI,3,14,8


In [24]:
# train, vl 데이터 나누는 함수
def split(df, target):
    X = df.drop([target], axis=1)
    y = df[target]

    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val

In [25]:
X_train, X_val, y_train, y_val = split(train1, 'Income')

In [26]:
print('X_train의 형태: ', X_train.shape)
print('X_val의 형태: ', X_val.shape)


X_train의 형태:  (201600, 10)
X_val의 형태:  (50400, 10)


In [27]:
# 조정된 결정계수 r2 구하는 함수 
def adj_r2(r2, n, p):
    return 1 - (1-r2)*(n-1)/(n-p-1)

#### 기준모델

In [28]:
# 기준 모델 
# 평가지표는 MAE
mae_r2_score = {}
baseline_predict = [train1['Income'].mean()]*len(X_val)

baseline_mae = mean_absolute_error(y_val, baseline_predict)
baseline_r2 = r2_score(y_val, baseline_predict)
baseline_adj_r2 = adj_r2(baseline_r2, X_val.shape[0], X_val.shape[1])

mae_r2_score['baseline'] = [baseline_mae, baseline_r2, baseline_adj_r2]

#### LinearRegression

In [29]:
# LinearRegression
onehot = OneHotEncoder(use_cat_names=True)
X_train_encoded = onehot.fit_transform(X_train)
X_val_encoded = onehot.transform(X_val)

pipe_LinearR = Pipeline([
    ('mn_scale', MinMaxScaler()),
    ('LinearRegreesion', LinearRegression(n_jobs=-1))
])
pipe_LinearR.fit(X_train_encoded, y_train)
LinearRegression_pred = pipe_LinearR.predict(X_val_encoded)

LinearRegression_mae = mean_absolute_error(y_val, LinearRegression_pred)
LinearRegression_r2 = r2_score(y_val, LinearRegression_pred)
LinearRegression_adj_r2 = adj_r2(LinearRegression_r2, X_val_encoded.shape[0], X_val_encoded.shape[1])


mae_r2_score['LinearRegression'] = [LinearRegression_mae, LinearRegression_r2, LinearRegression_adj_r2]


#### RidgeCV

In [30]:
# RidgeCV
pipe_Ridge = Pipeline([
    ('mn_scale', MinMaxScaler()),
    ('ridge', RidgeCV())
])
pipe_Ridge.fit(X_train_encoded, y_train)
RidgeCV_pred = pipe_Ridge.predict(X_val_encoded)

RidgeCV_mae = mean_absolute_error(y_val, RidgeCV_pred)
RidgeCV_r2 = r2_score(y_val, RidgeCV_pred)
RidgeCV_adj_r2 = adj_r2(RidgeCV_r2, X_val_encoded.shape[0], X_val_encoded.shape[1])

mae_r2_score['RidgeCV'] = [RidgeCV_mae, RidgeCV_r2, RidgeCV_adj_r2]

#### RandomForestRegressor

In [31]:
# RnadomforestRegressor
pipe_RFR = Pipeline([
    ('ordinal', OrdinalEncoder()),
    ('RFregressor', RandomForestRegressor(n_jobs=-1, oob_score=True, random_state=42))
])
dists_RFr = {
    'RFregressor__max_depth': [10, 15, 20],
    'RFregressor__min_samples_split': [2, 3, 4],
    'RFregressor__min_samples_leaf': [1, 2, 3]
}

clf_RFr = RandomizedSearchCV(
    pipe_RFR,
    param_distributions=dists_RFr,
    n_iter=10,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42
)

clf_RFr.fit(X_train, y_train)
print('최적 하퍼라미터: ', clf_RFr.best_params_, '\n')
print('MAE: ', -clf_RFr.best_score_)


최적 하퍼라미터:  {'RFregressor__min_samples_split': 2, 'RFregressor__min_samples_leaf': 2, 'RFregressor__max_depth': 20} 

MAE:  708719.693347021


In [32]:
pipe_RFR = clf_RFr.best_estimator_
RFregressor_pred = pipe_RFR.predict(X_val)

RFregressor_mae = mean_absolute_error(y_val, RFregressor_pred)
RFregressor_r2 = r2_score(y_val, RFregressor_pred)
RFregressor_adj_r2 = adj_r2(RFregressor_r2, X_val.shape[0], X_val.shape[1])

mae_r2_score['RandomForestRegressor'] = [RFregressor_mae, RFregressor_r2, RFregressor_adj_r2]


#### XGBRegressor

In [33]:
class XGBoostWithEarlyStopLinear(BaseEstimator):
    def __init__(self, early_stopping_rounds=7, test_size=0.2, 
                 eval_metric='mae',n_jobs=-1, random_state=42, **estimator_params):
        self.early_stopping_rounds = early_stopping_rounds
        self.test_size = test_size
        self.eval_metric=eval_metric='mae'      
        self.n_jobs=n_jobs
        self.random_state=random_state  
        if self.estimator is not None:
            self.set_params(**estimator_params)

    def set_params(self, **params):
        return self.estimator.set_params(**params)

    def get_params(self, **params):
        return self.estimator.get_params()

    def fit(self, X, y):
        x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=self.test_size, random_state=42)
        self.estimator.fit(x_train, y_train, 
                           early_stopping_rounds=self.early_stopping_rounds, 
                           eval_metric=self.eval_metric, eval_set=[(x_val, y_val)])
        return self

    def predict(self, X):
        return self.estimator.predict(X)

class XGBoostRegressorWithEarlyStop(XGBoostWithEarlyStopLinear):
    def __init__(self, *args, **kwargs):
        self.estimator = XGBRegressor()
        super(XGBoostRegressorWithEarlyStop, self).__init__(*args, **kwargs)

In [34]:
# XGBRegressor
pipe_XGBR = Pipeline([
    ('ordinal', OrdinalEncoder()),
    ('XGBR', XGBoostRegressorWithEarlyStop())
])

dists_XGBr = {
    'XGBR__max_depth': [10, 15, 20],
    'XGBR__max_leaves': [1, 2, 3,],
    'XGBR__learning_rate': [0.1, 0.2]
}

clf_XGBr= RandomizedSearchCV(
    pipe_XGBR,
    param_distributions=dists_XGBr,
    n_iter=5,
    cv=5,
    scoring='neg_mean_absolute_error',
    random_state=42
)

clf_XGBr.fit(X_train, y_train)
print('최적 하퍼라미터: ', clf_XGBr.best_params_, '\n')
print('MAE: ', -clf_XGBr.best_score_)


[0]	validation_0-mae:4528219.30017
[1]	validation_0-mae:4132442.77233
[2]	validation_0-mae:3802103.82914
[3]	validation_0-mae:3526627.30904
[4]	validation_0-mae:3301147.57715
[5]	validation_0-mae:3107853.82083
[6]	validation_0-mae:2954361.29799
[7]	validation_0-mae:2811922.21838
[8]	validation_0-mae:2695970.10215
[9]	validation_0-mae:2591257.37012
[10]	validation_0-mae:2508982.11886
[11]	validation_0-mae:2432412.46703
[12]	validation_0-mae:2373119.51729
[13]	validation_0-mae:2311459.40489
[14]	validation_0-mae:2256957.17743
[15]	validation_0-mae:2211933.33015
[16]	validation_0-mae:2177868.57955
[17]	validation_0-mae:2134251.00762
[18]	validation_0-mae:2097473.48156
[19]	validation_0-mae:2063173.33180
[20]	validation_0-mae:2029159.41138
[21]	validation_0-mae:2007414.59066
[22]	validation_0-mae:1973214.56767
[23]	validation_0-mae:1946119.03380
[24]	validation_0-mae:1923102.58639
[25]	validation_0-mae:1895526.44088
[26]	validation_0-mae:1881053.51736
[27]	validation_0-mae:1856370.00750
[2

In [35]:
pipe_XGBR = clf_XGBr.best_estimator_
pipe_XGBR.fit(X_train, y_train)
XGBR_pred = pipe_XGBR.predict(X_val)

XGBR_mae = mean_absolute_error(y_val, XGBR_pred)
XGBR_r2 = r2_score(y_val, XGBR_pred)
XGBR_adj_r2 = adj_r2(XGBR_r2, X_val.shape[0], X_val.shape[1])

mae_r2_score['XGBRegressor'] = [XGBR_mae, XGBR_r2, XGBR_adj_r2]

[0]	validation_0-mae:4531730.53429
[1]	validation_0-mae:4105830.35997
[2]	validation_0-mae:3727584.75266
[3]	validation_0-mae:3386584.33965
[4]	validation_0-mae:3083681.49949
[5]	validation_0-mae:2814034.31371
[6]	validation_0-mae:2571893.73844
[7]	validation_0-mae:2355308.41482
[8]	validation_0-mae:2161939.72064
[9]	validation_0-mae:1986174.96362
[10]	validation_0-mae:1830126.53490
[11]	validation_0-mae:1690922.47617
[12]	validation_0-mae:1564547.25204
[13]	validation_0-mae:1452435.77404
[14]	validation_0-mae:1352306.03585
[15]	validation_0-mae:1262596.93690
[16]	validation_0-mae:1182756.53785
[17]	validation_0-mae:1109371.97406
[18]	validation_0-mae:1041182.24078
[19]	validation_0-mae:981723.79226
[20]	validation_0-mae:926759.42514
[21]	validation_0-mae:882045.63509
[22]	validation_0-mae:840777.54086
[23]	validation_0-mae:803767.06469
[24]	validation_0-mae:770287.84754
[25]	validation_0-mae:737168.61171
[26]	validation_0-mae:708298.36249
[27]	validation_0-mae:682282.79379
[28]	valida

In [36]:
# 모델들의 mae와 r2 비교 
mae_r2_score

{'baseline': [2484726.918723612,
  -2.553989657316791e-05,
  -0.0002240009773439855],
 'LinearRegression': [2482880.4607936507,
  0.0011113405417264577,
  0.0006156043189438076],
 'RidgeCV': [2482850.850584956, 0.0010954854360935018, 0.0005997413446158228],
 'RandomForestRegressor': [700732.3901609607,
  0.8099960289386787,
  0.809958321508275],
 'XGBRegressor': [415441.6735608104, 0.8030044937457606, 0.8029653988031631]}

## 모델 선택
결정 계수 R2와 조정된 결정계수 adj_R2만 본다면 'RandomForestRegressor'모델이 가장 성능이 좋아 보인다.  
그런데 'XGBRegressor'와 0.007정도의 차이만 있으며, XGBRegressor의 MAE가 RandomForestRegressor보다 약 285291정도의 차이만큼 작다.  
따라서 ***XGBRegressor모델***을 선택하는 것이 더 바람직하다고 생각한다.  

### 가장 좋은 모델의 성능 확인

In [37]:
# test 데이터로 모델 성능 확인
Income_test = test1['Income']
test1_1 = test1.drop(['Income'], axis=1)
test1_1.head()

Unnamed: 0,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,OutofWork_time
0,59,19,single,rented,no,Scholar,West Bengal,4,13,15
1,25,5,single,rented,no,Public,WI,5,10,0
2,50,12,single,rented,no,Pro,WI,9,14,3
3,49,9,married,rented,yes,Pro,WI,3,12,6
4,25,18,single,rented,yes,Art,West Bengal,13,11,5


In [38]:
blm_pred = pipe_XGBR.predict(test1_1)

blm_mae = mean_absolute_error(Income_test, blm_pred)
blm_r2 = r2_score(Income_test, blm_pred)
blm_adj_r2 = adj_r2(blm_r2, test1.shape[0], test1.shape[1])

print(f' XGBRegressor의 예측 성능-> MAE: {blm_mae}, r2: {blm_r2}, adj_r2: {blm_adj_r2}')

 XGBRegressor의 예측 성능-> MAE: 1551588.1840294364, r2: 0.3250925963998421, adj_r2: 0.32482734052448126


# Modeling - Risk_Flag 예측

## 변수 및 모델 정의

In [39]:
train2 = train_processed.copy()
test2 = test_processed.copy()

# test2의 'Income'값을 위에서 에측한 값으로 변경
test2['Income'] = blm_pred

In [40]:
X_train, X_val, y_train, y_val = split(train2, 'Risk_Flag')

In [41]:
print('X_train의 형태: ', X_train.shape)
print('X_val의 형태: ', X_val.shape)


X_train의 형태:  (201600, 11)
X_val의 형태:  (50400, 11)


#### 기준 모델

In [57]:
# 기준 모델
# 평가지표는 accuracy, f1 score
acc_f1_score = {}

baseline_predict = [train2['Risk_Flag'].mode()[0]]*len(X_val)

baseline_acc = accuracy_score(y_val, baseline_predict)
baseline_f1 = f1_score(y_val, baseline_predict)

acc_f1_score['baseline'] = [baseline_acc, baseline_f1]


#### LogisticRegression

In [58]:
# LogisticRegression
onehot = OneHotEncoder(use_cat_names=True)
X_train_encoded2 = onehot.fit_transform(X_train)
X_val_encoded2 = onehot.transform(X_val)


pipe_LogisticR = Pipeline([
    ('logistic', LogisticRegression(class_weight='balanced', n_jobs=-1, random_state=42))
])

dists_LR = {
    'logistic__C': [1.0, 2.0, 3.0],
    'logistic__solver': ['lbfgs', 'liblinear', 'saga']
}

clf_LR = RandomizedSearchCV(
    pipe_LogisticR,
    param_distributions=dists_LR,
    n_iter=20,
    cv=5,
    scoring='accuracy',
    random_state=42
)

clf_LR.fit(X_train_encoded2, y_train)
print('최적 하퍼라미터: ', clf_LR.best_params_, '\n')
print('Accuracy ', clf_LR.best_score_)




최적 하퍼라미터:  {'logistic__solver': 'saga', 'logistic__C': 1.0} 

Accuracy  0.7267311507936508


In [59]:
pipe_LogisticR = clf_LR.best_estimator_
LR_pred = pipe_LogisticR.predict(X_val_encoded2)

LR_acc = accuracy_score(y_val, LR_pred)
LR_f1 = f1_score(y_val, LR_pred)

acc_f1_score['LogisticRegression'] = [LR_acc, LR_f1]

#### RandomForestClassifier

In [60]:
# RandomForestClassifier
pipe_RFC = Pipeline([
    ('ordinal', OrdinalEncoder()),
    ('RFclassifier', RandomForestClassifier(class_weight='balanced', n_jobs=-1, oob_score=True, random_state=42))
])

dists_RFC = {
    'RFclassifier__criterion': ['gini', 'entropy', 'log_loss'],
    'RFclassifier__max_depth': [10, 15, 20],
    'RFclassifier__min_samples_split': [2, 3, 4],
    'RFclassifier__min_samples_leaf': [1, 2, 3],
    'RFclassifier__max_features': ['sqrt', 'log2']
}

clf_RFC = RandomizedSearchCV(
    pipe_RFC,
    param_distributions=dists_RFC,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    random_state=42
)

clf_RFC.fit(X_train, y_train)
print('최적 하퍼라미터: ', clf_RFC.best_params_, '\n')
print('Accuracy ', clf_RFC.best_score_)


최적 하퍼라미터:  {'RFclassifier__min_samples_split': 4, 'RFclassifier__min_samples_leaf': 1, 'RFclassifier__max_features': 'log2', 'RFclassifier__max_depth': 20, 'RFclassifier__criterion': 'entropy'} 

Accuracy  0.8951587301587303


In [61]:
pipe_RFC = clf_RFC.best_estimator_
RFC_pred = pipe_RFC.predict(X_val)

RFC_acc = accuracy_score(y_val, RFC_pred)
RFC_f1 = f1_score(y_val, RFC_pred)

acc_f1_score['RandomForestClassifier'] = [RFC_acc, RFC_f1]

#### XGBoostClassifier

In [62]:
class XGBoostWithEarlyStopClassifier(BaseEstimator):
    def __init__(self, early_stopping_rounds=7, test_size=0.2, 
                 eval_metric='error', scale_pos_weight=7, n_jobs=-1, random_state=42, **estimator_params):
        self.early_stopping_rounds = early_stopping_rounds
        self.test_size = test_size
        self.eval_metric=eval_metric='error' 
        self.scale_pos_weight = scale_pos_weight
        self.n_jobs = n_jobs       
        self.random_state = random_state
        if self.estimator is not None:
            self.set_params(**estimator_params)

    def set_params(self, **params):
        return self.estimator.set_params(**params)

    def get_params(self, **params):
        return self.estimator.get_params()

    def fit(self, X, y):
        x_train, x_val, y_train, y_val = train_test_split(X, y, test_size=self.test_size, random_state=42)
        self.estimator.fit(x_train, y_train, 
                           early_stopping_rounds=self.early_stopping_rounds, 
                           eval_metric=self.eval_metric, eval_set=[(x_val, y_val)])
        return self

    def predict(self, X):
        return self.estimator.predict(X)

class XGBoostClassifierWithEarlyStop(XGBoostWithEarlyStopClassifier):
    def __init__(self, *args, **kwargs):
        self.estimator = XGBClassifier()
        super(XGBoostClassifierWithEarlyStop, self).__init__(*args, **kwargs)

In [63]:
# XGBClassifier
pipe_XGBC = Pipeline([
    ('ordinal', OrdinalEncoder()),
    ('XGBC', XGBoostClassifierWithEarlyStop())
])

dists_XGBc = {
    'XGBC__max_depth': [10, 15, 20],
    'XGBC__max_leaves': [1, 2, 3,],
    'XGBC__learning_rate': [0.1, 0.2]
}

clf_XGBc= RandomizedSearchCV(
    pipe_XGBC,
    param_distributions=dists_XGBc,
    n_iter=5,
    cv=5,
    scoring='accuracy',
    random_state=42
)

clf_XGBc.fit(X_train, y_train)
print('최적 하퍼라미터: ', clf_XGBc.best_params_, '\n')
print('Accuracy: ', clf_XGBc.best_score_)


[0]	validation_0-error:0.12202
[1]	validation_0-error:0.12165
[2]	validation_0-error:0.12137
[3]	validation_0-error:0.12156
[4]	validation_0-error:0.12106
[5]	validation_0-error:0.12131
[6]	validation_0-error:0.12119
[7]	validation_0-error:0.12150
[8]	validation_0-error:0.12143
[9]	validation_0-error:0.12171
[10]	validation_0-error:0.12193
[11]	validation_0-error:0.12171
[12]	validation_0-error:0.12190
[13]	validation_0-error:0.12205
[14]	validation_0-error:0.12215
[15]	validation_0-error:0.12218
[16]	validation_0-error:0.12212
[17]	validation_0-error:0.12212
[18]	validation_0-error:0.12236
[19]	validation_0-error:0.12243
[20]	validation_0-error:0.12246
[21]	validation_0-error:0.12255
[22]	validation_0-error:0.12255
[23]	validation_0-error:0.12258
[24]	validation_0-error:0.12258
[25]	validation_0-error:0.12236
[26]	validation_0-error:0.12236
[27]	validation_0-error:0.12230
[28]	validation_0-error:0.12252
[29]	validation_0-error:0.12209
[30]	validation_0-error:0.12212
[31]	validation_0-

In [64]:
pipe_XGBC = clf_XGBc.best_estimator_
pipe_XGBC.fit(X_train, y_train)
XGBC_pred = pipe_XGBC.predict(X_val)

XGBC_acc = accuracy_score(y_val, XGBC_pred),
XGBC_f1 = f1_score(y_val, XGBC_pred)

acc_f1_score['XGBClassifier'] = [XGBC_acc, XGBC_f1]

[0]	validation_0-error:0.10990
[1]	validation_0-error:0.10600
[2]	validation_0-error:0.10565
[3]	validation_0-error:0.10476
[4]	validation_0-error:0.10521
[5]	validation_0-error:0.10513
[6]	validation_0-error:0.10553
[7]	validation_0-error:0.10513
[8]	validation_0-error:0.10456
[9]	validation_0-error:0.10496
[10]	validation_0-error:0.10429
[11]	validation_0-error:0.10399
[12]	validation_0-error:0.10375
[13]	validation_0-error:0.10414
[14]	validation_0-error:0.10444
[15]	validation_0-error:0.10424
[16]	validation_0-error:0.10491
[17]	validation_0-error:0.10481
[18]	validation_0-error:0.10506
[19]	validation_0-error:0.10568
[20]	validation_0-error:0.10613
[21]	validation_0-error:0.10561
[22]	validation_0-error:0.10588
[23]	validation_0-error:0.10583
[24]	validation_0-error:0.10593
[25]	validation_0-error:0.10570
[26]	validation_0-error:0.10595
[27]	validation_0-error:0.10600
[28]	validation_0-error:0.10541
[29]	validation_0-error:0.10503
[30]	validation_0-error:0.10496
[31]	validation_0-

In [65]:
# 모델들의 accuracy와 f1 비교 
acc_f1_score

{'baseline': [0.8759325396825397, 0.0],
 'LogisticRegression': [0.8759325396825397, 0.0],
 'RandomForestClassifier': [0.8936507936507937, 0.6417112299465241],
 'XGBClassifier': [(0.898154761904762,), 0.5510364733665704]}

## 모델 선택
accuracy상으로는 XGBClassifier가 가장 좋아 보이지만 f1을 비교할 때는 RandomForestClassifier가 더 좋아보인다.  
글너데 accuracysms RandomForestClassifier와 XGBClassifier는 미묘한 차이를 보인다.  
따라서 ***RandomForestClassifier모델***을 선택하는 것이 바람직해 보인다.  

### 가장 좋은 모델의 성능 확인