Library Import

In [1]:
import warnings
warnings.filterwarnings('ignore')

import sklearn
import xgboost as xgb
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OrdinalEncoder, OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from xgboost import XGBClassifier

Data Load

In [2]:
dataset = pd.read_csv('./Mapping.csv')

data = dataset[['Income', 'Age', 'Experience', 'Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS', 'cCON', 'cEXT', 'cNEU']]
target = dataset[['Risk_Flag']]

Train & Test Split

In [3]:
# train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

print(x_train.shape)
print(x_test.shape)

(31351, 14)
(7838, 14)


Setting Data Function

In [4]:
def setDataset(data):
    data = data.replace({'Married/Single': 'single'}, 0)
    data = data.replace({'Married/Single': 'married'}, 1)
    data = data.replace({'Car_Ownership': 'no'}, 0)
    data = data.replace({'Car_Ownership': 'yes'}, 1)
    data = data.replace('y', 1)
    data = data.replace('n', 0)
    return data

In [5]:
x_train = setDataset(x_train)
x_test = setDataset(x_test)

In [6]:
x_train

Unnamed: 0,Income,Age,Experience,Married/Single,House_Ownership,Car_Ownership,Profession,CITY,STATE,CURRENT_JOB_YRS,CURRENT_HOUSE_YRS,cCON,cEXT,cNEU
27683,1086347,26,10,0,rented,1,Civil_engineer,Eluru[25],Andhra_Pradesh,10,10,1,0,0
17258,8440968,22,12,0,rented,0,Design_Engineer,Saharsa[29],Bihar,5,12,1,1,1
15083,6325118,21,13,0,rented,0,Librarian,Burhanpur,Madhya_Pradesh,11,12,1,0,0
19698,3497108,27,20,0,rented,0,Financial_Analyst,Tiruppur,Tamil_Nadu,10,13,1,0,0
23309,7219160,24,7,0,rented,1,Microbiologist,Pune,Maharashtra,5,14,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,4545844,25,14,0,rented,0,Physician,Pondicherry,Puducherry,9,11,1,0,0
11284,5197016,28,6,0,rented,1,Aviator,Tiruppur,Tamil_Nadu,6,11,1,0,1
38158,7330378,22,13,0,rented,0,Consultant,Rewa,Madhya_Pradesh,7,12,1,0,0
860,4334503,22,6,0,owned,0,Designer,Ratlam,Madhya_Pradesh,6,14,1,0,0


Encoding & Scale

In [7]:
# Encoding & Scale
ordinal_cols = ['House_Ownership']
standard_cols = ['Income', 'Age', 'Married/Single', 'Car_Ownership', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']
onehot_cols = ['Profession', 'CITY', 'STATE']

onehot=Pipeline([
    ('oridinal', OrdinalEncoder()),
    ('onehot', OneHotEncoder())
])

column_trans=ColumnTransformer([
    ('stadardard', StandardScaler(), standard_cols),
    ('ordinal', OrdinalEncoder(categories=[['norent_noown','rented','owned']]), ordinal_cols),
    ('onehot',  OneHotEncoder(handle_unknown = 'ignore'), onehot_cols)],
    remainder="passthrough"
)

In [8]:
scaled_X_train = column_trans.fit_transform(x_train)
scaled_X_test = column_trans.transform(x_test)

XGBClassifier Train

In [9]:
clf = XGBClassifier()
clf.fit(scaled_X_train, y_train)

Predict

In [10]:
pred = clf.predict(scaled_X_test)
pred

array([0, 0, 0, ..., 0, 0, 0])

Accuracy & F1 Score

In [11]:
acc = accuracy_score(y_test, pred)
f1 = f1_score(y_test, pred)

print('acc:', acc)
print(' f1:', f1)

acc: 0.9016330696606277
 f1: 0.6092245311708059


Data without Mapping Personality

In [12]:
data = dataset[['Income', 'Age', 'Experience', 'Married/Single', 'House_Ownership', 'Car_Ownership', 'Profession', 'CITY', 'STATE', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']]
target = dataset[['Risk_Flag']]

In [13]:
# train_test_split
x_train, x_test, y_train, y_test = train_test_split(data, target, test_size=0.2, random_state=42)

print(x_train.shape)
print(x_test.shape)

(31351, 11)
(7838, 11)


In [14]:
x_train = setDataset(x_train)
x_test = setDataset(x_test)

In [15]:
# Encoding & Scale
ordinal_cols = ['House_Ownership']
standard_cols = ['Income', 'Age', 'Married/Single', 'Car_Ownership', 'CURRENT_JOB_YRS', 'CURRENT_HOUSE_YRS']
onehot_cols = ['Profession', 'CITY', 'STATE']

onehot=Pipeline([
    ('oridinal', OrdinalEncoder()),
    ('onehot', OneHotEncoder())
])

column_trans=ColumnTransformer([
    ('stadardard', StandardScaler(), standard_cols),
    ('ordinal', OrdinalEncoder(categories=[['norent_noown','rented','owned']]), ordinal_cols),
    ('onehot',  OneHotEncoder(handle_unknown = 'ignore'), onehot_cols)],
    remainder="passthrough"
)

In [16]:
scaled_X_train = column_trans.fit_transform(x_train)
scaled_X_test = column_trans.transform(x_test)

In [17]:
clf = XGBClassifier()
clf.fit(scaled_X_train, y_train)

In [18]:
pred2 = clf.predict(scaled_X_test)

In [19]:
acc2 = accuracy_score(y_test, pred2)
f12 = f1_score(y_test, pred2)

print('acc:', acc2)
print(' f1:', f12)

acc: 0.9058433273794335
 f1: 0.6335650446871897


Conclusion

In [20]:
print('매핑 전 20대\nacc -', acc2, '/ f1 -', f12)
print('\n매핑 후 20대\nacc -', acc, '/ f1 -', f1)

매핑 전 20대
acc - 0.9058433273794335 / f1 - 0.6335650446871897

매핑 후 20대
acc - 0.9016330696606277 / f1 - 0.6092245311708059
