# Library Load

In [None]:
import numpy as np
import pandas as pd

## 결과 확인 용이
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## preprocessing
from sklearn.model_selection import train_test_split

## RF
from sklearn.ensemble import RandomForestClassifier

## scoring
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

## rf 의사결정 나무 그래프
from sklearn.tree import export_graphviz
from subprocess import call
from IPython.display import Image


## vis
import matplotlib.pyplot as plt
import seaborn as sns

custom_params = {"axes.spines.right": False, "axes.spines.top": False}
sns.set_theme(context='notebook',   # 매체: paper, talk, poster
              style='darkgrid',     # 기본 내장 테마
              palette='deep',       # 그래프 색
              font='Malgun Gothic', # 글꼴 종류 
              font_scale=1,         # 글꼴 크기
              rc=custom_params)     # 그래프 세부 사항

## seed 고정
import random
user_seed = 42
random.seed(user_seed) # seed 고정

# Data Load

In [None]:
data = pd.read_excel('DATA.xlsx')

# 변수 지정

In [None]:
tar1 = data.TARGET
tar2 = data.TARGET2A
tar4 = data.ATM

features = data.iloc[:,1:-4]

## tar1

In [None]:
## train, test split
train, test = train_test_split(data, test_size = 0.2, stratify = data['TARGET']) # stratify : 균등 분할
                                                                                      # 이래버리면 data leakage이긴 함.
## X, Y 분리
X_train, Y_train = train.iloc[:, 1:-4], train['TARGET']
X_test, Y_test = test.iloc[:, 1:-4], test['TARGET']

model = RandomForestClassifier(n_estimators=300, random_state = user_seed)

## random_state 42로 고정 후, id 열 제거 후))
# 300에 57.3%이 현재 최선

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

## 점수 함수
def get_clf_eval(Y_test, pred=None, pred_proba=None):
    print('오차행렬 \n', confusion_matrix(Y_test, Y_pred))
    print('정확도 :', accuracy_score(Y_test, Y_pred))
#     print('정밀도 : ',precision_score(Y_test, Y_pred))    # class 비율 균일하게 분할 시 작동 안함.
#     print('재현율 :', recall_score(Y_test, Y_pred))    # class 비율 균일하게 분할 시 작동 안함.
#     print('f1 score :', f1_score(Y_test, Y_pred))    # class 비율 균일하게 분할 시 작동 안함.
#     print('roc auc score :', roc_auc_score(Y_test, Y_pred))


get_clf_eval(Y_test)

# print(f'Y_train : {Y_train}')
print(f'Y_test : {np.array(Y_test)}')
print(f'Y_pred : {Y_pred}')

### FI scatter

In [None]:
## linear하게 정렬하기
dic = {X_train.columns[i] : model.feature_importances_[i] for i in range(len(X_train.columns))}
item_li = sorted(dic.items(), key=lambda x:x[1], reverse=True)
#

item_keys = [i[0] for i in item_li]
item_values = [i[1] for i in item_li]
#

# sns.scatterplot(model.feature_importances_, X_train.columns)
sns.scatterplot(item_values, item_keys)
plt.savefig('tar1_FI')

### tar1 all vis
# Model (can also use single decision tree)
model = RandomForestClassifier(n_estimators=450, random_state = user_seed)

# Train
model.fit(features, tar1)
# Extract single tree
estimator = model.estimators_[5]

tar1_class = ['Unbanked','Under_Banked', 'Fully_Banked']

# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = list(features.columns),
                class_names = tar1_class,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
call(['dot', '-Tpng', 'tree.dot', '-o', 'all_vis_tree_tar1.png', '-Gdpi=600'])

# Display in jupyter notebook
Image(filename = 'all_vis_tree_tar1.png', height=300, width= 450)

## tar3
- missing data는 listwise하게 없애기 <br>
(listwise란? 그 행의 데이터를 다 없애는 방식. `dropna`로 해결 됨.) <br>

In [None]:
data_dropna = data.dropna(axis=0)
tar3 = data_dropna.TARGET2B
features_dropna = data_dropna.iloc[:,1:-4]

In [None]:
# train, test split
train, test = train_test_split(data_dropna, test_size = 0.2, stratify = data_dropna['TARGET2B']) # stratify : 균등 분할
                                                                                      # 이래버리면 data leakage이긴 함.
# X, Y 분리
X_train, Y_train = train.iloc[:, 1:-4], train['TARGET2B']
X_test, Y_test = test.iloc[:, 1:-4], test['TARGET2B']

model = RandomForestClassifier(n_estimators=300, random_state = user_seed)

## random_state 42로 고정 후, id 열 제거 후))
# 300에 64%

model.fit(X_train, Y_train)
Y_pred = model.predict(X_test)

## 점수 함수
def get_clf_eval(Y_test, pred=None, pred_proba=None):
    print('오차행렬 \n', confusion_matrix(Y_test, Y_pred))
    print('정확도 :', accuracy_score(Y_test, Y_pred))


get_clf_eval(Y_test)

# print(f'Y_train : {Y_train}')
print(f'Y_test : {np.array(Y_test)}')
print(f'Y_pred : {Y_pred}')

### FI scatter

In [None]:
## linear하게 정렬하기
dic = {X_train.columns[i] : model.feature_importances_[i] for i in range(len(X_train.columns))}
item_li = sorted(dic.items(), key=lambda x:x[1], reverse=True)
#

item_keys = [i[0] for i in item_li]
item_values = [i[1] for i in item_li]
#

# sns.scatterplot(model.feature_importances_, X_train.columns)
sns.scatterplot(item_values, item_keys)
plt.savefig('tar3_FI')

### tar1 all vis

In [None]:
# Model (can also use single decision tree)
model = RandomForestClassifier(n_estimators=450, random_state = user_seed)

# Train
model.fit(features_dropna, tar3)
# Extract single tree
estimator = model.estimators_[5]



tar3_class = ['Under_Banked_No','Under_Banked_Yes']

# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = list(X_train.columns),
                class_names = tar3_class,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
call(['dot', '-Tpng', 'tree.dot', '-o', 'all_vis_tree_tar3.png', '-Gdpi=600'])

# Display in jupyter notebook
Image(filename = 'all_vis_tree_tar3.png', height=300, width= 450)


# Visualization
## tar1
# Model (can also use single decision tree)
model = RandomForestClassifier(n_estimators=10, max_depth = 3, max_features = 4, random_state = user_seed)

# Train
model.fit(features, tar1)
# Extract single tree
estimator = model.estimators_[5]

tar1_class = ['Unbanked','Under_Banked', 'Fully_Banked']

# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = list(features.columns),
                class_names = tar1_class,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree_tar1(3,4,seed42).png', '-Gdpi=600'])

# Display in jupyter notebook
Image(filename = 'tree_tar1(3,4,seed42).png', height=300, width= 450)

## tar3

In [None]:
data_dropna = data.dropna(axis=0)
tar3 = data_dropna.TARGET2B
features_dropna = data_dropna.iloc[:,1:-4]


# Model (can also use single decision tree)
model = RandomForestClassifier(n_estimators=10, max_depth = 3, max_features = 4, random_state = user_seed)

# Train
model.fit(features_dropna, tar3)
# Extract single tree
estimator = model.estimators_[5]


tar3_class = ['Under_Banked_No','Under_Banked_Yes']


# Export as dot file
export_graphviz(estimator, out_file='tree.dot', 
                feature_names = list(features_dropna.columns),
                class_names = tar3_class,
                rounded = True, proportion = False, 
                precision = 2, filled = True)

# Convert to png using system command (requires Graphviz)
call(['dot', '-Tpng', 'tree.dot', '-o', 'tree_tar3(3,4,seed42).png', '-Gdpi=600'])

# Display in jupyter notebook
Image(filename = 'tree_tar3(3,4,seed42).png', height=300, width= 450)