In [None]:
# 기본
import numpy as np
import matplotlib.pyplot as plt # Graph

# 데이터 가져오기
import pandas as pd
from sklearn import datasets

# 데이터 전처리
from sklearn.preprocessing import StandardScaler    # 연속 변수 표준화
from sklearn import preprocessing                   # 범주형 변수 수치화
from sklearn.preprocessing import LabelEncoder      # 범주형 변수 수치화

# 훈련/검증용 데이터 분리
from sklearn.model_selection import train_test_split    # 훈련과 테스트를 위한 데이터 분리

# 분류 모델
# from sklearn.tree import DecisionTreeClassifier       # 의사결정나무
# from sklearn.naive_bayes import GaussianNB            # 나이브 베이즈 분류
# from sklearn.neighbors import KNeighborsClassifier    # K-최근접 이웃
# from sklearn.ensemble import RandomForestClassifier   # 랜덤 포레스트
# from sklearn.linear_model import LogisticRegression   # 로지스틱 회귀분석
# from sklearn.svm import SVC                           # SVM(서포트벡터머신)
# from sklearn.neural_network import MLPClassifier      # 다층 인공신경망
# from sklearn.ensemble import VotingClassifier         # 과반수 투표(Majority Voting) 
# from sklearn.ensemble import BaggingClassifier        # 배깅(Bagging) 
# from sklearn.ensemble import AdaBoostClassifier       # 부스팅(Boosting) 
from lightgbm import LGBMClassifier                     # lightGBM

# 모델 검정
from sklearn.metrics import confusion_matrix, classification_report # 정오분류표
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer  # 정확도, 민감도 등
from sklearn.metrics import roc_curve, roc_auc_score, auc   # ROC 곡선

# 최적화
from sklearn.model_selection import cross_validate, cross_val_score  # 교차 타당도
from sklearn.pipeline import make_pipeline  # 파이프라인 구축
from sklearn.model_selection import learning_curve, validation_curve # 학습곡선, 검증곡선
from sklearn.model_selection import GridSearchCV    # 하이퍼파라미터 튜닝

In [None]:
#train data load 및 null 값 확인
train_df=pd.read_csv('../Data/train.csv')
train_df.info()

In [None]:
#범주형 변수 숫자로 바꾸기
train_df=pd.get_dummies(train_df)

In [None]:
train_df.head()

In [None]:
#평균값으로 대체(train)
train_df['previous_year_rating'].fillna(train_df['previous_year_rating'].mean(),inplace=True)
#null값이 잘 반영되었는지 여부 확인
print('데이터 셋의 Null 값 개수:',train_df.isnull().sum().sum())

In [None]:
train_df.keys()

In [None]:
#예측에 활용될 data
X=train_df.drop(['is_promoted','employee_id'],axis=1)
X.head()

In [None]:
#예측할 target
y=train_df['is_promoted']
np.bincount(y)

In [None]:
X_train, X_test, y_train, y_test = \
    train_test_split(X, y,
                     test_size = 0.3,   # test set의 비율
                     random_state = 1,  # 무작위 시드 번호
                     stratify = y)      # 결과 레이블의 비율대로 분리

In [None]:
lgbm= LGBMClassifier(n_estimators=400)

In [None]:
lgbm.fit(X_train, y_train)

In [None]:
y_pred = lgbm.predict(X_test)

In [None]:
confmat = pd.DataFrame(confusion_matrix(y_test, y_pred),
                      index=['True[0]','True[1]'],
                      columns=['Predict[0]', 'Predict[1]'])
confmat

In [None]:
print('잘못 분류된 샘플 개수: %d' % (y_test != y_pred).sum())
print('정확도: %.3f' % accuracy_score(y_test, y_pred))
print('정밀도: %.3f' % precision_score(y_true=y_test, y_pred=y_pred))
print('재현율: %.3f' % recall_score(y_true=y_test, y_pred=y_pred))
print('F1: %.3f' % f1_score(y_true=y_test, y_pred=y_pred))

In [None]:
param_dict = {
    "num_leaves":[20,40,60,80,100],
    "min_child_samples":[5,10,15],
    "max_depth":[-1,5,10,20],
    "learning_rate":[0.05,0.1,0.2],
    "reg_alpha":[0,0.01,0.03]}

In [None]:
grid=GridSearchCV(lgbm,
                 param_grid=param_dict,
                 n_jobs=-1)

grid = grid.fit(X_train, y_train)

print(f'Best Score : {grid.best_score_}')
print(f'Best Param : {grid.best_params_}')

In [None]:
best_LGBM = grid.best_estimator_
best_LGBM.fit(X_train, y_train)

In [None]:
y_pred = best_LGBM.predict(X_test)

In [None]:
confmat = pd.DataFrame(confusion_matrix(y_test, y_pred),
                       index=['True[0]','True[1]'],
                       columns=['Predict[0]', 'Predict[1]'])
confmat

In [None]:
# 정확도, 정밀도, 재현율, f1 score
print(f'잘못 분류된 샘플 개수: {(y_test != y_pred).sum()}')
print(f'정확도: {accuracy_score(y_test, y_pred):.3f}')
print(f'정밀도: {precision_score(y_true=y_test, y_pred=y_pred):.3f}')
print(f'재현율: {recall_score(y_true=y_test, y_pred=y_pred):.3f}')
print(f'F1: {f1_score(y_true=y_test, y_pred=y_pred):.3f}')