In [39]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [40]:
pip install optuna

Note: you may need to restart the kernel to use updated packages.


In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

import warnings
warnings.filterwarnings('ignore')

In [42]:
df = pd.read_csv('./medical_noshow_processed_data.csv')

x = df.loc[:, df.columns != 'No-show']
y = df[['No-show']]   

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42
)


In [43]:
# 변수 설정

n_splits = 11    
random_state = 42

scaler = RobustScaler()


kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)

# Scaler
scaler.fit(x_train)                 
x_train = scaler.transform(x_train)   # train 은 fit, transform 모두 해줘야 함
x_test = scaler.transform(x_test) 

## GridSearchCV

In [None]:
### GridSearchCV

'''
#parameters_01
param = {
    'learning_rate': [0.1, 0.5, 1], # controls the learning rate
    'depth': [3, 4, 5], # controls the maximum depth of the tree
    'l2_leaf_reg': [2, 3, 4], # controls the L2 regularization term on weights
    'colsample_bylevel': [0.1, 0.2, 0.3], # specifies the fraction of columns to be randomly sampled for each level
    'n_estimators': [100, 200], # specifies the number of trees to be built
    'subsample': [0.1, 0.2, 0.3], # specifies the fraction of observations to be randomly sampled for each tree
    'border_count': [32, 64, 128],# specifies the number of splits for numerical features
    'bootstrap_type': ['Bernoulli', 'MVS']
} 



#parameters_02
param = {
    'learning_rate': [0.1, 0.01, 0.001],
    'depth': [3, 5, 6, 9],
    'l2_leaf_reg': [1, 3, 5, 7]
} 

cat = CatBoostClassifier()
model = GridSearchCV(cat, param,  cv = kfold, 
                   refit = True, verbose = 1, n_jobs = -1  )

#3. 훈련
import time
start_time = time.time()
model.fit(x_train, y_train)
end_time = time.time() - start_time

print('최적의 파라미터 : ', model.best_params_ )
print('최적의 매개변수 : ', model.best_estimator_)
print('best_score : ', model.best_score_)
print('model_score : ', model.score(x_test, y_test))
print('걸린 시간 : ', end_time, '초')

'''


#model = CatBoostClassifier( depth = 7, l2_leaf_reg = 7, learning_rate =  0.05, random_state=72) 
# model_score :  0.9208288092652913

## optuna

In [None]:
def objectiveCAT(trial: Trial, x_train, y_train, x_test):
    param = {
        'iterations': trial.suggest_int('iterations', 500, 4000),
        'depth' : trial.suggest_int('depth', 1, 16),
        'learning_rate' : trial.suggest_float('learning_rate', 0.005, 1),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10, log=True),
        'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0, 10),
        'random_state' :trial.suggest_int('random_state', 1, 2000)
    }
    # 학습 모델 생성
    model = CatBoostClassifier(**param)
    CAT_model = model.fit(x_train, y_train, verbose=True) # 학습 진행
    # 모델 성능 확인
    score = accuracy_score(CAT_model.predict(x_test), y_test)
    return score


# MAE가 최소가 되는 방향으로 학습을 진행
# TPESampler : Sampler using TPE (Tree-structured Parzen Estimator) algorithm.
study = optuna.create_study(direction='maximize', sampler=TPESampler())

# n_trials 지정해주지 않으면, 무한 반복
study.optimize(lambda trial : objectiveCAT(trial, x, y, x_test), n_trials = 5)
print('Best trial : score {}, /nparams {}'.format(study.best_trial.value, 
                                                  study.best_trial.params))

# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
print(optuna.visualization.plot_param_importances(study))
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)
plt.show()


## 모델 훈련 ( acc  0.9211 )

In [None]:
#2. 모델

# model = CatBoostClassifier( depth = 7, l2_leaf_reg = 7, learning_rate =  0.05, random_state=72) 
# # 결과 acc :  0.9208288092652913


#2. 모델
model = CatBoostClassifier( depth = 7, l2_leaf_reg = 7, learning_rate = 0.05, random_state=72,iterations = 1983, random_strength = 0.508584392429003 ) 
# 결과 acc :  0.9211454940282302


# model = CatBoostClassifier(iterations= 1759, depth = 7, l2_leaf_reg = 7,random_strength =  9.992204520122275e-05,
#                            learning_rate =  0.8948182120285428, random_state=72 )
#                           #  task_type='GPU') 

# 3. 훈련
model.fit(x_train, y_train)

#4. 평가, 예측
result = model.score(x_test,y_test)

score = cross_val_score( model, 
                        x_train, y_train,
                        cv = kfold )  #cv = cross validation_

y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv = kfold)

acc = accuracy_score(y_test, y_predict)

print('결과 acc : ', result)
print('cv pred acc : ', acc )


# 결과 acc :  0.9640449692363373

## 스케일링

In [None]:
# 스케일링
sts = StandardScaler() 
mms = MinMaxScaler()
mas = MaxAbsScaler()
rbs = RobustScaler()
qtf = QuantileTransformer()                     # QuantileTransformer 는 지정된 분위수에 맞게 데이터를 변환함. 
                                                # 기본 분위수는 1,000개이며, n_quantiles 매개변수에서 변경할 수 있음
ptf1 = PowerTransformer(method='yeo-johnson')   # 'yeo-johnson', 양수 및 음수 값으로 작동
ptf2 = PowerTransformer(method='box-cox')       # 'box-cox', 양수 값에서만 작동

scalers = [sts, mms, mas, rbs, qtf, ptf1, ptf2]
for scaler in scalers:
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    model = CatBoostClassifier( depth = 7, l2_leaf_reg = 7, learning_rate =  0.05, random_state=72, verbose = 0)     
    
#     model = CatBoostClassifier( iterations= 1759, depth = 7, l2_leaf_reg = 7,random_strength =  9.992204520122275e-05,
#                             learning_rate =  0.8948182120285428, random_state=72, 
#                              verbose = 0) 
                          #  task_type='GPU')
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    result = accuracy_score(y_test, y_predict)
    scale_name = scaler.__class__.__name__
    print('{0} 결과 : {1:.4f}'.format(scale_name, result), )

    
# StandardScaler 결과 : 0.9202
# MinMaxScaler 결과 : 0.9202
# MaxAbsScaler 결과 : 0.9202
# RobustScaler 결과 : 0.9208
# QuantileTransformer 결과 : 0.9208
# PowerTransformer 결과 : 0.9202