In [2]:
pip install catboost

Note: you may need to restart the kernel to use updated packages.


In [19]:
pip install optuna

Collecting optuna
  Downloading optuna-3.1.1-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 21.5 MB/s eta 0:00:01
Collecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting cmaes>=0.9.1
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.11.1-py3-none-any.whl (224 kB)
[K     |████████████████████████████████| 224 kB 72.0 MB/s eta 0:00:01
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.whl (78 kB)
[K     |████████████████████████████████| 78 kB 12.1 MB/s eta 0:00:01
[?25hCollecting typing-extensions>=4
  Downloading typing_extensions-4.5.0-py3-none-any.whl (27 kB)
Collecting importlib-resources
  Downloading importlib_resources-5.12.0-py3-none-any.whl (36 kB)
Installing collected packages: typing-extensions, Mako, importlib-resources, colorlog, cmaes, alembic, optuna
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 3.7.4.3
    U

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import MinMaxScaler, StandardScaler, MaxAbsScaler, RobustScaler
from sklearn.preprocessing import QuantileTransformer, PowerTransformer
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

import warnings
warnings.filterwarnings('ignore')

In [12]:
df = pd.read_csv('./medical_noshow3.csv')

x = df.loc[:, df.columns != 'No-show']
y = df[['No-show']]   

x_train, x_test, y_train,y_test = train_test_split(
    x, y, train_size=0.8, shuffle=True, random_state=42
)


In [13]:
# 변수 설정

n_splits = 11    
random_state = 42

scaler = MinMaxScaler()


kfold = KFold(n_splits=n_splits, shuffle=True, 
              random_state=random_state)

# Scaler
scaler.fit(x_train)                 
x_train = scaler.transform(x_train)   # train 은 fit, transform 모두 해줘야 함
x_test = scaler.transform(x_test) 

## GridSearchCV

In [None]:
### GridSearchCV

'''
#parameters_01
param = {
    'learning_rate': [0.1, 0.5, 1], # controls the learning rate
    'depth': [3, 4, 5], # controls the maximum depth of the tree
    'l2_leaf_reg': [2, 3, 4], # controls the L2 regularization term on weights
    'colsample_bylevel': [0.1, 0.2, 0.3], # specifies the fraction of columns to be randomly sampled for each level
    'n_estimators': [100, 200], # specifies the number of trees to be built
    'subsample': [0.1, 0.2, 0.3], # specifies the fraction of observations to be randomly sampled for each tree
    'border_count': [32, 64, 128],# specifies the number of splits for numerical features
    'bootstrap_type': ['Bernoulli', 'MVS']
} 



#parameters_02
param = {
    'learning_rate': [0.1, 0.01, 0.001],
    'depth': [3, 5, 6, 9],
    'l2_leaf_reg': [1, 3, 5, 7]
} 

cat = CatBoostClassifier()
model = GridSearchCV(cat, param,  cv = kfold, 
                   refit = True, verbose = 1, n_jobs = -1  )

#3. 훈련
import time
start_time = time.time()
model.fit(x_train, y_train)
end_time = time.time() - start_time

print('최적의 파라미터 : ', model.best_params_ )
print('최적의 매개변수 : ', model.best_estimator_)
print('best_score : ', model.best_score_)
print('model_score : ', model.score(x_test, y_test))
print('걸린 시간 : ', end_time, '초')

'''


#model = CatBoostClassifier( depth = 7, l2_leaf_reg = 7, learning_rate =  0.05, random_state=72) 
# model_score :  0.9208288092652913

## optuna

In [None]:
def objectiveCAT(trial: Trial, x_train, y_train, x_test):
    param = {
        'iterations': trial.suggest_int('iterations', 500, 4000),
        'depth' : trial.suggest_int('depth', 1, 16),
        'learning_rate' : trial.suggest_float('learning_rate', 0.005, 1),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10, log=True),
        'l2_leaf_reg' : trial.suggest_float('l2_leaf_reg', 0, 10),
        'random_state' :trial.suggest_int('random_state', 1, 2000)
    }
    # 학습 모델 생성
    model = CatBoostClassifier(**param)
    CAT_model = model.fit(x_train, y_train, verbose=True) # 학습 진행
    # 모델 성능 확인
    score = accuracy_score(CAT_model.predict(x_test), y_test)
    return score


# MAE가 최소가 되는 방향으로 학습을 진행
# TPESampler : Sampler using TPE (Tree-structured Parzen Estimator) algorithm.
study = optuna.create_study(direction='maximize', sampler=TPESampler())

# n_trials 지정해주지 않으면, 무한 반복
study.optimize(lambda trial : objectiveCAT(trial, x, y, x_test), n_trials = 5)
print('Best trial : score {}, /nparams {}'.format(study.best_trial.value, 
                                                  study.best_trial.params))

# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
print(optuna.visualization.plot_param_importances(study))
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)
plt.show()


## 모델 훈련 ( acc  0.9640 )

In [16]:
# 2. 모델
model = CatBoostClassifier(iterations= 1759, depth = 7, l2_leaf_reg = 7,random_strength =  9.992204520122275e-05,
                           learning_rate =  0.8948182120285428, random_state=72 )
                          #  task_type='GPU') 

# 3. 훈련
model.fit(x_train, y_train)

#4. 평가, 예측
result = model.score(x_test,y_test)

score = cross_val_score( model, 
                        x_train, y_train,
                        cv = kfold )  #cv = cross validation_

y_predict = cross_val_predict(model,
                              x_test, y_test,
                              cv = kfold)

acc = accuracy_score(y_test, y_predict)

print('결과 acc : ', result)
print('cv pred acc : ', acc )


결과 acc :  0.9640449692363373

0:	learn: 0.1699411	total: 79.1ms	remaining: 2m 19s
1:	learn: 0.1683875	total: 101ms	remaining: 1m 28s
2:	learn: 0.1679655	total: 120ms	remaining: 1m 10s
3:	learn: 0.1670036	total: 140ms	remaining: 1m 1s
4:	learn: 0.1664487	total: 160ms	remaining: 56.3s
5:	learn: 0.1661251	total: 179ms	remaining: 52.3s
6:	learn: 0.1657172	total: 198ms	remaining: 49.5s
7:	learn: 0.1653518	total: 216ms	remaining: 47.2s
8:	learn: 0.1649862	total: 235ms	remaining: 45.7s
9:	learn: 0.1645509	total: 254ms	remaining: 44.4s
10:	learn: 0.1643047	total: 273ms	remaining: 43.5s
11:	learn: 0.1639951	total: 294ms	remaining: 42.8s
12:	learn: 0.1637709	total: 316ms	remaining: 42.4s
13:	learn: 0.1634227	total: 336ms	remaining: 41.8s
14:	learn: 0.1631741	total: 359ms	remaining: 41.7s
15:	learn: 0.1628743	total: 379ms	remaining: 41.3s
16:	learn: 0.1626051	total: 398ms	remaining: 40.8s
17:	learn: 0.1621998	total: 417ms	remaining: 40.3s
18:	learn: 0.1619216	total: 440ms	remaining: 40.3s
19:	learn: 0.1616676	total: 459ms	rem

## 스케일링

In [None]:
# 스케일링
sts = StandardScaler() 
mms = MinMaxScaler()
mas = MaxAbsScaler()
rbs = RobustScaler()
qtf = QuantileTransformer()                     # QuantileTransformer 는 지정된 분위수에 맞게 데이터를 변환함. 
                                                # 기본 분위수는 1,000개이며, n_quantiles 매개변수에서 변경할 수 있음
ptf1 = PowerTransformer(method='yeo-johnson')   # 'yeo-johnson', 양수 및 음수 값으로 작동
ptf2 = PowerTransformer(method='box-cox')       # 'box-cox', 양수 값에서만 작동

scalers = [sts, mms, mas, rbs, qtf, ptf1, ptf2]
for scaler in scalers:
    x_train = scaler.fit_transform(x_train)
    x_test = scaler.transform(x_test)
    model = CatBoostClassifier( iterations= 1759, depth = 7, l2_leaf_reg = 7,random_strength =  9.992204520122275e-05,
                            learning_rate =  0.8948182120285428, random_state=72, 
                             verbose = 0) 
                          #  task_type='GPU')
    model.fit(x_train, y_train)
    y_predict = model.predict(x_test)
    result = accuracy_score(y_test, y_predict)
    scale_name = scaler.__class__.__name__
    print('{0} 결과 : {1:.4f}'.format(scale_name, result), )

    
# StandardScaler 결과 : 0.9640
# MinMaxScaler 결과 : 0.9640
# MaxAbsScaler 결과 : 0.9640
# RobustScaler 결과 : 0.9640
# QuantileTransformer 결과 : 0.9637
# PowerTransformer 결과 : 0.9637