### 모델 라이브러리

In [57]:
# 모델 라이브러리
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

# 보팅
from sklearn.ensemble import VotingClassifier

# 스테킹
from sklearn.ensemble import StackingClassifier

### 모델 성능확인

In [58]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)

### 모델 정의

Optuna를 이용하여 파라미터를 구함

In [59]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveLGBM_dart(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 800)
        , 'num_leaves': trial.suggest_int('num_leaves', 200, 800)
        , 'max_depth': trial.suggest_int('max_depth', 3, 30)
        , 'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1)
        , 'min_child_samples': trial.suggest_int('min_child_samples', 2, 20)
  
        , 'boosting' : 'dart' # dart 사용
        , 'random_state': 0
        , 'verbose' : -1
    }
    
    model = LGBMClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveLGBM_dart(trial, x_train, y_train, x_val, y_val), n_trials=500)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-24 12:41:19,563] A new study created in memory with name: no-name-dfe7fe09-b50b-4199-8547-2ec6974f12f7


[I 2024-02-24 12:42:31,449] Trial 0 finished with value: 0.8297338021094927 and parameters: {'n_estimators': 529, 'num_leaves': 629, 'max_depth': 19, 'learning_rate': 0.05494343511669279, 'min_child_samples': 10}. Best is trial 0 with value: 0.8297338021094927.
[I 2024-02-24 12:44:02,742] Trial 1 finished with value: 0.8409785932721713 and parameters: {'n_estimators': 588, 'num_leaves': 462, 'max_depth': 27, 'learning_rate': 0.09640261328960191, 'min_child_samples': 9}. Best is trial 1 with value: 0.8409785932721713.
[I 2024-02-24 12:45:42,347] Trial 2 finished with value: 0.8336713995943205 and parameters: {'n_estimators': 675, 'num_leaves': 517, 'max_depth': 18, 'learning_rate': 0.09263406719097345, 'min_child_samples': 3}. Best is trial 1 with value: 0.8409785932721713.
[I 2024-02-24 12:45:53,478] Trial 3 finished with value: 0.8326693227091633 and parameters: {'n_estimators': 252, 'num_leaves': 212, 'max_depth': 26, 'learning_rate': 0.0780375183440352, 'min_child_samples': 18}. Bes

: 

In [59]:
# LightGBM_dart
lgbm_dart_model = LGBMClassifier(
    n_estimators=716
    , num_leaves=677
    , max_depth=24
    , learning_rate=0.09716914697888186
    , min_child_samples=12
    , verbose=-1
    , boosting='dart'  # dart 사용
    , random_state=0
)

lgbm_dart_model.fit(x_train, y_train)

pred = lgbm_dart_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = lgbm_dart_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,834,160
False,140,10726


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.974705,0.856263,0.839034,0.847561,0.974588


776.0

이전의 하이퍼 파라미터

In [60]:
# LightGBM_dart
lgbm_dart_model = LGBMClassifier(
    n_estimators=1029
    , num_leaves=167
    , max_depth=30
    , learning_rate=0.05767571715999541
    , min_child_samples=25
    , verbose=-1
    , boosting='dart'  # dart 사용
    , random_state=0
)

lgbm_dart_model.fit(x_train, y_train)

pred = lgbm_dart_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = lgbm_dart_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,830,164
False,154,10712


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.973187,0.843496,0.83501,0.839232,0.973126


799.0

.