### 모델 라이브러리

In [57]:
# 모델 라이브러리
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

# 보팅
from sklearn.ensemble import VotingClassifier

# 스테킹
from sklearn.ensemble import StackingClassifier

### 모델 성능확인

In [58]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)

### 모델 정의

Optuna를 이용하여 파라미터를 구함

In [59]:
import optuna
from sklearn.metrics import f1_score

def objectiveExtraTrees(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000)
        , 'max_depth': trial.suggest_int('max_depth', 20, 60)
        , 'min_samples_split': trial.suggest_int('min_samples_split', 2, 5)
        , 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 5)
        , 'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
        , 'bootstrap': trial.suggest_categorical('bootstrap', [True, False])
        , 'random_state': 0
    }
    
    model = ExtraTreesClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveExtraTrees(trial, x_train, y_train, x_val, y_val), n_trials=500)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-24 12:38:59,464] A new study created in memory with name: no-name-7eb99c3a-7549-4ca2-a15e-282b7a1c52bf


[I 2024-02-24 12:39:33,169] Trial 0 finished with value: 0.7916666666666666 and parameters: {'n_estimators': 594, 'max_depth': 49, 'min_samples_split': 4, 'min_samples_leaf': 3, 'criterion': 'entropy', 'bootstrap': False}. Best is trial 0 with value: 0.7916666666666666.
[I 2024-02-24 12:42:38,608] Trial 1 finished with value: 0.7907817442385903 and parameters: {'n_estimators': 968, 'max_depth': 35, 'min_samples_split': 5, 'min_samples_leaf': 3, 'criterion': 'entropy', 'bootstrap': False}. Best is trial 0 with value: 0.7916666666666666.
[I 2024-02-24 12:43:00,750] Trial 2 finished with value: 0.7727272727272727 and parameters: {'n_estimators': 118, 'max_depth': 54, 'min_samples_split': 5, 'min_samples_leaf': 5, 'criterion': 'gini', 'bootstrap': False}. Best is trial 0 with value: 0.7916666666666666.
[I 2024-02-24 12:43:45,121] Trial 3 finished with value: 0.77191452245966 and parameters: {'n_estimators': 206, 'max_depth': 46, 'min_samples_split': 2, 'min_samples_leaf': 5, 'criterion': '

: 

In [59]:
# ExtraTrees 
et_model = ExtraTreesClassifier(
    n_estimators=747
    , max_depth=50
    , min_samples_split=3
    , min_samples_leaf=1
    , criterion='entropy'
    , random_state=0
) 

et_model.fit(x_train, y_train)

pred = et_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = et_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,831,163
False,171,10695


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.971838,0.829341,0.836016,0.832665,0.971889


813.0

In [60]:
# ExtraTrees 
et_model = ExtraTreesClassifier(
    n_estimators=947
    , max_depth=54
    , min_samples_split=3
    , min_samples_leaf=1
    , criterion='entropy'
    , random_state=0
) 

et_model.fit(x_train, y_train)

pred = et_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = et_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,829,165
False,166,10700


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.972091,0.833166,0.834004,0.833585,0.972097


814.0

In [67]:
# ExtraTrees 
et_model = ExtraTreesClassifier(
    n_estimators=729
    , max_depth=53
    , min_samples_split=3
    , min_samples_leaf=1
    , criterion='entropy'
    , random_state=0
) 

et_model.fit(x_train, y_train)

pred = et_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = et_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,832,162
False,163,10703


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.972597,0.836181,0.837022,0.836601,0.972603


811.0

아래는 이전에 쓰던 하이퍼 파라미터 수치

In [69]:
# ExtraTrees
et_model = ExtraTreesClassifier(
    n_estimators=100
    , max_depth=48
    , min_samples_split=3
    , min_samples_leaf=1
    , criterion='gini'
    , random_state=0
)

et_model.fit(x_train, y_train)

pred = et_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = et_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,827,167
False,165,10701


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.972007,0.833669,0.831992,0.83283,0.971994


791.0

.