### 모델 라이브러리

In [57]:
# 모델 라이브러리
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

# 보팅
from sklearn.ensemble import VotingClassifier

# 스테킹
from sklearn.ensemble import StackingClassifier

### 모델 성능확인

In [58]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)

### 모델 정의

Optuna를 이용하여 파라미터를 구함

In [59]:
## 단일 모델
# LightGBM
lgbm_model  = LGBMClassifier(
	n_estimators=722
    , num_leaves=361
    , max_depth=7
    , learning_rate=0.06729510233730235
    , min_child_samples=32
    , verbose=-1
	, random_state=0
)

# LightGBM_dart
lgbm_dart_model = LGBMClassifier(
    n_estimators=301
    , num_leaves=383
    , max_depth=15
    , learning_rate=0.050832163879494394
    , min_child_samples=4
    , boosting='dart'  # dart 사용
    , random_state=0
    , verbose=-1
)

# CatBoost
cat_model = CatBoostClassifier(
    iterations=800
    , depth=13
    , learning_rate=0.038658456604712066
    , l2_leaf_reg= 1.5880876184963115
    , border_count = 32
    , verbose=False
    , random_state=0
)

# XGBoost 
xgb_model = XGBClassifier(
    n_estimators=414
    , learning_rate=0.20046808426888615
    , max_depth=11
    , alpha=0.004365542651458743
    , gamma=0.00025712949731685885
    , reg_alpha=0.17168922089033928
    , reg_lambda=0.03881395024846057
    , colsample_bytree=0.32031741412326675
    , subsample=0.6269215430592496
    , objective='binary:logistic'  # 이진 분류
    , tree_method="exact"        
    , random_state=0
)

# DecisionTree
dt_model = DecisionTreeClassifier(
    max_depth=10
    , min_samples_split=2
    , min_samples_leaf=1
    , max_features=None
    , splitter='best'
    , criterion='gini'
    , random_state=0
) 

# ExtraTrees
et_model = ExtraTreesClassifier(
    n_estimators=100
    , max_depth=48
    , min_samples_split=3
		, min_samples_leaf=1
		, criterion='gini'
		, random_state=0
) 

In [60]:
from sklearn.ensemble import StackingClassifier

# 스태킹 분류기 생성
stacking_model = StackingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        # ('lgbm_dart', lgbm_dart_model),
        ('cat', cat_model),
        ('xgb', xgb_model),
        # ('dt', dt_model),
        # ('et', et_model)
    ],
    final_estimator=lgbm_model  # 최종 메타 모델
)

In [61]:
stacking_model.fit(x_train, y_train)

In [62]:
pred = stacking_model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,791,203
False,138,10728


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.971248,0.851453,0.795775,0.822673,0.970805


In [63]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = stacking_model.predict(x_test)
sum(test_pred) # True로 예측된 개수

676.0

In [64]:
from sklearn.ensemble import StackingClassifier

# 스태킹 분류기 생성
stacking_model = StackingClassifier(
    estimators=[
        # ('lgbm', lgbm_model),
        ('lgbm_dart', lgbm_dart_model),
        ('cat', cat_model),
        ('xgb', xgb_model),
        # ('dt', dt_model),
        # ('et', et_model)
    ],
    final_estimator=lgbm_model  # 최종 메타 모델
)

In [65]:
stacking_model.fit(x_train, y_train)

In [66]:
pred = stacking_model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,795,199
False,127,10739


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.972513,0.862256,0.799799,0.829854,0.972042


In [67]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = stacking_model.predict(x_test)
sum(test_pred) # True로 예측된 개수

674.0

In [68]:
from sklearn.ensemble import StackingClassifier

# 스태킹 분류기 생성
stacking_model = StackingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        # ('lgbm_dart', lgbm_dart_model),
        ('cat', cat_model),
        ('xgb', xgb_model),
        # ('dt', dt_model),
        ('et', et_model)
    ],
    final_estimator=lgbm_model  # 최종 메타 모델
)

In [69]:
stacking_model.fit(x_train, y_train)

In [70]:
pred = stacking_model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,775,219
False,104,10762


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.972766,0.881684,0.779678,0.827549,0.972001


In [71]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = stacking_model.predict(x_test)
sum(test_pred) # True로 예측된 개수

549.0

In [72]:
from sklearn.ensemble import StackingClassifier

# 스태킹 분류기 생성
stacking_model = StackingClassifier(
    estimators=[
        # ('lgbm', lgbm_model),
        ('lgbm_dart', lgbm_dart_model),
        ('cat', cat_model),
        ('xgb', xgb_model),
        # ('dt', dt_model),
        ('et', et_model)
    ],
    final_estimator=lgbm_model  # 최종 메타 모델
)

In [73]:
stacking_model.fit(x_train, y_train)

In [74]:
pred = stacking_model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,782,212
False,87,10779


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.974789,0.899885,0.78672,0.839506,0.974016


In [75]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = stacking_model.predict(x_test)
sum(test_pred) # True로 예측된 개수

539.0

In [76]:
from sklearn.ensemble import StackingClassifier

# 스태킹 분류기 생성
stacking_model = StackingClassifier(
    estimators=[
        # ('lgbm', lgbm_model),
        ('lgbm_dart', lgbm_dart_model),
        ('cat', cat_model),
        ('xgb', xgb_model),
        ('dt', dt_model),
        ('et', et_model)
    ],
    final_estimator=lgbm_model  # 최종 메타 모델
)

In [77]:
stacking_model.fit(x_train, y_train)

In [78]:
pred = stacking_model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,768,226
False,77,10789


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.974452,0.908876,0.772636,0.835237,0.973504


In [79]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = stacking_model.predict(x_test)
sum(test_pred) # True로 예측된 개수

512.0

In [80]:
from sklearn.ensemble import StackingClassifier

# 스태킹 분류기 생성
stacking_model = StackingClassifier(
    estimators=[
        ('lgbm', lgbm_model),
        # ('lgbm_dart', lgbm_dart_model),
        ('cat', cat_model),
        ('xgb', xgb_model),
        ('dt', dt_model),
        ('et', et_model)
    ],
    final_estimator=lgbm_model  # 최종 메타 모델
)

In [81]:
stacking_model.fit(x_train, y_train)

In [82]:
pred = stacking_model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,766,228
False,90,10776


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.973187,0.89486,0.770624,0.828108,0.972272


In [83]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = stacking_model.predict(x_test)
sum(test_pred) # True로 예측된 개수

496.0

### 제출파일 생성

In [84]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_stacking_lgbm.csv", index=False)

.