### 모델 라이브러리

In [57]:
# 모델 라이브러리
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

# 보팅
from sklearn.ensemble import VotingClassifier

# 스테킹
from sklearn.ensemble import StackingClassifier

### 모델 성능확인

In [58]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)

### 모델 정의

Optuna를 이용하여 파라미터를 구함

In [59]:
# LightGBM_dart
lgbm_dart_model = LGBMClassifier(
    n_estimators=1029
    , num_leaves=167
    , max_depth=30
    , learning_rate=0.05767571715999541
    , min_child_samples=25
    , verbose=-1
    , boosting='dart'  # dart 사용
    , random_state=0
)

lgbm_dart_model.fit(x_train, y_train)

pred = lgbm_dart_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = lgbm_dart_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,830,164
False,154,10712


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.973187,0.843496,0.83501,0.839232,0.973126


799.0

Public : 0.54688731

------

In [60]:
# XGBoost 
xgb_model = XGBClassifier(
    n_estimators=414
    , learning_rate=0.20046808426888615
    , max_depth=11

    , alpha=0.004365542651458743
    , gamma=0.00025712949731685885

    , reg_alpha=0.17168922089033928
    , reg_lambda=0.03881395024846057
    
    , colsample_bytree=0.32031741412326675
    , subsample=0.6269215430592496
    , objective='binary:logistic'  # 이진 분류
    , tree_method="exact"        
    , random_state=0
)

xgb_model.fit(x_train, y_train)

pred = xgb_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = xgb_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,817,177
False,126,10740


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.974452,0.866384,0.821932,0.843573,0.974146


872

Public : 0.58030303

------

In [61]:
# CatBoost
cat_model = CatBoostClassifier(
    iterations=308
    , depth=12
    , learning_rate=0.14214840217472086
    , l2_leaf_reg= 3.4914313211595593
    , border_count = 95
    , verbose=False
    , random_state=0
)

cat_model.fit(x_train, y_train)

pred = cat_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = cat_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,816,178
False,168,10698


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.970826,0.829268,0.820926,0.825076,0.970759


767.0

---

In [62]:
# DecisionTree
dt_model = DecisionTreeClassifier(
    max_depth=26
    , min_samples_split=10
    , min_samples_leaf=4 
    , criterion='entropy'
		, class_weight='balanced'
    , random_state=0
)  

dt_model.fit(x_train, y_train)

pred = dt_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = dt_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,823,171
False,250,10616


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.964503,0.767008,0.827968,0.796323,0.965116


1022.0

-----

In [63]:
# ExtraTrees
et_model = ExtraTreesClassifier(
    n_estimators=100
    , max_depth=48
    , min_samples_split=3
		, min_samples_leaf=1
		, criterion='gini'
		, random_state=0
)

et_model.fit(x_train, y_train)

pred = et_model.predict(x_val)
get_clf_eval(y_val, pred)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = et_model.predict(x_test)
sum(test_pred) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,827,167
False,165,10701


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.972007,0.833669,0.831992,0.83283,0.971994


791.0

----

In [64]:
from sklearn.ensemble import VotingClassifier

# Soft Voting
voting_clf = VotingClassifier(
    estimators=[
        ('lgbm_dart', lgbm_dart_model), 
        ('xgb', xgb_model), 
        ('et', et_model)
    ], 
    voting='soft'
)

# 학습
voting_clf.fit(x_train, y_train)

In [65]:
# 확률 예측
soft_voting_probs = voting_clf.predict_proba(x_val)[:, 1]

# 스레시홀드 0.4 적용
soft_voting_preds = [1 if prob > 0.4 else 0 for prob in soft_voting_probs]

# 평가
get_clf_eval(y_val, soft_voting_preds)


오차행렬:


Unnamed: 0,True,False
True,843,151
False,170,10696


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.972934,0.832182,0.848089,0.84006,0.973051


In [66]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

# 확률 예측
test_pred_probs = voting_clf.predict_proba(x_test)[:, 1]

# 스레시홀드 적용
test_pred_cutoff = [1 if prob > 0.4 else 0 for prob in test_pred_probs]

sum(test_pred_cutoff) # True로 예측된 개수


967

---

In [67]:
# 확률 예측
soft_voting_probs = voting_clf.predict_proba(x_val)[:, 1]

# 스레시홀드 0.2 적용
soft_voting_preds = [1 if prob > 0.2 else 0 for prob in soft_voting_probs]

# 평가
get_clf_eval(y_val, soft_voting_preds)


오차행렬:


Unnamed: 0,True,False
True,900,94
False,301,10565


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.966695,0.749376,0.905433,0.820046,0.968105


In [68]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

# 확률 예측
test_pred_probs = voting_clf.predict_proba(x_test)[:, 1]

# 스레시홀드 적용
test_pred_cutoff = [1 if prob > 0.2 else 0 for prob in test_pred_probs]

sum(test_pred_cutoff) # True로 예측된 개수


1408

----

### 모델 학습

In [69]:
from sklearn.ensemble import VotingClassifier

# LightGBM_dart
lgbm_dart_model = LGBMClassifier(
    n_estimators=1029
    , num_leaves=167
    , max_depth=30
    , learning_rate=0.05767571715999541
    , min_child_samples=25
    , verbose=-1
    , boosting='dart'  # dart 사용
    , random_state=0
)

# XGBoost 
xgb_model = XGBClassifier(
    n_estimators=414
    , learning_rate=0.20046808426888615
    , max_depth=11

    , alpha=0.004365542651458743
    , gamma=0.00025712949731685885

    , reg_alpha=0.17168922089033928
    , reg_lambda=0.03881395024846057
    
    , colsample_bytree=0.32031741412326675
    , subsample=0.6269215430592496
    , objective='binary:logistic'  # 이진 분류
    , tree_method="exact"        
    , random_state=0
)

# ExtraTrees
et_model = ExtraTreesClassifier(
    n_estimators=100
    , max_depth=48
    , min_samples_split=3
		, min_samples_leaf=1
		, criterion='gini'
		, random_state=0
)

# 보팅 분류기 생성
voting_model = VotingClassifier(
    estimators=[
        ('lgb_dart', lgbm_dart_model),
        ('xgb', xgb_model),
        ('et', et_model)
    ],
    voting='soft'  # 'hard'는 다수결 투표, 'soft'는 확률 평균
)

In [70]:
voting_model.fit(x_train, y_train)

In [71]:
# 확률 예측
soft_voting_probs = voting_model.predict_proba(x_val)[:, 1]

### cutoff-value 조정

In [72]:
# 스레시홀드 0.4 적용
soft_voting_preds_04 = [1 if prob > 0.4 else 0 for prob in soft_voting_probs]

# 평가
get_clf_eval(y_val, soft_voting_preds_04)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

# 확률 예측
test_pred_probs = voting_clf.predict_proba(x_test)[:, 1]

# 스레시홀드 적용
preds_04 = [1 if prob > 0.4 else 0 for prob in test_pred_probs]

sum(preds_04) # True로 예측된 개수


오차행렬:


Unnamed: 0,True,False
True,843,151
False,170,10696


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.972934,0.832182,0.848089,0.84006,0.973051


967

In [73]:
# 스레시홀드 0.2 적용
soft_voting_preds_02 = [1 if prob > 0.2 else 0 for prob in soft_voting_probs]

get_clf_eval(y_val, soft_voting_preds_02)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

# 확률 예측
test_pred_probs = voting_clf.predict_proba(x_test)[:, 1]

# 스레시홀드 적용
preds_02 = [1 if prob > 0.2 else 0 for prob in test_pred_probs]

sum(preds_02) # True로 예측된 개수



오차행렬:


Unnamed: 0,True,False
True,900,94
False,301,10565


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.966695,0.749376,0.905433,0.820046,0.968105


1408

### 제출파일 생성

In [84]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["is_converted"] = preds_02

# 제출 파일 저장
df_sub.to_csv("submission_cutoff_02.csv", index=False)

In [85]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["is_converted"] = preds_04

# 제출 파일 저장
df_sub.to_csv("submission_cutoff_04.csv", index=False)

.