### 모델 라이브러리

In [58]:
# 모델 라이브러리
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

# 보팅
from sklearn.ensemble import VotingClassifier

# 스테킹
from sklearn.ensemble import StackingClassifier

In [60]:
import optuna
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveDecisionTree(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'max_depth': trial.suggest_int('max_depth', 2, 50)
        , 'min_samples_split': trial.suggest_int('min_samples_split', 2, 30)
        , 'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 10)
        , 'max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
        , 'splitter': trial.suggest_categorical('splitter', ['best', 'random'])
        , 'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
        , 'random_state': 0
    }
    
    model = DecisionTreeClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="binary")
    
    return score

# 데이터셋 분할
x_train, x_val, y_train, y_val = train_test_split(
    df_train_encoded.drop("is_converted", axis=1),
    df_train_encoded["is_converted"].astype(int),
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveDecisionTree(trial, x_train, y_train, x_val, y_val), n_trials=3000)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


[I 2024-02-22 17:37:10,403] A new study created in memory with name: no-name-a1d86249-949b-4c48-82f7-b64855a056d1


[I 2024-02-22 17:37:10,804] Trial 0 finished with value: 0.7857553130384836 and parameters: {'max_depth': 28, 'min_samples_split': 22, 'min_samples_leaf': 7, 'max_features': None, 'splitter': 'random', 'criterion': 'gini'}. Best is trial 0 with value: 0.7857553130384836.
[I 2024-02-22 17:37:10,910] Trial 1 finished with value: 0.7593582887700535 and parameters: {'max_depth': 40, 'min_samples_split': 17, 'min_samples_leaf': 6, 'max_features': 'sqrt', 'splitter': 'random', 'criterion': 'entropy'}. Best is trial 0 with value: 0.7857553130384836.
[I 2024-02-22 17:37:11,014] Trial 2 finished with value: 0.7649122807017544 and parameters: {'max_depth': 49, 'min_samples_split': 25, 'min_samples_leaf': 5, 'max_features': 'sqrt', 'splitter': 'random', 'criterion': 'gini'}. Best is trial 0 with value: 0.7857553130384836.
[I 2024-02-22 17:37:11,290] Trial 3 finished with value: 0.7872463768115942 and parameters: {'max_depth': 14, 'min_samples_split': 24, 'min_samples_leaf': 5, 'max_features': Non

Best trial: score 0.8090539756239118, 
params {'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'splitter': 'best', 'criterion': 'gini'}


### 모델 성능확인

In [64]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)

### 모델 정의

Optuna를 이용하여 파라미터를 구함

In [75]:
# DecisionTree
model = DecisionTreeClassifier(
    max_depth=10
    , min_samples_split=2
    , min_samples_leaf=1
    , max_features=None
    , splitter='best'
    , criterion='gini'
    , random_state=0
) 

### 모델 학습

In [76]:
model.fit(x_train, y_train)

In [77]:
pred = model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,697,250
False,79,10834


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.97226,0.898196,0.736008,0.809054,0.970991


In [78]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

test_pred = model.predict(x_test)
sum(test_pred) # True로 예측된 개수

658

### 제출파일 생성

In [79]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_dt_optuna.csv", index=False)

.