## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore') # 경고 메세지 무시

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df_train_origin = pd.read_csv("data/train.csv") # 학습용 데이터
df_test_origin = pd.read_csv("data/submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train_origin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         58317 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            15338 non-null  object 
 6   enterprise               59299 non-null  object 
 7   historical_existing_cnt  13756 non-null  float64
 8   id_strategic_ver         3444 non-null   float64
 9   it_strategic_ver         1121 non-null   float64
 10  idit_strategic_ver       4565 non-null   float64
 11  customer_job             40566 non-null  object 
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             58358 non-null  object 
 14  product_category      

## 2. 데이터 전처리

### 각 변수별 확인

In [4]:
df_train_process = pd.read_csv('data/Ch2/df_train.csv')
df_test_process = pd.read_csv('data/Ch2/df_test.csv')

In [5]:
df_train_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         59299 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            59299 non-null  object 
 6   enterprise               59299 non-null  object 
 7   historical_existing_cnt  59299 non-null  float64
 8   id_strategic_ver         59299 non-null  float64
 9   it_strategic_ver         59299 non-null  float64
 10  idit_strategic_ver       59299 non-null  float64
 11  customer_job             59299 non-null  object 
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             59299 non-null  object 
 14  product_category      

## 3. 피처엔지니어링

### 레이블 인코딩

In [6]:
df_train_encoded = pd.read_csv('data/Ch3/df_train.csv')
df_test_encoded = pd.read_csv('data/Ch3/df_test.csv')

In [7]:
df_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 24 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   bant_submit               59299 non-null  float64
 1   customer_country          59299 non-null  int64  
 2   business_unit             59299 non-null  int64  
 3   com_reg_ver_win_rate      59299 non-null  float64
 4   customer_idx              59299 non-null  int64  
 5   customer_type             59299 non-null  int64  
 6   enterprise                59299 non-null  int64  
 7   historical_existing_cnt   59299 non-null  float64
 8   customer_job              59299 non-null  int64  
 9   lead_desc_length          59299 non-null  int64  
 10  customer_country.1        59299 non-null  int64  
 11  customer_position         59299 non-null  int64  
 12  response_corporate        59299 non-null  int64  
 13  expected_timeline         59299 non-null  int64  
 14  lead_o

## 4. 모델 학습

### 데이터 분할

학습, 검증 데이터 분리

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(
    df_train_encoded.drop("is_converted", axis=1),
    df_train_encoded["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

### 모델 라이브러리

#### 단일모델 기준으로 사용할수 있는 모델들의 라이브러리

In [9]:
# from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# 보팅
from sklearn.ensemble import VotingClassifier
# 스테킹
from sklearn.ensemble import StackingClassifier

In [None]:
break

### 최적 하이퍼 파라미터 찾기

#### optuna를 통한 최적의 파라미터 찾기

Random Forest

In [10]:
import optuna
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

def objectiveRF(trial, x_tr, y_tr, x_val, y_val):
    n_estimators = trial.suggest_int('n_estimators', 400, 1000)
    max_depth = trial.suggest_int('max_depth', 10, 35)
    min_samples_split = trial.suggest_int('min_samples_split', 10, 20)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 8, 20)
    criterion = trial.suggest_categorical("criterion", ["gini", "entropy"])

    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        criterion=criterion,
        random_state=0
    )

    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="weighted")
    
    return score


# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveRF(trial, x_train, y_train, x_val, y_val), n_trials=50)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-17 20:55:05,044] A new study created in memory with name: no-name-ba012bf8-a0a6-4cb2-840d-7020c281ceb5
[W 2024-02-17 20:55:09,497] Trial 0 failed with parameters: {'n_estimators': 729, 'max_depth': 28, 'min_samples_split': 16, 'min_samples_leaf': 15, 'criterion': 'entropy'} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\HamIG\AppData\Local\Programs\Python\Python312\Lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\HamIG\AppData\Local\Temp\ipykernel_50360\4050503408.py", line 32, in <lambda>
    study.optimize(lambda trial: objectiveRF(trial, x_train, y_train, x_val, y_val), n_trials=50)
                                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\HamIG\AppData\Local\Temp\ipykernel_50360\4050503408.py", line 23, in objectiveRF
    model.fit(x_tr, y_tr)
  File "c:\Users\HamIG\A

KeyboardInterrupt: 

LGBM

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveLGBM(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 25),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'verbose' : -1,
        'random_state': 0
    }
    
    model = LGBMClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="weighted")
    
    return score


# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveLGBM(trial, x_train, y_train, x_val, y_val), n_trials=1)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-17 17:24:10,277] A new study created in memory with name: no-name-5e1786de-1c89-42be-88aa-05fa97c761ec
[I 2024-02-17 17:24:12,914] Trial 0 finished with value: 0.9753345480842605 and parameters: {'num_leaves': 166, 'max_depth': 19, 'n_estimators': 643, 'learning_rate': 0.05903948646972072, 'min_child_samples': 24}. Best is trial 0 with value: 0.9753345480842605.


Best trial: score 0.9753345480842605, 
params {'num_leaves': 166, 'max_depth': 19, 'n_estimators': 643, 'learning_rate': 0.05903948646972072, 'min_child_samples': 24}


LGBM_dart

In [None]:
import optuna
from lightgbm import LGBMClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveLGBM(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'num_leaves': trial.suggest_int('num_leaves', 2, 300),
        'max_depth': trial.suggest_int('max_depth', 2, 25),
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 50),
        'verbose' : -1,
        'boosting' : 'dart',
        'random_state': 0
    }
    
    model = LGBMClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="weighted")
    
    return score


# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveLGBM(trial, x_train, y_train, x_val, y_val), n_trials=100)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-17 23:42:14,548] A new study created in memory with name: no-name-afd4f68e-35a9-470c-933b-2eafb64b810d
[I 2024-02-17 23:42:34,065] Trial 0 finished with value: 0.9757931658418674 and parameters: {'num_leaves': 166, 'max_depth': 19, 'n_estimators': 643, 'learning_rate': 0.05903948646972072, 'min_child_samples': 24}. Best is trial 0 with value: 0.9757931658418674.
[I 2024-02-17 23:43:07,845] Trial 1 finished with value: 0.9765732288484092 and parameters: {'num_leaves': 195, 'max_depth': 12, 'n_estimators': 903, 'learning_rate': 0.09672964844509264, 'min_child_samples': 22}. Best is trial 1 with value: 0.9765732288484092.
[I 2024-02-17 23:43:26,090] Trial 2 finished with value: 0.9768418373577431 and parameters: {'num_leaves': 238, 'max_depth': 14, 'n_estimators': 611, 'learning_rate': 0.0933036974463395, 'min_child_samples': 8}. Best is trial 2 with value: 0.9768418373577431.
[I 2024-02-17 23:43:31,667] Trial 3 finished with value: 0.9579651760987544 and parameters: {'num_leav

Best trial: score 0.977302984142506, 
params {'num_leaves': 271, 'max_depth': 25, 'n_estimators': 842, 'learning_rate': 0.049672512714030864, 'min_child_samples': 18}


XGBoost

In [None]:
import optuna
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveXGB(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 300, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 25),
        'objective': 'binary:logistic',  # 이진 분류
        'eval_metric': trial.suggest_categorical("eval_metric", ["logloss", "auc", "error"]),
        'random_state': 0
    }
    
    model = XGBClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="weighted")
    
    return score


# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveXGB(trial, x_train, y_train, x_val, y_val), n_trials=1)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-17 17:24:19,281] A new study created in memory with name: no-name-cf715e54-a711-4222-a128-5a64ca99952b
[I 2024-02-17 17:24:22,336] Trial 0 finished with value: 0.9742965682985676 and parameters: {'n_estimators': 959, 'learning_rate': 0.07436704297351776, 'max_depth': 16, 'eval_metric': 'error'}. Best is trial 0 with value: 0.9742965682985676.


Best trial: score 0.9742965682985676, 
params {'n_estimators': 959, 'learning_rate': 0.07436704297351776, 'max_depth': 16, 'eval_metric': 'error'}


Decision Tree

In [13]:
import optuna
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveDT(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'max_depth': trial.suggest_int('max_depth', 20, 50),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 7),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 7),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'random_state': 0
    }
    
    model = DecisionTreeClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="weighted")
    
    return score


# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveDT(trial, x_train, y_train, x_val, y_val), n_trials=500)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-18 12:42:44,242] A new study created in memory with name: no-name-6dc74e67-c3f1-49e7-9adb-d702117e3ff7
[I 2024-02-18 12:42:44,559] Trial 0 finished with value: 0.964042669616188 and parameters: {'max_depth': 37, 'min_samples_split': 6, 'min_samples_leaf': 5, 'criterion': 'gini'}. Best is trial 0 with value: 0.964042669616188.
[I 2024-02-18 12:42:44,861] Trial 1 finished with value: 0.9623658815600732 and parameters: {'max_depth': 40, 'min_samples_split': 4, 'min_samples_leaf': 7, 'criterion': 'gini'}. Best is trial 0 with value: 0.964042669616188.
[I 2024-02-18 12:42:45,179] Trial 2 finished with value: 0.9645864092098764 and parameters: {'max_depth': 44, 'min_samples_split': 5, 'min_samples_leaf': 4, 'criterion': 'gini'}. Best is trial 2 with value: 0.9645864092098764.
[I 2024-02-18 12:42:45,438] Trial 3 finished with value: 0.963085494025695 and parameters: {'max_depth': 22, 'min_samples_split': 2, 'min_samples_leaf': 6, 'criterion': 'entropy'}. Best is trial 2 with value:

Best trial: score 0.9675279320930867, 
params {'max_depth': 40, 'min_samples_split': 3, 'min_samples_leaf': 1, 'criterion': 'entropy'}


ExtraTrees

In [17]:
import optuna
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveET(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 200, 1000),
        'max_depth': trial.suggest_int('max_depth', 20, 40),
        'min_samples_split': trial.suggest_int('min_samples_split', 2, 7),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 6),
        'criterion': trial.suggest_categorical('criterion', ['gini', 'entropy']),
        'random_state': 0
    }
    
    model = ExtraTreesClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="weighted")
    
    return score


# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveET(trial, x_train, y_train, x_val, y_val), n_trials=80)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-18 14:00:33,867] A new study created in memory with name: no-name-47da6310-a2b5-4d69-a77c-c4f449716151
[I 2024-02-18 14:00:46,071] Trial 0 finished with value: 0.9325533137114816 and parameters: {'n_estimators': 639, 'max_depth': 35, 'min_samples_split': 5, 'min_samples_leaf': 4, 'criterion': 'entropy'}. Best is trial 0 with value: 0.9325533137114816.
[I 2024-02-18 14:00:57,016] Trial 1 finished with value: 0.936472769700811 and parameters: {'n_estimators': 550, 'max_depth': 38, 'min_samples_split': 7, 'min_samples_leaf': 3, 'criterion': 'gini'}. Best is trial 1 with value: 0.936472769700811.
[I 2024-02-18 14:01:15,135] Trial 2 finished with value: 0.9644846466558648 and parameters: {'n_estimators': 655, 'max_depth': 39, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}. Best is trial 2 with value: 0.9644846466558648.
[I 2024-02-18 14:01:35,077] Trial 3 finished with value: 0.9271183175454357 and parameters: {'n_estimators': 823, 'max_depth': 38, 'min_sa

Best trial: score 0.9647921027615778, 
params {'n_estimators': 795, 'max_depth': 40, 'min_samples_split': 2, 'min_samples_leaf': 1, 'criterion': 'entropy'}


GradientBoost

In [None]:
import optuna
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveGB(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 700, 1500),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),
        'max_depth': trial.suggest_int('max_depth', 2, 15),
        'min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
        'random_state': 0
    }
    
    model = GradientBoostingClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="weighted")
    
    return score


# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveGB(trial, x_train, y_train, x_val, y_val), n_trials=1)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))


[I 2024-02-17 17:26:44,855] A new study created in memory with name: no-name-a1d289c3-67d6-4220-b852-49ba548f1867
[I 2024-02-17 17:30:06,455] Trial 0 finished with value: 0.9773593142990754 and parameters: {'n_estimators': 1139, 'learning_rate': 0.07436704297351776, 'max_depth': 10, 'min_samples_leaf': 11}. Best is trial 0 with value: 0.9773593142990754.


Best trial: score 0.9773593142990754, 
params {'n_estimators': 1139, 'learning_rate': 0.07436704297351776, 'max_depth': 10, 'min_samples_leaf': 11}


AdaBoost

In [None]:
import optuna
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

def objectiveAdaBoost(trial, x_tr, y_tr, x_val, y_val):
    param = {
        'n_estimators': trial.suggest_int('n_estimators', 1000, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.1, 1.0),
        'algorithm': trial.suggest_categorical('algorithm', ['SAMME', 'SAMME.R']),
        'random_state': 0
    }
    
    model = AdaBoostClassifier(**param)
    model.fit(x_tr, y_tr)
    pred = model.predict(x_val)
    score = f1_score(y_val, pred, average="weighted")
    
    return score


# 하이퍼 파라미터 튜닝
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.TPESampler(seed=0))
study.optimize(lambda trial: objectiveAdaBoost(trial, x_train, y_train, x_val, y_val), n_trials=1)

print('Best trial: score {}, \nparams {}'.format(study.best_trial.value, study.best_trial.params))

[I 2024-02-17 17:30:13,485] A new study created in memory with name: no-name-13940382-acb4-4b58-981e-660f38516d26
[I 2024-02-17 17:30:55,339] Trial 0 finished with value: 0.9561994360135005 and parameters: {'n_estimators': 1549, 'learning_rate': 0.7436704297351775, 'algorithm': 'SAMME'}. Best is trial 0 with value: 0.9561994360135005.


Best trial: score 0.9561994360135005, 
params {'n_estimators': 1549, 'learning_rate': 0.7436704297351775, 'algorithm': 'SAMME'}


### 모델 정의

In [18]:
from sklearn.ensemble import RandomForestClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import numpy as np

# 보팅
from sklearn.ensemble import VotingClassifier
# 스테킹
from sklearn.ensemble import StackingClassifier

In [32]:
# RandomForest
rf_model = RandomForestClassifier(
    n_estimators=1056     
    , max_depth=30    
    , min_samples_split=3   
    , min_samples_leaf=1   
    , bootstrap=True
    , criterion='entropy'
)

# LightGBM
lgb_model = LGBMClassifier(
    num_leaves=162
    , max_depth=10
    , n_estimators=487
    , learning_rate=0.07324658507873466
    , min_child_samples=31
    , verbose = -1
)

# LightGBM_dart
lgb_dart_model = LGBMClassifier(
    num_leaves=170
    , max_depth=13
    , n_estimators=692
    , learning_rate=0.0916736042020453
    , min_child_samples=10
    , verbose = -1
    , boosting_type="dart"
)

# XGBoost 
xgb_model = XGBClassifier(
    n_estimators=1427
    , learning_rate=0.08645845446703926
    , max_depth=7
    , objective='binary:logistic'
    , eval_metric = 'error'
)

# GradientBoosting
gb_model = GradientBoostingClassifier(
    n_estimators=1425
    , learning_rate=0.09883679411048218
    , max_depth=6
    , min_samples_leaf=13
)

# DecisionTree
dt_model = DecisionTreeClassifier(
    max_depth=24
    , min_samples_split=2  # 노드를 분할하기 위한 최소 샘플 수
    , min_samples_leaf=1  # 리프 노드에 필요한 최소 샘플 수
    , criterion = 'entropy'
)  

# ExtraTrees
et_model = ExtraTreesClassifier(
    n_estimators=486
    , min_samples_split=3  # 노드를 분할하기 위한 최소 샘플 수
    , min_samples_leaf=1   # 리프 노드에 필요한 최소 샘플 수
    , max_depth=26 
    , criterion = 'entropy'
)  

# AdaBoost
ada_model = AdaBoostClassifier(
    n_estimators=1399
    , learning_rate=0.9987147599335517
    , algorithm='SAMME.R'
)  

# CatBoost
#cat_model = CatBoostClassifier(
#    iterations=1045
#   , learning_rate=0.21147352826666405
#    , depth=9
#    , verbose=False
#)


### 스태킹 분류기 생성 ###
model = StackingClassifier(
    estimators=[
        ('rf', rf_model),
        ('lgb', lgb_model),
        ('xgb', xgb_model),
        ('gb', gb_model),
        ('ada', ada_model),
        ('dt', dt_model),
        ('lgb_dart',lgb_dart_model)
        #('cat', cat_model)
    ],
    final_estimator=lgb_model  # 최종 메타 모델
)



### 모델 학습

In [33]:
model.fit(x_train, y_train)

### 모델 성능 보기

In [34]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')  # 추가된 부분 

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]  # 추가된 부분
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)


In [35]:
pred = model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,763,184
False,92,10821


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.976728,0.892398,0.805702,0.846837,0.976183


## 4. 제출하기

### 테스트 데이터 예측

In [36]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

In [37]:
test_pred = model.predict(x_test)
sum(test_pred) # True로 예측된 개수

704

### 제출 파일 작성

In [None]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_model_12.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**

.