## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore') # 경고 메세지 무시

In [2]:
df_train_origin = pd.read_csv("./data/train.csv") # 학습용 데이터
df_test_origin = pd.read_csv("./data/submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train_origin.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         58317 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            15338 non-null  object 
 6   enterprise               59299 non-null  object 
 7   historical_existing_cnt  13756 non-null  float64
 8   id_strategic_ver         3444 non-null   float64
 9   it_strategic_ver         1121 non-null   float64
 10  idit_strategic_ver       4565 non-null   float64
 11  customer_job             40566 non-null  object 
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             58358 non-null  object 
 14  product_category      

## 2. 데이터 전처리

### 각 변수별 확인

In [4]:
df_train_process = pd.read_csv('./data/Ch2/df_train.csv')
df_test_process = pd.read_csv('./data/Ch2/df_test.csv')

In [5]:
df_train_process.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 36 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         59299 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            59299 non-null  object 
 6   enterprise               59299 non-null  object 
 7   historical_existing_cnt  59299 non-null  float64
 8   id_strategic_ver         59299 non-null  float64
 9   it_strategic_ver         59299 non-null  float64
 10  idit_strategic_ver       59299 non-null  float64
 11  customer_job             59299 non-null  object 
 12  lead_desc_length         59299 non-null  float64
 13  inquiry_type             59299 non-null  object 
 14  product_category      

## 3. 피처엔지니어링

### 레이블 인코딩

In [6]:
df_train_encoded = pd.read_csv('./data/Ch3/df_train_encoded.csv')
df_test_encoded = pd.read_csv('./data/Ch3/df_test_encoded.csv')

In [7]:
df_train_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 32 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   business_unit            59299 non-null  float64
 2   com_reg_ver_win_rate     59299 non-null  float64
 3   customer_type            59299 non-null  float64
 4   enterprise               59299 non-null  float64
 5   historical_existing_cnt  59299 non-null  float64
 6   id_strategic_ver         59299 non-null  float64
 7   it_strategic_ver         59299 non-null  float64
 8   idit_strategic_ver       59299 non-null  float64
 9   customer_job             59299 non-null  float64
 10  lead_desc_length         59299 non-null  float64
 11  inquiry_type             59299 non-null  float64
 12  product_category         59299 non-null  float64
 13  product_subcategory      59299 non-null  float64
 14  customer_country.1    

## 4. 모델 학습

### 데이터 분할

학습, 검증 데이터 분리

In [8]:
# !pip install imblearn

In [9]:
from sklearn.model_selection import train_test_split

df_train_encoded.loc[df_train_encoded['is_converted'] == True, 'is_converted'] = 1
df_train_encoded.loc[df_train_encoded['is_converted'] == False, 'is_converted'] = 0

df_train_encoded['is_converted'] = df_train_encoded['is_converted'].astype(float)

X = df_train_encoded[df_train_encoded.columns.drop('is_converted')]
Y = df_train_encoded['is_converted']

# X와 Y로 나누기
x_train, x_val, y_train, y_val = train_test_split(X, Y, test_size=0.2, random_state=0, shuffle=True)

# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted"], axis=1)

In [10]:
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# SMOTE와 RandomUnderSampler를 파이프라인으로 결합
resample = Pipeline([('SMOTE', SMOTE(random_state=0)), 
                     ('RandomUnderSampler', RandomUnderSampler(random_state=0))])

# 데이터에 오버샘플링과 언더샘플링 적용
x_train, y_train = resample.fit_resample(x_train, y_train)


### 모델 라이브러리

단일모델 기준으로 사용할수 있는 모델들의 라이브러리를 불러오는 코드에 작성

In [11]:
# 모델 라이브러리
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier, AdaBoostClassifier

# 보팅
from sklearn.ensemble import VotingClassifier

# 스테킹
from sklearn.ensemble import StackingClassifier

### 최적 하이퍼 파라미터 찾기

optuna를 통한 최적의 파라미터 찾기

In [12]:
## 파라미터 찾는 과정.. ##

다시한번 optuna를 통한 최적의 파라미터 찾는과정

### 모델 정의

#### 단일 모델

In [13]:
## hyper parameter -> Optuna (final EDA Lite ver)

# RandomForest
rf_model = RandomForestClassifier(
    n_estimators=1260
    , max_depth=41
    , min_samples_split=2
    , min_samples_leaf=1   
    , bootstrap=True
    , criterion='gini'
    , random_state=0
)

# LightGBM
lgbm_model = LGBMClassifier(
    n_estimators=834
    , max_depth=40
    , num_leaves=85
    , learning_rate=0.12303612733589961
    , min_child_samples=30
    , verbose=-1
	, random_state=0
)

# LightGBM_dart
lgbm_dart_model = LGBMClassifier(
    n_estimators=1029
    , num_leaves=167
    , max_depth=30
    , learning_rate=0.05767571715999541
    , min_child_samples=25
    , verbose=-1
    , boosting='dart'  # dart 사용
    , random_state=0
)

# CatBoost
cat_model = CatBoostClassifier(
    iterations=3455
    , learning_rate=0.32379048176316866
    , depth=9
    , verbose=False
    , random_state=0
)

# XGBoost 
xgb_model = XGBClassifier(
    n_estimators=2069
    , learning_rate=0.07125304368155233
    , max_depth=6
    , objective='binary:logistic'
    , eval_metric="auc"
    , random_state=0
)

# GradientBoosting
gb_model = GradientBoostingClassifier(
    n_estimators=851
    , learning_rate=0.0752545884620923
    , max_depth=11
    , min_samples_leaf=11
    , random_state=0
)

# ExtraTrees
et_model = ExtraTreesClassifier(
    n_estimators=1009
    , max_depth=36
    , min_samples_split=2
    , min_samples_leaf=1
    , criterion='entropy'
    , random_state=0
)  

# DecisionTree
dt_model = DecisionTreeClassifier(
    max_depth=21
    , min_samples_split=29  
    , min_samples_leaf=8 
    , criterion='entropy'
    , random_state=0
)  

# AdaBoost
ada_model = AdaBoostClassifier(
    n_estimators=2998
    , learning_rate=0.8670594137755283
    , algorithm='SAMME.R'
    , random_state=0
)

#### 보팅

In [14]:
### 보팅 분류기 생성 ###
model = VotingClassifier(
    estimators=[
        # ('rf', rf_model),
        ('cat', cat_model),
        # ('lgb', lgbm_model),
        ('lgb_dart',lgbm_dart_model),
        ('xgb', xgb_model),
        ('gb', gb_model),
        ('ada', ada_model),
        # ('dt', dt_model),
        # ('et', et_model),
    ],
    voting='soft'  # 'hard'는 다수결 투표, 'soft'는 확률 평균
)

#### 스태킹

In [15]:
### 스태킹 분류기 생성 ###
model = StackingClassifier(
    estimators=[
        ('rf', rf_model),
        ('cat', cat_model),
        # ('lgb', lgbm_model),
        ('lgb_dart',lgbm_dart_model),
        ('xgb', xgb_model),
        # ('gb', gb_model),
        # ('ada', ada_model),
        # ('dt', dt_model),
        ('et', et_model),
    ],
    final_estimator=lgbm_model  # 최종 메타 모델
)

In [16]:
# # LightGBM_dart
# model = LGBMClassifier(
#     n_estimators=1029
#     , num_leaves=167
#     , max_depth=30
#     , learning_rate=0.05767571715999541
#     , min_child_samples=25
#     , verbose=-1
#     , boosting='dart'  # dart 사용
#     , random_state=0
# )

### 모델 학습

In [17]:
model.fit(x_train, y_train)

### 모델 성능 보기

In [18]:
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)

def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])
    weighted_F1 = f1_score(y_test, y_pred, average='weighted')

    metrics = pd.DataFrame({
        '정확도': [accuracy],
        '정밀도': [precision],
        '재현율': [recall],
        'F1 Score': [F1],
        'Weighted F1': [weighted_F1]
    })

    confusion_df = pd.DataFrame(confusion, index=['True', 'False'], columns=['True', 'False'])

    print("\n오차행렬:")
    display(confusion_df)
    print("평가 지표:")
    display(metrics)


In [19]:
pred = model.predict(x_val)
get_clf_eval(y_val, pred)


오차행렬:


Unnamed: 0,True,False
True,779,215
False,157,10709


평가 지표:


Unnamed: 0,정확도,정밀도,재현율,F1 Score,Weighted F1
0,0.968634,0.832265,0.783702,0.807254,0.968205


## 4. 제출하기

### 테스트 데이터 예측

In [20]:
# 예측에 필요한 데이터 분리
x_test = df_test_encoded.drop(["is_converted", "id"], axis=1)

In [21]:
test_pred = model.predict(x_test)
sum(test_pred) # True로 예측된 개수

713.0

### 제출 파일 작성

In [22]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("./data/submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission_ss2.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**

.