# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV

### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("./data/encoded_train.csv",  encoding='ISO-8859-1') # 학습용 데이터
df_test = pd.read_csv("./data/encoded_submission.csv",  encoding='ISO-8859-1') # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,...,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted,enterprise_Enterprise,enterprise_SMB
0,1.0,0.060187,0.066667,32160,0.116088,0,0,0,0,0.143617,...,1,0,0.003079,0.026846,0.064119,0.123188,0,1,True,False
1,1.0,0.060187,0.066667,23122,0.116088,12,0,0,0,0.076423,...,1,0,0.003079,0.026846,0.064119,0.084211,1,1,True,False
2,1.0,0.060283,0.088889,1755,0.112788,144,0,0,0,0.093429,...,1,0,0.003079,0.026846,0.067235,0.118421,2,1,True,False
3,1.0,0.059962,0.088889,4919,0.109474,0,0,0,0,0.08891,...,1,0,0.003079,0.026846,0.064915,0.132075,3,1,True,False
4,1.0,0.060107,0.088889,17126,0.078316,0,0,0,0,0.098375,...,0,0,0.003079,0.026846,0.064232,0.082825,4,1,True,False


## 2. 데이터 전처리

### 레이블 인코딩

다시 학습 데이터와 제출 데이터를 분리합니다.

### 2-2. 학습, 검증 데이터 분리

In [4]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1), 
    df_train["is_converted"], 
    test_size=0.2, 
    random_state=42
)

In [5]:
df_test.drop(columns='id', axis=1, inplace=True)
x_submission = df_test.drop("is_converted", axis=1)
y_submission = df_test["is_converted"]

## 3. 모델 학습

### 모델 정의 

In [6]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

In [7]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 4, 5],
    'subsample': [0.8, 0.9, 1.0],
    'colsample_bytree': [0.8, 0.9, 1.0],
}

In [8]:
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring='accuracy',  # Can be adjusted depending on the objective
    cv=5,
    verbose=1
)

In [9]:
grid_search.fit(x_train, y_train)

Fitting 5 folds for each of 243 candidates, totalling 1215 fits


### 모델 학습

In [10]:
print("Best Parameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)
best_model = grid_search.best_estimator_

Best Parameters: {'colsample_bytree': 0.9, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 300, 'subsample': 0.9}
Best Score: 0.9686267353198564


### 모델 성능 보기

In [11]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [12]:
pred = grid_search.predict(x_submission.fillna(0))
get_clf_eval(y_submission, pred)

오차행렬:
 [[ 928  253]
 [1449 2641]]

정확도: 0.6771
정밀도: 0.3904
재현율: 0.7858
F1: 0.5216


## 4. 제출하기

### 테스트 데이터 예측

In [13]:
test_pred = grid_search.predict(x_submission.fillna(0))
sum(test_pred) # True로 예측된 개수

2377

### 제출 파일 작성

In [14]:
# # 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
# df_sub = pd.read_csv("./data/encoded_submission.csv")
# df_sub["is_converted"] = test_pred

# # 제출 파일 저장
# df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**