# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [11]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV, KFold

from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

### 데이터 셋 읽어오기

In [3]:
df_train = pd.read_csv("data/encoded_train_two.csv",  encoding='ISO-8859-1') # 학습용 데이터
df_test = pd.read_csv("data/encoded_submission_two.csv",  encoding='ISO-8859-1') # 테스트 데이터(제출파일의 데이터)

In [4]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,...,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted,enterprise_Enterprise,enterprise_SMB
0,1.0,0.060187,0.066667,32160,0.116088,0.0,0.0,0.0,0.0,0.143617,...,1,0,0.003079,0.026846,0.064119,0.123188,0,1,True,False
1,1.0,0.060187,0.066667,23122,0.116088,12.0,0.0,0.0,0.0,0.076423,...,1,0,0.003079,0.026846,0.064119,0.084211,1,1,True,False
2,1.0,0.060283,0.088889,1755,0.112788,144.0,0.0,0.0,0.0,0.093429,...,1,0,0.003079,0.026846,0.067235,0.118421,2,1,True,False
3,1.0,0.059962,0.088889,4919,0.109474,0.0,0.0,0.0,0.0,0.08891,...,1,0,0.003079,0.026846,0.064915,0.132075,3,1,True,False
4,1.0,0.060107,0.088889,17126,0.078316,0.0,0.0,0.0,0.0,0.098375,...,0,0,0.003079,0.026846,0.064232,0.082825,4,1,True,False


In [5]:
# Assuming 'df_train' is your DataFrame
class_counts = df_train['is_converted'].value_counts()
print(class_counts)

# Calculate the proportion of the minority class
minority_proportion = class_counts.min() / class_counts.sum()
print(f"Minority class proportion: {minority_proportion:.2f}")


is_converted
0    51160
1     4620
Name: count, dtype: int64
Minority class proportion: 0.08


In [7]:
for col in df_train.columns:
    if df_train[col].dtype != 'int64' or df_train[col].dtype != 'float64':
        print(col)

bant_submit
business_unit
com_reg_ver_win_rate
customer_idx
customer_type
historical_existing_cnt
id_strategic_ver
it_strategic_ver
idit_strategic_ver
customer_job
lead_desc_length
product_category
product_subcategory
product_modelname
customer_country.1
customer_position
response_corporate
expected_timeline
ver_cus
ver_pro
ver_win_rate_x
ver_win_ratio_per_bu
business_area
business_subarea
lead_owner
is_converted
enterprise_Enterprise
enterprise_SMB


In [9]:
for index, row in df_train.iterrows():
    # Check if the value is not int64 and not float64
    if not isinstance(row['bant_submit'], (int, float)):
        # Print the row index and the data type of the value
        print(f"Row {index} has value '{row['bant_submit']}' with type {type(row['bant_submit']).__name__}")

In [10]:
for index, row in df_train.iterrows():
    # Check if the value is not int64 and not float64
    if not isinstance(row['business_unit'], (int, float)):
        # Print the row index and the data type of the value
        print(f"Row {index} has value '{row['bant_submit']}' with type {type(row['bant_submit']).__name__}")

## 2. 데이터 전처리

##### is_converted imbalance 때문에 scaling 작업 진행

In [111]:
# Calculate scale_pos_weight for class imbalance
num_neg_instances = class_counts.max()
num_pos_instances = class_counts.min()
scale_pos_weight = num_neg_instances / num_pos_instances

다시 학습 데이터와 제출 데이터를 분리합니다.

### 2-2. 학습, 검증 데이터 분리

In [112]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400
)

## 3. 모델 학습

### 모델 정의 

In [113]:
param_grid = {
    'max_depth': [3, 4, 5],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'learning_rate': [0.01, 0.1, 0.2]
}

In [114]:
model = XGBClassifier(use_label_encoder=False, eval_metric='logloss',
                      random_state=42, scale_pos_weight=scale_pos_weight)

In [115]:
# Setup GridSearchCV
kf = KFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(model, param_grid, scoring='f1', n_jobs=-1, cv=kf, verbose=3)


In [116]:
grid_search.fit(x_train, y_train)
# Best model's parameters
print("Best parameters:", grid_search.best_params_)

Fitting 5 folds for each of 108 candidates, totalling 540 fits


Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 5, 'min_child_weight': 5, 'subsample': 0.8}


### 모델 학습

### 모델 성능 보기

In [117]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [118]:
pred = grid_search.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[ 852   76]
 [ 645 9583]]

정확도: 0.9354
정밀도: 0.5691
재현율: 0.9181
F1: 0.7027


## 4. 제출하기

### 테스트 데이터 예측

In [124]:
# Assuming 'id' is preserved in df_test
y_test = df_test["is_converted"]
x_test = df_test.drop(["is_converted"], axis=1)  # Keep 'id' for alignment
test_pred = grid_search.predict(x_test.drop('id', axis=1).fillna(0))

In [125]:
get_clf_eval(y_test, test_pred)

오차행렬:
 [[ 289  881]
 [ 373 3728]]

정확도: 0.7621
정밀도: 0.4366
재현율: 0.2470
F1: 0.3155


In [123]:
# Create a new dataframe for submission that includes 'id' and the predictions
df_sub = pd.DataFrame({
    "id": x_test["id"],
    "is_converted": test_pred
})

In [None]:
df_sub.to_csv("submission.csv", index=False)

### 제출 파일 작성

In [None]:
# # 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
# df_sub = pd.read_csv("encoded_submission.csv")
# df_sub["is_converted"] = test_pred

# # 제출 파일 저장
# df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**