# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [73]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

### 데이터 셋 읽어오기

In [74]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [75]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


In [76]:
df_test.head()

Unnamed: 0,id,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,19844,0.0,/ / Brazil,ID,0.073248,47466,End Customer,Enterprise,53.0,,...,LGESP,,1,0,0.001183,0.04984,retail,Electronics & Telco,278,True
1,9738,0.25,400 N State Of Franklin Rd Cloud IT / Johnson...,IT,,5405,End Customer,SMB,,,...,LGEUS,,0,0,1.3e-05,,transportation,Others,437,True
2,8491,1.0,/ / U.A.E,ID,,13597,Specifier/ Influencer,SMB,,,...,LGEGF,less than 3 months,0,0,6e-05,0.131148,hospital & health care,General Hospital,874,False
3,19895,0.5,/ Madison / United States,ID,0.118644,17204,,Enterprise,,,...,LGEUS,more than a year,0,0,0.001183,0.04984,retail,,194,False
4,10465,1.0,/ Sao Paulo / Brazil,ID,0.074949,2329,End Customer,Enterprise,2.0,1.0,...,LGESP,less than 3 months,1,1,0.003079,0.064566,corporate / office,Engineering,167,True


In [77]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         58317 non-null  object 
 2   business_unit            59299 non-null  object 
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            15338 non-null  object 
 6   enterprise               59299 non-null  object 
 7   historical_existing_cnt  13756 non-null  float64
 8   id_strategic_ver         3444 non-null   float64
 9   it_strategic_ver         1121 non-null   float64
 10  idit_strategic_ver       4565 non-null   float64
 11  customer_job             40566 non-null  object 
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             58358 non-null  object 
 14  product_category      

In [78]:
df_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5271 entries, 0 to 5270
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       5271 non-null   int64  
 1   bant_submit              5271 non-null   float64
 2   customer_country         5271 non-null   object 
 3   business_unit            5271 non-null   object 
 4   com_reg_ver_win_rate     1788 non-null   float64
 5   customer_idx             5271 non-null   int64  
 6   customer_type            3814 non-null   object 
 7   enterprise               5271 non-null   object 
 8   historical_existing_cnt  1275 non-null   float64
 9   id_strategic_ver         593 non-null    float64
 10  it_strategic_ver         53 non-null     float64
 11  idit_strategic_ver       646 non-null    float64
 12  customer_job             3832 non-null   object 
 13  lead_desc_length         5271 non-null   int64  
 14  inquiry_type            

## 2. 데이터 전처리

In [None]:
# 주어진 값들이 있는지 여부를 나타내는 새로운 열 생성
major_business_area = ['retail', 'corporate / office', 'education', 'hotel & accommodation']

# major_business_area_present 열을 0으로 초기화합니다.
df_train['major_business_area_present'] = 0

# 각 행에 대해 major_business_area_present 열을 업데이트합니다.
for i in range(len(df_train['business_area'])):
    if df_train['business_area'][i] in major_business_area:
        df_train.at[i, 'major_business_area_present'] = 1 

In [None]:
# major_business_area_present 열을 0으로 초기화합니다.
df_test['major_business_area_present'] = 0

# 각 행에 대해 major_business_area_present 열을 업데이트합니다.
for i in range(len(df_test['business_area'])):
    if df_test['business_area'][i] in major_business_area:
        df_test.at[i, 'major_business_area_present'] = 1

In [None]:
# 주어진 값들이 있는지 여부를 나타내는 새로운 열 생성
major_product_category = ['led signage', 'video wall signage', 'high brightness signage',
                          'standard signage', 'oled signage', 'interactive signage',
                          'hotel tv', 'special signage']

# major_product_category 열을 0으로 초기화합니다.
df_train['major_product_category'] = 0

# 각 행에 대해 major_product_category 열을 업데이트합니다.
for i in range(len(df_train['product_category'])):
    if df_train['product_category'][i] in major_product_category:
        df_train.at[i, 'major_product_category'] = 1 

In [None]:
# 주어진 값들이 있는지 여부를 나타내는 새로운 열 생성
major_product_category = ['led signage', 'video wall signage', 'high brightness signage',
                          'standard signage', 'oled signage', 'interactive signage',
                          'hotel tv', 'special signage']

# major_product_category 열을 0으로 초기화합니다.
df_test['major_product_category'] = 0

# 각 행에 대해 major_product_category 열을 업데이트합니다.
for i in range(len(df_test['product_category'])):
    if df_test['product_category'][i] in major_product_category:
        df_test.at[i, 'major_product_category'] = 1 

In [None]:
df_train['major_business_area_present'].value_counts()

In [None]:
df_test.head()

### 레이블 인코딩

In [79]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [80]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])

다시 학습 데이터와 제출 데이터를 분리합니다.

In [81]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

### 2-2. 학습, 검증 데이터 분리

In [82]:
# ! pip install imbalanced-learn

In [83]:
from imblearn.under_sampling import RandomUnderSampler

# Random Undersampling 적용
undersampler = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = undersampler.fit_resample(df_train.drop("is_converted", axis=1),df_train["is_converted"])

In [84]:
x_train, x_val, y_train, y_val = train_test_split(
    X_resampled,
    y_resampled,
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

## 3. 모델 학습

### 모델 정의 

In [89]:
# model = DecisionTreeClassifier() # undersampling-> 0.53 : 0.85

from sklearn.ensemble import RandomForestClassifier # 0.63 : 0.91
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

from sklearn.ensemble import StackingClassifier

# model = RandomForestClassifier(random_state = 42)
xgb = XGBClassifier(random_state = 42,
                      colsample_bytree = 0.6, 
                      learning_rate = 0.2, 
                      max_depth = 7,
                      min_child_weight = 1,
                      n_estimators = 300,
                      subsample = 1.0) # tree_method = 'exact' : 0.7188373392956959
lgbm = LGBMClassifier(random_state = 42)
cat = CatBoostClassifier(random_state = 42)

### 모델 학습

In [90]:
estimators = [('xgb',xgb), ('lgbm',lgbm), ('cat',cat)]
stack = StackingClassifier(estimators,final_estimator=LogisticRegression(), verbose=1)
stack.fit(x_train.fillna(0), y_train)

[LightGBM] [Info] Number of positive: 3884, number of negative: 3876
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001367 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2188
[LightGBM] [Info] Number of data points in the train set: 7760, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500515 -> initscore=0.002062
[LightGBM] [Info] Start training from score 0.002062
Learning rate set to 0.024712
0:	learn: 0.6681880	total: 53.6ms	remaining: 53.6s
1:	learn: 0.6419213	total: 60.2ms	remaining: 30s
2:	learn: 0.6246215	total: 66.2ms	remaining: 22s
3:	learn: 0.6012939	total: 71.8ms	remaining: 17.9s
4:	learn: 0.5836778	total: 76.8ms	remaining: 15.3s
5:	learn: 0.5638482	total: 81.9ms	remaining: 13.6s
6:	learn: 0.5475049	total: 86.6ms	remaining: 12.3s
7:	learn: 0.5288562	total: 91ms	remaining: 11.3s
8:	learn: 0.515

In [None]:
#model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [91]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [92]:
pred = stack.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[911  55]
 [ 73 901]]

정확도: 0.9340
정밀도: 0.9258
재현율: 0.9431
F1: 0.9344


## feature importance 확인

In [None]:
df_train.columns

In [None]:
df_train.head()

In [None]:
from xgboost import plot_importance

fscore = model.get_booster().get_fscore()
fscore

In [None]:
plot_importance(model)

## 4. 제출하기

### 테스트 데이터 예측

In [93]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [94]:
test_pred = stack.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

1828

### 제출 파일 작성

In [95]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**

In [96]:
df_sub["is_converted"].value_counts(dropna=False)

is_converted
False    3443
True     1828
Name: count, dtype: int64

from sklearn.model_selection import StratifiedKFold
# Stratified K-Fold 객체 생성
k_folds = 5  # 폴드의 개수 설정
stratified_kfold = StratifiedKFold(n_splits=k_folds, shuffle=True, random_state=42)

for fold, (train_indices, val_indices) in enumerate(stratified_kfold.split(X_resampled, y_resampled)):
    print(f"Fold {fold+1} Training Samples: {len(train_indices)}, Validation Samples: {len(val_indices)}")
    
    # 훈련용 및 검증용 데이터 추출
    X_train, X_val = X_resampled.iloc[train_indices], X_resampled.iloc[val_indices]
    y_train, y_val = y_resampled.iloc[train_indices], y_resampled.iloc[val_indices]
    
    model.fit(X_train, y_train)
    
    pred = model.predict(x_val)
    print(f"Fold {fold+1} Validation Score: {get_clf_eval}")
    print(get_clf_eval(y_val, pred))