# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


### 데이터 셋 읽어오기

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


## 2. 데이터 전처리

### 레이블 인코딩

In [4]:
def label_encoding(series: pd.Series) -> pd.Series:
    """범주형 데이터를 시리즈 형태로 받아 숫자형 데이터로 변환합니다."""

    my_dict = {}

    # 모든 요소를 문자열로 변환
    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [5]:
# 레이블 인코딩할 칼럼들
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])
'''
for col in label_columns:
    df_all[col] = label_encoding(df_all[col])
    #print(df_all[col])
    '''

import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Embedding, Input, Flatten
from tensorflow.keras.models import Model

# 먼저, 범주형 칼럼들을 레이블 인코딩합니다.
label_encoders = {}
for col in label_columns:
    le = LabelEncoder()
    df_all[col] = le.fit_transform(df_all[col].astype(str))
    label_encoders[col] = le

# Entity Embedding을 위한 모델을 구성합니다.
input_layers = []
embedding_layers = []

for col in label_columns:
    input_layer = Input(shape=(1,), name=col)
    # 각 범주형 변수의 고유값 개수
    unique_vals = df_all[col].nunique()
    # 임베딩 차원은 일반적으로 unique_vals의 제곱근을 사용합니다.
    embed_dim = int(unique_vals ** 0.5) + 1
    
    # 임베딩 레이어
    embedding_layer = Embedding(input_dim=unique_vals, output_dim=embed_dim, input_length=1)(input_layer)
    embedding_layer = Flatten()(embedding_layer)
    
    input_layers.append(input_layer)
    embedding_layers.append(embedding_layer)

# 모든 임베딩 레이어를 연결합니다.
x = tf.keras.layers.Concatenate()(embedding_layers)
x = tf.keras.layers.Dense(128, activation='relu')(x)
output_layer = tf.keras.layers.Dense(1, activation='sigmoid')(x)  # 예제 목적의 출력 레이어



2024-02-11 14:25:56.228375: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-11 14:25:56.490380: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-11 14:25:56.490807: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-11 14:25:56.540491: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-02-11 14:25:56.632536: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2024-02-11 14:25:56.633972: I tensorflow/core/platform/cpu_feature_guard.cc:1

다시 학습 데이터와 제출 데이터를 분리합니다.

In [6]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

### 2-2. 학습, 검증 데이터 분리

In [7]:

# TRUE와 FALSE 개수 세기
true_count = df_train['is_converted'].sum()
false_count = len(df_train) - true_count

# 두 개수 중 작은 값으로 데이터를 분할
min_count = min(true_count, false_count)

# TRUE와 FALSE 개수를 맞추어 데이터를 분할
true_data = df_train[df_train['is_converted'] == True].sample(n=min_count, random_state=400)
false_data = df_train[df_train['is_converted'] == False].sample(n=min_count, random_state=400)

# 데이터를 결합
df_balanced = pd.concat([true_data, false_data])

# 데이터 분할
x_train, x_val, y_train, y_val = train_test_split(
    df_balanced.drop("is_converted", axis=1),
    df_balanced["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

# 결과 확인
print("Train set:")
print(y_train.value_counts())
print("Validation set:")
print(y_val.value_counts())

Train set:
is_converted
False    3884
True     3876
Name: count, dtype: int64
Validation set:
is_converted
True     974
False    966
Name: count, dtype: int64


In [8]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

## 3. 모델 학습

### 모델 정의 

In [9]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

# 기본 모델 정의
base_models = [
    ('mlp', MLPClassifier(hidden_layer_sizes=(300,), max_iter=1000, random_state=42)),
    ('xgb', XGBClassifier(n_estimators=100, max_depth=3, random_state=42))
]

# 메타 모델 정의 (여기에서는 SVM을 사용)
meta_model = SVC(probability=True)

# 스태킹 앙상블 모델 구성
stacking_model = StackingClassifier(estimators=base_models, final_estimator=meta_model, cv=5)

# 모델 객체 저장
model = stacking_model


### 모델 학습

In [10]:
model.fit(x_train.fillna(0), y_train)

### 모델 성능 보기

In [11]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [12]:
#pred = model.predict(x_val_selected_features.fillna(0))
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)
print(type(model))
#print("트리의 최대 깊이:", model.tree_.max_depth)


오차행렬:
 [[  598   349]
 [   94 10819]]

정확도: 0.9626
정밀도: 0.8642
재현율: 0.6315
F1: 0.7297
<class 'sklearn.ensemble._stacking.StackingClassifier'>


## 4. 제출하기

### 테스트 데이터 예측

In [13]:
# 예측에 필요한 데이터 분리
x_test = df_test.drop(["is_converted", "id"], axis=1)

In [14]:
test_pred = model.predict(x_test.fillna(0))
sum(test_pred) # True로 예측된 개수

347

### 제출 파일 작성

In [15]:
# 제출 데이터 읽어오기 (df_test는 전처리된 데이터가 저장됨)
df_sub = pd.read_csv("submission.csv")
df_sub["is_converted"] = test_pred

# 제출 파일 저장
df_sub.to_csv("submission.csv", index=False)

#in vscode gitignore issue

**우측 상단의 제출 버튼을 클릭해 결과를 확인하세요**