# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, mean_squared_error, classification_report
import xgboost as xgb

### 데이터 셋 읽어오기

In [2]:
# Load the encoded datasets
train_data = pd.read_csv('encoded_train_two.csv')
submission_data = pd.read_csv('encoded_submission_two.csv')

submission_data.drop(columns='id', axis=1, inplace=True)

# Prepare the training data
X_train = train_data.drop(['is_converted'], axis=1)  # Features for training
y_train = train_data['is_converted']  # Target variable for training

# Prepare the submission/validation data
X_submission = submission_data.drop(['is_converted'], axis=1)  # Features for validation
y_submission = submission_data['is_converted']  # Target variable for validation

# Now, X_train and y_train are ready to be used for training your model
# And X_submission and y_submission can be used to validate the model's performance

In [3]:
X_train.columns

Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'enterprise_Enterprise',
       'enterprise_SMB'],
      dtype='object')

In [4]:

X_submission.columns

Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'enterprise_Enterprise',
       'enterprise_SMB'],
      dtype='object')

In [5]:
print(len(X_train.columns))
print(len(X_submission.columns))

columns_in_train_not_in_submission = set(X_train.columns) - set(X_submission.columns)

print(columns_in_train_not_in_submission.difference())

27
27
set()


## 2. 데이터 전처리

### 2-2. 학습, 검증 데이터 분리

In [6]:
# Replace infinite values with NaN
train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for any NaN values in the DataFrame (including those that were originally infinite values)
if train_data.isna().any().any():
    print("DataFrame contains NaN or infinite values.")
    # Optional: Handle NaN values, e.g., by filling with a default value or dropping them
    # df.fillna(0, inplace=True) # Example of filling NaN values with 0
else:
    print("DataFrame does not contain any NaN or infinite values.")

DataFrame does not contain any NaN or infinite values.


In [7]:
# Replace infinite values with NaN
submission_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for any NaN values in the DataFrame (including those that were originally infinite values)
if submission_data.isna().any().any():
    print("DataFrame contains NaN or infinite values.")
    # Optional: Handle NaN values, e.g., by filling with a default value or dropping them
    # df.fillna(0, inplace=True) # Example of filling NaN values with 0
else:
    print("DataFrame does not contain any NaN or infinite values.")

DataFrame does not contain any NaN or infinite values.


In [8]:
# x_train, x_val, y_train, y_val = train_test_split(
#     df_train.drop("is_converted", axis=1),
#     df_train["is_converted"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=400,
# )

# Split the training data into a smaller training set and a validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



## 3. 모델 학습

In [9]:
# #LGBM Booster Parameters
# param = {
#     'num_leaves':31, 
#     'objective':'binary',
#     'metric':'logloss',
#     'learning_rate':0.05,
#     'verbosity': 3
#     }


### 모델 정의 

In [10]:


# Initialize LGBM Model
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

### 모델 학습

In [11]:
#Train LGBM 
model.fit(X_train_split, y_train)

[0]	validation_0-logloss:0.22102
[1]	validation_0-logloss:0.18566
[2]	validation_0-logloss:0.16206
[3]	validation_0-logloss:0.14627




[4]	validation_0-logloss:0.13325
[5]	validation_0-logloss:0.12255
[6]	validation_0-logloss:0.11603
[7]	validation_0-logloss:0.11095
[8]	validation_0-logloss:0.10611
[9]	validation_0-logloss:0.10336
[10]	validation_0-logloss:0.10100
[11]	validation_0-logloss:0.09837
[12]	validation_0-logloss:0.09695
[13]	validation_0-logloss:0.09619
[14]	validation_0-logloss:0.09591


[15]	validation_0-logloss:0.09414
[16]	validation_0-logloss:0.09346
[17]	validation_0-logloss:0.09341
[18]	validation_0-logloss:0.09250
[19]	validation_0-logloss:0.09224
[20]	validation_0-logloss:0.09139
[21]	validation_0-logloss:0.09115
[22]	validation_0-logloss:0.09105
[23]	validation_0-logloss:0.09100
[24]	validation_0-logloss:0.09068
[25]	validation_0-logloss:0.08952
[26]	validation_0-logloss:0.08933
[27]	validation_0-logloss:0.08939
[28]	validation_0-logloss:0.08913
[29]	validation_0-logloss:0.08879
[30]	validation_0-logloss:0.08838
[31]	validation_0-logloss:0.08814
[32]	validation_0-logloss:0.08771
[33]	validation_0-logloss:0.08734
[34]	validation_0-logloss:0.08693
[35]	validation_0-logloss:0.08714
[36]	validation_0-logloss:0.08688
[37]	validation_0-logloss:0.08660
[38]	validation_0-logloss:0.08617
[39]	validation_0-logloss:0.08617
[40]	validation_0-logloss:0.08635
[41]	validation_0-logloss:0.08587
[42]	validation_0-logloss:0.08595
[43]	validation_0-logloss:0.08555
[44]	validatio

### 모델 성능 보기

In [12]:
# Predict on the validation set
y_pred = model.predict(X_submission)

# rmse_test = mean_squared_error(y_submission, y_pred) ** 0.5
# print(f'The RMSE of prediction is: {rmse_test}')

# Evaluate the model
print(classification_report(y_submission, y_pred))


              precision    recall  f1-score   support

           0       0.84      0.70      0.76      4090
           1       0.34      0.55      0.42      1181

    accuracy                           0.66      5271
   macro avg       0.59      0.62      0.59      5271
weighted avg       0.73      0.66      0.69      5271



In [13]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [14]:
get_clf_eval(y_submission, y_pred)

오차행렬:
 [[ 649  532]
 [1234 2856]]

정확도: 0.6650
정밀도: 0.3447
재현율: 0.5495
F1: 0.4236


## 4. 제출하기

### 테스트 데이터 예측

In [15]:
# Correct prediction on the submission/validation dataset
# submission_predictions = dt_classifier.predict(X_submission)  # Use X_submission instead of submission_data
submission = pd.read_csv('data/encoded_submission.csv')
# Optionally, if you want to add predictions to the submission dataframe to check performance:
# Note: This is useful if you're examining the model's performance and not for final submission where you wouldn't have 'is_converted'
submission['is_converted'] = y_pred  # Use a new column for predictions

In [16]:
submission

Unnamed: 0,id,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,...,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted,enterprise_Enterprise,enterprise_SMB
0,19844,0.00,0.224035,0.073248,47466,0.251652,53,0,0,0,...,1,0,0.001183,0.049840,0.244980,0.192308,278,1,True,False
1,9738,0.25,0.492788,0.102336,5405,0.253414,0,0,0,0,...,0,0,0.000013,0.058388,0.532544,0.439163,437,0,False,True
2,8491,1.00,0.226788,0.102336,13597,0.313694,0,0,0,0,...,0,0,0.000060,0.131148,0.311321,0.100000,874,0,False,True
3,19895,0.50,0.225875,0.118644,17204,0.054889,0,0,0,0,...,0,0,0.001183,0.049840,0.265306,0.224056,194,0,True,False
4,10465,1.00,0.225875,0.074949,2329,0.253823,2,1,0,1,...,1,1,0.003079,0.064566,0.318966,0.241379,167,1,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,13855,0.50,0.163740,0.102336,40292,0.057216,10,0,0,0,...,0,0,0.001093,0.058388,0.224056,0.224056,97,0,True,False
5267,7979,0.25,0.495215,0.102336,47466,0.051390,0,0,0,0,...,0,0,0.001093,0.058388,0.224056,0.224056,438,0,True,False
5268,12887,0.75,0.163117,0.102336,46227,0.312000,0,0,0,0,...,0,0,0.001093,0.058388,0.224056,0.224056,97,0,True,False
5269,17530,0.00,0.493827,0.102336,45667,0.256660,0,0,0,0,...,0,0,0.001093,0.058388,0.224056,0.224056,429,0,False,True


### 제출 파일 작성

In [17]:
# submission.to_csv('submission.csv', index=False)