# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score

### 데이터 셋 읽어오기

In [4]:
# Load the encoded datasets
train_data = pd.read_csv('data/encoded_train.csv')
submission_data = pd.read_csv('data/encoded_submission.csv')

submission_data.drop(columns='id', axis=1, inplace=True)

# Prepare the training data
X_train = train_data.drop(['is_converted'], axis=1)  # Features for training
y_train = train_data['is_converted']  # Target variable for training

# Prepare the submission/validation data
X_submission = submission_data.drop(['is_converted'], axis=1)  # Features for validation
y_submission = submission_data['is_converted']  # Target variable for validation

# Now, X_train and y_train are ready to be used for training your model
# And X_submission and y_submission can be used to validate the model's performance

In [5]:
train_data.columns

Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

In [6]:

submission_data.columns

Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

In [7]:
print(len(submission_data.columns))
print(len(train_data.columns))

columns_in_train_not_in_submission = set(train_data.columns) - set(submission_data.columns)

columns_in_train_not_in_submission

28
28


set()

## 2. 데이터 전처리

### 2-2. 학습, 검증 데이터 분리

In [8]:
# Replace infinite values with NaN
train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for any NaN values in the DataFrame (including those that were originally infinite values)
if train_data.isna().any().any():
    print("DataFrame contains NaN or infinite values.")
    # Optional: Handle NaN values, e.g., by filling with a default value or dropping them
    # df.fillna(0, inplace=True) # Example of filling NaN values with 0
else:
    print("DataFrame does not contain any NaN or infinite values.")

DataFrame does not contain any NaN or infinite values.


In [9]:
# Replace infinite values with NaN
submission_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for any NaN values in the DataFrame (including those that were originally infinite values)
if submission_data.isna().any().any():
    print("DataFrame contains NaN or infinite values.")
    # Optional: Handle NaN values, e.g., by filling with a default value or dropping them
    # df.fillna(0, inplace=True) # Example of filling NaN values with 0
else:
    print("DataFrame does not contain any NaN or infinite values.")

DataFrame does not contain any NaN or infinite values.


In [10]:
# x_train, x_val, y_train, y_val = train_test_split(
#     df_train.drop("is_converted", axis=1),
#     df_train["is_converted"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=400,
# )

# Split the training data into a smaller training set and a validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



## 3. 모델 학습

### 모델 정의 

In [11]:
# Initialize the Decision Tree Classifier
dt_classifier = DecisionTreeClassifier(random_state=42)

# Train the model using the split training data
dt_classifier.fit(X_train_split, y_train_split)



### 모델 학습

In [12]:
# Predict on the validation set
y_pred = dt_classifier.predict(X_val)

# Evaluate the model
print("Accuracy on validation set:", accuracy_score(y_val, y_pred))


Accuracy on validation set: 0.957422015059161


### 모델 성능 보기

In [13]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [14]:
get_clf_eval(y_val, y_pred)

오차행렬:
 [[ 696  230]
 [ 245 9985]]

정확도: 0.9574
정밀도: 0.7396
재현율: 0.7516
F1: 0.7456


## 4. 제출하기

### 테스트 데이터 예측

In [15]:
# Correct prediction on the submission/validation dataset
submission_predictions = dt_classifier.predict(X_submission)  # Use X_submission instead of submission_data

# Optionally, if you want to add predictions to the submission dataframe to check performance:
# Note: This is useful if you're examining the model's performance and not for final submission where you wouldn't have 'is_converted'
submission_data['predicted_is_converted'] = submission_predictions  # Use a new column for predictions

In [16]:
submission_data

Unnamed: 0,bant_submit,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,historical_existing_cnt,id_strategic_ver,it_strategic_ver,idit_strategic_ver,customer_job,...,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted,enterprise_Enterprise,enterprise_SMB,predicted_is_converted
0,0.00,0.224035,0.073248,47466,0.251652,53,0,0,0,0.255034,...,0,0.001183,0.049840,0.244980,0.192308,278,1,True,False,0
1,0.25,0.492788,0.102336,5405,0.253414,0,0,0,0,0.224056,...,0,0.000013,0.058388,0.532544,0.439163,437,1,False,True,1
2,1.00,0.226788,0.102336,13597,0.313694,0,0,0,0,0.266667,...,0,0.000060,0.131148,0.311321,0.100000,874,0,False,True,1
3,0.50,0.225875,0.118644,17204,0.054889,0,0,0,0,0.277027,...,0,0.001183,0.049840,0.265306,0.224056,194,0,True,False,0
4,1.00,0.225875,0.074949,2329,0.253823,2,1,0,1,0.220183,...,1,0.003079,0.064566,0.318966,0.241379,167,0,True,False,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,0.50,0.163740,0.102336,40292,0.057216,10,0,0,0,0.296552,...,0,0.001093,0.058388,0.224056,0.224056,97,0,True,False,1
5267,0.25,0.495215,0.102336,47466,0.051390,0,0,0,0,0.224056,...,0,0.001093,0.058388,0.224056,0.224056,438,0,True,False,1
5268,0.75,0.163117,0.102336,46227,0.312000,0,0,0,0,0.304762,...,0,0.001093,0.058388,0.224056,0.224056,97,1,True,False,1
5269,0.00,0.493827,0.102336,45667,0.256660,0,0,0,0,0.224056,...,0,0.001093,0.058388,0.224056,0.224056,429,0,False,True,0


### 제출 파일 작성

In [17]:
submission_data.to_csv('predicted_submission.csv', index=False)