# 영업 성공 여부 분류 경진대회

## 1. 데이터 확인

### 필수 라이브러리

In [29]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_classification
from sklearn.metrics import accuracy_score, confusion_matrix, recall_score, precision_score, f1_score, mean_squared_error, classification_report
import lightgbm as lgb

### 데이터 셋 읽어오기

In [2]:
# Load the encoded datasets
train_data = pd.read_csv('data/encoded_train.csv')
submission_data = pd.read_csv('data/encoded_submission.csv')

submission_data.drop(columns='id', axis=1, inplace=True)

# Prepare the training data
X_train = train_data.drop(['is_converted'], axis=1)  # Features for training
y_train = train_data['is_converted']  # Target variable for training

# Prepare the submission/validation data
X_submission = submission_data.drop(['is_converted'], axis=1)  # Features for validation
y_submission = submission_data['is_converted']  # Target variable for validation

# Now, X_train and y_train are ready to be used for training your model
# And X_submission and y_submission can be used to validate the model's performance

In [25]:
train_data.__class__

pandas.core.frame.DataFrame

In [4]:

submission_data.columns

Index(['bant_submit', 'business_unit', 'com_reg_ver_win_rate', 'customer_idx',
       'customer_type', 'historical_existing_cnt', 'id_strategic_ver',
       'it_strategic_ver', 'idit_strategic_ver', 'customer_job',
       'lead_desc_length', 'product_category', 'product_subcategory',
       'product_modelname', 'customer_country.1', 'customer_position',
       'response_corporate', 'expected_timeline', 'ver_cus', 'ver_pro',
       'ver_win_rate_x', 'ver_win_ratio_per_bu', 'business_area',
       'business_subarea', 'lead_owner', 'is_converted',
       'enterprise_Enterprise', 'enterprise_SMB'],
      dtype='object')

In [5]:
print(len(submission_data.columns))
print(len(train_data.columns))

columns_in_train_not_in_submission = set(train_data.columns) - set(submission_data.columns)

columns_in_train_not_in_submission

28
28


set()

## 2. 데이터 전처리

### 2-2. 학습, 검증 데이터 분리

In [6]:
# Replace infinite values with NaN
train_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for any NaN values in the DataFrame (including those that were originally infinite values)
if train_data.isna().any().any():
    print("DataFrame contains NaN or infinite values.")
    # Optional: Handle NaN values, e.g., by filling with a default value or dropping them
    # df.fillna(0, inplace=True) # Example of filling NaN values with 0
else:
    print("DataFrame does not contain any NaN or infinite values.")

DataFrame does not contain any NaN or infinite values.


In [7]:
# Replace infinite values with NaN
submission_data.replace([np.inf, -np.inf], np.nan, inplace=True)

# Check for any NaN values in the DataFrame (including those that were originally infinite values)
if submission_data.isna().any().any():
    print("DataFrame contains NaN or infinite values.")
    # Optional: Handle NaN values, e.g., by filling with a default value or dropping them
    # df.fillna(0, inplace=True) # Example of filling NaN values with 0
else:
    print("DataFrame does not contain any NaN or infinite values.")

DataFrame does not contain any NaN or infinite values.


In [8]:
# x_train, x_val, y_train, y_val = train_test_split(
#     df_train.drop("is_converted", axis=1),
#     df_train["is_converted"],
#     test_size=0.2,
#     shuffle=True,
#     random_state=400,
# )

# Split the training data into a smaller training set and a validation set
X_train_split, X_val, y_train_split, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)



## 3. 모델 학습

In [26]:
#LGBM Booster Parameters
param = {
    'num_leaves':31, 
    'objective':'binary',
    'metric':'logloss',
    'learning_rate':0.05,
    'verbosity': 3
    }


### 모델 정의 

In [27]:


# Initialize LGBM Model
gbm = lgb.LGBMClassifier(**param)

### 모델 학습

In [28]:
#Train LGBM 
gbm.fit(X_train_split, y_train_split, eval_set=[(X_val, y_val)], eval_metric='logloss', callbacks=[lgb.early_stopping(5)])

[LightGBM] [Info] Number of positive: 3694, number of negative: 40930
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.874301
[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.373896
[LightGBM] [Debug] init for col-wise cost 0.011579 seconds, init for row-wise cost 0.011053 seconds
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.014121 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Debug] Using Sparse Multi-Val Bin
[LightGBM] [Info] Total Bins 2206
[LightGBM] [Info] Number of data points in the train set: 44624, number of used features: 27
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.082781 -> initscore=-2.405153
[LightGBM] [Info] Start training from score -2.405153
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
Training until validation scores don't improve for 5 rounds
[LightGBM] [De

### 모델 성능 보기

In [30]:
# Predict on the validation set
y_pred = gbm.predict(X_submission, num_iteration=gbm.best_iteration_)

# rmse_test = mean_squared_error(y_submission, y_pred) ** 0.5
# print(f'The RMSE of prediction is: {rmse_test}')

# Evaluate the model
print(classification_report(y_submission, y_pred))


              precision    recall  f1-score   support

           0       0.86      0.89      0.87      4090
           1       0.57      0.48      0.52      1181

    accuracy                           0.80      5271
   macro avg       0.71      0.69      0.70      5271
weighted avg       0.79      0.80      0.79      5271



In [31]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [32]:
get_clf_eval(y_submission, y_pred)

ValueError: Found input variables with inconsistent numbers of samples: [11156, 5271]

## 4. 제출하기

### 테스트 데이터 예측

In [145]:
# Correct prediction on the submission/validation dataset
submission_predictions = dt_classifier.predict(X_submission)  # Use X_submission instead of submission_data

# Optionally, if you want to add predictions to the submission dataframe to check performance:
# Note: This is useful if you're examining the model's performance and not for final submission where you wouldn't have 'is_converted'
submission_data['predicted_is_converted'] = submission_predictions  # Use a new column for predictions

### 제출 파일 작성

In [146]:
submission_data.to_csv('predicted_submission.csv', index=False)