### XGBClassifier(Version2) for Sales Success Prediction

In [49]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

Dataset

In [None]:
df_train = pd.read_csv("train.csv") 
df_test = pd.read_csv("submission.csv") 

Data Processing

In [None]:
def label_encoding(series: pd.Series) -> pd.Series:

    my_dict = {}

    series = series.astype(str)

    for idx, value in enumerate(sorted(series.unique())):
        my_dict[value] = idx
    series = series.map(my_dict)

    return series

In [None]:
label_columns = [
    "customer_country",
    "business_subarea",
    "business_area",
    "business_unit",
    "customer_type",
    "enterprise",
    "customer_job",
    "inquiry_type",
    "product_category",
    "product_subcategory",
    "product_modelname",
    "customer_country.1",
    "customer_position",
    "response_corporate",
    "expected_timeline",
]

df_all = pd.concat([df_train[label_columns], df_test[label_columns]])

for col in label_columns:
    df_all[col] = label_encoding(df_all[col])
    
df_all

Unnamed: 0,customer_country,business_subarea,business_area,business_unit,customer_type,enterprise,customer_job,inquiry_type,product_category,product_subcategory,product_modelname,customer_country.1,customer_position,response_corporate,expected_timeline
0,9070,28,0,0,10,0,420,32,180,349,699,9070,47,33,246
1,8406,0,0,0,10,0,303,32,180,349,699,8406,21,33,246
2,6535,17,0,0,10,0,160,27,238,349,699,6535,80,21,246
3,3388,44,0,0,10,0,166,32,318,349,699,3388,21,21,246
4,5799,86,0,0,29,0,84,32,180,349,699,5799,80,21,246
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5266,10650,86,6,0,33,0,468,30,183,349,699,10650,9,43,271
5267,16640,86,6,3,33,0,323,30,183,349,699,16640,74,50,271
5268,1079,86,6,0,29,0,166,58,180,349,699,1079,64,43,246
5269,24,86,6,3,9,1,323,58,52,349,699,24,74,12,271


In [54]:
for col in label_columns:  
    df_train[col] = df_all.iloc[: len(df_train)][col]
    df_test[col] = df_all.iloc[len(df_train) :][col]

In [55]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59299 entries, 0 to 59298
Data columns (total 29 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   bant_submit              59299 non-null  float64
 1   customer_country         59299 non-null  int64  
 2   business_unit            59299 non-null  int64  
 3   com_reg_ver_win_rate     14568 non-null  float64
 4   customer_idx             59299 non-null  int64  
 5   customer_type            59299 non-null  int64  
 6   enterprise               59299 non-null  int64  
 7   historical_existing_cnt  13756 non-null  float64
 8   id_strategic_ver         3444 non-null   float64
 9   it_strategic_ver         1121 non-null   float64
 10  idit_strategic_ver       4565 non-null   float64
 11  customer_job             59299 non-null  int64  
 12  lead_desc_length         59299 non-null  int64  
 13  inquiry_type             59299 non-null  int64  
 14  product_category      

In [None]:
# Proceed with undersampling due to the large difference in T/F distribution of the is_converted column

# Separate rows where is_converted is T/F
df_train1 = df_train[df_train['is_converted'] == 1]
df_train2 = df_train[df_train['is_converted'] == 0]

# Select the first 25000 rows where is_converted is False (more rows)
df_train2 = df_train2.iloc[:25000]
#df_train2.info()

# Combine approximately 4800 True rows and 25000 False rows
df_train = pd.concat([df_train1, df_train2])

# Remove columns
df_train = df_train.drop(['customer_country.1', 'id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver', 'lead_desc_length',
                         'customer_type', 'historical_existing_cnt', 'product_subcategory', 
                          'product_modelname', 'business_subarea'], axis=1)

# Handle missing values in the dataset with approximately 30000 rows (using median or 0)
df_train.loc[np.isnan(df_train.com_reg_ver_win_rate)==True,'com_reg_ver_win_rate'] = df_train.com_reg_ver_win_rate.median()
df_train.loc[np.isnan(df_train.ver_win_rate_x)==True,'ver_win_rate_x'] = df_train.ver_win_rate_x.median()
df_train.loc[np.isnan(df_train.ver_win_ratio_per_bu)==True,'ver_win_ratio_per_bu'] = df_train.ver_win_ratio_per_bu.median()

df_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29850 entries, 0 to 27624
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   bant_submit           29850 non-null  float64
 1   customer_country      29850 non-null  int64  
 2   business_unit         29850 non-null  int64  
 3   com_reg_ver_win_rate  29850 non-null  float64
 4   customer_idx          29850 non-null  int64  
 5   enterprise            29850 non-null  int64  
 6   customer_job          29850 non-null  int64  
 7   inquiry_type          29850 non-null  int64  
 8   product_category      29850 non-null  int64  
 9   customer_position     29850 non-null  int64  
 10  response_corporate    29850 non-null  int64  
 11  expected_timeline     29850 non-null  int64  
 12  ver_cus               29850 non-null  int64  
 13  ver_pro               29850 non-null  int64  
 14  ver_win_rate_x        29850 non-null  float64
 15  ver_win_ratio_per_bu  29

In [57]:
x_train, x_val, y_train, y_val = train_test_split(
    df_train.drop("is_converted", axis=1),
    df_train["is_converted"],
    test_size=0.2,
    shuffle=True,
    random_state=400,
)

Model Training

In [58]:
#model = DecisionTreeClassifier()
model = xgb.XGBClassifier(n_estimators=100)

In [59]:
#model.fit(x_train.fillna(0), y_train)
model.fit(x_train.fillna(0), y_train)

In [60]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [61]:
pred = model.predict(x_val.fillna(0))
get_clf_eval(y_val, pred)

오차행렬:
 [[ 806  164]
 [  74 4926]]

정확도: 0.9601
정밀도: 0.9159
재현율: 0.8309
F1: 0.8714
