### Ensemble Model for Sales Success Prediction

Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
#from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder

Dataset

In [2]:
df_train = pd.read_csv("train.csv") # 학습용 데이터
df_test = pd.read_csv("submission.csv") # 테스트 데이터(제출파일의 데이터)

In [3]:
df_train.head() # 학습용 데이터 살펴보기

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


Data Processing

In [None]:
# Handling Imbalanced Data
df_train1 = df_train[df_train['is_converted'] == 1]
df_train2 = df_train[df_train['is_converted'] == 0].iloc[:25000]
df_train = pd.concat([df_train1, df_train2])

In [None]:
# Handling Missing Values
numerical_cols_with_nan = ['com_reg_ver_win_rate', 'historical_existing_cnt', 'ver_win_rate_x', 'ver_win_ratio_per_bu']
for col in numerical_cols_with_nan:
    df_train[col].fillna(df_train[col].median(), inplace=True)

In [6]:
drop_columns = ['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver', 'lead_desc_length', 'historical_existing_cnt', 'product_subcategory',
                'product_modelname', 'expected_timeline', 'business_subarea']
df_train.drop(drop_columns, axis=1, inplace=True)

In [7]:
# Prepare target and features for training
y_train = df_train['is_converted'].astype(float)
X_train = df_train.drop('is_converted', axis=1)
y_test = df_test['is_converted'].astype(float)
X_test = df_test.drop('is_converted', axis=1)

In [8]:
# Identify and encode categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    X_train[col] = X_train[col].fillna('missing')
    X_test[col] = X_test[col].fillna('missing')
    combined_data = pd.concat([X_train[col], X_test[col]]).astype(str)
    le.fit(combined_data)
    X_train[col] = le.transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))


In [10]:
# Ensure that both datasets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

# Fill missing values in numerical columns
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

Modeling

In [11]:
# Define the ensemble model
clf1 = LogisticRegression(random_state=42, max_iter=1000)
clf2 = RandomForestClassifier(random_state=42)
clf3 = GradientBoostingClassifier(random_state=42)
clf4 = xgb.XGBClassifier(n_estimators=100)

# Assign weights based on hypothetical performance (e.g., 1 for LR, 2 for RF, 3 for GBC)
ensemble_clf = VotingClassifier(estimators=[
    ('lr', clf1),
    ('rf', clf2),
    ('gbc', clf3),
    ('xgb', clf4)],
    voting='soft', weights=[0.1, 0.1, 0.1, 1.2])

In [12]:
# Train the ensemble model
ensemble_clf.fit(X_train, y_train)

In [13]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [15]:
# Predict and evaluate the model
y_pred = ensemble_clf.predict(X_test)
get_clf_eval(y_test, y_pred)

오차행렬:
 [[1468    0]
 [   0 3803]]

정확도: 1.0000
정밀도: 1.0000
재현율: 1.0000
F1: 1.0000
