### Ensemble Model(+Optuna) for Sales Success Prediction

Library

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics import (
    accuracy_score,
    confusion_matrix,
    f1_score,
    precision_score,
    recall_score,
)
#from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error

from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import optuna
import plotly

  from .autonotebook import tqdm as notebook_tqdm


Dataset

In [None]:
df_train = pd.read_csv("train.csv") 
df_test = pd.read_csv("submission.csv")

In [None]:
df_train.head()

Unnamed: 0,bant_submit,customer_country,business_unit,com_reg_ver_win_rate,customer_idx,customer_type,enterprise,historical_existing_cnt,id_strategic_ver,it_strategic_ver,...,response_corporate,expected_timeline,ver_cus,ver_pro,ver_win_rate_x,ver_win_ratio_per_bu,business_area,business_subarea,lead_owner,is_converted
0,1.0,/Quezon City/Philippines,AS,0.066667,32160,End-Customer,Enterprise,,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Engineering,0,True
1,1.0,/PH-00/Philippines,AS,0.066667,23122,End-Customer,Enterprise,12.0,,,...,LGEPH,less than 3 months,1,0,0.003079,0.026846,corporate / office,Advertising,1,True
2,1.0,/Kolkata /India,AS,0.088889,1755,End-Customer,Enterprise,144.0,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,Construction,2,True
3,1.0,/Bhubaneswar/India,AS,0.088889,4919,End-Customer,Enterprise,,,,...,LGEIL,less than 3 months,1,0,0.003079,0.026846,corporate / office,IT/Software,3,True
4,1.0,/Hyderabad/India,AS,0.088889,17126,Specifier/ Influencer,Enterprise,,,,...,LGEIL,less than 3 months,0,0,0.003079,0.026846,corporate / office,,4,True


Data Processing

In [4]:
df_train1 = df_train[df_train['is_converted'] == 1]
df_train2 = df_train[df_train['is_converted'] == 0].iloc[:25000]
df_train = pd.concat([df_train1, df_train2])

In [5]:
numerical_cols_with_nan = ['com_reg_ver_win_rate', 'historical_existing_cnt', 'ver_win_rate_x', 'ver_win_ratio_per_bu']
for col in numerical_cols_with_nan:
    df_train[col].fillna(df_train[col].median(), inplace=True)

In [6]:
drop_columns = ['id_strategic_ver', 'it_strategic_ver', 'idit_strategic_ver', 'lead_desc_length', 'historical_existing_cnt', 'product_subcategory',
                'product_modelname', 'expected_timeline', 'business_subarea']
df_train.drop(drop_columns, axis=1, inplace=True)

In [7]:
# Prepare target and features for training
y_train = df_train['is_converted'].astype(float)
X_train = df_train.drop('is_converted', axis=1)
y_test = df_test['is_converted'].astype(float)
X_test = df_test.drop('is_converted', axis=1)

In [8]:
# Identify and encode categorical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns
le = LabelEncoder()
for col in categorical_cols:
    X_train[col] = X_train[col].fillna('missing')
    X_test[col] = X_test[col].fillna('missing')
    combined_data = pd.concat([X_train[col], X_test[col]]).astype(str)
    le.fit(combined_data)
    X_train[col] = le.transform(X_train[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))


In [10]:
# Ensure that both datasets have the same columns after encoding
X_train, X_test = X_train.align(X_test, join='inner', axis=1)

# Fill missing values in numerical columns
X_train.fillna(X_train.mean(), inplace=True)
X_test.fillna(X_test.mean(), inplace=True)

Model Training

In [12]:
#optuna

def objective(trial):
    # Hyperparameter search spaces
    lr_C = trial.suggest_float('lr_C', 1e-10, 1e10, log=True)
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 10, 1000)
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32, log=True)
    rf_max_features = trial.suggest_categorical('rf_max_features', ['sqrt', 'log2'])
    gbc_n_estimators = trial.suggest_int('gbc_n_estimators', 10, 1000)
    gbc_learning_rate = trial.suggest_float('gbc_learning_rate', 0.01, 1, log=True)
    gbc_subsample = trial.suggest_float('gbc_subsample', 0.1, 1.0)
    
    xgb_n_estimators = trial.suggest_int('xgb_n_estimators', 10, 1000)
    xgb_max_depth = trial.suggest_int('xgb_max_depth', 2, 10)
    xgb_learning_rate = trial.suggest_float('xgb_learning_rate', 0.01, 1, log=True)
    xgb_min_child_weight = trial.suggest_int('xgb_min_child_weight', 1, 6)
    xgb_subsample = trial.suggest_float('xgb_subsample', 0.1, 1.0)
    xgb_colsample_bytree = trial.suggest_float('xgb_colsample_bytree', 0.1, 1.0)
    
    
    # Define the ensemble model
    clf1 = LogisticRegression(C=lr_C, random_state=42, max_iter=2000)
    clf2 = RandomForestClassifier(n_estimators=rf_n_estimators, max_depth=rf_max_depth, max_features=rf_max_features, random_state=42)
    clf3 = GradientBoostingClassifier(n_estimators=gbc_n_estimators, learning_rate=gbc_learning_rate, subsample=gbc_subsample, random_state=42)
    clf4 = xgb.XGBClassifier(n_estimators=xgb_n_estimators, max_depth=xgb_max_depth, learning_rate=xgb_learning_rate, min_child_weight=xgb_min_child_weight, subsample=xgb_subsample, colsample_bytree=xgb_colsample_bytree, use_label_encoder=False, eval_metric='logloss')
    
    # Assign weights based on hypothetical performance (e.g., 1 for LR, 2 for RF, 3 for GBC)
    ensemble_clf = VotingClassifier(estimators=[
    ('lr', clf1),
    ('rf', clf2),
    ('gbc', clf3),
    ('xgb', clf4)],
    voting='soft', weights=[0.1, 0.1, 0.1, 1.2])
    
    # Train the ensemble model
    ensemble_clf.fit(X_train, y_train)
    
     # Predict and evaluate the model
    y_pred = ensemble_clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    return accuracy

In [13]:
# Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)  # Adjust the number of trials as necessary

print("Number of finished trials: ", len(study.trials))
print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print(f"    {key}: {value}")

[I 2024-02-17 22:21:55,765] A new study created in memory with name: no-name-fe1fa8cb-6858-40d7-b73f-ec8f65f413bb
[I 2024-02-17 22:22:08,389] Trial 0 finished with value: 0.882564978182508 and parameters: {'lr_C': 0.0001341000099645994, 'rf_n_estimators': 496, 'rf_max_depth': 3, 'rf_max_features': 'sqrt', 'gbc_n_estimators': 313, 'gbc_learning_rate': 0.3652415465086137, 'gbc_subsample': 0.5461171247771311, 'xgb_n_estimators': 177, 'xgb_max_depth': 5, 'xgb_learning_rate': 0.3331741746957007, 'xgb_min_child_weight': 6, 'xgb_subsample': 0.26878354849401676, 'xgb_colsample_bytree': 0.6428992369769699}. Best is trial 0 with value: 0.882564978182508.
[I 2024-02-17 22:22:58,919] Trial 1 finished with value: 0.8381711250237147 and parameters: {'lr_C': 0.0006490053937062986, 'rf_n_estimators': 920, 'rf_max_depth': 25, 'rf_max_features': 'log2', 'gbc_n_estimators': 723, 'gbc_learning_rate': 0.010797828326514254, 'gbc_subsample': 0.959649321568127, 'xgb_n_estimators': 985, 'xgb_max_depth': 5, 'xg

Number of finished trials:  1000
Best trial:
  Value:  0.9797002466325175
  Params: 
    lr_C: 2.3259399574003422e-07
    rf_n_estimators: 372
    rf_max_depth: 9
    rf_max_features: sqrt
    gbc_n_estimators: 22
    gbc_learning_rate: 0.05823216213040342
    gbc_subsample: 0.29165456985604404
    xgb_n_estimators: 953
    xgb_max_depth: 6
    xgb_learning_rate: 0.013929046450122077
    xgb_min_child_weight: 5
    xgb_subsample: 0.9800635968827005
    xgb_colsample_bytree: 0.9553463955764299


In [None]:
# Hyperparameter significance
fig = optuna.visualization.plot_param_importances(study)
fig.show()

optuna.visualization.plot_optimization_history(study)



In [15]:
# Define the classifiers with the best hyperparameters
best_params = study.best_trial.params
clf1 = LogisticRegression(C=best_params['lr_C'], random_state=42, max_iter=1000)
clf2 = RandomForestClassifier(n_estimators=best_params['rf_n_estimators'], max_depth=best_params['rf_max_depth'], random_state=42)
clf3 = GradientBoostingClassifier(n_estimators=best_params['gbc_n_estimators'], learning_rate=best_params['gbc_learning_rate'], random_state=42)
clf4 = xgb.XGBClassifier(n_estimators=best_params['xgb_n_estimators'], max_depth=best_params['xgb_max_depth'], learning_rate=best_params['xgb_learning_rate'], min_child_weight=best_params['xgb_min_child_weight'], use_label_encoder=False, eval_metric='logloss')

# Assign weights based on hypothetical performance (e.g., 1 for LR, 2 for RF, 3 for GBC)
ensemble_clf = VotingClassifier(estimators=[
    ('lr', clf1),
    ('rf', clf2),
    ('gbc', clf3),
    ('xgb', clf4)],
    voting='soft', weights=[0.1, 0.1, 0.1, 1.2])


Modeling

In [16]:
# Train the ensemble model
ensemble_clf.fit(X_train, y_train)

Performance

In [17]:
def get_clf_eval(y_test, y_pred=None):
    confusion = confusion_matrix(y_test, y_pred, labels=[True, False])
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred, labels=[True, False])
    recall = recall_score(y_test, y_pred)
    F1 = f1_score(y_test, y_pred, labels=[True, False])

    print("오차행렬:\n", confusion)
    print("\n정확도: {:.4f}".format(accuracy))
    print("정밀도: {:.4f}".format(precision))
    print("재현율: {:.4f}".format(recall))
    print("F1: {:.4f}".format(F1))

In [18]:
# Predict and evaluate the model
y_pred = ensemble_clf.predict(X_test)
get_clf_eval(y_test, y_pred)

오차행렬:
 [[1180   58]
 [  37 3996]]

정확도: 0.9820
정밀도: 0.9696
재현율: 0.9532
F1: 0.9613
