In [82]:
from utils.ETC import *
from sklearn.utils import resample
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from imblearn.combine import SMOTETomek
from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE, ADASYN
from sklearn.model_selection import train_test_split

import os
import optuna
import warnings
import pandas as pd

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
warnings.filterwarnings("ignore", category=FutureWarning)

# Load Data

In [115]:
train = pd.read_csv("../Database/test/train_k.csv", index_col=0)
test = pd.read_csv("../Database/test/test_k.csv", index_col=0)
# train = pd.read_csv('../Database/train_optimal.csv', index_col=0)
# test = pd.read_csv('../Database/test_optimal.csv', index_col=0)
test.drop(columns=['is_converted'], inplace=True)
train.drop(columns=['idit_strategic_ver'], inplace=True)
test.drop(columns=['idit_strategic_ver'], inplace=True)

# Sampling

In [45]:
X = train.drop(columns=['is_converted'])
y = train['is_converted']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
minority_indexes = np.where(y_train == 1)[0]

smoteto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'), random_state=42)
X_train, y_train = smoteto.fit_resample(X_train, y_train)

In [98]:
smote = SMOTE(random_state=42)
X_smote, y_smote = smote.fit_resample(X_train, y_train)

adasyn = ADASYN(random_state=42)
X_adasyn, y_adasyn = adasyn.fit_resample(X_train, y_train)

smoteto = SMOTETomek(tomek=TomekLinks(sampling_strategy='majority'), random_state=42)
X_smoteto, y_smoteto = smoteto.fit_resample(X_train, y_train)

In [103]:
smote_data = pd.concat([X_smote, y_smote], axis=1)
smote_data = smote_data[smote_data['is_converted'] == True & (~smote_data.index.isin(minority_indexes))]

adasyn_data = pd.concat([X_adasyn, y_adasyn], axis=1)
adasyn_data = adasyn_data[adasyn_data['is_converted'] == True & (~adasyn_data.index.isin(minority_indexes))]

smoteto_data = pd.concat([X_smoteto, y_smoteto], axis=1)
# smoteto_data = smoteto_data[smoteto_data['is_converted'] == True & (~smoteto_data.index.isin(minority_indexes))]

In [218]:
synthetic_data = pd.concat([smote_data, adasyn_data], axis=0)
synthetic_data = resample(synthetic_data, replace=True, n_samples=40000, random_state=123)

In [219]:
X_syn = synthetic_data.drop(columns=['is_converted'])
y_syn = synthetic_data['is_converted']

X_train = pd.concat([X_train, X_syn], axis=0)
y_train = pd.concat([y_train, y_syn], axis=0)

In [220]:
# class_counts = y_train.value_counts()
# set(class_counts.index)
# plt.pie(class_counts, labels=class_counts.index, startangle=140, autopct='%1.1f%%')
# plt.axis('equal')
# plt.title('Class Distribution')
# plt.show()

In [80]:
true_data = train[train['is_converted'] == True]
bootstrap_sample = resample(true_data, replace=True, n_samples=20000, random_state=123)

# Model Test

In [114]:
params = {
    'device': 'cpu',
    'boosting_type': 'gbrt',
    'objective': 'binary',
    'eval_metric': 'F1',

    'learning_rate': 0.1148568499042726,
    'max_depth': 19,
    'num_leaves': 178,
    'min_data_in_leaf': 13,
    'subsample': 0.5393283096351735,
    'colsample_bytree': 0.950543867059109,
}

model_lgbm = LGBMClassifier(**params)

model_lgbm.fit(X_train, y_train)
y_pred_lgbm = model_lgbm.predict(X_test)
get_clf_eval(y_test, y_pred_lgbm)

test_pred = model_lgbm.predict(test)
print(sum(test_pred))

[LightGBM] [Info] Number of positive: 43865, number of negative: 43574
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.044903 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6576
[LightGBM] [Info] Number of data points in the train set: 87439, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.501664 -> initscore=0.006656
[LightGBM] [Info] Start training from score 0.006656
오차행렬:
 [[  817   168]
 [  160 10715]]

정확도: 0.9723
정밀도: 0.8362
재현율: 0.8294
F1: 0.8328
1845


In [111]:
params = {
    'device': 'cuda',
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'auc',

    'learning_rate': 0.07256481766178394,
    'max_depth': 7,
    'gamma': 0.00029770223602020355,
    'alpha': 2.0170328774959357e-05,
}

model_xgb = XGBClassifier(**params)

model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
get_clf_eval(y_test, y_pred_xgb)

test_pred = model_xgb.predict(test)
print(sum(test_pred))

오차행렬:
 [[  804   181]
 [  175 10700]]

정확도: 0.9700
정밀도: 0.8212
재현율: 0.8162
F1: 0.8187
1849


In [11]:
params = {
    'task_type': 'GPU',
    'boosting_type': 'Plain',
    'loss_function': 'Logloss',
    'eval_metric': 'F1',
    'grow_policy': 'Lossguide',
    'bootstrap_type': 'Bayesian',

    'learning_rate': 0.48519406235345247,
    'depth': 17,
    'l2_leaf_reg': 4,
    'num_leaves': 266,
    'border_count': 207,
}

model_catboost = CatBoostClassifier(**params)

model_catboost.fit(X_train, y_train)
y_pred_catboost = model_catboost.predict(X_test)
y_pred_catboost = [eval(pred) for pred in y_pred_catboost]
get_clf_eval(y_test, y_pred_catboost)

test_pred = model_catboost.predict(test)
test_pred = [eval(pred) for pred in test_pred]
print(sum(test_pred))

오차행렬:
 [[  817   168]
 [  320 10555]]

정확도: 0.9589
정밀도: 0.7186
재현율: 0.8294
F1: 0.7700
1759


In [13]:
classifiers = [
    ('lgbm', model_lgbm),
    ('xgb', model_xgb),
    ('catboost', model_catboost),
]

weights = [1, 2, 2]
ensemble_model = VotingClassifier(estimators=classifiers, voting='soft', weights=weights)

ensemble_model.fit(X_train, y_train)
y_pred_ensemble = ensemble_model.predict(X_test)
get_clf_eval(y_test, y_pred_ensemble)

test_pred = ensemble_model.predict(test)
print(sum(test_pred))

[LightGBM] [Info] Number of positive: 43574, number of negative: 43574
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 6527
[LightGBM] [Info] Number of data points in the train set: 87148, number of used features: 32
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
오차행렬:
 [[  818   167]
 [  186 10689]]

정확도: 0.9702
정밀도: 0.8147
재현율: 0.8305
F1: 0.8225
2248
