In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv(r'churn_prediction_data\churn_prediction_data\train.csv')

In [2]:
df

Unnamed: 0,RowNumber,CustomerId,Surname,CreditScore,Geography,Gender,Age,Tenure,Balance,NumOfProducts,HasCrCard,IsActiveMember,EstimatedSalary,Exited
0,551,15806307,Trevisano,720,Spain,Male,38,5,114051.97,2,0,1,107577.29,0
1,6897,15709621,Martin,682,France,Female,54,4,62397.41,1,1,0,113088.60,1
2,4588,15619340,Palmer,672,France,Female,31,5,119903.67,1,1,1,132925.17,0
3,291,15620746,Napolitani,592,Spain,Female,40,4,104257.86,1,1,0,110857.33,0
4,1673,15646372,Yao,753,Spain,Male,42,5,120387.73,1,0,1,126378.57,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7995,5345,15584532,Yu,568,France,Female,35,6,121079.60,2,1,1,124890.50,1
7996,5837,15606641,Liao,602,Germany,Female,45,7,145846.07,1,1,0,99276.02,0
7997,7335,15739692,Ferri,679,Spain,Female,43,5,132810.01,1,1,0,130780.85,1
7998,9552,15791373,Worsnop,715,France,Male,38,4,118729.45,1,0,0,95484.52,0


In [2]:
from sklearn.metrics import accuracy_score, precision_score, f1_score

def evaluate(y_true, y_pred, show=False):

    acc = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    if show:
        print('accuracy:', acc)
        print('precision:', precision)
        print('f1 score:', f1)

    return acc, precision, f1

In [18]:
from sklearn.preprocessing import LabelEncoder

class PreproPipe():

    def __init__(self, target=False, normal=False):

        # set target encoding
        self.target = target
        if not target:
            self.GenderEncoder = LabelEncoder()
            self.GeographyEncoder = LabelEncoder()
        else:
            self.GenderEncoder = dict()
            self.GeographyEncoder = dict()

        self.normal = normal
        if normal:
            self.CreditParam = None
            self.AgeParam = None
            self.BalanceParam = None
            self.EstimatedSalaryParam = None

    def fit_transform(self, df):
        
        self.fit(df)
        return(self.transform(df))

    def fit(self, df):
        
        if not self.target:
            self.GenderEncoder.fit(df.Gender)
            self.GeographyEncoder.fit(df.Geography)
        else:
            for gender in df.Gender.unique():
                self.GenderEncoder[gender] = df.Exited[df.Gender == gender].mean()
            for geograph in df.Geography.unique():
                self.GeographyEncoder[geograph] = df.Exited[df.Geography == geograph].mean()

        if self.normal:
            self.CreditParam = (df.CreditScore.mean(), df.CreditScore.std())
            self.AgeParam = (df.Age.mean(), df.Age.std())
            self.BalanceParam = (df.Balance.mean(), df.Balance.std())
            self.EstimatedSalaryParam = (df.EstimatedSalary.mean(), df.EstimatedSalary.std())

    def transform(self, df):
        
        outdf = df.copy()

        # encoding
        if not self.target:
            outdf.Gender = self.GenderEncoder.transform(df.Gender)
            outdf.Geography = self.GeographyEncoder.transform(df.Geography)
        else:
            outdf.Gender = df.Gender.apply(lambda x: self.GenderEncoder[x])
            outdf.Geography = df.Geography.apply(lambda x: self.GeographyEncoder[x])

        # normalize
        if self.normal:
            outdf.CreditScore = df.CreditScore.apply(lambda x: (x-self.CreditParam[0])/self.CreditParam[1])
            outdf.Age = df.Age.apply(lambda x: (x-self.AgeParam[0])/self.AgeParam[1])
            outdf.Balance = df.Balance.apply(lambda x: (x-self.BalanceParam[0])/self.BalanceParam[1])
            outdf.EstimatedSalary = df.Age.apply(lambda x: (x-self.EstimatedSalaryParam[0])/self.EstimatedSalaryParam[1])

        # drop columns
        drop_feature_names = ['CustomerId', 'RowNumber', 'Surname']
        outdf = outdf.drop(drop_feature_names, axis=1)

        return outdf


In [4]:
from sklearn.model_selection import train_test_split

def StratifiedSplit(df, test_size=0.1, random_state = 42, over = 0):

    np.random.seed(random_state)

    X, y = np.array(df.iloc[:, :-1]), np.array(df.iloc[:, -1])

    # stratified split on lable
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state, stratify=y
    )

    # sample more from minor class 
    over_idxs = np.random.choice(np.where(y_train==1)[0], over, replace=True)
    X_train = np.r_[X_train, X_train[over_idxs]]
    y_train = np.r_[y_train, y_train[over_idxs]]

    return X_train, X_test, y_train, y_test

In [5]:
from sklearn.ensemble import IsolationForest

def OutlierClipping(X, y):

    iforest = IsolationForest(bootstrap=True,
                            contamination=0.04, 
                            max_features=10, 
                            max_samples=10, 
                            n_estimators=1000, 
                            n_jobs=-1,
                            random_state=1)
    inlier_idx = np.where(iforest.fit_predict(X) == 1)[0]
    print(X.shape[0] - len(inlier_idx), 'outliers clipped')
    return X[inlier_idx], y[inlier_idx]

In [6]:
def StratifiedTest(clfs, df, transformer, niters = 5, outlier = False, over=0):

    seeds = np.random.randint(1000, size=niters)
    
    # Preprocessing
    df = transformer.transform(df)

    final_scores = []

    # foreach classifier do n trails
    for clf in clfs:

        scores = []
        for seed in seeds:

            X_train, X_test, y_train, y_test = StratifiedSplit(df, random_state=seed, over=over)
            # outlier
            if outlier:
                X_train, y_train = OutlierClipping(X_train, y_train)

            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            scores.append(list(evaluate(y_test, y_pred)))
        final_scores.append(np.mean(np.array(scores), axis=0))

    return pd.DataFrame(final_scores, columns=['accuracy', 'precision', 'f1_score'], index=[str(clf) for clf in clfs])

In [19]:
SimplePrepro = PreproPipe()
SimplePrepro.fit(df)

targetPrepro = PreproPipe(target=True)
targetPrepro.fit(df)

normalPrepro = PreproPipe(normal=True)
normalPrepro.fit(df)

In [27]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

from sklearn.ensemble import VotingClassifier

In [31]:
clfs = [
    KNeighborsClassifier(),
    LogisticRegressionCV(),
    RandomForestClassifier(),
    MLPClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    # RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    # LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro)
table

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




















Unnamed: 0,accuracy,precision,f1_score
KNeighborsClassifier(),0.761,0.258486,0.137422
LogisticRegressionCV(),0.787,0.268606,0.038021
RandomForestClassifier(),0.86025,0.770761,0.566215
MLPClassifier(),0.73025,0.094893,0.111987
LGBMClassifier(),0.86725,0.765935,0.606589
"XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n importance_type='gain', interaction_constraints='',\n learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n min_child_weight=1, missing=nan, monotone_constraints='()',\n n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n tree_method='exact', validate_parameters=1, verbosity=None)",0.86275,0.737852,0.600092


In [32]:
clfs = [
    KNeighborsClassifier(),
    LogisticRegressionCV(),
    RandomForestClassifier(),
    MLPClassifier(),
    LGBMClassifier(),
    XGBClassifier(),
    # RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    # LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=targetPrepro)
table

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))




















Unnamed: 0,accuracy,precision,f1_score
KNeighborsClassifier(),0.76275,0.265713,0.139216
LogisticRegressionCV(),0.7915,0.375881,0.072629
RandomForestClassifier(),0.8575,0.729571,0.577412
MLPClassifier(),0.66,0.110325,0.136964
LGBMClassifier(),0.85825,0.722194,0.587028
"XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,\n colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,\n importance_type='gain', interaction_constraints='',\n learning_rate=0.300000012, max_delta_step=0, max_depth=6,\n min_child_weight=1, missing=nan, monotone_constraints='()',\n n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,\n reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,\n tree_method='exact', validate_parameters=1, verbosity=None)",0.851,0.693203,0.568177


In [28]:
np.random.seed(100)
clfs = [
    LogisticRegressionCV(),
    SVC(),
    RandomForestClassifier(),
    LGBMClassifier(),
    LGBMClassifier(class_weight={0:0.8, 1:0.2}),
    ]

table = StratifiedTest(clfs, df, transformer=normalPrepro)
table

Unnamed: 0,accuracy,precision,f1_score
LogisticRegressionCV(),0.80775,0.593384,0.273105
SVC(),0.80675,0.93,0.104108
RandomForestClassifier(),0.863,0.76172,0.585639
LGBMClassifier(),0.871,0.783096,0.615289
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2})",0.857,0.882179,0.494161


In [29]:
np.random.seed(100)
clfs = [
    LogisticRegressionCV(),
    SVC(),
    RandomForestClassifier(),
    LGBMClassifier(),
    LGBMClassifier(class_weight={0:0.8, 1:0.2}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro)
table

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,accuracy,precision,f1_score
LogisticRegressionCV(),0.79325,0.186667,0.016363
SVC(),0.79625,0.0,0.0
RandomForestClassifier(),0.86375,0.785467,0.575974
LGBMClassifier(),0.87175,0.787484,0.617043
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2})",0.8585,0.884894,0.501699


In [13]:
clf1 = XGBClassifier(use_label_encoder=False)
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15})

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm', clf3)],
    voting='soft',
    weights=[1,1,1]
)

clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro)
table



Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('xgb',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=None,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=10...\n reg_lambda=None,\n scale_pos_weight=None,\n subsample=None, tree_method=None,\n use_label_encoder=False,\n validate_parameters=None,\n verbosity=None)),\n ('rfc',\n RandomForestClassifier(class_weight={0: 85,\n 1: 15},\n n_estimators=350,\n n_jobs=-1)),\n ('lgbm',\n LGBMClassifier(class_weight={0: 0.85, 1: 0.15},\n learning_rate=0.018, max_bin=370,\n num_leaves=50))],\n voting='soft', weights=[1, 1, 1])",0.86125,0.829067,0.54121
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.85925,0.731451,0.585514
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.837,0.908802,0.357545


In [43]:
clf1 = XGBClassifier(use_label_encoder=False)
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15})

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm', clf3)],
    voting='soft',
    weights=[1,1,1]
)

clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=targetPrepro)
table



Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('xgb',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=None,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=10...\n reg_lambda=None,\n scale_pos_weight=None,\n subsample=None, tree_method=None,\n use_label_encoder=False,\n validate_parameters=None,\n verbosity=None)),\n ('rfc',\n RandomForestClassifier(class_weight={0: 85,\n 1: 15},\n n_estimators=350,\n n_jobs=-1)),\n ('lgbm',\n LGBMClassifier(class_weight={0: 0.85, 1: 0.15},\n learning_rate=0.018, max_bin=370,\n num_leaves=50))],\n voting='soft', weights=[1, 1, 1])",0.85425,0.794183,0.517441
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.859,0.725477,0.588771
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.8375,0.880784,0.367635


In [54]:
###########################
# Currently best          #
# Clip Outliers 0.04(288) #
###########################

clf1 = XGBClassifier(use_label_encoder=False, learning_rate = 0.025, n_estimators=200, eval_metric = 'logloss')
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=60, learning_rate=0.015, class_weight={0:0.75, 1:0.25}, n_estimators=200)
clf4 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.015, class_weight={0:0.80, 1:0.2}, n_estimators=200)
clf5 = LGBMClassifier(max_bin=370, num_leaves=40, learning_rate=0.015, class_weight={0:0.77, 1:0.23}, n_estimators=200)
clf6 = LGBMClassifier(max_bin=370, num_leaves=70, learning_rate=0.015)

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    voting='hard',
    weights=[1, 1, 1.1, 1.1, 1, 1]
)

clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=True)
table

288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped


Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('xgb',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n eval_metric='logloss', gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=0.025,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constrain...\n n_estimators=200, num_leaves=60)),\n ('lgbm2',\n LGBMClassifier(class_weight={0: 0.8, 1: 0.2},\n learning_rate=0.015, max_bin=370,\n n_estimators=200, num_leaves=50)),\n ('lgbm3',\n LGBMClassifier(class_weight={0: 0.77, 1: 0.23},\n learning_rate=0.015, max_bin=370,\n n_estimators=200, num_leaves=40)),\n ('lgbm4',\n LGBMClassifier(learning_rate=0.015, max_bin=370,\n num_leaves=70))],\n weights=[1, 1, 1.1, 1.1, 1, 1])",0.84775,0.845589,0.450845
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.8535,0.717535,0.563928
LGBMClassifier(),0.8535,0.713593,0.567352
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.834,0.924645,0.32917


In [56]:
clf1 = XGBClassifier(use_label_encoder=False, eval_metric = 'logloss')
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=60, learning_rate=0.015, class_weight={0:0.85, 1:0.15})
clf4 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.015, class_weight={0:0.80, 1:0.2})
clf5 = LGBMClassifier(max_bin=370, num_leaves=40, learning_rate=0.015, class_weight={0:0.9, 1:0.1})
clf6 = LGBMClassifier(max_bin=370, num_leaves=70, learning_rate=0.015)

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    voting='soft',
    weights=[1,1,1,1,1,1.2]
)

clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, over=2000)
table

Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('xgb',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n eval_metric='logloss', gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=None,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constraint...\n learning_rate=0.015, max_bin=370,\n num_leaves=60)),\n ('lgbm2',\n LGBMClassifier(class_weight={0: 0.8, 1: 0.2},\n learning_rate=0.015, max_bin=370,\n num_leaves=50)),\n ('lgbm3',\n LGBMClassifier(class_weight={0: 0.9, 1: 0.1},\n learning_rate=0.015, max_bin=370,\n num_leaves=40)),\n ('lgbm4',\n LGBMClassifier(learning_rate=0.015, max_bin=370,\n num_leaves=70))],\n voting='soft', weights=[1, 1, 1, 1, 1, 1.2])",0.86625,0.834656,0.566359
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.85575,0.685539,0.604528
LGBMClassifier(),0.8495,0.631572,0.632404
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.8505,0.877956,0.456925


In [45]:
clf1 = XGBClassifier(use_label_encoder=False)
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=60, learning_rate=0.015, class_weight={0:0.85, 1:0.15})
clf4 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.015, class_weight={0:0.80, 1:0.2})
clf5 = LGBMClassifier(max_bin=370, num_leaves=40, learning_rate=0.015, class_weight={0:0.9, 1:0.1})
clf6 = LGBMClassifier(max_bin=370, num_leaves=70, learning_rate=0.015)

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    voting='soft',
    weights=[1,1,1,1,1,1.2]
)

clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=True)
table

288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped


Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('xgb',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=None,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=10...\n learning_rate=0.015, max_bin=370,\n num_leaves=60)),\n ('lgbm2',\n LGBMClassifier(class_weight={0: 0.8, 1: 0.2},\n learning_rate=0.015, max_bin=370,\n num_leaves=50)),\n ('lgbm3',\n LGBMClassifier(class_weight={0: 0.9, 1: 0.1},\n learning_rate=0.015, max_bin=370,\n num_leaves=40)),\n ('lgbm4',\n LGBMClassifier(learning_rate=0.015, max_bin=370,\n num_leaves=70))],\n voting='soft', weights=[1, 1, 1, 1, 1, 1.2])",0.84675,0.855409,0.440244
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.85275,0.713267,0.560849
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.834,0.880764,0.343046


In [47]:
clf1 = XGBClassifier(use_label_encoder=False, learning_rate = 0.015, n_estimators=200)
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=60, learning_rate=0.015, class_weight={0:0.85, 1:0.15}, n_estimators=200)
clf4 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.015, class_weight={0:0.80, 1:0.2}, n_estimators=200)
clf5 = LGBMClassifier(max_bin=370, num_leaves=40, learning_rate=0.015, class_weight={0:0.9, 1:0.1}, n_estimators=200)
clf6 = LGBMClassifier(max_bin=370, num_leaves=70, learning_rate=0.015)

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    voting='hard',
    weights=[1,1,1,1,1,1.2]
)

clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False)
table



Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('xgb',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=0.015,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=2...\n n_estimators=200, num_leaves=60)),\n ('lgbm2',\n LGBMClassifier(class_weight={0: 0.8, 1: 0.2},\n learning_rate=0.015, max_bin=370,\n n_estimators=200, num_leaves=50)),\n ('lgbm3',\n LGBMClassifier(class_weight={0: 0.9, 1: 0.1},\n learning_rate=0.015, max_bin=370,\n n_estimators=200, num_leaves=40)),\n ('lgbm4',\n LGBMClassifier(learning_rate=0.015, max_bin=370,\n num_leaves=70))],\n weights=[1, 1, 1, 1, 1, 1.2])",0.8565,0.82545,0.515819
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.8565,0.737479,0.566328
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.83675,0.894458,0.360488


In [52]:
###########################
# This is good            #
# hard voter              #
###########################
clf1 = XGBClassifier(use_label_encoder=False, learning_rate = 0.025, n_estimators=200)
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=60, learning_rate=0.015, class_weight={0:0.75, 1:0.25}, n_estimators=200)
clf4 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.015, class_weight={0:0.80, 1:0.2}, n_estimators=200)
clf5 = LGBMClassifier(max_bin=370, num_leaves=40, learning_rate=0.015, class_weight={0:0.77, 1:0.23}, n_estimators=200)
clf6 = LGBMClassifier(max_bin=370, num_leaves=70, learning_rate=0.015)

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    voting='hard',
    weights=[1, 1, 1.1, 1.1, 1, 1]
)

clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=True)
table

288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped
288 outliers clipped


Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('xgb',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=0.025,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=2...\n n_estimators=200, num_leaves=60)),\n ('lgbm2',\n LGBMClassifier(class_weight={0: 0.8, 1: 0.2},\n learning_rate=0.015, max_bin=370,\n n_estimators=200, num_leaves=50)),\n ('lgbm3',\n LGBMClassifier(class_weight={0: 0.77, 1: 0.23},\n learning_rate=0.015, max_bin=370,\n n_estimators=200, num_leaves=40)),\n ('lgbm4',\n LGBMClassifier(learning_rate=0.015, max_bin=370,\n num_leaves=70))],\n weights=[1, 1, 1.1, 1.1, 1, 1])",0.84425,0.834024,0.434052
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.85325,0.727954,0.554144
LGBMClassifier(),0.8575,0.735035,0.573017
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.831,0.895658,0.314161


In [53]:
###########################
# This is good            #
# soft voter              #
###########################
clf1 = XGBClassifier(use_label_encoder=False, learning_rate = 0.025, n_estimators=200)
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=60, learning_rate=0.015, class_weight={0:0.75, 1:0.25}, n_estimators=200)
clf4 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.015, class_weight={0:0.80, 1:0.2}, n_estimators=200)
clf5 = LGBMClassifier(max_bin=370, num_leaves=40, learning_rate=0.015, class_weight={0:0.77, 1:0.23}, n_estimators=200)
clf6 = LGBMClassifier(max_bin=370, num_leaves=70, learning_rate=0.015)

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    voting='soft',
    weights=[1, 1, 1.1, 1.1, 1, 1]
)

clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False)
table



Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('xgb',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None, gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=0.025,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constraints=None,\n n_estimators=2...\n n_estimators=200, num_leaves=60)),\n ('lgbm2',\n LGBMClassifier(class_weight={0: 0.8, 1: 0.2},\n learning_rate=0.015, max_bin=370,\n n_estimators=200, num_leaves=50)),\n ('lgbm3',\n LGBMClassifier(class_weight={0: 0.77, 1: 0.23},\n learning_rate=0.015, max_bin=370,\n n_estimators=200, num_leaves=40)),\n ('lgbm4',\n LGBMClassifier(learning_rate=0.015, max_bin=370,\n num_leaves=70))],\n voting='soft', weights=[1, 1, 1.1, 1.1, 1, 1])",0.8545,0.835609,0.49757
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.8645,0.752903,0.59833
LGBMClassifier(),0.8655,0.761216,0.599571
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.83475,0.892955,0.343044


In [35]:
clfs = [
    LGBMClassifier(num_leaves=31, learning_rate=0.1),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.60, 1:0.40}),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.90, 1:0.10}),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.95, 1:0.05}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False, over=0)
table

Unnamed: 0,accuracy,precision,f1_score
LGBMClassifier(),0.8635,0.739714,0.603077
"LGBMClassifier(class_weight={0: 0.6, 1: 0.4})",0.86075,0.778549,0.564288
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2})",0.85075,0.820512,0.482687
"LGBMClassifier(class_weight={0: 0.9, 1: 0.1})",0.84475,0.830842,0.438672
"LGBMClassifier(class_weight={0: 0.95, 1: 0.05})",0.8355,0.810106,0.381823


In [36]:
clfs = [
    LGBMClassifier(num_leaves=31, learning_rate=0.2, class_weight={0:0.80, 1:0.20}),
    LGBMClassifier(num_leaves=31, learning_rate=0.15, class_weight={0:0.80, 1:0.20}),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}),
    LGBMClassifier(num_leaves=31, learning_rate=0.05, class_weight={0:0.80, 1:0.20}),
    LGBMClassifier(num_leaves=31, learning_rate=0.03, class_weight={0:0.80, 1:0.20}),
    LGBMClassifier(num_leaves=31, learning_rate=0.02, class_weight={0:0.80, 1:0.20}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False, over=0)
table

Unnamed: 0,accuracy,precision,f1_score
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, learning_rate=0.2)",0.8535,0.819657,0.500311
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, learning_rate=0.15)",0.8535,0.840864,0.489841
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2})",0.85475,0.868908,0.486335
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, learning_rate=0.05)",0.85425,0.902523,0.471225
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, learning_rate=0.03)",0.848,0.911966,0.42951
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, learning_rate=0.02)",0.84425,0.92033,0.402476


In [48]:
clfs = [
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, min_gain_to_split=15),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, min_gain_to_split=10),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, min_split_gain=5),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, min_split_gain=3),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, min_split_gain=1),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False, over=0)
table



Unnamed: 0,accuracy,precision,f1_score
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, min_gain_to_split=15)",0.8155,1.0,0.172182
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, min_gain_to_split=10)",0.83,0.947203,0.295733
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, min_split_gain=5)",0.842,0.923384,0.385094
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, min_split_gain=3)",0.843,0.92486,0.391043
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, min_split_gain=1)",0.84825,0.912011,0.430469
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2})",0.8595,0.867708,0.513887


In [51]:
clfs = [
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, n_estimators=50),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, n_estimators=100),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, n_estimators=200),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, n_estimators=500),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, n_estimators=1000),
    LGBMClassifier(num_leaves=31, learning_rate=0.1, class_weight={0:0.80, 1:0.20}, n_estimators=2000),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False, over=0)
table

Unnamed: 0,accuracy,precision,f1_score
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, n_estimators=50)",0.852,0.892937,0.460142
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2})",0.85725,0.882098,0.495168
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, n_estimators=200)",0.86,0.858095,0.520844
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, n_estimators=500)",0.865,0.824267,0.562926
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, n_estimators=1000)",0.86225,0.778662,0.57119
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, n_estimators=2000)",0.85925,0.744302,0.576024


In [45]:
clfs = [
    LGBMClassifier(learning_rate=0.1),
    LGBMClassifier(max_bin=50, learning_rate=0.1),
    LGBMClassifier(max_bin=150, learning_rate=0.1),
    LGBMClassifier(max_bin=250, learning_rate=0.1),
    LGBMClassifier(max_bin=500, learning_rate=0.1),
    LGBMClassifier(max_bin=1000, learning_rate=0.1),
    LGBMClassifier(max_bin=1500, learning_rate=0.1),
    LGBMClassifier(max_bin=2000, learning_rate=0.1),
    LGBMClassifier(max_bin=3000, learning_rate=0.1),
    LGBMClassifier(max_bin=5000, learning_rate=0.1),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False, over=0)
table

Unnamed: 0,accuracy,precision,f1_score
LGBMClassifier(),0.86025,0.721612,0.598304
LGBMClassifier(max_bin=50),0.8655,0.744397,0.611167
LGBMClassifier(max_bin=150),0.8615,0.735985,0.595491
LGBMClassifier(max_bin=250),0.863,0.733584,0.60517
LGBMClassifier(max_bin=500),0.86575,0.7471,0.609768
LGBMClassifier(max_bin=1000),0.86475,0.741612,0.608759
LGBMClassifier(max_bin=1500),0.8625,0.728709,0.605229
LGBMClassifier(max_bin=2000),0.862,0.732752,0.599723
LGBMClassifier(max_bin=3000),0.8645,0.741339,0.607429
LGBMClassifier(max_bin=5000),0.86125,0.731995,0.596801


In [66]:
clfs = [
    LGBMClassifier(),
    LGBMClassifier(num_leaves=31, learning_rate=0.03, n_estimators=200, max_bin=1000),
    LGBMClassifier(num_leaves=31, learning_rate=0.015, n_estimators=200, max_bin=1000),
    LGBMClassifier(class_weight={0:0.80, 1:0.20}),
    LGBMClassifier(num_leaves=31, learning_rate=0.02, class_weight={0:0.80, 1:0.20}, n_estimators=200, max_bin=1000),
    LGBMClassifier(num_leaves=31, learning_rate=0.017, class_weight={0:0.80, 1:0.20}, n_estimators=200, max_bin=1000),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False, over=0)
table

Unnamed: 0,accuracy,precision,f1_score
LGBMClassifier(),0.85825,0.737541,0.578256
"LGBMClassifier(learning_rate=0.03, max_bin=1000, n_estimators=200)",0.86225,0.761745,0.583789
"LGBMClassifier(learning_rate=0.015, max_bin=1000, n_estimators=200)",0.8615,0.785092,0.565203
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2})",0.85475,0.857556,0.491662
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, learning_rate=0.02, max_bin=1000,\n n_estimators=200)",0.84875,0.893394,0.441162
"LGBMClassifier(class_weight={0: 0.8, 1: 0.2}, learning_rate=0.017, max_bin=1000,\n n_estimators=200)",0.847,0.898836,0.427971


In [57]:
clfs = [
    # RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(),
    LGBMClassifier(max_bin = 2000, num_leaves=63, learning_rate=0.05, min_gain_to_split=10),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.02, class_weight={0:0.90, 1:0.10}, min_gain_to_split=10),
    # LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.02, class_weight={0:0.85, 1:0.15}),
    # LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.02, class_weight={0:0.80, 1:0.20}),
    # LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.02, class_weight={0:0.75, 1:0.25}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False, over=0)
table



Unnamed: 0,accuracy,precision,f1_score
LGBMClassifier(),0.8655,0.761216,0.599571
"LGBMClassifier(learning_rate=0.05, max_bin=2000, min_gain_to_split=10,\n num_leaves=63)",0.86075,0.782778,0.561937
"LGBMClassifier(class_weight={0: 0.9, 1: 0.1}, learning_rate=0.02, max_bin=370,\n min_gain_to_split=10, num_leaves=50)",0.8045,1.0,0.077355


In [15]:
clf1 = XGBClassifier(use_label_encoder=False, learning_rate = 0.2, n_estimators=200, eval_metric = 'logloss')
# clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf2 = XGBClassifier(use_label_encoder=False, learning_rate = 0.2, n_estimators=350, eval_metric = 'logloss')
clf3 = LGBMClassifier(max_bin=1000, learning_rate=0.05, class_weight={0:0.91, 1:0.09}, n_estimators=200, min_gain_to_split=5)
clf4 = LGBMClassifier(max_bin=1000, learning_rate=0.05, class_weight={0:0.9, 1:0.1}, n_estimators=200, min_gain_to_split=4)
clf5 = LGBMClassifier(max_bin=1000, learning_rate=0.02, class_weight={0:0.87, 1:0.13}, n_estimators=200, min_gain_to_split=4)
clf6 = LGBMClassifier(max_bin=2000, learning_rate=0.05,  boosting_type='dart')

ensemble_clf = VotingClassifier(
    # estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    estimators=[('rfc', clf2),('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    voting='soft',
    weights=[1.2 ,1, 1, 1.1, 1.4]
)



clfs = [
    ensemble_clf,
    RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1),
    LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.018, class_weight={0:0.85, 1:0.15}),
    ]

table = StratifiedTest(clfs, df, transformer=SimplePrepro, outlier=False, over=1000)
table



Unnamed: 0,accuracy,precision,f1_score
"VotingClassifier(estimators=[('rfc',\n XGBClassifier(base_score=None, booster=None,\n colsample_bylevel=None,\n colsample_bynode=None,\n colsample_bytree=None,\n eval_metric='logloss', gamma=None,\n gpu_id=None, importance_type='gain',\n interaction_constraints=None,\n learning_rate=0.2,\n max_delta_step=None, max_depth=None,\n min_child_weight=None, missing=nan,\n monotone_constraints...\n LGBMClassifier(class_weight={0: 0.9, 1: 0.1},\n learning_rate=0.05, max_bin=1000,\n min_gain_to_split=4,\n n_estimators=200)),\n ('lgbm3',\n LGBMClassifier(class_weight={0: 0.87, 1: 0.13},\n learning_rate=0.02, max_bin=1000,\n min_gain_to_split=4,\n n_estimators=200)),\n ('lgbm4',\n LGBMClassifier(boosting_type='dart',\n learning_rate=0.05,\n max_bin=2000))],\n voting='soft', weights=[1.2, 1, 1, 1.1, 1.4])",0.8555,0.897269,0.480644
"RandomForestClassifier(class_weight={0: 85, 1: 15}, n_estimators=350, n_jobs=-1)",0.8565,0.702307,0.594605
"LGBMClassifier(class_weight={0: 0.85, 1: 0.15}, learning_rate=0.018,\n max_bin=370, num_leaves=50)",0.8445,0.894781,0.412602


In [17]:
def ToUpload(clf, df, transformer, outlier=False, refit=True, over=0):

    if refit:
        df = transformer.transform(df)
        X,y = np.array(df.iloc[:, :-1]), np.array(df.iloc[:,-1])

        # sample more from minor class 
        if over:
            over_idxs = np.random.choice(np.where(y==1)[0], over, replace=True)
            X = np.r_[X, X[over_idxs]]
            y = np.r_[y, y[over_idxs]]


        if outlier:
            X, y = OutlierClipping(X, y)
        print(X.shape)
        clf.fit(X, y)

    # upload part
    df_upload = pd.read_csv(r'churn_prediction_data\churn_prediction_data\test.csv')

    rownum = df_upload['RowNumber']
    upload_pred = clf.predict(transformer.transform(df_upload))
    pd.concat([rownum, pd.Series(upload_pred, name='Exited')], axis=1).to_csv(r'output.csv')

ToUpload(ensemble_clf, df, SimplePrepro, outlier=False, refit=True, over=1000)
print(ensemble_clf)

(9000, 10)
VotingClassifier(estimators=[('rfc',
                              XGBClassifier(base_score=None, booster=None,
                                            colsample_bylevel=None,
                                            colsample_bynode=None,
                                            colsample_bytree=None,
                                            eval_metric='logloss', gamma=None,
                                            gpu_id=None, importance_type='gain',
                                            interaction_constraints=None,
                                            learning_rate=0.2,
                                            max_delta_step=None, max_depth=None,
                                            min_child_weight=None, missing=nan,
                                            monotone_constraints...
                              LGBMClassifier(class_weight={0: 0.9, 1: 0.1},
                                             learning_rate=0.05, max_bin=

In [61]:
# clf1 = XGBClassifier(use_label_encoder=False, learning_rate = 0.2, n_estimators=200, eval_metric = 'logloss')
# # clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
# clf2 = XGBClassifier(use_label_encoder=False, learning_rate = 0.2, n_estimators=350, eval_metric = 'logloss')
# clf3 = LGBMClassifier(max_bin=1000, learning_rate=0.05, class_weight={0:0.91, 1:0.09}, n_estimators=200, min_gain_to_split=5)
# clf4 = LGBMClassifier(max_bin=1000, learning_rate=0.05, class_weight={0:0.9, 1:0.1}, n_estimators=200, min_gain_to_split=5)
# clf5 = LGBMClassifier(max_bin=1000, learning_rate=0.02, n_estimators=200, min_gain_to_split=4)
# clf6 = LGBMClassifier(max_bin=2000, learning_rate=0.05,  boosting_type='dart')

# ensemble_clf = VotingClassifier(
#     # estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
#     estimators=[('rfc', clf2),('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
#     voting='soft',
#     weights=[1.2 ,1, 1, 1.1, 1.2]
# )

clf1 = XGBClassifier(use_label_encoder=False, learning_rate = 0.025, n_estimators=200)
clf2 = RandomForestClassifier(n_estimators = 350, class_weight={0:85, 1:15}, n_jobs=-1)
clf3 = LGBMClassifier(max_bin=370, num_leaves=60, learning_rate=0.015, class_weight={0:0.75, 1:0.25}, n_estimators=200)
clf4 = LGBMClassifier(max_bin=370, num_leaves=50, learning_rate=0.015, class_weight={0:0.80, 1:0.2}, n_estimators=200)
clf5 = LGBMClassifier(max_bin=370, num_leaves=40, learning_rate=0.015, class_weight={0:0.77, 1:0.23}, n_estimators=200)
clf6 = LGBMClassifier(max_bin=370, num_leaves=70, learning_rate=0.015)

ensemble_clf = VotingClassifier(
    estimators=[('xgb', clf1), ('rfc', clf2), ('lgbm1', clf3), ('lgbm2', clf4), ('lgbm3', clf5), ('lgbm4', clf6)],
    voting='soft',
    weights=[1, 1, 1.1, 1.1, 1, 1]
)


In [62]:
from sklearn.model_selection import StratifiedKFold
from collections import Counter

def SKFUpload(clf, df, transformer,kf=3, outlier=False, over=0, weight_='unif'):

    seeds = np.random.randint(1000)

    df = transformer.transform(df)
    X, y = np.array(df.iloc[:, :-1]), np.array(df.iloc[:, -1])

    # load upload file
    df_upload = pd.read_csv(r'churn_prediction_data\churn_prediction_data\test.csv')
    rownum = df_upload['RowNumber']
    
    results = []
    upload_preds = []

    skf = StratifiedKFold(n_splits=kf)
    for idx, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]

        # sample more from minor class 
        if idx % 2 == 0:
            over_idxs = np.random.choice(np.where(y_train==1)[0], over, replace=True)
            X_train = np.r_[X_train, X_train[over_idxs]]
            y_train = np.r_[y_train, y_train[over_idxs]]

        # fit and show result
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        results.append(evaluate(y_test, y_pred))

        # upload part
        upload_preds.append(clf.predict_proba(transformer.transform(df_upload)))


    upload_prob = np.zeros((len(rownum), 2))
    if weight_ == 'unif':
        weights = [1 for result in results]
    elif weight_ == 'f1':
        weights = [result[2] for result in results]
    elif weight_ == 'precision':
        weights = [result[1] for result in results]

    for upload_pred, weight in zip(upload_preds, weights):
        upload_prob += weight * upload_pred


    upload_result = np.argmax(upload_prob, axis=1)
    print(Counter(upload_result))
    pd.concat([rownum, pd.Series(upload_result, name='Exited')], axis=1).to_csv(r'output.csv')

    return pd.DataFrame(results, columns=['accuracy', 'precision', 'f1_score'], index=np.arange(kf))


table = SKFUpload(ensemble_clf, df, SimplePrepro, over=1000, kf=5, weight_='precision')
table

















Counter({0: 1789, 1: 211})




Unnamed: 0,accuracy,precision,f1_score
0,0.86625,0.807692,0.57874
1,0.8625,0.884058,0.525862
2,0.8525,0.741935,0.539062
3,0.845625,0.773973,0.477801
4,0.858125,0.787356,0.546906
