In [3]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier
from sklearn.model_selection import train_test_split # Import train_test_split function
from sklearn import metrics #Import scikit-learn metrics module for accuracy calculation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV

np.random.seed(123)

data_train = pd.read_csv('adulttrain.data', sep=",",header = None, names = ["age","workclass","fnlwgt"
                                                                     ,"education","education_num","martial_status","occupation","relationship","race","sex",
                                                                     "capital_gain","capital_loss","hours_per_week","native_country","earning"])

data_test = pd.read_csv('adulttest.test', sep=",",header = None, names = ["age","workclass","fnlwgt"
                                                                     ,"education","education_num","martial_status","occupation","relationship","race","sex",
                                                                     "capital_gain","capital_loss","hours_per_week","native_country","earning"])


data = pd.concat([data_train,data_test])

In [4]:
data.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,martial_status,occupation,relationship,race,sex,capital_gain,capital_loss,hours_per_week,native_country,earning
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
data["earning"].unique()

array([' <=50K', ' >50K', ' <=50K.', ' >50K.'], dtype=object)

In [6]:
data["earning"].replace({' <=50K.' :" <=50K"}, inplace=True)
data["earning"].replace({' >50K.' :" >50K"}, inplace=True)

In [7]:
data["earning"].unique()

array([' <=50K', ' >50K'], dtype=object)

In [8]:
for label in data.columns:
    if data[label].dtypes != "int64":
        data[label].replace({' ?' :list(dict(data[label].value_counts()).keys())[0]}, inplace=True)

In [9]:
data.isnull().any()

age               False
workclass         False
fnlwgt            False
education         False
education_num     False
martial_status    False
occupation        False
relationship      False
race              False
sex               False
capital_gain      False
capital_loss      False
hours_per_week    False
native_country    False
earning           False
dtype: bool

In [10]:
data = data.drop('education', axis = 1)

In [11]:
data_1 = pd.get_dummies(data[["workclass","martial_status","occupation","relationship","race","sex","native_country","earning"]], drop_first=True)

In [12]:
data_1.head()

Unnamed: 0,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,martial_status_ Married-AF-spouse,martial_status_ Married-civ-spouse,martial_status_ Married-spouse-absent,...,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia,earning_ >50K
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
1,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
3,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,1,0,0,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
result = data_1.iloc[:,0:76].copy()
labels = data_1.iloc[:,76:77].copy()

In [14]:
result.head()

Unnamed: 0,workclass_ Local-gov,workclass_ Never-worked,workclass_ Private,workclass_ Self-emp-inc,workclass_ Self-emp-not-inc,workclass_ State-gov,workclass_ Without-pay,martial_status_ Married-AF-spouse,martial_status_ Married-civ-spouse,martial_status_ Married-spouse-absent,...,native_country_ Portugal,native_country_ Puerto-Rico,native_country_ Scotland,native_country_ South,native_country_ Taiwan,native_country_ Thailand,native_country_ Trinadad&Tobago,native_country_ United-States,native_country_ Vietnam,native_country_ Yugoslavia
0,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,0,0,0,0,1,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,0,0,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
labels.head()

Unnamed: 0,earning_ >50K
0,0
1,0
2,0
3,0
4,0


In [16]:
train_x, test_x, train_y, test_y = train_test_split(result, labels, test_size = 0.2, 
                                                    random_state = 42)

print ("Train_x Shape: ",train_x.shape)
print ("Train_y Shape: ", train_y.shape)
print ("Test_x Shape: ", test_x.shape)
print ("Test_y Shape: ", test_y.shape)

Train_x Shape:  (39073, 76)
Train_y Shape:  (39073, 1)
Test_x Shape:  (9769, 76)
Test_y Shape:  (9769, 1)


# 1-Decision Tree Classifier

In [204]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

clf = DecisionTreeClassifier(random_state = seed_1)

X = result
Y = labels
import sklearn
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return (sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred))+ 
            roc_auc_score(np.array(y_true), np.array(y_pred)))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(clf, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Acuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()

# Train Decision Tree Classifer
clf = clf.fit(train_x,train_y)

#Predict the response for test dataset
y_pred_DTC = clf.predict(test_x)
print("--- %s seconds ---" % (time.time() - start_time))

print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_DTC))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_DTC))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_DTC))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_DTC))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_DTC))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_DTC))

feature_importances_DTC = pd.DataFrame(clf.feature_importances_,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances_DTC.head(10))

Mean of Roc_Auc + Acuracy of kfold cross validation results:1.5320
--- 0.3430819511413574 seconds ---
Confusion Matrix:
[[6669  745]
 [1122 1233]]
Accuracy: 0.8089
AUC Score:0.7115
Precision:0.6234
Recall:0.5236
F1 Score:0.5691
                                    importance
martial_status_ Married-civ-spouse    0.536099
occupation_ Exec-managerial           0.080551
occupation_ Prof-specialty            0.071898
occupation_ Sales                     0.026130
workclass_ Private                    0.021236
workclass_ Self-emp-not-inc           0.020797
sex_ Male                             0.015739
occupation_ Tech-support              0.015732
race_ White                           0.012008
native_country_ United-States         0.011638


In [89]:
#I will take features that has importance greater than 0.01
print(feature_importances_DTC[feature_importances_DTC["importance"]>0.01])
print("total importance generated by these variables: ",feature_importances_DTC[feature_importances_DTC["importance"]>0.01].sum())

                                    importance
martial_status_ Married-civ-spouse    0.536099
occupation_ Exec-managerial           0.080551
occupation_ Prof-specialty            0.071898
occupation_ Sales                     0.026130
workclass_ Private                    0.021236
workclass_ Self-emp-not-inc           0.020797
sex_ Male                             0.015739
occupation_ Tech-support              0.015732
race_ White                           0.012008
native_country_ United-States         0.011638
relationship_ Own-child               0.011600
native_country_ Mexico                0.010982
workclass_ Self-emp-inc               0.010682
total importance generated by these variables:  importance    0.845092
dtype: float64


In [90]:
new_train_x = train_x[feature_importances_DTC[feature_importances_DTC["importance"]>0.01].index]
new_test_x = test_x[feature_importances_DTC[feature_importances_DTC["importance"]>0.01].index]

In [91]:
new_train_x.head()

Unnamed: 0,martial_status_ Married-civ-spouse,occupation_ Exec-managerial,occupation_ Prof-specialty,occupation_ Sales,workclass_ Private,workclass_ Self-emp-not-inc,sex_ Male,occupation_ Tech-support,race_ White,native_country_ United-States,relationship_ Own-child,native_country_ Mexico,workclass_ Self-emp-inc
4632,1,0,0,0,1,0,1,0,1,1,0,0,0
31093,1,0,1,0,0,0,1,0,1,1,0,0,0
1253,1,0,0,0,0,0,1,0,1,1,0,0,0
14500,0,0,0,0,1,0,1,0,1,1,0,0,0
23399,0,0,0,1,0,1,1,0,1,1,0,0,0


In [92]:
new_test_x.head()

Unnamed: 0,martial_status_ Married-civ-spouse,occupation_ Exec-managerial,occupation_ Prof-specialty,occupation_ Sales,workclass_ Private,workclass_ Self-emp-not-inc,sex_ Male,occupation_ Tech-support,race_ White,native_country_ United-States,relationship_ Own-child,native_country_ Mexico,workclass_ Self-emp-inc
7762,0,0,0,0,1,0,1,0,1,1,0,0,0
23881,0,0,0,1,1,0,0,0,1,1,1,0,0
30507,0,0,0,0,0,0,1,0,0,1,0,0,0
28911,0,0,0,1,1,0,0,0,1,1,1,0,0
19484,0,0,0,0,1,0,1,0,0,0,0,0,0


In [93]:
from sklearn import model_selection 
seed_1 = 20
seed = 42
# Create Decision Tree classifer object with max_depth = 2 and criterion = gini
clf = DecisionTreeClassifier(random_state = seed_1)

X = result[feature_importances_DTC[feature_importances_DTC["importance"]>0.01].index]
Y = labels
import sklearn
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(clf, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Acuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()
# Train Decision Tree Classifer
clf = clf.fit(new_train_x,train_y)

#Predict the response for test dataset
y_pred_DTC = clf.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_DTC))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_DTC))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_DTC))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_DTC))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_DTC))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_DTC))


Mean of Roc_Auc + Acuracy of kfold cross validation results:1.5248
--- 0.030916929244995117 seconds ---
Confusion Matrix:
[[6730  684]
 [1180 1175]]
Accuracy: 0.8092
AUC Score:0.7033
Precision:0.6321
Recall:0.4989
F1 Score:0.5577


In [205]:
from sklearn.feature_selection import RFE
estimator = clf
selector = RFE(estimator, n_features_to_select=None, step=1)
selector = selector.fit(train_x,np.ravel(train_y))
selector.ranking_

array([ 1, 38,  1,  1,  1,  1, 15, 12,  1,  3,  1,  1,  1, 36,  1,  1,  1,
        1,  1,  1, 22,  1,  2,  1,  1,  1,  1,  1,  1,  7,  1,  1,  1,  1,
        1,  1,  1,  8, 17,  1, 18, 33, 13,  1, 21,  1, 10, 29, 30, 39, 37,
       34, 14,  1,  9, 11,  1,  5,  4, 32,  1, 25, 26, 27,  1, 16, 20,  1,
       35,  6, 19, 31, 28,  1, 24, 23])

In [95]:
data = pd.DataFrame(data = train_x.columns, columns =["variable"] )
data["ranking"] = selector.ranking_

In [100]:
data[data["ranking"]==1]["variable"]

new_train_x = train_x[data[data["ranking"]==1]["variable"]]
new_test_x = test_x[data[data["ranking"]==1]["variable"]]

In [104]:
from sklearn import model_selection 
seed_1 = 20
seed = 42
# Create Decision Tree classifer object with max_depth = 2 and criterion = gini
clf = DecisionTreeClassifier(random_state = seed_1)

X = result[data[data["ranking"]==1]["variable"]]
Y = labels
import sklearn
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(clf, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Acuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()
# Train Decision Tree Classifer
clf = clf.fit(new_train_x,train_y)

#Predict the response for test dataset
y_pred_DTC = clf.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_DTC))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_DTC))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_DTC))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_DTC))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_DTC))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_DTC))


Mean of Roc_Auc + Acuracy of kfold cross validation results:1.5348
--- 0.12864947319030762 seconds ---
Confusion Matrix:
[[6661  753]
 [1120 1235]]
Accuracy: 0.8083
AUC Score:0.7114
Precision:0.6212
Recall:0.5244
F1 Score:0.5687


# 2-Bagging Classifier

In [206]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

clf2 = DecisionTreeClassifier(random_state = seed_1)
bgc = BaggingClassifier(base_estimator=clf2, random_state=seed_1)

X = result
Y = np.ravel(labels)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
results = model_selection.cross_val_score(bgc, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()

# Train Decision Tree Classifer
bgc = bgc.fit(train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_bgc = bgc.predict(test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_bgc))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_bgc))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_bgc))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_bgc))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_bgc))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_bgc))

featimp = np.mean([
    tree.feature_importances_ for tree in bgc.estimators_
], axis=0)

feature_importances_BC = pd.DataFrame(featimp,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)

print(feature_importances_BC.head(10))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5334
--- 1.9560153484344482 seconds ---
Confusion Matrix:
[[6654  760]
 [1117 1238]]
Accuracy: 0.8079
AUC Score:0.7116
Precision:0.6196
Recall:0.5257
F1 Score:0.5688
                                    importance
martial_status_ Married-civ-spouse    0.512888
occupation_ Exec-managerial           0.077402
occupation_ Prof-specialty            0.066454
occupation_ Sales                     0.026649
workclass_ Private                    0.023205
workclass_ Self-emp-not-inc           0.023140
sex_ Male                             0.017419
occupation_ Tech-support              0.016326
race_ White                           0.013761
native_country_ United-States         0.013077


In [110]:
#I will take features that has importance greater than 0.01
print(feature_importances_BC[feature_importances_BC["importance"]>0.01])
print("total importance generated by these variables: ",
      feature_importances_BC[feature_importances_BC["importance"]>0.01].sum())

                                    importance
martial_status_ Married-civ-spouse    0.512888
occupation_ Exec-managerial           0.077402
occupation_ Prof-specialty            0.066454
occupation_ Sales                     0.026649
workclass_ Private                    0.023205
workclass_ Self-emp-not-inc           0.023140
sex_ Male                             0.017419
occupation_ Tech-support              0.016326
race_ White                           0.013761
native_country_ United-States         0.013077
workclass_ Self-emp-inc               0.011064
race_ Black                           0.010856
relationship_ Own-child               0.010407
total importance generated by these variables:  importance    0.822647
dtype: float64


In [111]:
new_train_x = train_x[feature_importances_BC[feature_importances_BC["importance"]>0.01].index]
new_test_x = test_x[feature_importances_BC[feature_importances_BC["importance"]>0.01].index]

In [207]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

clf2 = DecisionTreeClassifier(random_state = seed_1)
bgc = BaggingClassifier(base_estimator=clf2, random_state=seed_1)

X = result[feature_importances_BC[feature_importances_BC["importance"]>0.01].index]
Y = np.ravel(labels)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
results = model_selection.cross_val_score(bgc, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()

# Train Decision Tree Classifer
bgc = bgc.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_bgc = bgc.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_bgc))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_bgc))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_bgc))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_bgc))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_bgc))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_bgc))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5239
--- 0.9937758445739746 seconds ---
Confusion Matrix:
[[6655  759]
 [1110 1245]]
Accuracy: 0.8087
AUC Score:0.7131
Precision:0.6213
Recall:0.5287
F1 Score:0.5712


In [208]:
from sklearn.feature_selection import RFE
estimator = bgc
featimp = np.mean([
    RFE(tree, n_features_to_select=None, step=1).fit(train_x,np.ravel(train_y)).ranking_ for tree in bgc.estimators_
], axis=0)


In [121]:
featimp

array([ 1. , 38. ,  1. ,  1. ,  1. ,  1. , 14.6, 13.7,  1. ,  4.2,  1. ,
        1. ,  1. , 36. ,  1. ,  1. ,  1. ,  1. ,  1. ,  1. , 23.1,  1. ,
        1.6,  1. ,  1. ,  1. ,  1. ,  1. ,  1. ,  6.7,  1. ,  1. ,  1. ,
        1. ,  1. ,  1. ,  1. ,  8.5, 18.2,  1. , 16.5, 34.8, 14. ,  1. ,
       20.7,  1. , 10.5, 28.8, 26.4, 39. , 36.7, 33.3, 12.7,  1. ,  7.9,
       10.5,  1.8,  3.8,  5.3, 31.3,  1. , 24.5, 26. , 26.3,  1. , 15.7,
       20.2,  1. , 34. ,  5.2, 19.2, 30.6, 29.5,  1. , 27.3, 22.9])

In [122]:
data = pd.DataFrame(data = train_x.columns, columns =["variable"] )
data["ranking"] = featimp

In [124]:
data[data["ranking"]==1]["variable"]

new_train_x = train_x[data[data["ranking"]==1]["variable"]]
new_test_x = test_x[data[data["ranking"]==1]["variable"]]

In [209]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

clf2 = DecisionTreeClassifier(random_state = seed_1)
bgc = BaggingClassifier(base_estimator=clf2, random_state=seed_1)

X = result[data[data["ranking"]==1]["variable"]]
Y = np.ravel(labels)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
results = model_selection.cross_val_score(bgc, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()

# Train Decision Tree Classifer
bgc = bgc.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_bgc = bgc.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_bgc))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_bgc))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_bgc))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_bgc))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_bgc))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_bgc))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5362
--- 0.8956453800201416 seconds ---
Confusion Matrix:
[[6655  759]
 [1110 1245]]
Accuracy: 0.8087
AUC Score:0.7131
Precision:0.6213
Recall:0.5287
F1 Score:0.5712


# 3-Random Forest Classifier

In [144]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result
Y = np.ravel(labels)

rfc = RandomForestClassifier(random_state=seed_1)
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)

kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(rfc, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()
# Train Bagging Classifer
rfc = rfc.fit(train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_RFC = np.ravel(rfc.predict(test_x))
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_RFC))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_RFC))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_RFC))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_RFC))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_RFC))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_RFC))
feature_importances_RFC = pd.DataFrame(rfc.feature_importances_,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances_RFC.head(10))

Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5343
--- 4.854711055755615 seconds ---
Confusion Matrix:
[[6664  750]
 [1115 1240]]
Accuracy: 0.8091
AUC Score:0.7127
Precision:0.6231
Recall:0.5265
F1 Score:0.5708
                                    importance
martial_status_ Married-civ-spouse    0.213985
martial_status_ Never-married         0.115981
occupation_ Exec-managerial           0.077143
relationship_ Own-child               0.058481
sex_ Male                             0.052683
relationship_ Not-in-family           0.047955
occupation_ Prof-specialty            0.043266
occupation_ Other-service             0.029195
relationship_ Unmarried               0.023091
workclass_ Private                    0.021721


In [145]:
new_train_x = train_x[feature_importances_RFC[feature_importances_RFC["importance"]>0.01].index]
new_test_x = test_x[feature_importances_RFC[feature_importances_RFC["importance"]>0.01].index]

In [146]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result[feature_importances_RFC[feature_importances_RFC["importance"]>0.01].index]
Y = np.ravel(labels)

rfc = RandomForestClassifier(random_state=seed_1)
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)

kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(rfc, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
# Train Bagging Classifer
rfc = rfc.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_RFC = np.ravel(rfc.predict(new_test_x))
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_RFC))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_RFC))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_RFC))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_RFC))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_RFC))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_RFC))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5359
--- 1.9313933849334717 seconds ---
Confusion Matrix:
[[6649  765]
 [1100 1255]]
Accuracy: 0.8091
AUC Score:0.7149
Precision:0.6213
Recall:0.5329
F1 Score:0.5737


In [147]:
from sklearn.feature_selection import RFE
estimator = rfc
selector = RFE(estimator, n_features_to_select=None, step=1)
selector = selector.fit(train_x,np.ravel(train_y))
selector.support_
selector.ranking_

array([ 1, 38,  1,  1,  1,  1, 20, 21,  1,  1,  1,  1,  1, 36,  1,  1,  1,
        1,  1,  1, 23,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  9, 12,  4, 15, 27, 18,  3, 22,  1,  8, 28, 26, 39, 37,
       31, 19,  1,  6, 10,  2, 14,  5, 32,  1, 24, 33, 30,  1, 11, 16,  1,
       34,  7, 13, 29, 35,  1, 17, 25])

In [148]:
data = pd.DataFrame(data = train_x.columns, columns =["variable"] )
data["ranking"] = selector.ranking_

In [149]:
new_train_x = train_x[data[data["ranking"]==1]["variable"]]
new_test_x = test_x[data[data["ranking"]==1]["variable"]]

In [150]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result[data[data["ranking"]==1]["variable"]]
Y = np.ravel(labels)

rfc = RandomForestClassifier(random_state=seed_1)
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)

kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(rfc, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
# Train Bagging Classifer
rfc = rfc.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_RFC = np.ravel(rfc.predict(new_test_x))
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_RFC))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_RFC))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_RFC))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_RFC))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_RFC))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_RFC))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5366
--- 3.015632152557373 seconds ---
Confusion Matrix:
[[6662  752]
 [1112 1243]]
Accuracy: 0.8092
AUC Score:0.7132
Precision:0.6231
Recall:0.5278
F1 Score:0.5715


# 4-AdaBoost

In [152]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result
Y = np.ravel(labels)

clf1 = DecisionTreeClassifier(random_state= 5, max_depth = 1)
ada = AdaBoostClassifier(random_state = seed_1)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(ada, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()
# Train Bagging Classifer
ada = ada.fit(train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_ada = ada.predict(test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_ada))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_ada))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_ada))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_ada))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_ada))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_ada))

feature_importances_ada = pd.DataFrame(ada.feature_importances_,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances_ada.head(10))

Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5355
--- 2.323394536972046 seconds ---
Confusion Matrix:
[[6579  835]
 [1044 1311]]
Accuracy: 0.8077
AUC Score:0.7220
Precision:0.6109
Recall:0.5567
F1 Score:0.5825
                                    importance
martial_status_ Married-civ-spouse        0.08
martial_status_ Never-married             0.08
sex_ Male                                 0.06
workclass_ Local-gov                      0.04
occupation_ Exec-managerial               0.04
workclass_ Private                        0.04
workclass_ Self-emp-inc                   0.04
workclass_ Self-emp-not-inc               0.04
relationship_ Own-child                   0.04
occupation_ Other-service                 0.04


In [153]:
new_train_x = train_x[feature_importances_ada[feature_importances_ada["importance"]>0.01].index]
new_test_x = test_x[feature_importances_ada[feature_importances_ada["importance"]>0.01].index]

In [154]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result[feature_importances_ada[feature_importances_ada["importance"]>0.01].index]
Y = np.ravel(labels)

clf1 = DecisionTreeClassifier(random_state= 5, max_depth = 1)
ada = AdaBoostClassifier(random_state = seed_1)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(ada, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()
# Train Bagging Classifer
ada = ada.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_ada = ada.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_ada))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_ada))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_ada))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_ada))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_ada))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_ada))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5218
--- 1.4093797206878662 seconds ---
Confusion Matrix:
[[6579  835]
 [1044 1311]]
Accuracy: 0.8077
AUC Score:0.7220
Precision:0.6109
Recall:0.5567
F1 Score:0.5825


In [155]:
from sklearn.feature_selection import RFE
estimator = ada
selector = RFE(estimator, n_features_to_select=None, step=1)
selector = selector.fit(train_x,np.ravel(train_y))
selector.support_
selector.ranking_

array([ 1,  8,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 14, 16, 19,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, 23,  1, 27,  1,  1,
       33,  1, 37, 39,  1,  1,  1,  3,  1,  1,  7,  9, 10, 11, 13, 18, 20,
       21, 22,  1, 17,  2, 15, 24, 12,  4,  1,  5,  6,  1,  1, 25, 26, 28,
       29, 30, 31, 32, 34, 35, 36, 38])

In [156]:
data = pd.DataFrame(data = train_x.columns, columns =["variable"] )
data["ranking"] = selector.ranking_

In [157]:
new_train_x = train_x[data[data["ranking"]==1]["variable"]]
new_test_x = test_x[data[data["ranking"]==1]["variable"]]

In [158]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result[data[data["ranking"]==1]["variable"]]
Y = np.ravel(labels)

clf1 = DecisionTreeClassifier(random_state= 5, max_depth = 1)
ada = AdaBoostClassifier(random_state = seed_1)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(ada, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())

import time
start_time = time.time()
# Train Bagging Classifer
ada = ada.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_ada = ada.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_ada))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_ada))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_ada))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_ada))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_ada))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_ada))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5158
--- 1.4217414855957031 seconds ---
Confusion Matrix:
[[6579  835]
 [1044 1311]]
Accuracy: 0.8077
AUC Score:0.7220
Precision:0.6109
Recall:0.5567
F1 Score:0.5825


# 5-Gradient Boosting Classifier

In [163]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result
Y = np.ravel(labels)

gb = GradientBoostingClassifier(random_state = seed_1)
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)

kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(gb, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
# Train Bagging Classifer
gb = gb.fit(train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_gb = gb.predict(test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_gb))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_gb))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_gb))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_gb))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_gb))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_gb))

feature_importances_ada = pd.DataFrame(gb.feature_importances_,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances_ada.head(10))

Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5300
--- 6.043719291687012 seconds ---
Confusion Matrix:
[[6708  706]
 [1149 1206]]
Accuracy: 0.8101
AUC Score:0.7084
Precision:0.6308
Recall:0.5121
F1 Score:0.5653
                                    importance
martial_status_ Married-civ-spouse    0.653065
occupation_ Exec-managerial           0.093314
occupation_ Prof-specialty            0.084049
occupation_ Other-service             0.020049
workclass_ Self-emp-inc               0.014440
occupation_ Sales                     0.013688
workclass_ Self-emp-not-inc           0.013581
sex_ Male                             0.011452
occupation_ Farming-fishing           0.010841
native_country_ Mexico                0.010135


In [164]:
new_train_x = train_x[feature_importances_ada[feature_importances_ada["importance"]>0.01].index]
new_test_x = test_x[feature_importances_ada[feature_importances_ada["importance"]>0.01].index]

In [165]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result[feature_importances_ada[feature_importances_ada["importance"]>0.01].index]
Y = np.ravel(labels)

gb = GradientBoostingClassifier(random_state = seed_1)
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)

kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(gb, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
# Train Bagging Classifer
gb = gb.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_gb = gb.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_gb))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_gb))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_gb))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_gb))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_gb))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_gb))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5130
--- 1.3532588481903076 seconds ---
Confusion Matrix:
[[6749  665]
 [1230 1125]]
Accuracy: 0.8060
AUC Score:0.6940
Precision:0.6285
Recall:0.4777
F1 Score:0.5428


In [166]:
from sklearn.feature_selection import RFE
estimator = gb
selector = RFE(estimator, n_features_to_select=None, step=1)
selector = selector.fit(train_x,np.ravel(train_y))
selector.support_
selector.ranking_

array([ 1, 25,  1,  1,  1,  1,  1,  1,  1,  8,  1, 11,  1, 23,  1,  1,  1,
        1,  1,  1, 18,  1,  1,  1,  1,  1,  1,  1,  1, 30,  1,  1, 27,  1,
        1,  1,  1, 19,  1, 39,  4, 21, 31,  1, 38,  1, 17,  7, 14, 37, 36,
       32, 15,  1,  2, 12, 16, 33, 34, 35,  1, 10, 29, 26,  1, 28,  6,  5,
       20,  1, 22, 24,  9,  1, 13,  3])

In [167]:
data = pd.DataFrame(data = train_x.columns, columns =["variable"] )
data["ranking"] = selector.ranking_

In [169]:
new_train_x = train_x[data[data["ranking"]==1]["variable"]]
new_test_x = test_x[data[data["ranking"]==1]["variable"]]

In [170]:
from sklearn import model_selection 
seed_1 = 20
seed = 42

X = result[data[data["ranking"]==1]["variable"]]
Y = np.ravel(labels)

gb = GradientBoostingClassifier(random_state = seed_1)
from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)

kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(gb, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
# Train Bagging Classifer
gb = gb.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_gb = gb.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_gb))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_gb))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_gb))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_gb))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_gb))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_gb))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5262
--- 3.499044418334961 seconds ---
Confusion Matrix:
[[6727  687]
 [1160 1195]]
Accuracy: 0.8109
AUC Score:0.7074
Precision:0.6350
Recall:0.5074
F1 Score:0.5641


# 6-Light GBM

In [21]:
import lightgbm as lgb
from sklearn import model_selection 
import sklearn
seed_1 = 20
seed = 42

X = result
Y = np.ravel(labels)

lgbm = lgb.LGBMClassifier(random_state = seed_1)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(lgbm, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
# Train Bagging Classifer
lgbm = lgbm.fit(train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_lgbm = lgbm.predict(test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_lgbm))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_lgbm))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_lgbm))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_lgbm))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_lgbm))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_lgbm))

feature_importances_lgbm = pd.DataFrame(lgbm.feature_importances_,
                                   index = train_x.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
print(feature_importances_lgbm.head(10))

Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5409
--- 0.6359169483184814 seconds ---
Confusion Matrix:
[[6709  705]
 [1114 1241]]
Accuracy: 0.8138
AUC Score:0.7159
Precision:0.6377
Recall:0.5270
F1 Score:0.5771
                               importance
sex_ Male                             187
martial_status_ Never-married         136
occupation_ Prof-specialty            132
workclass_ Private                    123
workclass_ Self-emp-not-inc           116
occupation_ Exec-managerial           113
relationship_ Not-in-family           110
race_ White                            98
race_ Black                            94
occupation_ Other-service              94


In [23]:
new_train_x = train_x[feature_importances_lgbm[feature_importances_lgbm["importance"]>0.01].index]
new_test_x = test_x[feature_importances_lgbm[feature_importances_lgbm["importance"]>0.01].index]

In [24]:
seed_1 = 20
seed = 42

X = result[feature_importances_lgbm[feature_importances_lgbm["importance"]>0.01].index]
Y = np.ravel(labels)

lgbm = lgb.LGBMClassifier(random_state = seed_1)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(lgbm, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
# Train Bagging Classifer
lgbm = lgbm.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_lgbm = lgbm.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_lgbm))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_lgbm))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_lgbm))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_lgbm))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_lgbm))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_lgbm))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5418
--- 0.4529280662536621 seconds ---
Confusion Matrix:
[[6709  705]
 [1114 1241]]
Accuracy: 0.8138
AUC Score:0.7159
Precision:0.6377
Recall:0.5270
F1 Score:0.5771


In [25]:
from sklearn.feature_selection import RFE
estimator = lgbm
selector = RFE(estimator, n_features_to_select=None, step=1)
selector = selector.fit(train_x,np.ravel(train_y))
selector.support_
selector.ranking_

array([ 1, 38,  1,  1,  1,  1, 30,  1,  1, 11,  1,  1,  1, 31,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  2,
        1,  1, 13, 24,  1, 15,  1, 18,  6,  4, 29, 17, 25,  1, 23, 32, 33,
       35, 36,  3, 20, 12, 19, 16, 27, 39,  1,  7,  8,  9,  1, 34, 22,  1,
       37, 14, 26, 28,  5,  1, 10, 21])

In [26]:
data = pd.DataFrame(data = train_x.columns, columns =["variable"] )
data["ranking"] = selector.ranking_

In [27]:
new_train_x = train_x[data[data["ranking"]==1]["variable"]]
new_test_x = test_x[data[data["ranking"]==1]["variable"]]

In [29]:
seed_1 = 20
seed = 42

X = result[data[data["ranking"]==1]["variable"]]
Y = np.ravel(labels)

lgbm = lgb.LGBMClassifier(random_state = seed_1)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(lgbm, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
# Train Bagging Classifer
lgbm = lgbm.fit(new_train_x,np.ravel(train_y))

#Predict the response for test dataset
y_pred_lgbm = lgbm.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_lgbm))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_lgbm))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_lgbm))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_lgbm))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_lgbm))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_lgbm))


Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5418
--- 0.4358479976654053 seconds ---
Confusion Matrix:
[[6706  708]
 [1113 1242]]
Accuracy: 0.8136
AUC Score:0.7159
Precision:0.6369
Recall:0.5274
F1 Score:0.5770


# Hyper Parameter Tuning

In [72]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import fbeta_score, make_scorer
import sklearn


def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
#first I will do random search over a greater area of parameters
params = {
        'max_bin': [200,255,300,],  
          "tree_learner" : ["serial","voting","feature","data"], 
        "num_boosting_round" : [80,100,150,200], 
          "learning_rate" : [0.01,0.1,0.2,0.3],
        "max_depth" : [-1,2,4,8]
          
        }

model = RandomizedSearchCV(estimator = lgbm, param_distributions = params, scoring=score, n_iter = 300,
                               cv = 3, verbose= 5, n_jobs = -1,random_state = 12)
new_train_x = train_x[data[data["ranking"]==1]["variable"]]
new_test_x = test_x[data[data["ranking"]==1]["variable"]]
model.fit(new_train_x,np.ravel(train_y))
print(model.best_params_)



Fitting 3 folds for each of 300 candidates, totalling 900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    3.7s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:   11.2s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   45.6s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.2min
[Parallel(n_jobs=-1)]: Done 442 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 874 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 900 out of 900 | elapsed:  3.8min finished


{'tree_learner': 'feature', 'num_boosting_round': 200, 'max_depth': 4, 'max_bin': 300, 'learning_rate': 0.3}


In [73]:
best_model = model.best_estimator_


X = result[data[data["ranking"]==1]["variable"]]
Y = np.ravel(labels)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))
score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(best_model, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
best_model = best_model.fit(new_train_x,np.ravel(train_y))
y_pred_best_gb = best_model.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_best_gb))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_best_gb))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_best_gb))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_best_gb))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_best_gb))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_best_gb))

Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5410
--- 0.3670046329498291 seconds ---
Confusion Matrix:
[[6703  711]
 [1116 1239]]
Accuracy: 0.8130
AUC Score:0.7151
Precision:0.6354
Recall:0.5261
F1 Score:0.5756


After finding  maximum of sum auc and accuracy I wil further search around this point 
{'tree_learner': 'feature', 'num_boosting_round': 200, 'max_depth': 4, 'max_bin': 300, 'learning_rate': 0.3}

In [74]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
import sklearn


def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))

score = make_scorer(my_custom_loss_func, greater_is_better = True)
params = {
        'max_bin': [298,300,302],  
          "tree_learner" : ["feature"], 
        "num_boosting_round" : [198,200,202], 
          "learning_rate" : [0.25,0.3,0.35],
        "max_depth" : [3,4,5] }
model = GridSearchCV(estimator = lgbm, param_grid = params, scoring=score,
                               cv = 5, verbose= 5, n_jobs = -1)
model.fit(new_train_x,np.ravel(train_y))
print(model.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 tasks      | elapsed:    1.3s
[Parallel(n_jobs=-1)]: Done  64 tasks      | elapsed:    8.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   41.4s
[Parallel(n_jobs=-1)]: Done 280 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 405 out of 405 | elapsed:  1.4min finished


{'learning_rate': 0.35, 'max_bin': 298, 'max_depth': 4, 'num_boosting_round': 198, 'tree_learner': 'feature'}


In [75]:
best_model = model.best_estimator_


X = result[data[data["ranking"]==1]["variable"]]
Y = np.ravel(labels)

from sklearn.metrics import fbeta_score, make_scorer
def my_custom_loss_func(y_true, y_pred):
    return sklearn.metrics.accuracy_score(np.array(y_true), np.array(y_pred)) +  roc_auc_score(np.array(y_true), np.array(y_pred))
score = make_scorer(my_custom_loss_func, greater_is_better = True)
kfold = model_selection.KFold(n_splits = 10, random_state = seed,shuffle = True)
results = model_selection.cross_val_score(best_model, X, Y, cv = kfold, scoring = score) 
print("Mean of Roc_Auc + Accuracy of kfold cross validation results:%0.4f"%results.mean())
import time
start_time = time.time()
best_model = best_model.fit(new_train_x,np.ravel(train_y))
y_pred_best_gb = best_model.predict(new_test_x)
print("--- %s seconds ---" % (time.time() - start_time))
print("Confusion Matrix:")
print(confusion_matrix(test_y, y_pred_best_gb))
print("Accuracy: %0.4f"%metrics.accuracy_score(test_y, y_pred_best_gb))
print ("AUC Score:%0.4f"%roc_auc_score(test_y, y_pred_best_gb))
print ("Precision:%0.4f"%precision_score(test_y, y_pred_best_gb))
print ("Recall:%0.4f"%recall_score(test_y, y_pred_best_gb))
print ("F1 Score:%0.4f"%f1_score(test_y, y_pred_best_gb))

Mean of Roc_Auc + Accuracy of kfold cross validation results:1.5420
--- 0.30100345611572266 seconds ---
Confusion Matrix:
[[6695  719]
 [1105 1250]]
Accuracy: 0.8133
AUC Score:0.7169
Precision:0.6348
Recall:0.5308
F1 Score:0.5782
