In [1]:
%matplotlib inline
import pandas as pd
df = pd.read_csv("https://www.openml.org/data/get_csv/1595261/adult-census.csv")

In [2]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25,Private,226802,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,89814,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,336951,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,160323,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,18,?,103497,Some-college,10,Never-married,?,Own-child,White,Female,0,0,30,United-States,<=50K


In [3]:
from IPython.display import IFrame
IFrame('https://www.openml.org/d/1590', width=1000, height=600)

In [4]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'class'],
      dtype='object')

In [5]:
df['class'].value_counts()

 <=50K    37155
 >50K     11687
Name: class, dtype: int64

In [6]:
target = ['class']
numerical = ['age','education-num','capital-gain','capital-loss','hours-per-week']
categorical = ['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
all_columns = numerical + categorical + target
df = df[all_columns]

In [7]:
from pandas_profiling import ProfileReport
df.profile_report()



In [8]:
x = df[numerical+categorical]
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(df['class'])

In [9]:
x.shape,y.shape

((48842, 13), (48842,))

In [10]:
df['class'].value_counts()

 <=50K    37155
 >50K     11687
Name: class, dtype: int64

In [11]:
x = pd.get_dummies(x)

In [12]:
x.shape

(48842, 107)

In [13]:
#from sklearn.preprocessing import StandardScaler
#scaler = StandardScaler()
#x_scaled = scaler.fit_transform(x)

In [14]:
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=100,max_features='sqrt')
clf = clf.fit(x,y)

model = SelectFromModel(clf,prefit=True)
x_reduced = model.transform(x)
x_reduced.shape

(48842, 16)

In [15]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x_reduced,y,test_size=0.3,stratify=y)

# xgboost

In [16]:
import xgboost as xgb

params={
    'booster':'gbtree',
    'objective':'binary:logistic',
    'eval_metric': 'auc',
    'gamma':0,
    'max_depth':10,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'stratified':True,
    'min_child_weight':15,
    'learning_rate':0.1,
    'seed':20,
    'silent':1
}

In [17]:
dtrain = xgb.DMatrix(x_train,label=y_train)
dtest = xgb.DMatrix(x_test)

In [18]:
res = xgb.cv(params,dtrain,num_boost_round=2000,nfold=5,early_stopping_rounds=50)

In [19]:
watchlist = [(dtrain,'train')]
model = xgb.train(params,dtrain,len(res),watchlist)

[0]	train-auc:0.897134
[1]	train-auc:0.907865
[2]	train-auc:0.910179
[3]	train-auc:0.908157
[4]	train-auc:0.912497
[5]	train-auc:0.914602
[6]	train-auc:0.91513
[7]	train-auc:0.914506
[8]	train-auc:0.916171
[9]	train-auc:0.916447
[10]	train-auc:0.917375
[11]	train-auc:0.918065
[12]	train-auc:0.918717
[13]	train-auc:0.918824
[14]	train-auc:0.918884
[15]	train-auc:0.919194
[16]	train-auc:0.919561
[17]	train-auc:0.919367
[18]	train-auc:0.920757
[19]	train-auc:0.920734
[20]	train-auc:0.921069
[21]	train-auc:0.921157
[22]	train-auc:0.92139
[23]	train-auc:0.922291
[24]	train-auc:0.922535
[25]	train-auc:0.922755
[26]	train-auc:0.922891
[27]	train-auc:0.923509
[28]	train-auc:0.923831
[29]	train-auc:0.92415
[30]	train-auc:0.924326
[31]	train-auc:0.924601
[32]	train-auc:0.924781
[33]	train-auc:0.924889
[34]	train-auc:0.925163
[35]	train-auc:0.925335
[36]	train-auc:0.925539
[37]	train-auc:0.925679
[38]	train-auc:0.925849
[39]	train-auc:0.925963
[40]	train-auc:0.926193
[41]	train-auc:0.92632
[42]	t

## model stacking

In [20]:
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
#from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost.sklearn import XGBClassifier
from sklearn.svm import SVC
from mlxtend.classifier import StackingClassifier
import numpy as np

clf1 = KNeighborsClassifier()
clf2 = RandomForestClassifier()
clf3 = GradientBoostingClassifier()
clf4 = AdaBoostClassifier()
clf5 = XGBClassifier(params={
    'booster':'gbtree',
    'objective':'binary:logistic',
    'eval_metric': 'auc',
    'gamma':0,
    'max_depth':10,
    'subsample':0.7,
    'colsample_bytree':0.7,
    'stratified':True,
    'min_child_weight':15,
    'learning_rate':0.1,
    'seed':20,
    'silent':1
})
clf6 = SVC()
lr = LogisticRegression()
sclf = StackingClassifier(classifiers=[clf1,clf2,clf3,clf4,clf5,clf6],
                         meta_classifier=lr)

print('5-fold cross validation:\n')

for clf, label in zip([clf1, clf2,clf3,clf4, clf5, clf6,sclf],
                     ['KNN',
                     'Random Forest',
                #     'Naive Bayes',
                     'Gradient Boosting',
                     'AdaBoost',
                     'xgboost',
                      'SVC',
                      'StackingClassifier']):
                
    scores = model_selection.cross_val_score(clf,x_reduced,y,
                                            cv=5,scoring='accuracy')
                      
    
    print('Accuracy: %0.2f (+/- %0.2f) [%s]'
          % (scores.mean(), scores.std(), label))

5-fold cross validation:

Accuracy: 0.85 (+/- 0.00) [KNN]
Accuracy: 0.84 (+/- 0.00) [Random Forest]
Accuracy: 0.86 (+/- 0.00) [Gradient Boosting]
Accuracy: 0.86 (+/- 0.00) [AdaBoost]
Accuracy: 0.86 (+/- 0.00) [xgboost]
Accuracy: 0.86 (+/- 0.00) [SVC]
Accuracy: 0.84 (+/- 0.00) [StackingClassifier]
