In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings('ignore', category=DeprecationWarning)
from matplotlib.pylab import rcParams
rcParams['figure.figsize'] = 12, 4
from sklearn.model_selection import cross_validate, GridSearchCV
from sklearn import metrics



In [2]:
def feature_engr(train_set, test_set):
    '''
    Taking the raw training and test sets, and cleaning them
    up so that they can be used in training a model
    
    Parameters
    ----------
    train_set: pandas dataframe
        training set read in from pandas
    test_set: pandas dataframe
        test set read in from pandas
    
    Returns
    -------
    train_scaled, test_scaled, y_train, y_test
        feature engineered and cleaned up sets   
    '''
    train_set =train_set.dropna()
    test_set = test_set.dropna()
   
    
    combined = pd.concat([train_set, test_set], axis=0).reset_index(drop=True)
    
    combined.loc[combined.target == ' >50K', 'target']=1
    combined.loc[combined.target==' <=50K', 'target']=0
    combined.loc[combined.target==' <=50K.', 'target']=0
    combined.loc[combined.target == ' >50K.', 'target']=1
    
       
    dummies = pd.get_dummies(combined)
    
    ts = len(train_set)
    ts2 = len(test_set)
    
    train_set = dummies[0:ts]
    test_set = dummies[ts:]
    
    y_train = train_set['target']
    y_test = test_set['target']
    
    train_set = train_set.drop(["target"], axis=1)
    test_set = test_set.drop(["target"], axis=1)
    
    std_scaler = StandardScaler()
    std_scaler.fit(train_set)
    train_scaled = std_scaler.transform(train_set)
    test_scaled = std_scaler.transform(test_set)
    

    return train_scaled, test_scaled, y_train, y_test

In [3]:
columns=['age', 'workclass','fnlwgt','education','education-num','marital-status','occupation',
         'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country', 'target']
test_set = pd.read_csv("../input/adult.test.txt", index_col=False, names=columns, na_values=[' ?'])

columns=['age', 'workclass','fnlwgt','education','education-num','marital-status','occupation',
         'relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country', 'target']
train_set = pd.read_csv("../input/adult.data.txt", index_col=False, names=columns,na_values=[' ?'])

In [4]:
train_scaled, test_scaled, y_train, y_test = feature_engr(train_set, test_set)

  return self.partial_fit(X, y)


In [5]:
#First, a plain vanilla gradient boosting classifier
model = GradientBoostingClassifier()
model.fit(train_scaled, y_train)


GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features=None, max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_samples_split=2,
              min_weight_fraction_leaf=0.0, n_estimators=100,
              n_iter_no_change=None, presort='auto', random_state=None,
              subsample=1.0, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False)

In [6]:
result = model.score(test_scaled, y_test)
result

0.8606241699867198

In [7]:
params= {'n_estimators':range(20,81,10)}
gsv = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, 
        min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',
        subsample=0.8,random_state=10, verbose=1), 
param_grid = params, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsv.fit(train_scaled,y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.0776           0.0473            3.56s
         2           1.0266           0.0467            3.92s
         3           0.9822           0.0436            4.14s
         4           0.9550           0.0309            4.06s
         5           0.9246           0.0325            4.06s
         6           0.8948           0.0235            4.03s
         7           0.8770           0.0263            3.84s
         8           0.8512           0.0199            3.82s
         9           0.8334           0.0184            3.81s
        10           0.8152           0.0163            3.70s
        20           0.7146           0.0045            3.03s
        30           0.6706           0.0035            2.45s
        40           0.6423           0.0014            1.91s
        50           0.6271           0.0020            1.38s
        60           0.6139           0.0006            0.90s
       

GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=8,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=50, min_sa...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=1, warm_start=False),
       fit_params=None, iid=False, n_jobs=4,
       param_grid={'n_estimators': range(20, 81, 10)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [8]:
score = gsv.score(test_scaled, y_test)
score

0.9100234226303767

In [9]:
gsv.best_params_

{'n_estimators': 80}

In [10]:
params2 = {'max_depth':range(5,16,2), 'min_samples_split':range(200,1001,200)}
gsv2 = GridSearchCV(estimator = GradientBoostingClassifier(learning_rate=0.1, 
       n_estimators=80, max_features='sqrt', subsample=0.8, random_state=10), 
param_grid = params2, scoring='roc_auc',n_jobs=4,iid=False, cv=5)
gsv2.fit(train_scaled,y_train)


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.1, loss='deviance', max_depth=3,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=1, min_sam...      subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=0, warm_start=False),
       fit_params=None, iid=False, n_jobs=4,
       param_grid={'max_depth': range(5, 16, 2), 'min_samples_split': range(200, 1001, 200)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='roc_auc', verbose=0)

In [11]:
gsv2.score(test_scaled, y_test)

0.9124766725352113

In [22]:

estimator = GradientBoostingClassifier(learning_rate=0.01, 
        min_samples_split=500,min_samples_leaf=50,max_depth=8,max_features='sqrt',
        subsample=0.8,random_state=10, verbose=1, n_estimators=950)



In [23]:
estimator.fit(train_scaled,y_train)

      Iter       Train Loss      OOB Improve   Remaining Time 
         1           1.1184           0.0049           43.38s
         2           1.1101           0.0056           49.25s
         3           1.1029           0.0061           53.49s
         4           1.1011           0.0046           52.89s
         5           1.0980           0.0055           52.97s
         6           1.0856           0.0060           53.27s
         7           1.0879           0.0052           51.19s
         8           1.0787           0.0049           51.18s
         9           1.0722           0.0043           51.58s
        10           1.0689           0.0037           50.56s
        20           1.0302           0.0035           48.55s
        30           0.9900           0.0040           48.15s
        40           0.9543           0.0035           47.84s
        50           0.9271           0.0025           47.61s
        60           0.8983           0.0021           46.91s
       

GradientBoostingClassifier(criterion='friedman_mse', init=None,
              learning_rate=0.01, loss='deviance', max_depth=8,
              max_features='sqrt', max_leaf_nodes=None,
              min_impurity_decrease=0.0, min_impurity_split=None,
              min_samples_leaf=50, min_samples_split=500,
              min_weight_fraction_leaf=0.0, n_estimators=950,
              n_iter_no_change=None, presort='auto', random_state=10,
              subsample=0.8, tol=0.0001, validation_fraction=0.1,
              verbose=1, warm_start=False)

In [24]:
estimator.score(test_scaled, y_test)

0.8591633466135458