# Setup

In [50]:
# imports
import pandas as pd
import numpy as np
import data_describe as dd
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

## Prepare Data

In [82]:
# prepare data
def preprocess(df):
    # clean prediction feature, make binary
    df['income'] = df['income'].str.replace('.', '')
    assert set(df['income']) == set([">50K", "<=50K"])
    
    df['income_gt_50k'] = (df['income']==">50K")
    df = df.drop(['income'], axis='columns')

    # onehot encode remaining categorical variables
    cat_cols = list(set(df.columns) - set(df._get_numeric_data().columns))
    df = pd.get_dummies(df, columns=cat_cols, dtype="int64")
    df['income_gt_50k'] = df['income_gt_50k'].astype(int)

    return df


train_data = pd.read_csv('../data/census_income_census_income_data_adult.data', skipinitialspace=True, comment="|")
test_data = pd.read_csv('../data/census_income_census_income_data_adult.test', skipinitialspace=True, comment="|")

train_data = preprocess(train_data)
test_data = preprocess(test_data)

cols = train_data.columns
train_features = list(set(cols) - set(["income_gt_50k"]))
X_train, y_train = train_data.loc[:, train_data.columns.isin(train_features)], np.array(train_data["income_gt_50k"])
X_test, y_test = test_data.loc[:, test_data.columns.isin(train_features)], np.array(test_data["income_gt_50k"])

for dummy_col in list(set(train_features) - set(X_test.columns)):
    X_test[dummy_col] = 0


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.


The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.



# EDA

In [75]:
list(X_train.columns)

['age',
 'fnlwgt',
 'education-num',
 'capital-gain',
 'capital-loss',
 'hours-per-week',
 'sex_Female',
 'sex_Male',
 'native-country_?',
 'native-country_Cambodia',
 'native-country_Canada',
 'native-country_China',
 'native-country_Columbia',
 'native-country_Cuba',
 'native-country_Dominican-Republic',
 'native-country_Ecuador',
 'native-country_El-Salvador',
 'native-country_England',
 'native-country_France',
 'native-country_Germany',
 'native-country_Greece',
 'native-country_Guatemala',
 'native-country_Haiti',
 'native-country_Holand-Netherlands',
 'native-country_Honduras',
 'native-country_Hong',
 'native-country_Hungary',
 'native-country_India',
 'native-country_Iran',
 'native-country_Ireland',
 'native-country_Italy',
 'native-country_Jamaica',
 'native-country_Japan',
 'native-country_Laos',
 'native-country_Mexico',
 'native-country_Nicaragua',
 'native-country_Outlying-US(Guam-USVI-etc)',
 'native-country_Peru',
 'native-country_Philippines',
 'native-country_Poland'

In [45]:
print(y_train.value_counts())
print(y_test.value_counts())
print(X_train)

income_gt_50k
0                24720
1                 7841
dtype: int64
income_gt_50k
0                12435
1                 3846
dtype: int64
       age  fnlwgt  education-num  capital-gain  capital-loss  hours-per-week  \
0       39   77516             13          2174             0              40   
1       50   83311             13             0             0              13   
2       38  215646              9             0             0              40   
3       53  234721              7             0             0              40   
4       28  338409             13             0             0              40   
...    ...     ...            ...           ...           ...             ...   
32556   27  257302             12             0             0              38   
32557   40  154374              9             0             0              40   
32558   58  151910              9             0             0              40   
32559   22  201490              9           

# Modeling

In [83]:
model = XGBClassifier()
model.fit(X_train, y_train)
# TODO do cross-validation




pandas.Int64Index is deprecated and will be removed from pandas in a future version. Use pandas.Index with the appropriate dtype instead.





XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

# Evaluation

In [84]:
model.score(X_test, y_test)

0.8260549106320251

# Serializing