In [82]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('seaborn-white')

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, Ridge, Lasso, SGDRegressor, ElasticNet
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, StratifiedKFold

from xgboost import XGBClassifier

In [52]:
train = pd.read_csv('./Data/소득예측경진대회/train.csv')
test = pd.read_csv('./Data/소득예측경진대회/test.csv')
submission = pd.read_csv('./Data/소득예측경진대회/sample_submission.csv')

train = train.drop(['id'], axis = 1)

train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,32,Private,309513,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
1,33,Private,205469,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,46,Private,149949,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,23,Private,193090,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0
4,55,Private,60193,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0


In [53]:
print(len(train))
print(len(test))

17480
15081


In [54]:
train.isnull().sum()

age                  0
workclass         1836
fnlwgt               0
education            0
education.num        0
marital.status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital.gain         0
capital.loss         0
hours.per.week       0
native.country     583
target               0
dtype: int64

In [70]:
test.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
dtype: int64

In [55]:
train.value_counts('workclass')

workclass
Private             11568
Self-emp-not-inc     1272
Local-gov            1053
State-gov             659
Self-emp-inc          594
Federal-gov           485
Never-worked            7
Without-pay             6
dtype: int64

In [56]:
train['workclass'] = train['workclass'].fillna('Private')
train['occupation'] = train['occupation'].fillna('Exec-managerial')
train['native.country'] = train['native.country'].fillna('United-States')

In [57]:
train.isnull().sum()

age               0
workclass         0
fnlwgt            0
education         0
education.num     0
marital.status    0
occupation        0
relationship      0
race              0
sex               0
capital.gain      0
capital.loss      0
hours.per.week    0
native.country    0
target            0
dtype: int64

In [58]:
train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country,target
0,32,Private,309513,Assoc-acdm,12,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
1,33,Private,205469,Some-college,10,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,1
2,46,Private,149949,Some-college,10,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,0
3,23,Private,193090,Bachelors,13,Never-married,Adm-clerical,Own-child,White,Female,0,0,30,United-States,0
4,55,Private,60193,HS-grad,9,Divorced,Adm-clerical,Not-in-family,White,Female,0,0,40,United-States,0


In [59]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17480 entries, 0 to 17479
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             17480 non-null  int64 
 1   workclass       17480 non-null  object
 2   fnlwgt          17480 non-null  int64 
 3   education       17480 non-null  object
 4   education.num   17480 non-null  int64 
 5   marital.status  17480 non-null  object
 6   occupation      17480 non-null  object
 7   relationship    17480 non-null  object
 8   race            17480 non-null  object
 9   sex             17480 non-null  object
 10  capital.gain    17480 non-null  int64 
 11  capital.loss    17480 non-null  int64 
 12  hours.per.week  17480 non-null  int64 
 13  native.country  17480 non-null  object
 14  target          17480 non-null  int64 
dtypes: int64(7), object(8)
memory usage: 2.0+ MB


In [60]:
X = train.drop(['target'], axis = 1)
y = train['target']

In [61]:
columns = ('workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country')

for cols in columns:
    le = LabelEncoder()
    le.fit(list(X[cols]))
    X[cols] = le.transform(list(X[cols]))

In [62]:
X.head()

Unnamed: 0,age,workclass,fnlwgt,education,education.num,marital.status,occupation,relationship,race,sex,capital.gain,capital.loss,hours.per.week,native.country
0,32,3,309513,7,12,2,2,0,4,1,0,0,40,38
1,33,3,205469,15,10,2,3,0,4,1,0,0,40,38
2,46,3,149949,15,10,2,2,0,4,1,0,0,40,38
3,23,3,193090,9,13,4,0,3,4,0,0,0,30,38
4,55,3,60193,11,9,0,0,1,4,0,0,0,40,38


In [63]:
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 123)

print(x_train.shape, x_test.shape)
print(y_train.shape, y_test.shape)

(13984, 14) (3496, 14)
(13984,) (3496,)


In [83]:
params = {
    'min_child_weight' : [1, 5, 10],
    'gamma' : [0.5, 1, 1.5, 2, 5],
    'subsample' : [0.6, 0.8, 1.0],
    'colsample_bytree' : [0.6, 0.8, 1.0],
    'max_depth' : [3, 4, 5]
}

In [84]:
xgb = XGBClassifier(tree_method = 'gpu_hist',
                       predictor = 'gpu_predictor',
                       gpu_id=0,
                    learning_rate = 0.02,
                    n_estimators = 600,
                    objective = 'binary:logistic',
                    silent = True,
                    nthread = 1)

In [86]:
folds = 3
param_comb = 5

skf = StratifiedKFold(n_splits = folds, shuffle = True, random_state = 1000)

random_search = RandomizedSearchCV(xgb, param_distributions = params,
                                  n_iter = param_comb,
                                  scoring = 'roc_auc',
                                  n_jobs = 4,
                                  cv = skf.split(x_train, y_train),
                                  verbose = 3,
                                  random_state = 1000)

random_search.fit(x_train, y_train)

Fitting 3 folds for each of 5 candidates, totalling 15 fits




Parameters: { "silent" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




RandomizedSearchCV(cv=<generator object _BaseKFold.split at 0x0000026D30DA55F0>,
                   estimator=XGBClassifier(base_score=None, booster=None,
                                           colsample_bylevel=None,
                                           colsample_bynode=None,
                                           colsample_bytree=None, gamma=None,
                                           gpu_id=0, importance_type='gain',
                                           interaction_constraints=None,
                                           learning_rate=0.02,
                                           max_delta_step=None, max_depth=None,
                                           min_child_weight=None, missin...
                                           random_state=None, reg_alpha=None,
                                           reg_lambda=None,
                                           scale_pos_weight=None, silent=True,
                                           subsa

In [91]:
print(random_search.best_estimator_)

print(random_search.best_params_)

results = pd.DataFrame(random_search.cv_results_)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.6, gamma=1.5, gpu_id=0,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.02, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=600, n_jobs=1, nthread=1, num_parallel_tree=1,
              predictor='gpu_predictor', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, silent=True, subsample=0.8,
              tree_method='gpu_hist', validate_parameters=1, verbosity=None)
{'subsample': 0.8, 'min_child_weight': 1, 'max_depth': 3, 'gamma': 1.5, 'colsample_bytree': 0.6}


In [93]:
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_subsample,param_min_child_weight,param_max_depth,param_gamma,param_colsample_bytree,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,6.582552,0.038681,0.030582,0.007187,1.0,5,4,0.5,1.0,"{'subsample': 1.0, 'min_child_weight': 5, 'max...",0.914512,0.920133,0.914823,0.916489,0.00258,2
1,5.375593,0.3907,0.01887,0.004869,0.6,10,5,5.0,1.0,"{'subsample': 0.6, 'min_child_weight': 10, 'ma...",0.91241,0.917811,0.911919,0.914046,0.002669,5
2,5.062209,0.013841,0.025245,0.003515,0.8,1,3,1.5,0.6,"{'subsample': 0.8, 'min_child_weight': 1, 'max...",0.915845,0.920343,0.915601,0.917263,0.00218,1
3,5.034774,0.102031,0.026027,0.005654,0.6,5,4,5.0,0.6,"{'subsample': 0.6, 'min_child_weight': 5, 'max...",0.913625,0.918297,0.912342,0.914755,0.002559,4
4,3.748296,0.200334,0.013665,0.004847,0.8,5,3,0.5,0.6,"{'subsample': 0.8, 'min_child_weight': 5, 'max...",0.913889,0.918813,0.913709,0.91547,0.002365,3


In [94]:
pred = random_search.predict(x_test)

acc = accuracy_score(y_test, pred)
print(acc)

0.8664187643020596


In [68]:
test = test.drop('id', axis = 1)
test.shape

(15081, 14)

In [95]:
columns = ('workclass', 'education', 'marital.status', 'occupation', 'relationship', 'race', 'sex', 'native.country')

for cols in columns:
    le = LabelEncoder()
    le.fit(list(test[cols]))
    test[cols] = le.transform(list(test[cols]))



fin_pred = random_search.predict(test)

In [96]:
submission.head()

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [97]:
xgb_submission = pd.DataFrame({'id': submission.id, 'target': fin_pred})
xgb_submission.to_csv('xgb_submission.csv', index = False)