# Import Libraries

In [27]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold, cross_val_score, cross_val_predict
from sklearn import metrics

from sklearn import svm
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

# Check Data

In [97]:
train = pd.read_csv('train.csv')

# 전처리
# column 삭제
train.drop(columns = ['fnlwgt','education','race','gender'],inplace = True)

# work
train['workclass'] = train['workclass'].fillna('None')
train = train.drop(train[(train['workclass'] == 'Never-worked')].index)

# 직업이 'Armed-Forces', 'Other-service', 'Priv-house-serv' 인 것 제거
train['occupation'] = train['occupation'].fillna('None')
train = train.drop(train[(train['occupation'] == 'Armed-Forces') | (train['occupation'] == 'Other-service') 
                         | (train['occupation'] == 'Priv-house-serv')].index)


# native - country 가 us 에 대해서만
train = train[train['native-country'] == 'United-States' ]
train= train.drop('native-country', axis = 1)

# 1에 대한 데이터는 많이 존재하지 않기 때문
train = train[train['educational-num'] != 1]

# column 명 바꿈
train = train.rename(columns={'income_>50K':'income'})

# for those of who whose native country is United-States
train = train[train['native-country'] == 'United-States' ]
train = train[train['educational-num'] != 1]
train= train.drop('native-country', axis = 1)

sorted(train['occupation'].unique().tolist())

train = train.drop(train[(train['workclass'] == 'Never-worked')].index)

sorted(train['workclass'].unique().tolist())

In [98]:
def edufunc(x):
    if x in [2,3]:
        return 0
        # return 'ES'
    elif x in [4]:
        return 1
        # return 'MS'
    elif x in [5,6,7,8]:
        return 2
        # return 'HS'
    elif x == 9:
        return 3
        # return 'HS-grad'
        
        
    elif x == 10:
        return 4
        # return 'Some-college'
    elif x == 11:
        return 5
        # return 'Assoc-voc'
    elif x == 12:
        return 6
        # return 'Assoc-acdm'
    elif x == 13:
        return 7
        # return 'Bachelors'
    elif x == 14:
        return 8
        # return 'Masters'
    elif x == 15:
        return 9
        # return 'Prof-school'
    elif x == 16:
        return 10
        # return 'Doctorate'
    

In [99]:
train['educational-num'] = train['educational-num'].map(edufunc)

In [100]:
def hw(x):
    if x == 'Husband' or x == 'Wife':
        return 'H&W'
    else:
        return x

In [101]:
train['relationship'] = train['relationship'].map(hw)

In [102]:
train.head()

Unnamed: 0,age,workclass,educational-num,marital-status,occupation,relationship,capital-gain,capital-loss,hours-per-week,income
0,67,Private,10,Divorced,Exec-managerial,Not-in-family,99999,0,60,1
2,31,Private,7,Married-civ-spouse,Exec-managerial,H&W,0,0,40,1
3,58,State-gov,1,Married-civ-spouse,Transport-moving,H&W,0,0,40,0
5,59,State-gov,3,Never-married,Adm-clerical,Own-child,0,0,40,0
6,70,Private,2,Married-civ-spouse,Machine-op-inspct,H&W,2653,0,40,0


In [103]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 35456 entries, 0 to 43956
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              35456 non-null  int64 
 1   workclass        35456 non-null  object
 2   educational-num  35456 non-null  int64 
 3   marital-status   35456 non-null  object
 4   occupation       35456 non-null  object
 5   relationship     35456 non-null  object
 6   capital-gain     35456 non-null  int64 
 7   capital-loss     35456 non-null  int64 
 8   hours-per-week   35456 non-null  int64 
 9   income           35456 non-null  int64 
dtypes: int64(6), object(4)
memory usage: 3.0+ MB


In [104]:
# categorical_col = ['workclass', 'marital-status', 'occupation', 'relationship', 'race', 'gender','edu']
# one-hot encoding
df = pd.get_dummies(train)
df.head()

Unnamed: 0,age,educational-num,capital-gain,capital-loss,hours-per-week,income,workclass_Federal-gov,workclass_Local-gov,workclass_None,workclass_Private,...,occupation_Prof-specialty,occupation_Protective-serv,occupation_Sales,occupation_Tech-support,occupation_Transport-moving,relationship_H&W,relationship_Not-in-family,relationship_Other-relative,relationship_Own-child,relationship_Unmarried
0,67,10,99999,0,60,1,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,31,7,0,0,40,1,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
3,58,1,0,0,40,0,0,0,0,0,...,0,0,0,0,1,1,0,0,0,0
5,59,3,0,0,40,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,70,2,2653,0,40,0,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0


In [105]:
df.shape

(35456, 38)

# Test Train Spilt

In [125]:
from sklearn.model_selection import train_test_split
X_df = df.drop(columns = 'educational-num',axis = 1)
y_df = df['educational-num']
# y = df[['educational-num']]

In [110]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.01, random_state = 42, stratify = y)

In [111]:
print(X_train.shape)
print(X_test.shape)

(35101, 37)
(355, 37)


# Classification

In [112]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

classifier1 = DecisionTreeClassifier(criterion='gini')  
classifier1.fit(X_train, y_train)

DecisionTreeClassifier()

In [113]:
y_pred_1 = classifier1.predict(X_test)  
print(y_pred_1)

[ 3  8  7  7  7  4  8  8  3  1  7  3  7  3  4  6  4  2  4  8  4  2  3  2
  2  3  3  4  7  3  4  4  3  3  7  8  7  7  3  7  3  2  7  3  7  6  3  3
  3  3  2  7  2  8  2  7  7 10  3  7  1  7  5  4  4  2  3  2  4  5  4  8
  1  3  3  3  2  7  7  7  4  4  4  8  3  3  3  4  3  2  3  4  4  9  5  2
  4  2  7  2  4  3  2  6  4  4  3  8  4  3  3  3  3  5  7  7  7  3  4 10
  4  3  3  4  3  3  7  4  3  4  3  3  1  7  3  3  2  7  3  3  7  3  3  3
  7  7  3  3  3  4  7  4  2  4  3  2  3  4  4  7  2  3  3  4  4  4  3  3
  4  4  7  4  4  3  3  3  9  3  7  3  7  3  2  3  3  4  4  4  2  2  3  4
  8  3  3  4  9  3  2  3  4  7  6  3  4  3  4 10  7 10  3  3  3  2  4  4
  3  7  3  7  7  8  4  2  4  7  3  4  2  3  3  3  3  4  2  3  3  3  2  3
  1  3  3  4  3  4  2  6  9  9  4  3  3  4  7  2  3  7  7  3  3  1  7  3
  2  7  2  4  3  1  4  1  3  3  3  4  4  4  4  8  3  4  9  3  3  7  3  7
  3  3  9  4  3  2  7  2  3  2  4  7  3  3  4  4  3  4  3  3  2  3  3  3
  3  4  3  4  3  3  2  2  3  4  3  3  4  4  2  2  4

In [114]:


acc_1 = accuracy_score(y_test,y_pred_1)
print("Accuracy for Gini model {} %".format(acc_1*100))

from sklearn.metrics import classification_report, confusion_matrix  
print(confusion_matrix(y_test, y_pred_1))


Accuracy for Gini model 31.83098591549296 %
[[ 0  0  1  0  1  0  0  0  0  0  0]
 [ 0  1  2  1  0  0  0  1  1  0  0]
 [ 0  2 11  9  5  0  0  2  0  0  0]
 [ 0  3 12 54 32  2  3  7  0  1  1]
 [ 0  0 10 35 20  2  2 10  0  2  0]
 [ 0  0  1  6  2  1  0  4  2  0  0]
 [ 0  0  1  4  5  0  1  1  0  0  0]
 [ 0  1  5 20 11  1  2 18  2  2  1]
 [ 0  0  0  3  3  0  0  5  7  2  1]
 [ 0  0  0  1  0  0  0  3  1  0  1]
 [ 0  1  0  0  1  0  0  2  0  0  0]]


print(y_pred_1)

In [115]:
from sklearn.neighbors import KNeighborsClassifier


classifier2 = KNeighborsClassifier(n_neighbors= 3)  
classifier2.fit(X_train, y_train) 

y_pred_2 = classifier2.predict(X_test)  

acc_2 = accuracy_score(y_test,y_pred_2)
print("Accuracy for KNN model {} %".format(acc_2*100))
print(confusion_matrix(y_test, y_pred_2))

  return self._fit(X, y)


Accuracy for KNN model 32.95774647887324 %
[[ 0  0  1  1  0  0  0  0  0  0  0]
 [ 0  1  1  4  0  0  0  0  0  0  0]
 [ 0  3  9 12  3  0  0  2  0  0  0]
 [ 1  1 15 71 21  1  0  5  0  0  0]
 [ 1  3  7 42 18  1  0  8  0  1  0]
 [ 0  0  1  6  3  0  1  4  1  0  0]
 [ 0  0  1  7  2  0  0  0  2  0  0]
 [ 0  2  5 25 11  0  2 14  4  0  0]
 [ 0  0  2  4  6  0  0  4  4  1  0]
 [ 0  0  0  3  2  0  0  1  0  0  0]
 [ 0  0  1  0  1  0  0  2  0  0  0]]


In [116]:
from sklearn.linear_model import LogisticRegression
classifier3 = LogisticRegression(random_state=0)
classifier3.fit(X_train, y_train) 

y_pred_3 = classifier3.predict(X_test)  

acc_3 = accuracy_score(y_test,y_pred_3)
print("Accuracy for LR model {} %".format(acc_3*100))
print(confusion_matrix(y_test, y_pred_3))

  return f(*args, **kwargs)


Accuracy for LR model 32.67605633802817 %
[[  0   0   0   2   0   0   0   0   0   0   0]
 [  0   0   0   6   0   0   0   0   0   0   0]
 [  0   0   0  29   0   0   0   0   0   0   0]
 [  0   0   0 113   0   0   0   2   0   0   0]
 [  0   0   0  80   0   0   0   1   0   0   0]
 [  0   0   0  15   0   0   0   1   0   0   0]
 [  0   0   0  11   0   0   0   1   0   0   0]
 [  0   0   0  59   0   0   0   3   0   1   0]
 [  0   0   0  21   0   0   0   0   0   0   0]
 [  0   0   0   6   0   0   0   0   0   0   0]
 [  0   0   0   4   0   0   0   0   0   0   0]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [117]:
from sklearn.ensemble import GradientBoostingClassifier  

classifier4 = GradientBoostingClassifier()  
classifier4.fit(X_train, y_train) 

y_pred_4 = classifier4.predict(X_test)  

acc_4 = accuracy_score(y_test,y_pred_4)
print("Accuracy for Gradient Boost model {} %".format(acc_4*100))
print(confusion_matrix(y_test, y_pred_4))

  return f(*args, **kwargs)


Accuracy for Gradient Boost model 43.66197183098591 %
[[ 0  0  1  1  0  0  0  0  0  0  0]
 [ 0  0  0  5  0  0  0  1  0  0  0]
 [ 0  0  4 23  1  0  0  1  0  0  0]
 [ 0  0  1 96  7  0  0 11  0  0  0]
 [ 0  0  0 54 12  0  0 15  0  0  0]
 [ 0  0  0 10  1  0  0  5  0  0  0]
 [ 0  0  0  8  2  0  0  1  1  0  0]
 [ 0  0  0 21  4  0  0 34  4  0  0]
 [ 0  0  0  1  0  0  0 14  6  0  0]
 [ 0  0  0  0  0  0  0  3  0  3  0]
 [ 0  0  0  1  0  0  0  1  1  1  0]]


In [118]:
from sklearn.ensemble import RandomForestClassifier  

classifier5 = RandomForestClassifier()  
classifier5.fit(X_train, y_train) 

y_pred_5 = classifier5.predict(X_test)  

acc_5 = accuracy_score(y_test,y_pred_5)
print("Accuracy for Random Forest model {} %".format(acc_5*100))
print(confusion_matrix(y_test, y_pred_5))

  classifier5.fit(X_train, y_train)


Accuracy for Random Forest model 39.436619718309856 %
[[ 0  0  1  1  0  0  0  0  0  0  0]
 [ 0  0  2  2  0  0  0  2  0  0  0]
 [ 0  2  7 16  2  0  0  2  0  0  0]
 [ 0  1  8 72 24  2  2  6  0  0  0]
 [ 0  0  4 38 23  0  1 11  2  2  0]
 [ 0  0  1  5  4  0  1  3  1  1  0]
 [ 0  0  1  5  5  0  0  1  0  0  0]
 [ 0  1  2 20  9  0  1 25  4  1  0]
 [ 0  0  0  1  3  0  0  4 12  0  1]
 [ 0  0  0  0  0  0  1  1  3  1  0]
 [ 0  1  0  0  0  0  0  3  0  0  0]]


In [120]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold, cross_val_score, GridSearchCV

In [122]:
# gradient boost, random forest, AdaBoostClassifier를 위한 사이킷런 classifier 클래스 생성
gbc_clf = GradientBoostingClassifier(random_state=11)
rf_clf = RandomForestClassifier(random_state=11)
ada_clf = AdaBoostClassifier(random_state=11)

# GradientBoostingClassifier 학습,예측,정확도 출력
gbc_clf.fit(X_train, y_train)
gbc_pred = gbc_clf.predict(X_test)
print('GradientBoostingClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, gbc_pred)))

# RandomForestClassifier 학습,예측,정확도 출력
rf_clf.fit(X_train, y_train)
rf_pred = rf_clf.predict(X_test)
print('RandomForestClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, rf_pred)))

# AdaBoostClassifier 학습,예측,정확도 출력
ada_clf.fit(X_train, y_train)
ada_pred = ada_clf.predict(X_test)
print('AdaBoostClassifier 정확도: {0:.4f}'.format(accuracy_score(y_test, ada_pred)))

  return f(*args, **kwargs)


GradientBoostingClassifier 정확도: 0.4366


  rf_clf.fit(X_train, y_train)


RandomForestClassifier 정확도: 0.3859


  return f(*args, **kwargs)


AdaBoostClassifier 정확도: 0.3887


In [126]:
# kfold 사용
def exec_kfold(clf, folds=5):
    # 폴드 세트가 5개인 KFold 객체 생성, 폴드 수만큼 예측결과 저장을 위한 리스트 생성
    kfold = KFold(n_splits=folds)
    scores = []
    
    # kFold 교차 검증 수행
    for iter_count, (train_index, test_index) in enumerate(kfold.split(X_df)):
        # X_df, y_df 데이터에서 교차 검증별로 학습과 검증 데이터를 가리키는 index 생성
        X_train, X_test = X_df.values[train_index], X_df.values[test_index]
        y_train, y_test = y_df.values[train_index], y_df.values[test_index]
        # Classifier 학습, 예측, 정확도 계산
        clf.fit(X_train, y_train)
        predictions = clf.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        scores.append(accuracy)
        print("교차 검증 {0} 정확도:  {1:.4f}".format(iter_count, accuracy))
        
    # 5개 fold에서의 평균 정확도 계산.
    mean_score = np.mean(scores)
    print("평균 정확도: {0:.4f}".format(mean_score))
        
# exec_kfold 호출
exec_kfold(gbc_clf, folds=5)

교차 검증 0 정확도:  0.4405
교차 검증 1 정확도:  0.4513
교차 검증 2 정확도:  0.4400
교차 검증 3 정확도:  0.4478
교차 검증 4 정확도:  0.4441
평균 정확도: 0.4447


In [127]:
# cross_val_score 사용
scores = cross_val_score(gbc_clf, X_df, y_df, cv=5)
for iter_count, accuracy in enumerate(scores):
    print("교차 검증 {0} 정확도: {1:.4f}".format(iter_count, accuracy))
    
print("평균 정확도: {0:.4f}".format(np.mean(scores)))

교차 검증 0 정확도: 0.4392
교차 검증 1 정확도: 0.4493
교차 검증 2 정확도: 0.4454
교차 검증 3 정확도: 0.4479
교차 검증 4 정확도: 0.4421
평균 정확도: 0.4448


# GradientBoostingClassifier 파라미터 서치
params = {
    'n_estimators' : [100, 200, 500],
    'max_depth' : [6, 8, 10, 12],
    'min_samples_leaf' : [3, 5, 7, 10],
    'min_samples_split' : [2, 3, 5, 10],
    'learning_rate' : [0.05, 0.1, 0.2]
}
grid_cv = GridSearchCV(gbc_clf, param_grid=params, scoring="accuracy", n_jobs=-1, verbose=1)
grid_cv.fit(X_train, y_train)
print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

# GridSearchCV를 이용해 최적으로 학습된 estimator로 예측 수행.
gbc_pred = grid_cv.best_estimator_.predict(X_test)
gbc_accuracy = accuracy_score(y_test, gbc_pred)
print("GradientBoostingClassifier GridSearch 정확도: {0:.4f}".format(gbc_accuracy))

# GradientBoostingClassifier GridSearchCV 모델을 base_esimator로 한 앙상블 모델 비교

gbc_bagging_clf = BaggingClassifier(base_estimator=grid_cv.best_estimator_, n_estimators=10, random_state=0)
gbc_bagging_clf.fit(X_train, y_train)
gbc_bagging_pred = gbc_bagging_clf.predict(X_test)
gbc_bagging_accuracy = accuracy_score(y_test, gbc_bagging_pred)
print("GradientBoostingClassifier Bagging 정확도: {0:.4f}".format(gbc_bagging_accuracy))

gbc_ada_clf = AdaBoostClassifier(base_estimator=grid_cv.best_estimator_, n_estimators=10, random_state=0)
gbc_ada_clf.fit(X_train, y_train)
gbc_ada_pred = gbc_ada_clf.predict(X_test)
gbc_ada_accuracy = accuracy_score(y_test, gbc_ada_pred)
print("GradientBoostingClassifier Adaboost 정확도: {0:.4f}".format(gbc_ada_accuracy))

# GradientBoostingClassifier와 AdaBoostClassifier 보팅(하드)
vo_hard_clf = VotingClassifier( estimators=[('GBC', grid_cv.best_estimator_), ('ADA', ada_clf)])

# VotingClassifier 학습, 예측, 정확도 계산
vo_hard_clf.fit(X_train, y_train)
hard_pred = vo_hard_clf.predict(X_test)
print('VotingClassifier Hard 정확도: {0:.4f}'.format(accuracy_score(y_test, hard_pred)))

# GradientBoostingClassifier와 AdaBoostClassifier 보팅(소프트)
vo_soft_clf = VotingClassifier( estimators=[('GBC', grid_cv.best_estimator_), ('ADA', ada_clf)], voting='soft')

# VotingClassifier 학습, 예측, 정확도 계산
vo_soft_clf.fit(X_train, y_train)
soft_pred = vo_soft_clf.predict(X_test)
print('VotingClassifier Soft 정확도: {0:.4f}'.format(accuracy_score(y_test, soft_pred)))

# 