# __Package를 사용하여 bagging 코드 작성__

In [1]:
# sklearn으로 bagging 만들기
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html

import pandas as pd
import numpy as np
from sklearn import model_selection # cross-validation score를 가져오기 위함
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier # bagging
from sklearn.tree import DecisionTreeClassifier # 의사 결정 나무
from collections import Counter # count
from sklearn.metrics import f1_score

- 변수설명
    - preg: Number of times pregnant

    - plas: Plasma glucose concentration a 2 hours in an oral glucose tolerance test

    - pres: Diastolic blood pressure (mm Hg)

    - skin: Triceps skin fold thickness (mm)

    - test: 2-Hour serum insulin (mu U/ml)

    - mass: Body mass index (weight in kg/(height in m)^2)

    - pedi: Diabetes pedigree function

    - age: Age (years)

    - class = (1:tested positive for diabetes, 0: tested negative for diabetes)

In [2]:
filename = '../dataset/pima-indians-diabetes.data.csv'

dataframe = pd.read_csv(filename, header= -1)
dataframe.columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class']
dataframe.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Class
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
array = dataframe.values # 손 쉬운 indexing을 위하여 array로 변형
array

array([[  6.   , 148.   ,  72.   , ...,   0.627,  50.   ,   1.   ],
       [  1.   ,  85.   ,  66.   , ...,   0.351,  31.   ,   0.   ],
       [  8.   , 183.   ,  64.   , ...,   0.672,  32.   ,   1.   ],
       ...,
       [  5.   , 121.   ,  72.   , ...,   0.245,  30.   ,   0.   ],
       [  1.   , 126.   ,  60.   , ...,   0.349,  47.   ,   1.   ],
       [  1.   ,  93.   ,  70.   , ...,   0.315,  23.   ,   0.   ]])

In [4]:
X = array[:,0:8].astype(float)  # 0 - 7 column은 독립변수
Y = array[:,8].astype(int) # 마지막 column은 종속변수

print('X:',X[:5])
print('y:',Y[:5])

X: [[6.000e+00 1.480e+02 7.200e+01 3.500e+01 0.000e+00 3.360e+01 6.270e-01
  5.000e+01]
 [1.000e+00 8.500e+01 6.600e+01 2.900e+01 0.000e+00 2.660e+01 3.510e-01
  3.100e+01]
 [8.000e+00 1.830e+02 6.400e+01 0.000e+00 0.000e+00 2.330e+01 6.720e-01
  3.200e+01]
 [1.000e+00 8.900e+01 6.600e+01 2.300e+01 9.400e+01 2.810e+01 1.670e-01
  2.100e+01]
 [0.000e+00 1.370e+02 4.000e+01 3.500e+01 1.680e+02 4.310e+01 2.288e+00
  3.300e+01]]
y: [1 0 1 0 1]


In [5]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3, random_state=0)
print('Number of train set:', len(train_x))
print('Number of test set:', len(test_x))

Number of train set: 537
Number of test set: 231


In [6]:
assert len(train_x) == len(train_y)
assert len(test_x) == len(test_y)

In [7]:
# hyperparameters
seed = 1
k = 5
num_trees = np.power(2, range(9))
num_trees

array([  1,   2,   4,   8,  16,  32,  64, 128, 256])

In [8]:
kfold = model_selection.KFold(n_splits=k, random_state=seed)
kfold

KFold(n_splits=5, random_state=1, shuffle=False)

In [9]:
perf = {}

for n_tree in num_trees:
    # tree 생성
    DT = DecisionTreeClassifier()

    # bagging 모델 생성
    bag_model = BaggingClassifier(base_estimator=DT, n_estimators=n_tree, random_state=seed, max_samples=0.5)
    results = model_selection.cross_val_score(bag_model, train_x, train_y, scoring='f1', cv=kfold)
    print('-'*80)
    print("Trees : ", n_tree)
    print("Each k-fold perf : ", results)
    print("Mean Accuracy : {:.4f}".format(results.mean()))
    
    perf[n_tree] = results.mean()

--------------------------------------------------------------------------------
Trees :  1
Each k-fold perf :  [0.51282051 0.73913043 0.56097561 0.52777778 0.58823529]
Mean Accuracy : 0.5858
--------------------------------------------------------------------------------
Trees :  2
Each k-fold perf :  [0.55737705 0.63888889 0.5483871  0.40677966 0.56140351]
Mean Accuracy : 0.5426
--------------------------------------------------------------------------------
Trees :  4
Each k-fold perf :  [0.54545455 0.75       0.59375    0.44444444 0.52631579]
Mean Accuracy : 0.5720
--------------------------------------------------------------------------------
Trees :  8
Each k-fold perf :  [0.50793651 0.70731707 0.67647059 0.49315068 0.5862069 ]
Mean Accuracy : 0.5942
--------------------------------------------------------------------------------
Trees :  16
Each k-fold perf :  [0.53968254 0.73417722 0.68493151 0.54794521 0.59016393]
Mean Accuracy : 0.6194
---------------------------------------

In [10]:
best_n_tree = max(perf, key=lambda x: perf[x])
best_n_tree

64

- 최적의 파라미터를 찾은 후 모델 결정

In [11]:
DT = DecisionTreeClassifier()
best_bag_model = BaggingClassifier(base_estimator=DT, n_estimators=best_n_tree, random_state=seed, max_samples=0.5)

In [12]:
best_bag_model.fit(train_x,train_y)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=64, n_jobs=None, oob_score=False,
         random_state=1, verbose=0, warm_start=False)

In [13]:
test_pred_y = best_bag_model.predict(test_x)
test_pred_y

array([1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1])

In [14]:
f1_score(y_true= test_y, y_pred= test_pred_y)

0.6119402985074627

- 변수중요도
    - 모델이름.feature_importances_

In [15]:
def get_variable_importance(model):
    return np.mean([tree.feature_importances_ for tree in best_bag_model.estimators_], axis =0)

var_df = pd.Series(get_variable_importance(best_bag_model), index = dataframe.columns[:-1])

var_df.sort_values(ascending=False)

Glucose                     0.336738
BMI                         0.163339
DiabetesPedigreeFunction    0.130176
Age                         0.122654
BloodPressure               0.082784
Pregnancies                 0.061645
Insulin                     0.053006
SkinThickness               0.049658
dtype: float64

In [16]:
dataframe.columns

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Class'],
      dtype='object')

# __Package를 사용하여 random forest 코드 작성__

In [17]:
# sklearn으로 random forest 만들기
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

In [18]:
perf = {}

for n_tree in num_trees:

    # Randomforest 모델 생성
    rf_model = RandomForestClassifier(n_estimators=n_tree, random_state=seed)
    results = model_selection.cross_val_score(rf_model, train_x, train_y, scoring='f1', cv=kfold)
    print('-'*80)
    print("Trees : ", n_tree)
    print("Each k-fold perf : ", results)
    print("Mean Accuracy : {:.4f}".format(results.mean()))
    
    perf[n_tree] = results.mean()

--------------------------------------------------------------------------------
Trees :  1
Each k-fold perf :  [0.52380952 0.68181818 0.50549451 0.58666667 0.48484848]
Mean Accuracy : 0.5565
--------------------------------------------------------------------------------
Trees :  2
Each k-fold perf :  [0.47058824 0.58974359 0.51612903 0.5        0.45283019]
Mean Accuracy : 0.5059
--------------------------------------------------------------------------------
Trees :  4
Each k-fold perf :  [0.5483871  0.69767442 0.57971014 0.48275862 0.52631579]
Mean Accuracy : 0.5670
--------------------------------------------------------------------------------
Trees :  8
Each k-fold perf :  [0.56716418 0.70588235 0.56338028 0.47058824 0.43636364]
Mean Accuracy : 0.5487
--------------------------------------------------------------------------------
Trees :  16
Each k-fold perf :  [0.6        0.6744186  0.62857143 0.52941176 0.45614035]
Mean Accuracy : 0.5777
---------------------------------------

In [19]:
best_n_tree = max(perf, key=lambda x: perf[x])
best_n_tree

256

- 최적의 파라미터를 찾은 후 모델 결정

In [20]:
best_rf_model = RandomForestClassifier(n_estimators=best_n_tree, random_state=seed)

In [21]:
best_rf_model.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=256, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [22]:
test_pred_y = best_rf_model.predict(test_x)
test_pred_y

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0,
       1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1])

In [23]:
f1_score(y_true= test_y, y_pred= test_pred_y)

0.5909090909090909

- 변수중요도
    - 모델이름.feature_importances_

In [24]:
best_rf_model.feature_importances_

array([0.08073718, 0.26055887, 0.08316885, 0.06775128, 0.07522758,
       0.16275789, 0.12784155, 0.14195681])

In [25]:
var_df = pd.Series(best_rf_model.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

Glucose                     0.260559
BMI                         0.162758
Age                         0.141957
DiabetesPedigreeFunction    0.127842
BloodPressure               0.083169
Pregnancies                 0.080737
Insulin                     0.075228
SkinThickness               0.067751
dtype: float64