# __1. Bagging code 작성__

In [1]:
# package 불러오기
import pandas as pd
import numpy as np
from sklearn import model_selection # cross-validation score를 가져오기 위함
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier # bagging
from sklearn.tree import DecisionTreeClassifier # 의사 결정 나무
from collections import Counter # count
from sklearn.metrics import accuracy_score

- sepal: 꽃받침
- petal: 꽃


In [5]:
filename = '../../dataset/iris.csv'

dataframe = pd.read_csv(filename, header= -1)
dataframe.columns = ['sepal_length', 'sepal_width', 'petal_length','petal_width', 'class_label']
dataframe.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class_label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [6]:
array = dataframe.values # 손 쉬운 indexing을 위하여 array로 변형

In [7]:
X = array[:,0:4].astype(float)  # 0 - 4 column은 독립변수
Y = array[:,4].astype(int) # 마지막 column은 종속변수

print('X:',X[:4])
print('y:',Y[:4])

X: [[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]]
y: [0 0 0 0]


In [8]:
train_x, test_x, train_y, test_y = train_test_split(X, Y, test_size=0.3, random_state=0)
print('Number of train set:', len(train_x))
print('Number of test set:', len(test_x))

Number of train set: 105
Number of test set: 45


In [9]:
assert len(train_x) == len(train_y)
assert len(test_x) == len(test_y)

In [10]:
# hyperparameters
seed = 1
k = 5
num_trees = np.power(2, range(9))
num_trees

array([  1,   2,   4,   8,  16,  32,  64, 128, 256])

In [11]:
kfold = model_selection.KFold(n_splits=k, random_state=seed)
kfold

KFold(n_splits=5, random_state=1, shuffle=False)

In [12]:
perf = {}

for n_tree in num_trees:
    # tree 생성
    DT = DecisionTreeClassifier()

    # bagging 모델 생성
    bag_model = BaggingClassifier(base_estimator=DT, n_estimators=n_tree, random_state=seed, max_samples=0.5)
    results = model_selection.cross_val_score(bag_model, train_x, train_y, scoring='accuracy', cv=kfold)
    print('-'*80)
    print("Trees : ", n_tree)
    print("Each k-fold perf : ", results)
    print("Mean Accuracy : {:.4f}".format(results.mean()))
    
    perf[n_tree] = results.mean()

--------------------------------------------------------------------------------
Trees :  1
Each k-fold perf :  [0.95238095 0.9047619  1.         0.95238095 0.9047619 ]
Mean Accuracy : 0.9429
--------------------------------------------------------------------------------
Trees :  2
Each k-fold perf :  [0.85714286 1.         0.95238095 0.95238095 0.9047619 ]
Mean Accuracy : 0.9333
--------------------------------------------------------------------------------
Trees :  4
Each k-fold perf :  [0.85714286 1.         0.95238095 0.95238095 0.9047619 ]
Mean Accuracy : 0.9333
--------------------------------------------------------------------------------
Trees :  8
Each k-fold perf :  [0.85714286 1.         0.95238095 1.         0.85714286]
Mean Accuracy : 0.9333
--------------------------------------------------------------------------------
Trees :  16
Each k-fold perf :  [0.85714286 1.         0.95238095 1.         0.85714286]
Mean Accuracy : 0.9333
---------------------------------------

In [13]:
best_n_tree = max(perf, key=lambda x: perf[x])
best_n_tree

1

- 최적의 파라미터를 찾은 후 모델 결정

In [14]:
DT = DecisionTreeClassifier()
best_bag_model = BaggingClassifier(base_estimator=DT, n_estimators=best_n_tree, random_state=seed, max_samples=0.5)

In [15]:
best_bag_model.fit(train_x,train_y)

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.5, n_estimators=1, n_jobs=None, oob_score=False,
         random_state=1, verbose=0, warm_start=False)

In [16]:
test_pred_y = best_bag_model.predict(test_x)
test_pred_y

array([2, 2, 0, 2, 0, 2, 0, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 2, 0, 2, 1, 0, 2, 2, 1, 0, 2, 1, 1, 2, 0, 2, 0,
       0])

In [17]:
accuracy_score(y_true= test_y, y_pred= test_pred_y)

0.9333333333333333

- 변수중요도
    - 모델이름.feature_importances_

In [18]:
def get_variable_importance(model):
    return np.mean([tree.feature_importances_ for tree in best_bag_model.estimators_], axis =0)

var_df = pd.Series(get_variable_importance(best_bag_model), index = dataframe.columns[:-1])

var_df.sort_values(ascending=False)

petal_length    0.854035
sepal_width     0.089825
petal_width     0.056140
sepal_length    0.000000
dtype: float64

# __2. Random Forest code 작성__

In [19]:
# sklearn으로 random forest 만들기
# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

import pandas as pd
from sklearn import model_selection
from sklearn.ensemble import RandomForestClassifier

In [29]:
perf = {}

for n_tree in num_trees:

    # Randomforest 모델 생성
    rf_model = RandomForestClassifier(n_estimators=n_tree, random_state=seed)
    results = model_selection.cross_val_score(rf_model, train_x, train_y, scoring='accuracy', cv=kfold)
    print('-'*80)
    print("Trees : ", n_tree)
    print("Each k-fold perf : ", results)
    print("Mean Accuracy : {:.4f}".format(results.mean()))
    
    perf[n_tree] = results.mean()

--------------------------------------------------------------------------------
Trees :  1
Each k-fold perf :  [0.85714286 0.85714286 0.95238095 0.85714286 0.9047619 ]
Mean Accuracy : 0.8857
--------------------------------------------------------------------------------
Trees :  2
Each k-fold perf :  [0.9047619  1.         0.9047619  0.95238095 0.95238095]
Mean Accuracy : 0.9429
--------------------------------------------------------------------------------
Trees :  4
Each k-fold perf :  [0.80952381 1.         1.         0.95238095 0.9047619 ]
Mean Accuracy : 0.9333
--------------------------------------------------------------------------------
Trees :  8
Each k-fold perf :  [0.80952381 1.         1.         0.95238095 0.9047619 ]
Mean Accuracy : 0.9333
--------------------------------------------------------------------------------
Trees :  16
Each k-fold perf :  [0.80952381 1.         1.         0.95238095 0.85714286]
Mean Accuracy : 0.9238
---------------------------------------

In [21]:
best_n_tree = max(perf, key=lambda x: perf[x])
best_n_tree

2

- 최적의 파라미터를 찾은 후 모델 결정

In [22]:
best_rf_model = RandomForestClassifier(n_estimators=best_n_tree, random_state=seed)

In [23]:
best_rf_model.fit(train_x,train_y)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=2, n_jobs=None,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [24]:
test_pred_y = best_rf_model.predict(test_x)
test_pred_y

array([2, 1, 0, 2, 0, 2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 2, 1,
       0, 0, 2, 0, 0, 1, 1, 0, 2, 1, 0, 2, 2, 1, 0, 1, 1, 1, 2, 0, 2, 0,
       0])

In [25]:
accuracy_score(y_true= test_y, y_pred= test_pred_y)

0.9777777777777777

- 변수중요도
    - 모델이름.feature_importances_

In [26]:
best_rf_model.feature_importances_

array([0.23930769, 0.1114275 , 0.39871625, 0.25054856])

In [27]:
var_df = pd.Series(best_rf_model.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

petal_length    0.398716
petal_width     0.250549
sepal_length    0.239308
sepal_width     0.111428
dtype: float64