# __1. AdaBoost code 작성__

### Q. Dataset 폴더에 있는 iris.csv 파일을 이용하여 50개의 모델을 사용하는 AdaBoost 분류기를 생성하고 모델을 평가하세요

In [1]:
# packages 불러오기 
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
import pandas as pd

In [2]:
# 데이터 불러오기
filename = '../../dataset/iris.csv'

dataframe = pd.read_csv(filename, header = -1)

# sepal: 꽃받침
# petal: 꽃
dataframe.columns = ['sepal_length', 'sepal_width', 'petal_length','petal_width', 'class_label']

dataframe.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,class_label
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [3]:
array = dataframe.values # 손 쉬운 indexing을 위하여 array로 변형
X = array[:,0:-1] # 독립변수
y = array[:,-1].astype(int) # 마지막 column은 종속변수

In [4]:
# 학습용, 검증용 데이터 나누기
X_train, X_test, y_train, y_test =  train_test_split(X, y, test_size=0.2, random_state = 1) 

In [5]:
# 모델 생성하기
abc =  AdaBoostClassifier(n_estimators=50)

# 모델 fitting 하기
abc.fit(X_train, y_train) 

# test set으로 검증하기
y_pred = abc.predict(X_test)

In [6]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9666666666666667


- 변수 중요도 산출

In [7]:
abc.feature_importances_

array([0.  , 0.  , 0.42, 0.58])

In [8]:
var_df = pd.Series(abc.feature_importances_, index = dataframe.columns[:-1])
var_df.sort_values(ascending=False)

petal_width     0.58
petal_length    0.42
sepal_width     0.00
sepal_length    0.00
dtype: float64

---

# __2. GBM code 작성__

### Q. Dataset 폴더에 있는 iris.csv 파일을 이용하여 50개의 모델을 사용하는 GBM을 임의의 다섯 개의 learning rate에 대하여 성능을 평가하세요

In [9]:
# 패키지 불러오기
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd

In [10]:
# 데이터 셋 분할하기
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 1) 

In [11]:
learning_rates = 1/ np.power(10, range(1,5)) 
learning_rates

array([0.1   , 0.01  , 0.001 , 0.0001])

In [12]:
for learning_rate in learning_rates:
    gb = GradientBoostingClassifier(n_estimators=50, learning_rate = learning_rate, random_state = 1)
    gb.fit(X_train, y_train) # 모델 피팅하기
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(gb.score(X_train, y_train)))
    print("Accuracy score (validation): {0:.3f}".format(gb.score(X_test, y_test)))
    print()

Learning rate:  0.1
Accuracy score (training): 1.000
Accuracy score (validation): 0.967

Learning rate:  0.01
Accuracy score (training): 0.992
Accuracy score (validation): 0.967

Learning rate:  0.001
Accuracy score (training): 0.983
Accuracy score (validation): 0.967

Learning rate:  0.0001
Accuracy score (training): 0.367
Accuracy score (validation): 0.200



### Q. 임의의 learning rate 하나를 사용하여 confusion matrix와 classification report를 구하세요

In [13]:
# confusion matrix 그려보기

gb = GradientBoostingClassifier(n_estimators=50, learning_rate = 0.1, random_state = 0)
gb.fit(X_train, y_train) # 모델 피팅하기
predictions = gb.predict(X_test) # 예측값 출력하기

print("Confusion Matrix:")
print(confusion_matrix(y_test, predictions))

Confusion Matrix:
[[11  0  0]
 [ 0 12  1]
 [ 0  0  6]]


In [20]:
print("Classification Report")
output = classification_report(y_test, predictions,output_dict=True)
print(classification_report(y_test, predictions))

Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        11
           1       1.00      0.92      0.96        13
           2       0.86      1.00      0.92         6

   micro avg       0.97      0.97      0.97        30
   macro avg       0.95      0.97      0.96        30
weighted avg       0.97      0.97      0.97        30



In [23]:
# micro avg
(11+12+6)/(11+12+6+0+0+1+0+0+0)

0.9666666666666667

In [28]:
# macro avg
(output['0']['f1-score']+output['1']['f1-score']+output['2']['f1-score'])/3

0.9610256410256409

In [30]:
# weight macro avg
11/30*output['0']['f1-score']+13/30*output['1']['f1-score']+6/30*output['2']['f1-score']

0.9672820512820512