[RandomForest 하이퍼 파라미터 조정(GridSearch)](https://injo.tistory.com/30)

[GridSearchCV](https://2-chae.github.io/category/1.ai/29)

# 1. GridSearchCV를 통한 랜덤포레스트의 하이퍼 파라미터 튜닝

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

params = { 'n_estimators' : [10, 100],
           'max_depth' : [6, 8, 10, 12],
           'min_samples_leaf' : [8, 12, 18],
           'min_samples_split' : [8, 16, 20]
            }

# RandomForestClassifier 객체 생성 후 GridSearchCV 수행
rf_clf = RandomForestClassifier(random_state = 0, n_jobs = -1)
grid_cv = GridSearchCV(rf_clf, param_grid = params, cv = 3, n_jobs = -1)
grid_cv.fit(X_train, y_train)

print('최적 하이퍼 파라미터: ', grid_cv.best_params_)
print('최고 예측 정확도: {:.4f}'.format(grid_cv.best_score_))

# 2. 위의 결과로 나온 최적 하이퍼 파라미터로 다시 모델을 학습하여 

   # 테스트 세트 데이터에서 예측 성능을 측정

In [None]:
rf_clf1 = RandomForestClassifier(n_estimators = 100, 
                                max_depth = 12,
                                min_samples_leaf = 8,
                                min_samples_split = 8,
                                random_state = 0,
                                n_jobs = -1)
rf_clf1.fit(X_train, y_train)
pred = rf_clf1.predict(X_test)
print('예측 정확도: {:.4f}'.format(accuracy_score(y_test,pred)))

# 3. Random Forest의 각 피처의 중요도 시각화 : featureimportances

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(ftr_importances_values, index = X_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Top 20 Feature Importances')
sns.barplot(x=ftr_top20, y=ftr_top20.index)
plt.show()

# 4. Confusion Matrix

In [None]:
conf_mat = confusion_matrix(labels, predictions, normalize='true')

disp = plot_confusion_matrix(classifier, X_test, y_test,
display_labels=class_names, cmap=plt.cm.Blues, normalize='true')