# Day_044

### 練習時間

1. 試著調整 RandomForestClassifier(...) 中的參數，並觀察是否會改變結果？
2. 改用其他資料集 (boston, wine)，並與回歸模型與決策樹的結果進行比較

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn import datasets
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score

In [2]:
# load wine dataset
wine = datasets.load_wine()

df_wine = pd.DataFrame(wine.data, columns = wine.feature_names)
print(df_wine.shape)
df_wine.head()

(178, 13)


Unnamed: 0,alcohol,malic_acid,ash,alcalinity_of_ash,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanins,color_intensity,hue,od280/od315_of_diluted_wines,proline
0,14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0
1,13.2,1.78,2.14,11.2,100.0,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050.0
2,13.16,2.36,2.67,18.6,101.0,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185.0
3,14.37,1.95,2.5,16.8,113.0,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480.0
4,13.24,2.59,2.87,21.0,118.0,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735.0


In [3]:
print(pd.Series(wine.target).value_counts())
print("")
print(f"Unique values in wine.target = {np.unique(wine.target)}")

1    71
0    59
2    48
dtype: int64

Unique values in wine.target = [0 1 2]


#### RandomForestClassifier 內置參數預設
RandomForestClassifier(n_estimators=’warn’, criterion=’gini’, max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features=’auto’, max_leaf_nodes=None, min_impurity_decrease=0.0, min_impurity_split=None, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None)

[參考連結](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

In [4]:
# 切分訓練集/測試集
x_train, x_test, y_train, y_test = train_test_split(wine.data, wine.target, test_size = 0.25, random_state = 4)

# 建立模型
rf = RandomForestClassifier()

# 訓練模型
rf.fit(x_train, y_train)

# 預測測試集
y_pred = rf.predict(x_test)

# 查看準確率
print("Acuuracy: ", accuracy_score(y_test, y_pred))

Acuuracy:  1.0


In [5]:
pd.DataFrame(np.hstack((np.array(wine.feature_names).reshape(-1,1), rf.feature_importances_.reshape(-1,1))), columns = ['Wine_feature_names', 'Feature importance'])

Unnamed: 0,Wine_feature_names,Feature importance
0,alcohol,0.3001626885496361
1,malic_acid,0.0126040747023967
2,ash,0.0044628141803176
3,alcalinity_of_ash,0.0339206944184519
4,magnesium,0.0460291329618679
5,total_phenols,0.0292740878006065
6,flavanoids,0.1156358591805507
7,nonflavanoid_phenols,0.022143878207057
8,proanthocyanins,0.0294630319171767
9,color_intensity,0.0471926786198086


In [6]:
rf = RandomForestClassifier(n_estimators = 50, oob_score = True, n_jobs = -1, max_features = "auto", min_samples_leaf = 15, random_state = 4)
rf.fit(x_train, y_train)
y_pred = rf.predict(x_test)
print("Acuuracy: ", accuracy_score(y_test, y_pred))

Acuuracy:  0.977777777778


In [7]:
pd.DataFrame(np.hstack((np.array(wine.feature_names).reshape(-1,1), rf.feature_importances_.reshape(-1,1))), columns = ['Wine_feature_names', 'Feature importance'])

Unnamed: 0,Wine_feature_names,Feature importance
0,alcohol,0.1088130865851497
1,malic_acid,0.0054674699149984
2,ash,0.0134448222540351
3,alcalinity_of_ash,0.0394243934627089
4,magnesium,0.0129209445300042
5,total_phenols,0.0845510674151441
6,flavanoids,0.1479227959117544
7,nonflavanoid_phenols,0.0
8,proanthocyanins,0.0130600731366271
9,color_intensity,0.1479386223601565
