<a href="https://colab.research.google.com/github/Bingyy/MachineLearning/blob/master/%E9%9B%86%E6%88%90%E7%AE%97%E6%B3%95.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Bagged Decision Tree**

Bagging算法在数据有很大的方差时很有效，最常见的是决策树的Bagging算法。

In [0]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [0]:
# 导入数据
filename = 'http://ftp.ics.uci.edu/pub/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
data = read_csv(filename, names=names)

In [3]:
data.shape

(768, 9)

In [0]:
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]

# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
cart = DecisionTreeClassifier()
num_tree = 100
model = BaggingClassifier(base_estimator=cart, n_estimators=num_tree, random_state=seed)

result = cross_val_score(model, X, y, cv=kfold)

In [5]:
print(result.mean())

0.770745044429255


### 随机森林

用随机的方式建立一个森林，森林由多棵决策树组成，每个决策树之间没有关联。新的输入进入到随机森林，会让每个决策树分别判断，看样本属于哪一类，最后看哪类被选择的最多，就预测这个样本为那个类。

In [0]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

In [0]:
# 导入数据
filename = 'http://ftp.ics.uci.edu/pub/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
data = read_csv(filename, names=names)

In [0]:
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]

# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
num_tree = 100
max_features = 3
model = RandomForestClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)

result = cross_val_score(model, X, y, cv=kfold)

In [10]:
print(result.mean())

0.7733766233766234


### 极端随机树
与随机森林类似，都是由很多决策树组成，但是有两个重要区别：

- 随机森林是Bagging模型，ExtraTreesClassifier择时使用所有的训练样本得到每个决策树
- 随机森林是在一个随机子集内得到最优的分叉特征属性，ExtraTreesClassifier则是完全随机选择分叉特征属性

In [0]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import ExtraTreesClassifier

In [0]:
# 导入数据
filename = 'http://ftp.ics.uci.edu/pub/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
data = read_csv(filename, names=names)

In [0]:
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]

# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
num_tree = 100
max_features = 7

model = ExtraTreesClassifier(n_estimators=num_tree, random_state=seed, max_features=max_features)

result = cross_val_score(model, X, y, cv=kfold)

In [14]:
print(result.mean())

0.762987012987013


### 提升算法

Boosting算法是用来提高弱分类算法准确度的方法。先构造一个预测函数序列，然后将他们组合成为一个预测函数。

In [0]:
### AdaBoost
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier

In [0]:
# 导入数据
filename = 'http://ftp.ics.uci.edu/pub/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
data = read_csv(filename, names=names)



In [0]:
# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]

# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
num_tree = 100

model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)

result = cross_val_score(model, X, y, cv=kfold)

In [20]:
print(result.mean())

0.7421565276828435


In [0]:
### 随机梯度提升算法

from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier


In [0]:
# 导入数据
filename = 'http://ftp.ics.uci.edu/pub/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
data = read_csv(filename, names=names)

# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]

# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
num_tree = 100

model = AdaBoostClassifier(n_estimators=num_tree, random_state=seed)

result = cross_val_score(model, X, y, cv=kfold)

In [24]:
print(result.mean())

0.7421565276828435


### 投票算法

将多个机器学习模型集成集成起来的算法。通过创建两个或两个以上的算法模型，用投票算法将这些算法包裹起来。

In [0]:
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [0]:
# 导入数据
filename = 'http://ftp.ics.uci.edu/pub/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data'
names = ['preg','plas','pres','skin','test','mass','pedi','age','class']
data = read_csv(filename, names=names)

# 数据划分
array = data.values
X = array[:, 0:8]
y = array[:, 8]

# 构建模型
num_folds = 10
seed = 7
kfold = KFold(n_splits=num_folds, random_state=seed)
cart = DecisionTreeClassifier()

models = {}
models['logistic'] = LogisticRegression()
models['cart'] = DecisionTreeClassifier()
models['svm'] = SVC()

ensemble_model = VotingClassifier(estimators=models)

result = cross_val_score(model, X, y, cv=kfold)

In [28]:
print(result.mean())

0.7421565276828435
