# 投票器集成--一个训练集多个分类器

1、硬投票：分类最多的作为结果返回  
2、软投票：分类器权重×分类器结果概率再取均值，最后概率值最大的类别为结果返回

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons
X, y = make_moons(n_samples=500,noise=.3,random_state=42)
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=42)

In [3]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression(solver='liblinear',random_state=42)
rnd_clf = RandomForestClassifier(n_estimators=10, random_state=42)
svm_clf = SVC(gamma='auto',random_state=42)

In [4]:
voting_clf_hard = VotingClassifier(
estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
voting='hard')


In [30]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf_hard):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.896


In [31]:
svm_clf = SVC(gamma='auto',random_state=42,probability=True)

voting_clf_soft = VotingClassifier(
estimators=[('lr',log_clf),('rf',rnd_clf),('svc',svm_clf)],
voting='soft')

for clf in (log_clf, rnd_clf, svm_clf, voting_clf_soft):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.864
RandomForestClassifier 0.872
SVC 0.888
VotingClassifier 0.912


# 投票器集成--不同训练集相同分类器

1、有放回采样--Bagging  
2、无放回采样--pasting

In [2]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
bag_clf = BaggingClassifier(DecisionTreeClassifier(),
                            n_estimators=500,max_samples=100,
                            bootstrap=True,n_jobs=-1,random_state=42)
bag_clf.fit(X_train,y_train)
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test,y_pred)

0.904

In [6]:
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train,y_train)
y_pred_tree = tree_clf.predict(X_test)
accuracy_score(y_test,y_pred_tree)

0.856

In [None]:
# Bagging采样方法是对m个实例的训练集，有放回抽样m次。；因此对每个样本，m次都未选中的概率是(1-1/m)**m。当m越大，此概率趋近1/e=0.37，即m次中至少1次选中的概率是0.632。
# m中包含实例a1~am,令xi=0(ai未选中）或1（ai选中至少1次），sum(xi)就是m中被选中过的实例数。sum(xi)的期望就是选中过的实例数的平均值，即m个(1*0.632+0*0.368)的和，为0.632m个实例。

In [14]:
import numpy as np
(1-1/1e10)**1e10

0.36787941071455793

In [3]:
# out of bag 评估
bag_clf = BaggingClassifier(DecisionTreeClassifier(random_state=42),n_estimators=500,bootstrap=True,n_jobs=-1,oob_score=True,random_state=40)
bag_clf.fit(X_train,y_train)
bag_clf.oob_score_

0.9013333333333333

In [4]:
# 返回决策函数的每个实例类别概率
bag_clf.oob_decision_function_

array([[0.31746032, 0.68253968],
       [0.34117647, 0.65882353],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.08379888, 0.91620112],
       [0.31693989, 0.68306011],
       [0.02923977, 0.97076023],
       [0.97687861, 0.02312139],
       [0.97765363, 0.02234637],
       [0.74404762, 0.25595238],
       [0.        , 1.        ],
       [0.71195652, 0.28804348],
       [0.83957219, 0.16042781],
       [0.97777778, 0.02222222],
       [0.0625    , 0.9375    ],
       [0.        , 1.        ],
       [0.97297297, 0.02702703],
       [0.95238095, 0.04761905],
       [1.        , 0.        ],
       [0.01704545, 0.98295455],
       [0.38947368, 0.61052632],
       [0.88700565, 0.11299435],
       [1.        , 0.        ],
       [0.96685083, 0.03314917],
       [0.        , 1.        ],
       [0.99428571, 0.00571429],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.64804469, 0.35195531],
       [0.

In [5]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.912

In [None]:
# BaggingClassifier 的随机贴片/随机子空间
随机贴片：