In [1]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, ExtraTreesClassifier
# ensemble means "take mean"
from sklearn.neighbors import KNeighborsClassifier

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

data_i, targeti = load_iris(return_X_y=True)
data_train, data_test, target_train, target_test = train_test_split(data_i, targeti, random_state=42,
                                                                    stratify=targeti, test_size=0.7)

bagging = BaggingClassifier(max_samples=0.5,  # half rows
                            max_features=0.5,  # half columns
                            base_estimator=KNeighborsClassifier())

# better than Bagging: can define how many estimators, rather than unknown number of estimators
# worse than Bagging: cannot define the base_estimator
rndForest = RandomForestClassifier(n_estimators=10)

exTree = ExtraTreesClassifier(max_samples=0.5,  # half rows
                              max_features=0.5,  # half columns
                              n_estimators=10)

# Basically the same
bagging.fit(data_train, target_train)
rndForest.fit(data_train, target_train)
exTree.fit(data_train, target_train)

print(cross_val_score(bagging, data_test, target_test, cv=5).mean())
print(cross_val_score(rndForest, data_test, target_test, cv=5).mean())
print(cross_val_score(exTree, data_test, target_test, cv=5).mean())


0.9523809523809523
0.9523809523809523
0.9619047619047618


In [2]:
print(cross_val_predict(bagging, data_test, target_test, cv=5))
print(cross_val_predict(rndForest, data_test, target_test, cv=5))
print(cross_val_predict(exTree, data_test, target_test, cv=5))

[0 0 2 0 0 0 2 1 1 1 1 1 0 2 1 0 2 1 0 0 0 0 2 2 1 1 1 1 0 2 1 2 1 2 0 1 1
 0 2 2 2 0 2 1 0 1 0 0 0 1 0 0 2 2 2 0 2 1 0 2 2 0 2 2 2 2 1 2 0 2 1 2 0 0
 2 1 0 1 0 2 2 1 1 2 1 0 1 1 1 0 0 2 1 2 1 2 1 2 1 0 0 1 0 1 1]
[0 0 2 0 0 0 2 1 1 1 1 1 0 2 1 0 2 1 0 0 0 0 2 2 1 1 1 1 0 2 1 2 1 1 0 1 1
 0 2 2 2 0 2 1 0 1 0 0 0 1 0 0 2 1 2 0 2 1 0 2 2 0 2 2 2 2 1 2 0 2 1 2 0 0
 2 1 0 1 0 2 2 1 1 2 1 0 1 1 1 0 0 2 1 2 2 2 1 2 1 0 0 1 0 1 1]
[0 0 2 0 0 0 2 1 1 1 1 1 0 2 1 0 2 1 0 0 0 0 2 2 1 1 2 1 0 2 1 2 1 1 0 1 1
 0 2 2 2 0 2 1 0 1 0 0 0 1 0 0 2 1 2 0 2 1 0 2 2 0 2 2 2 2 1 2 0 2 1 2 0 0
 2 1 0 1 0 2 2 1 1 2 1 0 1 1 1 0 0 2 1 2 2 2 1 2 1 0 0 1 0 1 1]


In [3]:
from sklearn.ensemble import AdaBoostClassifier

ada_clf = AdaBoostClassifier(n_estimators=100)  # strengthen weakness
ada_clf.fit(data_train, target_train)

print(ada_clf.score(data_test, target_test))
print(cross_val_score(ada_clf, data_test, target_test, cv=5).mean())
print(cross_val_predict(ada_clf, data_test, target_test, cv=5))

0.9714285714285714
0.9523809523809523
[0 0 2 0 0 0 2 1 1 1 1 1 0 2 1 0 2 1 0 0 0 0 2 2 1 1 2 1 0 2 1 2 1 1 0 2 1
 0 1 2 2 0 2 1 0 1 0 0 0 1 0 0 2 2 2 0 2 1 0 2 2 0 2 2 2 2 1 2 0 2 2 2 0 0
 2 1 0 1 0 2 2 1 1 2 1 0 1 1 1 0 0 2 1 2 2 2 1 2 1 0 0 1 0 1 1]


In [4]:
from sklearn.ensemble import GradientBoostingClassifier, GradientBoostingRegressor

gbc = GradientBoostingClassifier(n_estimators=100)
gbc.fit(data_train, target_train)

print(gbc.score(data_test, target_test))
print(cross_val_score(gbc, data_test, target_test, cv=5).mean())
print(cross_val_predict(gbc, data_test, target_test, cv=5))

0.9428571428571428
0.9523809523809523
[0 0 2 0 0 0 2 1 1 1 1 1 0 2 1 0 2 1 0 0 0 0 2 2 1 1 2 1 0 2 1 2 1 1 0 1 1
 0 2 2 2 0 2 1 0 1 0 0 0 1 0 0 2 1 2 0 2 1 0 2 2 0 2 2 2 2 1 2 0 2 1 2 0 0
 2 1 0 1 0 2 2 1 1 2 1 0 1 1 1 0 0 2 1 2 2 2 1 2 1 0 0 1 0 1 1]


In [5]:
gbr = GradientBoostingRegressor(n_estimators=100)  # it is a decision tree, so it could be un-stable?
gbr.fit(data_train, target_train)

print(gbr.score(data_test, target_test))
print(cross_val_score(gbr, data_test, target_test, cv=5).mean())
print(cross_val_predict(gbr, data_test, target_test, cv=5))

0.9278549928225018
0.9251851432530074
[-1.21625101e-04 -1.21625101e-04  2.02461465e+00 -1.21625101e-04
 -1.21625101e-04 -1.21625101e-04  1.99825322e+00  1.08891462e+00
  8.22063846e-01  9.82651125e-01  9.84849620e-01  9.29685159e-01
 -1.21625101e-04  2.00143557e+00  9.72966777e-01 -1.21625101e-04
  1.71339666e+00  9.99053070e-01 -1.21625101e-04 -1.21625101e-04
 -1.21625101e-04 -4.23738530e-04  1.94885833e+00  2.03346285e+00
  1.01180741e+00  1.00741305e+00  1.79850294e+00  1.00240761e+00
 -4.23738530e-04  1.94396564e+00  1.68211090e+00  2.00189718e+00
  1.00747087e+00  9.99429148e-01  5.23189001e-04  1.68350937e+00
  9.90413789e-01 -4.23738530e-04  1.93903375e+00  1.99823734e+00
  2.00213187e+00 -4.23738530e-04  1.99700378e+00  9.98702974e-01
  2.80764727e-04  1.00381518e+00  2.80764727e-04  2.80764727e-04
  2.80764727e-04  9.69247481e-01  2.80764727e-04  1.09882778e-03
  1.99924267e+00  1.01051826e+00  1.99784143e+00  2.80764727e-04
  2.00021800e+00  9.98992152e-01  1.42530170e-03  2.

In [7]:
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn import svm

vc = VotingClassifier(
    estimators=[("someone", rndForest), ("bagging", bagging), ("LogisticRegression", LogisticRegression()),
                ("SVC", svm.SVC())],  # like Pipeline
    # hard vote: 少数服从多数， 如果平票，那按字母排列选第一个
    # soft vote：take average
    voting="hard",
    weights=[2, 1, 2, 1])

vc.fit(data_train, target_train)

print(vc.score(data_test, target_test))
print(cross_val_score(vc, data_test, target_test, cv=5).mean())
print(cross_val_predict(vc, data_test, target_test, cv=5))

0.9047619047619048
0.9619047619047618
[0 0 2 0 0 0 2 1 1 1 1 1 0 2 1 0 2 1 0 0 0 0 2 2 1 1 2 1 0 2 1 2 1 1 0 1 1
 0 2 2 2 0 2 1 0 1 0 0 0 1 0 0 2 2 2 0 2 1 0 2 2 0 2 2 2 2 1 2 0 2 2 2 0 0
 2 1 0 1 0 2 2 1 1 2 1 0 1 1 1 0 0 2 1 2 2 2 1 2 1 0 0 1 0 1 1]


In [8]:
from sklearn.ensemble import VotingRegressor
from sklearn.linear_model import BayesianRidge, LassoLars, LinearRegression

vr = VotingRegressor(
    estimators=[("BayesianRidge", BayesianRidge()), ("LassoLars", LassoLars(alpha=0.05)),
                ("LinearRegression", LinearRegression()),
                ("SVR", svm.SVR())],
    weights=[2, 1, 2, 1])

vr.fit(data_train, target_train)

print(vr.score(data_test, target_test))
print(cross_val_score(vr, data_test, target_test, cv=5).mean())
print(cross_val_predict(vr, data_test, target_test, cv=5))

0.919950556449805
0.9170393061332138
[ 0.02355464  0.12214374  1.75632896  0.03087309  0.06342169 -0.00372767
  1.7554274   1.20177498  1.23186683  1.14809371  1.13109518  1.32618274
  0.11523227  2.12312881  1.02127014 -0.05081407  1.71945602  1.02648813
  0.04677188  0.03824467  0.08749698  0.13576185  1.68296142  1.53909217
  1.2384281   1.27040085  1.51241599  1.15471058  0.26480985  1.91373247
  0.86676884  1.74125646  1.26770572  1.33029681  0.08758937  0.86342452
  1.11178497  0.08129878  1.49495255  1.85027691  1.91777894  0.0286841
  1.91699821  1.33815013  0.10722927  1.16157637  0.042848    0.0504366
  0.01760286  1.25007095  0.06451276  0.04348767  1.79723432  1.40575314
  1.62983006 -0.03149432  1.67751472  0.92409511 -0.0943819   1.51009515
  1.72839503  0.05708036  1.63000767  1.69138994  1.47779243  1.80518716
  1.31669814  1.88714268  0.12142905  1.89051299  1.48950406  1.9263772
  0.1079189   0.03803772  1.89714844  1.19768404  0.23359411  1.03869615
  0.08600836  1.7