In [112]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=10000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [113]:
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

rnd_clf = RandomForestClassifier()
log_clf = LogisticRegression()
svc_clf = SVC(probability=True)
voting_clf = VotingClassifier(
  estimators=[('rnd', rnd_clf), ('log', log_clf), ('svc', svc_clf)],
  voting='soft',
)

voting_clf.fit(X_train, y_train)

In [114]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svc_clf, voting_clf):
  clf.fit(X_train, y_train)
  print(clf.__class__.__name__, accuracy_score(y_test, clf.predict(X_test)))

LogisticRegression 0.8315
RandomForestClassifier 0.85
SVC 0.867
VotingClassifier 0.862


In [115]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, accuracy_score

bag_clf = BaggingClassifier(
  DecisionTreeClassifier(),
  n_estimators=500,
  max_samples=100, bootstrap=True, n_jobs=-1, # Use all cores
  oob_score=True
)
bag_clf.fit(X_train, y_train)
print(bag_clf.oob_score_)
print(confusion_matrix(y_test, bag_clf.predict(X_test)))
print(accuracy_score(y_test, bag_clf.predict(X_test)))

0.864625
[[882 114]
 [153 851]]
0.8665


In [116]:
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)
print(confusion_matrix(y_test, rnd_clf.predict(X_test)))
print(accuracy_score(y_test, rnd_clf.predict(X_test)))

[[881 115]
 [149 855]]
0.868


In [117]:
import numpy as np
from sklearn.metrics import precision_score
from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(X, y)

gbrt = GradientBoostingClassifier(max_depth=2, n_estimators=120)
gbrt.fit(X_train, y_train)

errors = [precision_score(y_test, pred) for pred in gbrt.staged_predict(X_test)]
best_n_estimators = np.argmin(errors) + 1

gbrt_best = GradientBoostingClassifier(max_depth=2, n_estimators=best_n_estimators)
gbrt_best.fit(X_train, y_train)

print(confusion_matrix(y_test, gbrt.predict(X_test)))
print(confusion_matrix(y_test, gbrt_best.predict(X_test)))


[[1084  158]
 [ 183 1075]]
[[1074  168]
 [ 199 1059]]


In [118]:
import xgboost
xgb = xgboost.XGBClassifier()
xgb.fit(X_train, y_train)
print(confusion_matrix(y_test, gbrt_best.predict(X_test)))
print(confusion_matrix(y_test, xgb.predict(X_test)))

[[1074  168]
 [ 199 1059]]
[[1082  160]
 [ 207 1051]]


In [119]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
X, y = mnist['data'], mnist['target']

In [120]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=(1/7))
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=(1/6))
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_val.shape)

(50000, 784) (50000,) (10000, 784) (10000,) (10000, 784) (10000,)


In [121]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

rnd = RandomForestClassifier(max_leaf_nodes=12)
ext = ExtraTreesClassifier(max_leaf_nodes=12)
bag = BaggingClassifier(
  LogisticRegression(multi_class='multinomial'),
  n_estimators=10,
  max_samples=100, bootstrap=True, n_jobs=-1, # Use all cores
  oob_score=True
)
voting = VotingClassifier(
  estimators=[
    ('rnd', rnd), ('ext', ext), ('bag', bag),
  ],
  n_jobs=-1,
  voting='soft',
)

In [122]:
voting.fit(X_train, y_train)
print(confusion_matrix(y_val, voting.predict(X_val)))

[[ 976    1    5    6    4    1    9    1    9    2]
 [   0 1150    2    8    0    2    2    1    5    1]
 [  17   72  729   12   21    0   29   20   38    0]
 [  13   31   18  865    2    9    6   25   39   15]
 [   4   24    8    1  849    1    5    1    5   74]
 [  39   64    2  139   26  519   29   11   17   42]
 [  24   38   15    1   15   12  926    1    4    0]
 [  11   43   11    4   22    0    0  955    4   26]
 [   7   99    5   68    9    9    5    7  673   39]
 [  13   19    4   22   54    0    2   32    5  810]]


In [123]:
rnd.fit(X_train, y_train)
print(confusion_matrix(y_val, rnd.predict(X_val)))

[[ 978    2    5    5    1    0    5    3   15    0]
 [   0 1146    5    7    1    1    3    2    4    2]
 [  25   56  737   27   16    2   28   28   12    7]
 [  35   32   29  816    5    8    4   24   27   43]
 [   4    9    6    4  756    0   27   17   15  134]
 [  92   41    4  277   34  277   30   23   25   85]
 [  42   31   24   17   32    7  869    4   10    0]
 [  10   30   23    1   17    0    0  949    7   39]
 [   4  106   23   80   10    2   14   10  616   56]
 [  15    7    6   22   51    0    7   90   12  751]]


In [124]:
ext.fit(X_train, y_train)
print(confusion_matrix(y_val, ext.predict(X_val)))

[[ 965    0    4   10    1    0   19    7    8    0]
 [   0 1151    3    5    0    2    1    2    6    1]
 [  37   91  695   24   26    0   22   35    7    1]
 [  28   41   19  845    3    8    3   27   23   26]
 [   7   16    7    4  792    1   27   41    9   68]
 [  72   38    8  310   47  252   24   41   26   70]
 [  64   41   18   28   22    3  846    9    5    0]
 [  10   53   21    3   12    0    0  944    3   30]
 [  13  133   26   96   12    2   15   14  571   39]
 [  11   22    6   33   93    3   14  104    3  672]]


In [125]:
bag.fit(X_train, y_train)
print(confusion_matrix(y_val, bag.predict(X_val)))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


[[ 980    1    6    3    4   10    4    3    3    0]
 [   0 1148    3    1    1    3    2    1   12    0]
 [   8   64  758   20   29    1   20   19   19    0]
 [  11   28   30  809    2   48    3   21   43   28]
 [   1   47    3    0  846    1    4    1    2   67]
 [  26   68    7   38   22  664   18    6   13   26]
 [  19   69   14    0   17   42  872    0    3    0]
 [  11   49    9    4   36    2    0  911    2   52]
 [  18  100   21   27   19   35    4   10  649   38]
 [  14   26    4   13   93    3    0   32    3  773]]


In [126]:
for clf in (voting, rnd, ext, bag):
  print(clf.__class__.__name__, accuracy_score(y_val, clf.predict(X_val)))

VotingClassifier 0.8452
RandomForestClassifier 0.7895
ExtraTreesClassifier 0.7733
BaggingClassifier 0.841


In [127]:
for clf in (voting, rnd, ext, bag):
  print(clf.__class__.__name__, accuracy_score(y_test, clf.predict(X_test)))

VotingClassifier 0.8396
RandomForestClassifier 0.7841
ExtraTreesClassifier 0.7684
BaggingClassifier 0.8334


In [128]:
from sklearn.base import TransformerMixin

class MNIST_Transformer(TransformerMixin):
  def __init__(self, clfs=[]):
    self.clfs = clfs
  
  def fit(self, X, y=None):
    return self

  def transform(self, X, y=None):
    return np.concatenate([clf.predict(X).astype(np.int32).reshape(-1, 1) for clf in self.clfs], axis=1)

transformer = MNIST_Transformer(clfs=[rnd, ext, bag, voting])

X_meta = transformer.fit_transform(X_val)
X_meta_test = transformer.fit_transform(X_test)

In [129]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
  ('transformer', transformer),
  ('meta_clf', RandomForestClassifier()),
])

clf.fit(X_val, y_val)
print(confusion_matrix(y_test, clf.predict(X_test)))
print(accuracy_score(y_test, clf.predict(X_test)))

[[ 945    0   10    1    2   12   10    5    6    0]
 [   0 1102    5    5    0    2    2    6    7    2]
 [   8   24  848   21   23    2   27   30   23    4]
 [   8   23   32  774    3   44   12   12   45   15]
 [   1    7    7    1  852   10   12   11   12   61]
 [  21   34    6   58   29  694   12    8   28   31]
 [  19   20   26    4   11   18  870    3   11    2]
 [   7   19   12    6    9    4    2  902    2   31]
 [  11   60   30   50   12   22    9    8  799   30]
 [   7   11   10   12   58    6    0   60   16  816]]
0.8602
