In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, BaggingClassifier

# datasets
from sklearn.datasets import make_moons

# VotingClassifier

In [2]:
X, y = make_moons(n_samples=500, noise=0.3, random_state=42)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [4]:
voting_clf = VotingClassifier(
    estimators=[
        ('lr', LogisticRegression(random_state=42)),
        ('rf', RandomForestClassifier(random_state=42)),
        ('svc', SVC(random_state=42))
    ]
)

In [5]:
voting_clf.fit(X_train, y_train)

## access estimator and named_estimator

In [6]:
# list all estimators
voting_clf.estimators_

[LogisticRegression(random_state=42),
 RandomForestClassifier(random_state=42),
 SVC(random_state=42)]

In [7]:
# list all pair name and estimator
voting_clf.named_estimators

{'lr': LogisticRegression(random_state=42),
 'rf': RandomForestClassifier(random_state=42),
 'svc': SVC(random_state=42)}

In [8]:
# difference here is : with _ means that it is already trained
voting_clf.named_estimators_

{'lr': LogisticRegression(random_state=42),
 'rf': RandomForestClassifier(random_state=42),
 'svc': SVC(random_state=42)}

In [9]:
# show accuracy score for each estimator
for name, clf in voting_clf.named_estimators_.items():
    print("accuracy score of", name, "ist", clf.score(X_test, y_test))

accuracy score of lr ist 0.864
accuracy score of rf ist 0.896
accuracy score of svc ist 0.896


In [10]:
# score of voting
voting_clf.score(X_test, y_test)

0.912

In [11]:
# test result
voting_clf.predict(X_test[0:1])

array([1], dtype=int64)

In [12]:
# result on separate
[clf.predict(X_test[:1]) for clf in voting_clf.estimators_]

[array([1], dtype=int64), array([1], dtype=int64), array([0], dtype=int64)]

## using voting = soft

In [13]:
voting_clf.voting = 'soft'

In [14]:
# this will not work later
voting_clf.named_estimators_['svc'].probability = True

# if use this, then get error in 
# voting_clf.score(X_test, y_test)
# because _ means already train

In [15]:
# this will work
voting_clf.named_estimators['svc'].probability = True

In [16]:
voting_clf.fit(X_train, y_train)

In [17]:
voting_clf.score(X_test, y_test)

0.92

# Bagging and Pasting Classifier

In [18]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [19]:
X_train.shape

(375, 2)

In [20]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, 
                            max_samples=100, n_jobs=-1,random_state=42)

In [21]:
bag_clf.fit(X_train, y_train)

- Bagging uses automatically soft voting if predict_proba is availabel.
- if wanna use Pasting then bootstrap=False

In [22]:
bag_clf.score(X_test, y_test)

0.904

## oob evaluation

In [24]:
X_train.shape

(375, 2)

In [25]:
y_train.shape

(375,)

In [26]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, 
                            oob_score=True, n_jobs=-1,random_state=42)

In [27]:
bag_clf.fit(X_train, y_train)

In [28]:
bag_clf.oob_score_

0.896

In [29]:
bag_clf.score(X_test, y_test)

0.92

In [30]:
y_train[:3]

array([1, 0, 0], dtype=int64)

In [31]:
bag_clf.oob_decision_function_

array([[0.32352941, 0.67647059],
       [0.3375    , 0.6625    ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.06145251, 0.93854749],
       [0.35465116, 0.64534884],
       [0.01142857, 0.98857143],
       [0.98930481, 0.01069519],
       [0.97927461, 0.02072539],
       [0.75586854, 0.24413146],
       [0.0049505 , 0.9950495 ],
       [0.75520833, 0.24479167],
       [0.82122905, 0.17877095],
       [0.98461538, 0.01538462],
       [0.06315789, 0.93684211],
       [0.00490196, 0.99509804],
       [0.99004975, 0.00995025],
       [0.92513369, 0.07486631],
       [1.        , 0.        ],
       [0.03409091, 0.96590909],
       [0.35087719, 0.64912281],
       [0.91111111, 0.08888889],
       [1.        , 0.        ],
       [0.96319018, 0.03680982],
       [0.        , 1.        ],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.6635514 , 0.3364486 ],
       [0.

In [32]:
bag_clf.oob_decision_function_.shape

(375, 2)

In [37]:
bag_clf2 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500, max_samples=0.8,
                            oob_score=True, n_jobs=-1,random_state=42)

In [38]:
bag_clf2.fit(X_train, y_train)

In [39]:
bag_clf2.score(X_test, y_test)

0.904

In [40]:
bag_clf2.oob_score_

0.9066666666666666

In [42]:
bag_clf2.oob_decision_function_[:3]

array([[0.40758294, 0.59241706],
       [0.30541872, 0.69458128],
       [1.        , 0.        ]])

## method of random patch and of random subspace

In [33]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

In [34]:
bag_clf3 = BaggingClassifier(DecisionTreeClassifier(), n_estimators=500,
                             max_samples=0.6, bootstrap=True,
                             max_features=0.8, bootstrap_features=True, # random subspace
                             n_jobs=-1, random_state=42)

In [35]:
bag_clf3.fit(X_train, y_train)

In [36]:
bag_clf3.score(X_test, y_test)

0.856

# Random forest

- there is no param splitter=, because rf use 1 random feature, selected from max_features =, in Decision Tree, splitter = best/random
- comparision to BaggingClassifier, rf doesnt have bootstrap_features=