In [71]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.datasets import make_moons
X,y = make_moons(n_samples =3000,noise=0.2, random_state =42)

In [72]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)


In [73]:
log_clf = LogisticRegression()
svm_clf  =SVC(probability=True)
rnd_clf = RandomForestClassifier()

In [74]:
voting_clf = VotingClassifier(
    estimators=[("lr", log_clf),("rf",rnd_clf),("svc",svm_clf)],
    voting ="soft"
)

# Evaluating for the Hard Voting classifier

In [75]:
from sklearn.metrics import accuracy_score
for clf in (log_clf,rnd_clf,svm_clf,voting_clf):
    clf.fit(X_train,y_train)
    pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test,pred))
  
    

LogisticRegression 0.8606060606060606
RandomForestClassifier 0.9696969696969697
SVC 0.9707070707070707
VotingClassifier 0.9717171717171718


In [76]:
# We can use predict_proba() function to calculate the probability for each label

In [77]:
# In order to do the hard classificatioin we just change the voting to hard

## Using Bagging and Pasting

In [93]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
bag_clf = BaggingClassifier(
    DecisionTreeClassifier() , n_estimators =1000,
    max_samples =1000, bootstrap=True, n_jobs=-1 , oob_score=True
)
bag_clf.fit(X_train,y_train)
# n_estimators gives the no of decission tree trained, max_samples 
# no of sample per training set and n_jobs =-1 means using all the available resourece for training

In [94]:
y_pred = bag_clf.predict(X_test)

In [95]:
accuracy_score(y_test,y_pred)

0.9696969696969697

In [96]:
# The baggingClassifier does the soft voting instead of hard voting

In [97]:
bag_clf.oob_score_

0.9567164179104478

In [84]:
X_train.shape

(2010, 2)

In [98]:
# on average about 63% of unique instances will be in the bootstrap sample
# The remaining 37% instances are left out which is called as the out of bag sample(oob) which acts as a validation set.

In [99]:
# IF the base estimator has a predict_proba() method then we can find the probability for each training instances

In [100]:
bag_clf.oob_decision_function_

array([[0.0049505 , 0.9950495 ],
       [0.00343643, 0.99656357],
       [1.        , 0.        ],
       ...,
       [0.        , 1.        ],
       [0.94098361, 0.05901639],
       [0.        , 1.        ]])

## Random Patches and Random Subspaces

In [102]:
# Random Patches and Random Subspaces are ensemble techniques related to feature sampling. They extend the idea of bagging by incorporating randomness not only in the training samples but also in the features used by each base learner.

In [103]:
# Sampling both training instances and feature is called as Random Patches method
# Keeping all training instances but sampling features is calles Random Subspaces mehods


## Random Forests

In [104]:
# A Random Forest is an ensemble of Decision Trees, generally trained via the bagging
# method or sometimes pasting as well , typically with max_samples set to the to size of the training set.

In [105]:
# Instead of using the BaggingClassifier and passing it a DecisionTreeClassifier() we can use randomforestclassifer or randomforestregressor

In [107]:
from sklearn.ensemble import RandomForestClassifier
rnnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16,n_jobs=-1)

In [108]:
rnnd_clf.fit(X_train,y_train)

In [109]:
prediction = rnnd_clf.predict(X_test)

In [110]:
accuracy_score(y_test,prediction)

0.9676767676767677