In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,BaggingClassifier,ExtraTreesClassifier
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# **VOTING** **CLASSIFIER**

## **1.** **Hard** **Voting** **Classifier**

In [None]:
#load moons data
Xdata,ydata=datasets.make_moons(n_samples=100,noise=0.15)
X=pd.DataFrame(Xdata)
y=pd.DataFrame(ydata)

#to split the dataset
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.20, random_state=42)

In [None]:
X.head()

Unnamed: 0,0,1
0,1.193301,-0.447967
1,0.242773,0.255242
2,-0.521021,1.025164
3,1.702371,-0.283151
4,0.63662,0.854684


In [None]:
y.head()

Unnamed: 0,0
0,1
1,1
2,0
3,1
4,0


In [None]:
log_clf=LogisticRegression()
forest_clf=RandomForestClassifier()
svm_clf=SVC(probability=True)

In [None]:
voting_hard_clf=VotingClassifier([('lr',log_clf),('rf',forest_clf),('svm',svm_clf)],voting='hard')

In [None]:
for clf in (log_clf,forest_clf,svm_clf,voting_hard_clf):
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

  y = column_or_1d(y, warn=True)
  
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LogisticRegression 0.9
RandomForestClassifier 1.0
SVC 0.95
VotingClassifier 0.95


## **2.** **Soft** **Voting** **Classifier**

In [None]:
voting_soft_clf=VotingClassifier([('lr',log_clf),('rf',forest_clf),('svm',svm_clf)],voting='soft')

In [None]:
for clf in (log_clf,forest_clf,svm_clf,voting_soft_clf):
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

  y = column_or_1d(y, warn=True)
  
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LogisticRegression 0.9
RandomForestClassifier 1.0
SVC 0.95
VotingClassifier 0.95


# **BAGGING** **AND** **PASTING**

In [None]:
#training with bagging
bagging_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=50,bootstrap=True,n_jobs=-1)


In [None]:
#training with single decision tree classifier
tree_clf=DecisionTreeClassifier()

In [None]:
#compare accuracy scores in both
for clf in (tree_clf,bagging_clf):
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

DecisionTreeClassifier 0.95


  y = column_or_1d(y, warn=True)


BaggingClassifier 1.0


In [None]:
#for training with pasting use bootstrap=False

### **Out** **of** **Bag** **Evaluation**

In [37]:
oob_eval_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=50,bootstrap=True,oob_score=True,n_jobs=-1)
oob_eval_clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [38]:
oob_eval_clf.oob_score_

0.925

In [39]:
#compare oob score with accuracy score
y_pred=oob_eval_clf.predict(X_test)

print(accuracy_score(y_test,y_pred))

0.95


In [40]:
#decision function

oob_eval_clf.oob_decision_function_

array([[0.68560606, 0.31439394],
       [0.09558824, 0.90441176],
       [0.06007067, 0.93992933],
       [1.        , 0.        ],
       [0.51526718, 0.48473282],
       [0.06690141, 0.93309859],
       [0.82706767, 0.17293233],
       [0.97435897, 0.02564103],
       [0.97358491, 0.02641509],
       [0.        , 1.        ],
       [0.76806084, 0.23193916],
       [0.07749077, 0.92250923],
       [0.01481481, 0.98518519],
       [0.38113208, 0.61886792],
       [0.1884058 , 0.8115942 ],
       [0.01858736, 0.98141264],
       [0.01754386, 0.98245614],
       [0.0648855 , 0.9351145 ],
       [0.96923077, 0.03076923],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.01470588, 0.98529412],
       [0.        , 1.        ],
       [0.12546125, 0.87453875],
       [0.25274725, 0.74725275],
       [0.81851852, 0.18148148],
       [0.        , 1.        ],
       [0.34586466, 0.65413534],
       [1.        , 0.        ],
       [1.        , 0.        ],
       [0.

# **RANDOM** **PATCHES** **AND** **RANDOM** **SUBSPACES**

In [41]:
#random patches means sampling both instances and features
#random subspaces means sampling features

### **Random** **Subspaces**

In [51]:
rnd_subspaces_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=1,bootstrap=False,bootstrap_features=True,max_features=0.5,n_jobs=-1)
rnd_subspaces_clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [52]:
y_pred=rnd_subspaces_clf.predict(X_test)

print(accuracy_score(y_test,y_pred))

0.55


# **RANDOM** **FOREST**

In [53]:
forest_clf=RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
forest_clf.fit(X_train,y_train)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=16, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [54]:
y_pred=forest_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

1.0


### **1.** **Extra** **Trees** **Classifiers**

In [56]:
extra_tree_clf=ExtraTreesClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
extra_tree_clf.fit(X_train,y_train)

  


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=16, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [57]:
y_pred=extra_tree_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

1.0


### **2.** **Feature** **Importance**

In [58]:
irisd=datasets.load_iris()
print(irisd.keys())

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [59]:
X=pd.DataFrame(irisd['data'])
y=pd.DataFrame(irisd['target'])

X.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [60]:
forest_clf=RandomForestClassifier(n_estimators=500,n_jobs=-1)
forest_clf.fit(X,y)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [61]:
forest_clf.feature_importances_

array([0.10312971, 0.02664204, 0.43344965, 0.43677861])

In [62]:
for name,score in zip(irisd['feature_names'],forest_clf.feature_importances_):
  print(name,score)

sepal length (cm) 0.10312970742659718
sepal width (cm) 0.026642035930907634
petal length (cm) 0.4334496465680227
petal width (cm) 0.43677861007447255
