In [61]:
import numpy as np
import pandas as pd
import xgboost
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier,VotingClassifier,BaggingClassifier,ExtraTreesClassifier,AdaBoostClassifier,GradientBoostingRegressor
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,mean_squared_error
from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor

# **VOTING** **CLASSIFIER**

## **1.** **Hard** **Voting** **Classifier**

In [None]:
#load moons data
Xdata,ydata=datasets.make_moons(n_samples=100,noise=0.15)
X=pd.DataFrame(Xdata)
y=pd.DataFrame(ydata)

#to split the dataset
X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=0.20, random_state=42)

In [None]:
X.head()

Unnamed: 0,0,1
0,1.766394,0.047461
1,0.731197,1.046674
2,0.262887,1.07094
3,-0.743395,0.910862
4,-0.70717,0.900018


In [None]:
y.head()

Unnamed: 0,0
0,1
1,0
2,0
3,0
4,0


In [None]:
log_clf=LogisticRegression()
forest_clf=RandomForestClassifier()
svm_clf=SVC(probability=True)

In [None]:
voting_hard_clf=VotingClassifier([('lr',log_clf),('rf',forest_clf),('svm',svm_clf)],voting='hard')

In [None]:
for clf in (log_clf,forest_clf,svm_clf,voting_hard_clf):
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

  y = column_or_1d(y, warn=True)
  
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LogisticRegression 0.9
RandomForestClassifier 1.0
SVC 1.0
VotingClassifier 1.0


## **2.** **Soft** **Voting** **Classifier**

In [None]:
voting_soft_clf=VotingClassifier([('lr',log_clf),('rf',forest_clf),('svm',svm_clf)],voting='soft')

In [None]:
for clf in (log_clf,forest_clf,svm_clf,voting_soft_clf):
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

  y = column_or_1d(y, warn=True)
  
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


LogisticRegression 0.9
RandomForestClassifier 1.0
SVC 1.0
VotingClassifier 1.0


# **BAGGING** **AND** **PASTING**

In [None]:
#training with bagging
bagging_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=50,bootstrap=True,n_jobs=-1)


In [None]:
#training with single decision tree classifier
tree_clf=DecisionTreeClassifier()

In [None]:
#compare accuracy scores in both
for clf in (tree_clf,bagging_clf):
  clf.fit(X_train,y_train)
  y_pred=clf.predict(X_test)
  print(clf.__class__.__name__,accuracy_score(y_test,y_pred))

DecisionTreeClassifier 1.0


  y = column_or_1d(y, warn=True)


BaggingClassifier 1.0


In [None]:
#for training with pasting use bootstrap=False

### **Out** **of** **Bag** **Evaluation**

In [None]:
oob_eval_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=50,bootstrap=True,oob_score=True,n_jobs=-1)
oob_eval_clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [None]:
oob_eval_clf.oob_score_

0.9125

In [None]:
#compare oob score with accuracy score
y_pred=oob_eval_clf.predict(X_test)

print(accuracy_score(y_test,y_pred))

1.0


In [None]:
#decision function

oob_eval_clf.oob_decision_function_

array([[0.01976285, 0.98023715],
       [0.00357143, 0.99642857],
       [0.74444444, 0.25555556],
       [0.91633466, 0.08366534],
       [0.77859779, 0.22140221],
       [0.09689922, 0.90310078],
       [0.0620438 , 0.9379562 ],
       [0.15750916, 0.84249084],
       [0.99253731, 0.00746269],
       [0.99206349, 0.00793651],
       [0.98490566, 0.01509434],
       [0.06367041, 0.93632959],
       [0.85660377, 0.14339623],
       [0.99610895, 0.00389105],
       [0.02692308, 0.97307692],
       [0.90421456, 0.09578544],
       [0.99233716, 0.00766284],
       [0.04597701, 0.95402299],
       [0.65354331, 0.34645669],
       [0.81818182, 0.18181818],
       [0.07633588, 0.92366412],
       [1.        , 0.        ],
       [0.90545455, 0.09454545],
       [0.08394161, 0.91605839],
       [0.03543307, 0.96456693],
       [0.10909091, 0.89090909],
       [0.99638989, 0.00361011],
       [0.9822695 , 0.0177305 ],
       [0.07364341, 0.92635659],
       [0.50537634, 0.49462366],
       [0.

# **RANDOM** **PATCHES** **AND** **RANDOM** **SUBSPACES**

In [None]:
#random patches means sampling both instances and features
#random subspaces means sampling features

### **Random** **Subspaces**

In [None]:
rnd_subspaces_clf=BaggingClassifier(DecisionTreeClassifier(),n_estimators=500,max_samples=1,bootstrap=False,bootstrap_features=True,max_features=0.5,n_jobs=-1)
rnd_subspaces_clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


BaggingClassifier(base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                        class_weight=None,
                                                        criterion='gini',
                                                        max_depth=None,
                                                        max_features=None,
                                                        max_leaf_nodes=None,
                                                        min_impurity_decrease=0.0,
                                                        min_impurity_split=None,
                                                        min_samples_leaf=1,
                                                        min_samples_split=2,
                                                        min_weight_fraction_leaf=0.0,
                                                        presort='deprecated',
                                                        random_state=None,


In [None]:
y_pred=rnd_subspaces_clf.predict(X_test)

print(accuracy_score(y_test,y_pred))

0.3


# **RANDOM** **FOREST**

In [None]:
forest_clf=RandomForestClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
forest_clf.fit(X_train,y_train)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=16, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [None]:
y_pred=forest_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

0.95


### **1.** **Extra** **Trees** **Classifiers**

In [None]:
extra_tree_clf=ExtraTreesClassifier(n_estimators=500,max_leaf_nodes=16,n_jobs=-1)
extra_tree_clf.fit(X_train,y_train)

  


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=16, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=500, n_jobs=-1,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [None]:
y_pred=extra_tree_clf.predict(X_test)
print(accuracy_score(y_test,y_pred))

1.0


### **2.** **Feature** **Importance**

In [None]:
irisd=datasets.load_iris()
print(irisd.keys())

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [None]:
X=pd.DataFrame(irisd['data'])
y=pd.DataFrame(irisd['target'])

X.head()

Unnamed: 0,0,1,2,3
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [None]:
forest_clf=RandomForestClassifier(n_estimators=500,n_jobs=-1)
forest_clf.fit(X,y)

  


RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=-1, oob_score=False, random_state=None, verbose=0,
                       warm_start=False)

In [None]:
forest_clf.feature_importances_

array([0.09417055, 0.0218612 , 0.44574412, 0.43822414])

In [None]:
for name,score in zip(irisd['feature_names'],forest_clf.feature_importances_):
  print(name,score)

sepal length (cm) 0.09417054607915214
sepal width (cm) 0.021861200539011753
petal length (cm) 0.44574411647720763
petal width (cm) 0.4382241369046285


# **BOOSTING**

## **1.AdaBoost**

In [None]:
ada_boost_clf=AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),n_estimators=200,learning_rate=0.5,algorithm='SAMME.R')
ada_boost_clf.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


AdaBoostClassifier(algorithm='SAMME.R',
                   base_estimator=DecisionTreeClassifier(ccp_alpha=0.0,
                                                         class_weight=None,
                                                         criterion='gini',
                                                         max_depth=1,
                                                         max_features=None,
                                                         max_leaf_nodes=None,
                                                         min_impurity_decrease=0.0,
                                                         min_impurity_split=None,
                                                         min_samples_leaf=1,
                                                         min_samples_split=2,
                                                         min_weight_fraction_leaf=0.0,
                                                         presort='deprecated',
                          

## **2.Gradient Boosting**

In [None]:
                            #doing manually

X_train,X_test,y_train,y_test=train_test_split(X,y,random_state=42)

tree_reg1=DecisionTreeRegressor(max_depth=2)
tree_reg1.fit(X,y)

#train second classifier on the residual errors made by the first predictor
y2=y-pd.DataFrame(tree_reg1.predict(X))
tree_reg2=DecisionTreeRegressor(max_depth=2)
tree_reg2.fit(X,y2)

#train third classifier on the residual error made by the second predictor
y3=y2-pd.DataFrame(tree_reg2.predict(X))
tree_reg3=DecisionTreeRegressor(max_depth=2)
tree_reg3.fit(X,y3)




DecisionTreeRegressor(ccp_alpha=0.0, criterion='mse', max_depth=2,
                      max_features=None, max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, presort='deprecated',
                      random_state=None, splitter='best')

In [None]:
#making predictions summing all the predictions

y_pred=sum(tree.predict(X_test) for tree in (tree_reg1,tree_reg2,tree_reg3))

print(y_pred)

[ 0.99103926 -0.10155333  2.          0.99103926  0.99103926 -0.00505885
  0.99103926  2.00345395  0.99103926  0.99103926  2.00345395  0.03703704
 -0.10155333  0.03703704 -0.00505885  0.99103926  2.00345395  0.99103926
  0.99103926  2.00345395  0.03703704  1.87670754 -0.00505885  2.00345395
  2.          2.00345395  2.00345395  2.00345395  0.03703704  0.03703704
  0.03703704 -0.10155333  0.99103926  0.03703704  0.03703704  2.00345395
  0.99103926 -0.00505885]


In [None]:
                                  #using sklearn
grbt=GradientBoostingRegressor(max_depth=2,n_estimators=3,learning_rate=1)
grbt.fit(X,y)


  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=3,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
print(grbt.predict(X_test))

[ 0.99103926 -0.10155333  2.          0.99103926  0.99103926 -0.00505885
  0.99103926  2.00345395  0.99103926  0.99103926  2.00345395  0.03703704
 -0.10155333  0.03703704 -0.00505885  0.99103926  2.00345395  0.99103926
  0.99103926  2.00345395  0.03703704  1.87670754 -0.00505885  2.00345395
  2.          2.00345395  2.00345395  2.00345395  0.03703704  0.03703704
  0.03703704 -0.10155333  0.99103926  0.03703704  0.03703704  2.00345395
  0.99103926 -0.00505885]


In [None]:
#to find the optimal number of trees

#1. Early stopping using staged_predict()

grbt=GradientBoostingRegressor(max_depth=2,n_estimators=120)
grbt.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=120,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
errors=[mean_squared_error(y_test,y_pred) for y_pred in grbt.staged_predict(X_test)]
best_n_estimators=np.argmax(errors)+1

grbt_best=GradientBoostingRegressor(max_depth=2,n_estimators=best_n_estimators)
grbt_best.fit(X_train,y_train)

  y = column_or_1d(y, warn=True)


GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=2,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=1,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [None]:
#to actually stopping training
grbt=GradientBoostingRegressor(max_depth=2,warm_start=True)

min_val_error=float('inf')
error_going_up=0
for n_estimators in range(1,120):
  grbt.n_estimators=n_estimators
  grbt.fit(X_train,y_train)
  y_pred=grbt.predict(X_test)
  val_error=mean_squared_error(y_test,y_pred)
  if val_error < min_val_error:
    min_val_error=val_error
    error_going_up=0
  else:
    error_going_up+=1
    if error_going_up==5:
      break


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = colu

In [None]:
#using xgboost(Extreme Gradient Boosting)
xgb_reg=xgboost.XGBRegressor()
xgb_reg.fit(X_train,y_train,eval_set=[(X_test,y_test)],early_stopping_rounds=2)

y_pred=xgb_reg.predict(X_test)


[0]	validation_0-rmse:0.848216
Will train until validation_0-rmse hasn't improved in 2 rounds.
[1]	validation_0-rmse:0.76637
[2]	validation_0-rmse:0.692923
[3]	validation_0-rmse:0.628601
[4]	validation_0-rmse:0.568572
[5]	validation_0-rmse:0.515465
[6]	validation_0-rmse:0.467516
[7]	validation_0-rmse:0.423988
[8]	validation_0-rmse:0.38486
[9]	validation_0-rmse:0.347354
[10]	validation_0-rmse:0.313519
[11]	validation_0-rmse:0.284549
[12]	validation_0-rmse:0.257516
[13]	validation_0-rmse:0.234319
[14]	validation_0-rmse:0.211719
[15]	validation_0-rmse:0.191369
[16]	validation_0-rmse:0.173853
[17]	validation_0-rmse:0.157659
[18]	validation_0-rmse:0.143506
[19]	validation_0-rmse:0.131328
[20]	validation_0-rmse:0.123011
[21]	validation_0-rmse:0.114566
[22]	validation_0-rmse:0.108455
[23]	validation_0-rmse:0.103333
[24]	validation_0-rmse:0.097454
[25]	validation_0-rmse:0.093965
[26]	validation_0-rmse:0.08907
[27]	validation_0-rmse:0.084827
[28]	validation_0-rmse:0.083042
[29]	validation_0-rms

In [None]:
print(y_pred)

[1.0566008  0.00761837 1.8801653  1.0058978  1.1085618  0.00761837
 0.99229443 1.8643548  1.0894842  0.9980558  1.900697   0.01715434
 0.00761837 0.00761837 0.00761837 0.9470267  1.9823997  0.9980558
 1.0477362  1.9917287  0.00761837 1.827092   0.00761837 1.9917287
 1.9763035  1.9823997  1.8801653  1.9763035  0.01715434 0.00761837
 0.00761837 0.00761837 0.96778154 0.00761837 0.00761837 1.8530235
 0.9636893  0.00761837]


In [None]:
print(xgb_reg.evals_result_)

{'validation_0': {'rmse': [0.848216, 0.76637, 0.692923, 0.628601, 0.568572, 0.515465, 0.467516, 0.423988, 0.38486, 0.347354, 0.313519, 0.284549, 0.257516, 0.234319, 0.211719, 0.191369, 0.173853, 0.157659, 0.143506, 0.131328, 0.123011, 0.114566, 0.108455, 0.103333, 0.097454, 0.093965, 0.08907, 0.084827, 0.083042, 0.080006, 0.077219, 0.074193, 0.072466, 0.07109, 0.070957, 0.070007, 0.069179, 0.069454, 0.067092, 0.066669, 0.066411, 0.06456, 0.063012, 0.063769, 0.062601, 0.061182, 0.061261]}}
