<a href="https://colab.research.google.com/github/Aftabgazali/Combining-Different-Models-for-Ensemble-Learning.ipynb/blob/main/Combining_Different_Models_for_Ensemble_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importing Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
iris_data = sns.load_dataset('iris')

iris_data.head()

# Preprocessing & Model Building

In [None]:
X, y = iris_data.iloc[:100, [1,2]].values, iris_data.iloc[:100, -1]

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
encoder = LabelEncoder()

y = encoder.fit_transform(y)

print(f"Class labels {np.unique(y)}")

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
y_train

# Building Simple Models

***Note: While ROC AUC is a valuable metric for evaluating binary classifiers, it may not provide a complete picture in multi-class classification scenarios. In such cases, you might consider techniques like micro-average ROC AUC or macro-average ROC AUC to aggregate performance across multiple classes. Check `get_scorer_names` to get list of scores which you can use***

In [None]:
from sklearn.metrics import get_scorer_names
get_scorer_names()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline

model_v0 = LogisticRegression(penalty='l2', C=0.001,solver='lbfgs',multi_class='ovr')
model_v1 = DecisionTreeClassifier(max_depth=1, criterion='entropy')
model_v2 = KNeighborsClassifier(n_neighbors=1, p=2, metric='minkowski', )


pipeline_1 = make_pipeline(StandardScaler(),model_v0)
pipeline_2 = make_pipeline(StandardScaler(), model_v2)

model_labels = ['Logistic Regression', 'Decision Tree', 'KNN']

for model, label in zip([pipeline_1,model_v1,pipeline_2], model_labels):
  scores = cross_val_score(estimator=model, X = X_train, y = y_train, cv =10, scoring='roc_auc')
  print(f"ROC AUC: {scores.mean():.2f} | for Model: {label}")


# Building Custom Ensemble Model

***Note:*** *A `'hard'` voting indicates it's based on majority vote, `'soft'` voting indicates it's based on the mean*

In [None]:
from sklearn.ensemble import VotingClassifier

ensemble_classifier = VotingClassifier(estimators=[
    ('pipeline-1', pipeline_1),
    ('pipeline-2', model_v1),
    ('pipeline-3', pipeline_2)
], voting='soft')


model_labels.append('Ensemble')

for model, label in zip([pipeline_1,model_v1,pipeline_2, ensemble_classifier], model_labels):
  scores = cross_val_score(estimator=model, X = X_train, y = y_train, cv =10, scoring='accuracy')
  print(f"ROC AUC: {scores.mean():.2f} | for Model: {label}")



# Evaluating and tuning the ensemble classifier

*Plotting out ROC AUC curves is essential to understand how model performed for TP vs FP*

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn.base import clone
colors = ['green','yellow','red','blue']

line_styles=[':','--','-.','-']

# put all classifiers into one list
all_classifiers = [pipeline_1,model_v1,pipeline_2, ensemble_classifier]
for model,label,color, line_style in zip(all_classifiers,model_labels, colors, line_styles):
  cloned_model = clone(model)
  y_pred = cloned_model.fit(X_train, y_train).predict_proba(X_test)[:,1]
  fpr, tpr, thresholds = roc_curve(y_true=y_test,y_score=y_pred)
  roc_auc = auc(x=fpr, y=tpr)
  plt.plot(fpr, tpr, color=color,linestyle=line_style, label=f"{label} (auc = {roc_auc:.2f})")

plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1],linestyle='--',color='gray',linewidth=2)
plt.xlim([-0.1, 1.1])
plt.ylim([-0.1, 1.1])
plt.grid(alpha=0.5)
plt.xlabel('False positive rate (FPR)')
plt.ylabel('True positive rate (TPR)')
plt.show()

# Hyper-parameter tunning

In [None]:
ensemble_classifier.get_params()

In [None]:
from sklearn.model_selection import GridSearchCV

params = {'pipeline-2__max_depth':[1,2,3,4,5,6,7,8],
    'pipeline-1__logisticregression__C': [0.0001, 00.1,1, 100.0]}
grid = GridSearchCV(estimator=ensemble_classifier,param_grid=params,cv=10,scoring='roc_auc')
grid.fit(X_train, y_train)

In [None]:
print(f"Best params: {grid.best_params_}")

In [None]:
print(f"Best score: {grid.best_score_*100:.2f}%")

# Bagging in Action

## Import Wine dataset

In [None]:
from sklearn.datasets import load_wine

wine_data = load_wine()

df = pd.DataFrame(data = wine_data.data, columns = wine_data.feature_names)
df['target'] = wine_data.target
df.head()

In [None]:
print(f"Class labels: {np.unique(df['target'])}")

*Let's drop one class as ROC AUC will not work for multi-classification problem*

In [None]:
new_df = df[df['target'] != 2]
new_df.head()

In [None]:
print(f"Class labels: {np.unique(new_df['target'])}")

In [None]:
X, y = new_df.iloc[:,:-1].values, new_df.iloc[:,-1].values

*Split the data into 80/20 %*

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

In [None]:
len(X_train)

In [None]:
np.bincount(y_train)

*We will use an unpruned decision tree as the base classifier and create an
ensemble of 500 decision trees fit on different bootstrap samples of the training dataset*

In [None]:
from sklearn.ensemble import BaggingClassifier

tree = DecisionTreeClassifier(criterion='entropy')

bag_model = BaggingClassifier(estimator=tree, n_estimators=500,bootstrap=True,bootstrap_features=False)

In [None]:
from sklearn.metrics import accuracy_score
tree = tree.fit(X_train, y_train)
y_pred_tree = tree.predict(X_test)

bag_model = bag_model.fit(X_train, y_train)
y_pred_bag = bag_model.predict(X_test)

print(f"Tree Model Accuracy: {accuracy_score(y_test, y_pred_tree)*100:.2f}")
print(f"Bagging Model Accuracy: {accuracy_score(y_test, y_pred_bag)*100:.2f}")

# ADA Boosting in Action
**AdaBoost trains decision tree stumps based on errors of the previous decision tree stump. In particular, the errors are used to compute sample weights in each round as well as for computing a clas-
sifier weight for each decision tree stump when combining the individual stumps into an ensemble. Westop training once a maximum number of iterations (decision tree stumps) is reached.**

In [None]:
from sklearn.ensemble import AdaBoostClassifier
# Create a prunned DT as a weak learner for ADA Boost
tree = DecisionTreeClassifier(criterion='entropy', max_depth=1)
ada = AdaBoostClassifier(estimator=tree, n_estimators=500,learning_rate=0.01)

y_pred_tree = tree.fit(X_train, y_train).predict(X_test)
y_pred_train_tree = tree.predict(X_train)
y_pred_boost = ada.fit(X_train,y_train).predict(X_test)
y_pred_train_boost = ada.predict(X_train)

*You can see DT stump, underfits unlike the unprunned DT in Bagging*

**ADA boost works well, with the training examples however, you can also see that we introduced additional variance with our attempt to reduce the model
bias—a greater gap between training and test performance**

In [None]:
print(f"Tree Stump Training Accuracy: {accuracy_score(y_train, y_pred_train_tree)}  | Testing Accuracy: {accuracy_score(y_test, y_pred_tree)*100:.2f}")
print(f"Ada Boost Training Accuracy: {accuracy_score(y_train, y_pred_train_boost)*100:.2f} | Testing Accuracy: {accuracy_score(y_test, y_pred_boost)*100:.2f}")

# Gradient Boost in Action

**Like AdaBoost,
gradient boosting fits decision trees in an iterative fashion using prediction errors. However, gradient
boosting trees are usually deeper than decision tree stumps and have typically a maximum depth of
3 to 6 (or a maximum number of 8 to 64 leaf nodes). Also, in contrast to AdaBoost, gradient boosting
does not use the prediction errors for assigning sample weights; they are used directly to form the
target variable for fitting the next tree. Moreover, instead of having an individual weighting term for
each tree, like in AdaBoost, gradient boosting uses a global learning rate that is the same for each tree.**

In [None]:
import xgboost as xgb
xgboost = xgb.XGBClassifier(n_estimators=200, learning_rate=0.1,max_depth=4)

xgboost = xgboost.fit(X_train, y_train)
y_train_pred = xgboost.predict(X_train)
y_test_pred = xgboost.predict(X_test)

print(f"XGBoost Training Accuracy: {accuracy_score(y_train, y_train_pred)}  | Testing Accuracy: {accuracy_score(y_test, y_test_pred)*100:.2f}")