# Ensemble classifiers

In [None]:
import statistics
import pandas as pd
import numpy as np
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import make_blobs
from classification_utils import *
from sklearn.model_selection import cross_validate
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.ensemble import ExtraTreesClassifier
import statistics
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.decomposition import PCA
import pickle
from sklearn.metrics import RocCurveDisplay

## Load Data

The only preprocessing step done is the one-hot-encoding of the **Lang** feature.

In [None]:
data = pd.read_csv("dataset/cleaned_user_profiles.csv", index_col=0)
data = data.join(pd.get_dummies(data["lang"]))
train_set, test_set, train_label, test_label = prepare_data(data)

## Bagging

Bagging methods form a class of algorithms which build several instances of a black-box estimator on random subsets of the original training set and then aggregate their individual predictions to form a final prediction. These methods are used as a way to reduce the variance of a base estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.

We will run a Grid-Search Cross-Validation over several parameters:
- **n_estimators**: The number of base estimators in the ensemble.
- **max_features**: The number of features to draw from X to train each base estimator.
- **max_samples**: The number of samples to draw from X to train each base estimator.

In [None]:
param_grid = {
                'n_estimators': np.arange(2, 40, 1),
                'max_features': [0.7, 0.8, 0.85, 0.9, 1.0],
                'max_samples': [0.7, 0.8, 0.85, 0.9, 1.0]
             }

bagging = BaggingClassifier()

grid = GridSearchCV(
    bagging,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=4
)

grid.fit(train_set, train_label)

In [None]:
bagging = grid.best_estimator_
bagging

## Print Metrics

In [None]:
train_pred = bagging.predict(train_set)
test_pred = bagging.predict(test_set)

print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label,test_pred)

### Confusion Matrix

In [None]:
predictions = bagging.predict(test_set)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(bagging, test_set, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

## Extremely Randomized Trees

A meta estimator that fits a number of randomized decision trees (a.k.a. extra-trees) on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting.

We will run a Grid-Search Cross-Validation over several parameters:
- **max_features**: The number of features to consider when looking for the best split.
- **min_sample_split**: The minimum number of samples required to split an internal node.
- **min_samples_leaf**: The minimum number of samples required to be at a leaf node.
- **n_estimators**: The number of trees in the forest.
- **ccp_alpha**: Complexity parameter used for Minimal Cost-Complexity Pruning.
- **min_weight_fraction_leaf**: The minimum weighted fraction of the sum total of weights (of all the input samples) required to be at a leaf node.
- **criterion**: The function to measure the quality of a split.

In [None]:
param_grid = {'max_features': ['sqrt', 'log2', None],
              'min_samples_split': [1, 2, 3, 4, 5],
              'min_samples_leaf': [1, 2, 3, 4],
              'n_estimators': np.arange(50, 200, 10),
              'ccp_alpha': [0, 0.1, 0.05, 0.3, 0.5],
              'min_weight_fraction_leaf': [0, 0.1, 0.05, 0.3, 0.5],
              'criterion' :['gini', 'entropy', 'log_loss']
             }

ert = ExtraTreesClassifier()

grid = GridSearchCV(
    ert,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=4
)

grid.fit(train_set, train_label)

In [None]:
ert = grid.best_estimator_
ert

## Print Metrics

In [None]:
train_pred = ert.predict(train_set)
test_pred = ert.predict(test_set)

print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label,test_pred)

### Confusion Matrix

In [None]:
predictions = ert.predict(test_set)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(ert, test_set, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

## AdaBoost

A meta-estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases.

We will run a Grid-Search Cross-Validation over several parameters:
- **n_estimators**: The maximum number of estimators at which boosting is terminated.
- **learnign_rate**: Weight applied to each classifier at each boosting iteration.
- **algorithm**: If ‘SAMME.R’ then use the SAMME.R real boosting algorithm. estimator must support calculation of class probabilities. If ‘SAMME’ then use the SAMME discrete boosting algorithm. The SAMME.R algorithm typically converges faster than SAMME, achieving a lower test error with fewer boosting iterations.

In [None]:
param_grid = {'n_estimators': np.arange(5, 100, 5),
              'learning_rate': [0.8, 0.9, 1.0, 1.1, 1.2],
              'algorithm': ['SAMME', 'SAMME.R'],
             }

ada = AdaBoostClassifier()

grid = GridSearchCV(
    ada,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=4
)

grid.fit(train_set, train_label)

In [None]:
ada = grid.best_estimator_
ada

## Print Metrics

In [None]:
train_pred = ada.predict(train_set)
test_pred = ada.predict(test_set)

print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label,test_pred)

### Confusion Matrix

In [None]:
predictions = ada.predict(test_set)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(ada, test_set, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

## Gradient Tree Boosting

This algorithm builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage n_classes_ regression trees are fit on the negative gradient of the loss function.

We will run the model with default parameters without doing a Grid Search, since the fact that this model is not mandatory, and due the timing limitation.

In [None]:
gbc = GradientBoostingClassifier()
gbc.fit(train_set, train_label)
gbc

## Print Metrics

In [None]:
train_pred = gbc.predict(train_set)
test_pred = gbc.predict(test_set)

print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label,test_pred)

### Confusion Matrix

In [None]:
predictions = gbc.predict(test_set)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(gbc, test_set, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

##  Histogram-Based Gradient Boosting

During training, the tree grower learns at each split point whether samples with missing values should go to the left or right child, based on the potential gain. When predicting, samples with missing values are assigned to the left or right child consequently.

We will run the model with default parameters without doing a Grid Search, since the fact that this model is not mandatory, and due the timing limitation.

In [None]:
hgbc = HistGradientBoostingClassifier()
hgbc.fit(train_set, train_label)
hgbc

## Print Metrics

In [None]:
train_pred = hgbc.predict(train_set)
test_pred = hgbc.predict(test_set)

print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label,test_pred)

### Confusion Matrix

In [None]:
predictions = hgbc.predict(test_set)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(hgbc, test_set, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

## Save models

In [None]:
with open("models_checkpoints/bagging_lang.bin", "wb") as f:
    pickle.dump(bagging, f)
    
with open("models_checkpoints/adaboost_lang.bin", "wb") as f:
    pickle.dump(ada, f)

## Removed "lang" attribute for classification

### Load Data

In [None]:
data = pd.read_csv("dataset/cleaned_user_profiles.csv", index_col=0)
data.drop(columns=['lang'])
train_set, test_set, train_label, test_label = prepare_data(data)

## Bagging

In [None]:
param_grid = {
                'n_estimators': np.arange(2, 40, 1),
                'max_features': [0.7, 0.8, 0.85, 0.9, 1.0],
                'max_samples': [0.7, 0.8, 0.85, 0.9, 1.0]
             }

bagging = BaggingClassifier()

grid = GridSearchCV(
    bagging,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=4
)

grid.fit(train_set, train_label)

In [None]:
bagging = grid.best_estimator_
bagging

## Print Metrics

In [None]:
train_pred = bagging.predict(train_set)
test_pred = bagging.predict(test_set)

print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label,test_pred)

### Confusion Matrix

In [None]:
predictions = bagging.predict(test_set)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(bagging, test_set, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

## AdaBoost

In [None]:
param_grid = {'n_estimators': np.arange(5, 100, 5),
              'learning_rate': [0.8, 0.9, 1.0, 1.1, 1.2],
              'algorithm': ['SAMME', 'SAMME.R'],
             }

ada = AdaBoostClassifier()

grid = GridSearchCV(
    ada,
    param_grid,
    cv=3,
    scoring='accuracy',
    return_train_score=False,
    verbose=4
)

grid.fit(train_set, train_label)

In [None]:
ada = grid.best_estimator_
ada

## Print Metrics

In [None]:
train_pred = ada.predict(train_set)
test_pred = ada.predict(test_set)

print_metrics(train_label, train_pred, test_label, test_pred)

In [None]:
report_scores(test_label,test_pred)

### Confusion Matrix

In [None]:
predictions = ada.predict(test_set)

cm = confusion_matrix(test_label, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix = cm)
disp.plot()
plt.show()

### ROC-Curve

In [None]:
RocCurveDisplay.from_estimator(ada, test_set, test_label)
plt.show()

### PCA Blobs

In [None]:
pca = PCA(n_components=2)
test_set_reduced = pca.fit_transform(test_set.values)

In [None]:
#true labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_label, s=25);

In [None]:
#predicted labels
plt.scatter(test_set_reduced[:, 0], test_set_reduced[:, 1], c=test_pred, s=25);

## Save models

In [None]:
with open("models_checkpoints/bagging_no_lang.bin", "wb") as f:
    pickle.dump(bagging, f)
    
with open("models_checkpoints/adaboost_no_lang.bin", "wb") as f:
    pickle.dump(ada, f)