In [None]:
!pip install tqdm xlrd

In [None]:
from pathlib import Path
from typing import List, Tuple

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
!pip install sklearn

In [None]:
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from tqdm.notebook import tqdm

In [None]:
%matplotlib inline

## Data

We'll use a dataset to do with protein expression in mice for multiclass classification made available by the [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php), documented [here](https://archive.ics.uci.edu/ml/datasets/Mice+Protein+Expression).

Dataset: Higuera C, Gardiner KJ, Cios KJ (2015) Self-Organizing Feature Maps Identify Proteins Critical to Learning in a Mouse Model of Down Syndrome. PLoS ONE 10(6).

The task is to predict the class of each observation, based on the associated protein expression levels.

In [None]:
# download the dataset from S3 if it isn't already in your working directory.
if not Path('Data_Cortex_Nuclear.xls').exists():
     !wget https://s3-eu-west-1.amazonaws.com/faculty-client-teaching-materials/bagging-and-boosting/Data_Cortex_Nuclear.xls

In [None]:
df = pd.read_excel("Data_Cortex_Nuclear.xls", index_col=0)

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# no major class imbalance issues to be concerned about
df["class"].value_counts()

In [None]:
# use a restricted set of features for our analysis. I'm not a domain expert,
# but I think the point is that we only want to use protein expression information
# to determine the class associated to each measurement.
feats = [
    c for c in df if c not in ["Genotype", "Treatment", "Behavior", "class"]
]

# Fill missing values with an indicator flag. We're not being careful about this,
# since the purpose of the notebook is to understand algorithms rather than datasets,
# but in a real world context, we would need to make sure that filling in missing
# values in this way did not damage the dataset. This could occur if the distribution
# of missing values was correlated with the class distribution within the dataset, for
# instance.
train, test = train_test_split(df.fillna(-1000), test_size=0.2)

X_train, y_train = train[feats].values, train["class"].values
X_test, y_test = test[feats].values, test["class"].values

# 1. Bagging trees

Let's build a classifier consisting of bagged decision trees by hand.
1. Using the function `np.random.choice`, train 100 different `DecisionTreeClassifier` objects on 100 different bootstrap samples of the training set (that is, samples obtained using `np.random.choice(range(len(data)), len(data), replace=True)`).
2. Use each of these classifiers to generate class predictions for data points in the training set. You should end up with 100 sets of `len(X_test)` predictions.
3. Now produce an aggregate/ bagged classification score for each element of the training set by majority vote from the individual `DecisionTreeClassifer` instances. The `scipy.stats` function `mode` will be helpful here.
4. Use the function `plt.hist` to plot a histogram of the 100 classifier accuracies from your 100 individually trained `DecisionTreeClassifier` instances. How does the accuracy of bagged classifications produced in `Exercise 3` compare to the individual accuracies of its constituent `DecisionTreeClassifier` members?
5. [Optional] Package steps 1-3 into a single class, `BaggedDT`, with `.fit` and `.predict` methods.

In [None]:
# 1
def train_forest(
    X: np.array, y: np.array, n_trees: int = 100
) -> Tuple[List[DecisionTreeClassifier], List[np.array]]:
    training_indices, forest = [], []
    for _ in range(n_trees):
        ix = np.random.choice(range(len(X)), len(X), replace=True)
        X_tr, y_tr = X[ix], y[ix]
        training_indices.append(ix)
        forest.append(DecisionTreeClassifier().fit(X_tr, y_tr))
    return forest, training_indices

In [None]:
# 2
forest, _ = train_forest(X_train, y_train)
preds = np.array([tree.predict(X_test) for tree in forest])

In [None]:
# 3
from scipy.stats import mode

majority_votes = mode(preds, axis=0).mode.flatten()

print(f"majority vote accuracy: {accuracy_score(y_test, majority_votes)}")

In [None]:
# 4
individual_accuracies = np.array([(pred == y_test).mean() for pred in preds])
_ = plt.hist(individual_accuracies, bins="fd")

In [None]:
# 5
class BaggedDT:
    def __init__(self, n_trees: int) -> None:
        self.n_trees = n_trees

    def fit(self, X: np.array, y: np.array) -> None:
        self.clfs, self.training_indices = train_forest(X, y, self.n_trees)

    def predict(self, X: np.array) -> np.array:
        preds = np.array([clf.predict(X) for clf in self.clfs])
        return mode(preds, axis=0).mode.flatten()

In [None]:
clf = BaggedDT(100)
clf.fit(X_train, y_train)
accuracy_score(y_test, clf.predict(X_test))

`sklearn` will do the above for you, via the class `sklearn.ensemble.BaggingClassifier`- you won't have to bag supervised machine learning models by hand in real life! 

6. Reimplement `1-3` above using `BaggingClassifier`- you should obtain very similar results!

In [None]:
# 6
bagged_trees = BaggingClassifier(DecisionTreeClassifier(), n_estimators=100)
bagged_trees.fit(X_train, y_train)
preds = bagged_trees.predict(X_test)

In [None]:
accuracy_score(y_test, preds)

## 2. Out Of Bag Estimates

* Each tree in our bagged classifier was trained on a bootstrapped sample of the training data.
* Because bootstrap samples are created by sampling with replacement, this means that not every data point will necessarily have been shown to every base estimator during training.
* In fact, in a dataset of size N, the probability that any one data point is not included in the bootstrap sample used to train a given tree is $\left(1 - \frac{1}{N}\right)^N$. So, on average, in a bagged classifier with K base estimators, a given data point will not have been shown to $K\cdot\left(1 - \frac{1}{N}\right)^N$ base estimators during training. Since $\left(1 - \frac{1}{N}\right)^N\approx e^{-1}$, we see that any given datum will not have encountered approximately $\frac{K}{e}\approx 0.37\cdot K$ base estimators during training.
* We can therefore use these $0.37\cdot K$ base estimators to generate an essentially unbiased classification score prediction for our data point. Repeating this process N times, we can produce an essentially unbiased score for each point in the training set. Since we know the target variable for points in the training set, this lets us produce what is known as an **out of bag** (or **OOB**) estimate of our bagged classifier's performance.
* Out of bag estimates are absolutely fantastic. Like cross validation, they let us estimate our model's generalisation accuracy (which helps us ensure that we don't overfit) using just the training that we have at hand. Unlike cross validation, however, they come for free whenever we train our model.

1. [Optional] Modify the code for the `BaggedDT` classifier that you have already written so that, when `BaggedDT().fit` is called, out of bag classification scores are recorded for each data point in the training set. Use this to estimate the generalisation accuracy of your classifier and compare it to the accuracy you obtain on the test set.
* In order to record out of bag classification scores for the training data, you will need to record which data points are shown to which tree during training. One way to do this would be to use your bootstrapping code to bootstrap indices from `list(range(len(X_train))` which can then be used both to select data points from `X_train` for training trees as well as to record which poinst are selected , rather than sampling from `X_train` directly.

In [None]:
# 1
def train_forest(
    X: np.array, y: np.array, n_trees: int = 100
) -> Tuple[List[DecisionTreeClassifier], List[np.array]]:
    training_indices, forest = [], []

    for _ in range(n_trees):
        ix = np.random.choice(range(len(X)), len(X), replace=True)
        X_tr, y_tr = X[ix], y[ix]
        training_indices.append(ix)
        forest.append(DecisionTreeClassifier().fit(X_tr, y_tr))

    return forest, training_indices


class BaggedDT:
    def __init__(self, n_trees: int) -> None:
        self.n_trees = n_trees

    def fit(self, X: np.array, y: np.array) -> None:
        self.clfs, self.training_indices = train_forest(X, y, self.n_trees)
        oob_preds = []
        self.oob_preds = self.oob_predictions(X)

    def oob_predictions(self, X: np.array) -> np.array:
        oob_preds = []
        for i in range(len(X)):
            unseen_clfs = [
                clf
                for clf, ix in zip(self.clfs, self.training_indices)
                if i not in ix
            ]
            # If self.n_trees is small, some data points might have been
            # included in the training set for every tree, in which case
            # no OOB prediction for these points is possible.
            if unseen_clfs:
                preds = np.hstack([clf.predict(X[[i]]) for clf in unseen_clfs])
                oob_pred = mode(preds, axis=0)[0]
            else:
                oob_pred = np.nan
            oob_preds.append(oob_pred)
        return np.hstack(oob_preds)

    def predict(self, X: np.array) -> np.array:
        preds = np.array([clf.predict(X) for clf in self.clfs])
        return mode(preds, axis=0)[0].transpose().ravel()

In [None]:
bdt = BaggedDT(100)

In [None]:
bdt.fit(X_train, y_train)

In [None]:
print(
    f"Out of Bag accuray estimate: {accuracy_score(y_train, bdt.oob_preds):.3f}, "
    f"test accuracy estimate: {accuracy_score(y_test, bdt.predict(X_test)):.3f}"
)

Again, `sklearn` will do this for you and will not need to implement this by hand in the real world: simply set `oob_score=True` and then access `.oob_score_` after having fit your model!

2. Use out of bag estimates to choose how many trees to use in `BaggingClassifier(DecisionTreeClassifier())`- do this by plotting a graph of a range of `n_estimators` values against `clf.oob_score_` for a classifier trained using that many estimators. You should find that adding more trees will never really make performance worse, but you will encounter diminishing returns that make it not worthwhile eventually. 
3. Use `plt.plot` to plot a graph comparing the out of bag performance estimates that you obtain this way as well as the true generalisation performance of your classifiers on the test set `(X_test, y_test)` as a function of the number of trees.
4. How long does it take to estimate the generalisation performance of `BaggingClassifer(DecisionTreeClassifier())` using 5-fold cross validation? How does the result compare to the OOB estimate? You will want to use the function `sklearn.model_selection.cross_val_score` to implement 5-fold cross validation. Use the jupyter cell magic command %%time to estimate how long these computations take.

In [None]:
# 2 and # 3

n_estimator_candidates = [20, 50, 200, 300, 350, 400]
clfs = (
    BaggingClassifier(
        DecisionTreeClassifier(), oob_score=True, n_estimators=n
    ).fit(X_train, y_train)
    for n in tqdm(n_estimator_candidates)
)
scores = list(
    (accuracy_score(clf.predict(X_test), y_test), clf.oob_score_)
    for clf in clfs
)

In [None]:
p = plt.plot(n_estimator_candidates, scores)
plt.legend((p[0], p[1]), ("test_accuracy", "oob_accuracy"))
plt.xlabel("n_estimators")

In [None]:
# 4
from sklearn.model_selection import cross_val_score

clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=200, oob_score=True
)

In [None]:
%%time
scores = cross_val_score(clf, X_train, y_train, cv=5)
scores.mean()

In [None]:
%%time
clf.fit(X_train, y_train)
clf.oob_score_

Finally, `BaggingClassifier` objects record the individual out of bag predictions made for each data point in the training set, as well as the overall resulting accuracy. These can be accessed via the (strangely named) `clf.oob_decision_function_` attribute, and can be very useful if you want more granular information than that contained in `clf.oob_score_`.

5: Use `clf.oob_decision_function_` together with `sklearn.metrics.confusion_matrix` to to display a confusion matrix computed using out of bag estimates. 
* You will need to convert the `(len(X_train), n_classes)` shape array `clf.oob_decision_function_` to a length `len(X_train)` list of predicted class labels.
* Do this by using `np.argmax` to convert `clf.oob_decision_function_` to a an array with shape `(len(X_train),)` and then use `clf.classes_` to find out which `argmax` values to map to which class label.

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
np.set_printoptions(precision=2)
confusion_matrix(
    y_train,
    [clf.classes_[ix] for ix in clf.oob_decision_function_.argmax(axis=1)],
    normalize="true",
)

In [None]:
# for comparison
np.set_printoptions(precision=2)
confusion_matrix(y_test, clf.predict(X_test), normalize="true")

# 3. Bagging generic estimators

1. Create a new classifier by bagging 50 logistic regression classifiers and compare its performance on the testing set with that of a single logistic regression classifier. 
    * You should find that there is essentially no performance gain - there are only theoretical reasons to believe that bagging will lead to improvements when small changes in the training data can lead to large changes in the structure of the base estimators.
    * In fact, there are some arguments that bagging stable estimators could degrade their performance. A discussion of this as well as a simulation in support of the hypothesis features in the [original paper introducing bagging](https://www.stat.berkeley.edu/~breiman/bagging.pdf), but this isn't a phenomenon that the author of this notebook (or anyone he knows) has encountered in the wild. There isn't a good reason to bag stable classifiers, but I've never seen it drastically hurt anyone.

In [None]:
%%time
# 1
# This takes a while. ~ 90s on a 4 core machine 🙄.
clf = BaggingClassifier(
    LogisticRegression(
        solver="lbfgs", max_iter=3000, multi_class="multinomial", n_jobs=-1
    ),
    n_estimators=50,
    oob_score=True,
).fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

In [None]:
clf = LogisticRegression(
    solver="lbfgs", max_iter=3000, multi_class="multinomial", n_jobs=-1
).fit(X_train, y_train)
accuracy_score(clf.predict(X_test), y_test)

# 4. Random Forests as bagging + feature subselection

We've seen so far that bagging decision trees is a very powerful technique. This works fundamentally because decision trees trained on different bootstrapped datasets find different pathways to the truth, and so aggregating their predictions results in a 'wisdom of the crowds' type of benefit. Phrased technically: the base estimator decision trees are _decorrelated_. 
* Leo Breiman's Random Forests take this notion even further by ensuring that the decision trees have access only to randomly selected **partial information** whenever they create new nodes.
* In a Random Forest, the base estimator decision trees are made even more decorrelated than in a bagged decision tree classifier by ensuring that they not only have access to different data sets during training but also different **features** at each stage of training.
* Partially blinding the decision trees at each stage might impair them as individual learners, but the benefit of decorrelating their predictions makes up for this: indeed, [Concordet's Jury Theorem](https://en.wikipedia.org/wiki/Condorcet%27s_jury_theorem) tells us that, if we were able to make the trees fully independent, then a Random Forest would become arbitrarily accurate as the number of trees  becomes larger.
* Leo Breiman's original paper subsamples features at each split point by picking $\mathrm{log}_2(F)$ features at each point, where $F$ is the total number of features available - the field has since found that the right number of feature to subsample can be problem specific. `sklearn` makes a variety of options available through the `max_features` option.
* **Note**: the default setting for `max_features` for `RandomForestClassifier` objects in `sklearn` is a sensible default $\mathrm{floor}(\sqrt{F})$. However, the default `max_features` setting for `RandomForestRegressor` objects is $F$, so that no subsampling takes place! This is a mistake/ the result of a misunderstanding (as discussed [here](https://stats.stackexchange.com/questions/324370/references-on-number-of-features-to-use-in-random-forest-regression))- if you don't set this argument manaully then you'll simply be bagging decision trees regressors as opposed to training a Random Forest.

1. Train a `RandomForestClassifier` on our training data. If you like, use OOB estimates (as before, you'll need to set `oob_score=True`) to choose the smallest value of `max_features` which doesn't result in degraded performance.
2. Use the `%%timeit` Jupyter cell magic command to time how long it takes a RandomForest with 100 trees to train. Compare this to how long it takes a bagged decision tree classifier with the same number of trees to train. What do you notice?
    * You should notice that training a RandomForest is faster: this will always be the case, since there are fewer features to consider at each node when a new split is made. This can result in substantial speed improvements when a lot of features are present.

In [None]:
# 1
feat_fractions = [i / 10 for i in range(1, 11)]
clfs = (
    RandomForestClassifier(
        n_estimators=200, max_features=fraction, oob_score=True
    ).fit(X_train, y_train)
    for fraction in feat_fractions
)

oob_scores = [clf.oob_score_ for clf in clfs]

In [None]:
# Looks like performance is high even when only 10% of available
# features are considered at each split. Because our data has 77
# features, this is close to the sklearn default of sqrt(N_features)
# so we'll stick with that.
p = plt.plot(list(feat_fractions), oob_scores)
plt.ylabel("oob_accuracy")
plt.xlabel("max_features")

In [None]:
%%timeit
# 2
RandomForestClassifier(n_estimators=100, oob_score=True).fit(X_train, y_train)

In [None]:
%%timeit
BaggingClassifier(
    base_estimator=DecisionTreeClassifier(), n_estimators=100, oob_score=True
).fit(X_train, y_train)

## Random Forest failure modes

Random Forests are incredibly robust- they inherit all of the advantages of decision trees (handle mixed data types, correlated features, features on very different scales, fast to train) whilst also being stable, highly performant, needing very little parameter tuning (and even then, OOB examples make this easy!), whilst also providing an estimate of their generalisation accuracy for free. If you had to pick an algorithm to use in order to achieve good performance on a supervised learning task without knowing what the dataset would be beforehand and under time pressure, a Random Forest will almost certainly be the best option. No algorithm is perfect, however, and there are some situations where Random Forests will not perform excellently:
* Very sparse or redundant high dimensional data: if a dataset has a huge number of features and very many of them are useless, the fact that Random Forest decision trees subsample features at each node will result in very weak base estimators and a low performing forest. This is relatively easy to diagnose, however: if you obtain poor OOB performance on a training set with very many features, set `max_features=None` to reduce your Random Forest to a standard bagged decision tree estimator and see if performance improves. If this is not computationally feasible, perform dimension reduction (try Principal Component Analysis as a cheap first option) in advance.
* Very complicated, unstructured, high dimensional datasets like images and audio: in situations like this, feature engineering is essential and so tree based methods (which don't do any feature engineering) are poorly suited to this kind of situation. Use a neural network instead.

1. Add `10_000` redundant features to each point in your dataset using `np.random.normal(size=len(df) * 10_000).reshape(N, 10_000)`, before using this data to train a Random Forest in order to explore how Random Forests perform in this situation.
* Examine performance first with the default value for `max_features` and then with `max_features=None`. Be aware that it might take a long time to fit a forest of any size when the latter option is used!
* You can use `np.hstack` to concatenate your redundant features onto your training and test sets.

In [None]:
# 1
useless_feats = 10_000

noise = np.random.normal(size=len(df) * useless_feats).reshape(
    len(df), useless_feats
)

X_train_noisey = np.hstack([X_train, noise[: len(X_train)]])
X_test_noisey = np.hstack([X_test, noise[len(X_train) :]])

In [None]:
%%time
clf = RandomForestClassifier(n_estimators=200, oob_score=True, n_jobs=-1)
clf.fit(X_train_noisey, y_train)

# The OOB estimate is way too low here because it's effectively being made
# with a forest of size 200*0.38 = 76 trees, which is just too few to achieve
# good performance when there are this many usless features.
print(
    f"Out of bag accuracy estimate: {clf.oob_score_},"
    f"test accuracy estimate: {accuracy_score(clf.predict(X_test_noisey), y_test)}"
)

In [None]:
%%time
# This will take a long time. Approx ~13min on a 4 core machine.
# Note that we still get worse performance than without having added any junk
# features. This is because we've added so many junk features that it's highly
# likely that random patterns exist within the new features that genuinely
# help to discriminate between classes in the training set, but which do not
# generalise to the test set. We've made the signal in our data weaker by adding
# noise, and would need to find more data to compensate.
clf = RandomForestClassifier(
    n_estimators=200, oob_score=True, n_jobs=-1, max_features=None
)
clf.fit(X_train_noisey, y_train)
print(
    f"Out of bag accuracy estimate: {clf.oob_score_},"
    f"test accuracy estimate: {accuracy_score(clf.predict(X_test_noisey), y_test)}"
)