<a href="https://colab.research.google.com/github/ABCDeath/study/blob/main/y_training_2.0/assignment05_bagging_and_oob/assignment_bagging_and_oob.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [None]:
import numpy as np

In [None]:
class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob

    def _generate_splits(self, data: np.ndarray):
        '''
        Generate indices for every bag and store in self.indices_list list
        '''
        self.indices_list = []
        data_length = len(data)
        self.indices_list = np.random.default_rng().integers(
            low=0, high=data_length-1, size=(self.num_bags, data_length), dtype=int,
        )

    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.

        example:

        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        self.models_list = []

        for bag in range(self.num_bags):
            model = model_constructor()
            data_bag, target_bag = data[self.indices_list[bag]], target[self.indices_list[bag]]
            self.models_list.append(model.fit(data_bag, target_bag)) # store fitted models here

        if self.oob:
            self.data = data
            self.target = target

    def predict(self, data):
        '''
        Get average prediction for every object from passed dataset
        '''
        predictions = np.ndarray((len(self.models_list), self.data.shape[0]), dtype=self.data.dtype)
        for i, model in enumerate(self.models_list):
            predictions[i, :] = model.predict(data)

        return predictions.mean(axis=0)

    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[] for _ in range(len(self.data))]
        # Your Code Here
        ids = {i for i in range(len(self.data))}
        seen = [set(l.flatten()) for l in self.indices_list]
        not_seen = [list(ids - seen[bag]) for bag in range(self.num_bags)]

        predictions = [
            self.models_list[bag].predict(self.data[not_seen[bag]])
            for bag in range(self.num_bags)
        ]

        for not_seen_bag, pred_bag in zip(not_seen, predictions):
            for i, p in zip(not_seen_bag, pred_bag):
                list_of_predictions_lists[i].append(p)

        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)

    def _get_averaged_oob_predictions(self):
        '''
        Compute average prediction for every object from training set.
        If object has been used in all bags on training phase, return None instead of prediction
        '''
        self._get_oob_predictions_from_every_model()
        self.oob_predictions = np.array([np.mean(p) if len(p) else np.nan for p in self.list_of_predictions_lists], dtype=np.float64)

    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()
        mask = ~np.isnan(self.oob_predictions)
        return np.mean(np.square(self.target[mask] - self.oob_predictions[mask]))

### Local tests:

In [None]:
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

#### Simple tests:

In [None]:
for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    assert np.mean((predictions - y)**2) < 1e-6, 'Linear dependency should be fitted with almost zero error!'
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    print(oob_score)
    assert oob_score < 1e-6, 'OOB error for linear dependency should be also close to zero!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'

print('Simple tests done!')

  0%|          | 0/100 [00:00<?, ?it/s]

3.4507745074746925e-32
5.147171180264844e-32
6.301091494672124e-32
8.393864317453715e-32
7.716055323710112e-32
6.170940732616233e-32
4.192920558985487e-32
1.2381276672320687e-31
7.127172540499857e-32
5.180307224217972e-32
1.2109931422031605e-31
7.669290168941904e-32
3.3713247787680627e-31
4.1356104791221945e-32
1.2638647183507749e-31
4.982939035510674e-32
5.113165342546647e-32
5.632939121190884e-32
4.7459366719792655e-32
3.661240128602008e-31
1.2467099006446537e-31
1.748297888027149e-31
4.801476000578712e-32
4.67150405300507e-32
4.7467068955474775e-32
8.26870611121933e-32
3.5939406209253154e-32
7.072757980160087e-32
8.369042517524301e-32
7.779177701334332e-32
4.042978028243481e-32
7.049898222396165e-32
1.0247827048603034e-31
5.202396302119953e-32
3.643894340538111e-32
4.2783442560313734e-32
2.0682673590907027e-31
7.127908425470044e-32
5.8100944895611e-32
5.089924182864851e-32
4.071274275819612e-32
1.7842472430444307e-31
1.8225538004765704e-31
8.377314949051941e-32
1.1084207726297019e-3

#### Medium tests

In [None]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y)**2)
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score > average_train_error, 'OOB error must be higher than train error due to overfitting!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'

print('Medium tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

Medium tests done!


#### Complex tests:

In [None]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 1e-2, 'Probability of missing a bag should be close to theoretical value!'

print('Complex tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

Complex tests done!


In [None]:
np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)

np.float64(-0.0013644411714423543)

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!