## Home assignment 05: Bagging and OOB score

Please, fill the lines in the code below.
This is a simplified version of `BaggingRegressor` from `sklearn`. Please, notice, that `sklearn` API is **not preserved**.

Your algorithm should be able to train different instances of the same model class on bootstrapped datasets and to provide [OOB score](https://en.wikipedia.org/wiki/Out-of-bag_error) for the training set.

The model should be passed as model class with no explicit parameters and no parentheses.

Example:
```
import numpy as np
from sklearn.linear_model import LinearRegression

bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
bagging_regressor.fit(LinearRegression, X, y)

```

In [None]:
import numpy as np

In [None]:
class SimplifiedBaggingRegressor:
    def __init__(self, num_bags, oob=False):
        self.num_bags = num_bags
        self.oob = oob

    def _generate_splits(self, data: np.ndarray):
        '''
        Generate indices for every bag and store in self.indices_list list
        '''
        # Your code here
        self.indices_list = []
        data_length = len(data)
        for bag in range(self.num_bags):
            # Создаем выборку с возвращением (bootstrap)
            indices = np.random.choice(data_length, size=data_length, replace=True)
            self.indices_list.append(indices)

    def fit(self, model_constructor, data, target):
        '''
        Fit model on every bag.
        Model constructor with no parameters (and with no ()) is passed to this function.

        example:

        bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
        bagging_regressor.fit(LinearRegression, X, y)
        '''
        self.data = None
        self.target = None
        self._generate_splits(data)
        assert len(set(list(map(len, self.indices_list)))) == 1, 'All bags should be of the same length!'
        assert list(map(len, self.indices_list))[0] == len(data), 'All bags should contain `len(data)` number of elements!'
        self.models_list = []
        for bag in range(self.num_bags):
            model = model_constructor()
            # Your code here
            indices = self.indices_list[bag]
            # Выбираем данные и целевые значения для текущего мешка по индексам
            data_bag = data[indices]
            target_bag = target[indices]
            # Обучаем модель на выбранных данных
            self.models_list.append(model.fit(data_bag, target_bag)) # store fitted models here


        if self.oob:
            self.data = data
            self.target = target


    def predict(self, data):
      '''
      Получает среднее предсказание для каждого объекта из переданных данных
      '''
      # Your code here
      # Проверка, что модели обучены
      if not hasattr(self, 'models_list'):
          raise ValueError("Модели еще не обучены. Вызовите метод fit() перед predict().")

      # Собираем предсказания от каждой модели
      predictions = []
      for model in self.models_list:
          predictions.append(model.predict(data))

      # Преобразуем список предсказаний в массив и берем среднее по моделям
      predictions = np.array(predictions)
      mean_predictions = np.mean(predictions, axis=0)

      return mean_predictions

    def _get_oob_predictions_from_every_model(self):
        '''
        Generates list of lists, where list i contains predictions for self.data[i] object
        from all models, which have not seen this object during training phase
        '''
        list_of_predictions_lists = [[] for _ in range(len(self.data))]
        # Your Code Here
        # Для каждого объекта
        for i in range(len(self.data)):
            # Для каждой модели
            for model_idx, model in enumerate(self.models_list):
                # Проверяем, входит ли индекс объекта в мешок этой модели
                if i not in self.indices_list[model_idx]:
                    # Модель не видела этот объект во время обучения
                    pred = model.predict(self.data[i].reshape(1, -1))
                    list_of_predictions_lists[i].append(pred[0])  # добавляем предсказание

        self.list_of_predictions_lists = np.array(list_of_predictions_lists, dtype=object)
        self.oob_predictions = np.array(list_of_predictions_lists, dtype=object)

    # def _get_averaged_oob_predictions(self):
    #     '''
    #     Compute average prediction for every object from training set.
    #     If object has been used in all bags on training phase, return None instead of prediction
    #     '''
    #     self._get_oob_predictions_from_every_model()
    #     '''
    #     Вычисляет среднее предсказание для каждого объекта из обучающего набора.
    #     Если объект использовался во всех мешках (и, следовательно, не имеет oob предсказаний), возвращает None.
    #     '''
    #     oob_preds = [] # массив списков предсказаний

    #     averaged_predictions = []
    #     for i in range(len(self.data)):
    #         preds = oob_preds[i]
    #         # Если объект использовался во всех мешках, то список предсказаний пуст
    #         if len(preds) == 0:
    #             # Объект был использован во всех мешках, возвращаем None
    #             averaged_predictions.append(None)
    #         else:
    #             # Вычисляем среднее предсказание
    #             avg_pred = np.mean(preds)
    #             averaged_predictions.append(avg_pred)

    #     self.oob_predictions = np.array(averaged_predictions, dtype=object)
    def _get_averaged_oob_predictions(self):
            '''
            Compute average prediction for every object from training set.
            If object has been used in all bags on training phase, return None instead of prediction
            '''
            # self.oob_predictions =
            self._get_oob_predictions_from_every_model()
            # self.oob_predictions = None
            oob_preds = self.oob_predictions  # массив списков предсказаний

            averaged_preds = []
            for preds in oob_preds:
                if len(preds) == 0:
                    # Объект использовался во всех мешках, предсказаний нет
                    averaged_preds.append(None)
                else:
                    # Берем среднее предсказание
                    avg_pred = np.mean(preds)
                    averaged_preds.append(avg_pred)

            self.oob_predictions = np.array(averaged_preds, dtype=object)

    def OOB_score(self):
        '''
        Compute mean square error for all objects, which have at least one prediction
        '''
        self._get_averaged_oob_predictions()
        '''
        Вычисляет среднеквадратичную ошибку (MSE) для всех объектов, у которых есть предсказания.
        '''

        # Собираем предсказания и истинные значения для объектов с предсказаниями
        predictions = []
        true_values = []

        for pred, true in zip(self.oob_predictions, self.target):
            if pred is not None:
                predictions.append(pred)
                true_values.append(true)

        # Если нет объектов с предсказаниями, возвращаем None или 0
        if len(predictions) == 0:
            return None

        # Вычисляем MSE
        predictions = np.array(predictions)
        true_values = np.array(true_values)

        mse = np.mean((predictions - true_values) ** 2)
        return mse

### Local tests:

In [None]:
from sklearn.linear_model import LinearRegression
from tqdm.auto import tqdm

#### Simple tests:

In [None]:
for _ in tqdm(range(100)):
    X = np.random.randn(2000, 10)
    y = np.mean(X, axis=1)
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=10, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    assert np.mean((predictions - y)**2) < 1e-6, 'Linear dependency should be fitted with almost zero error!'
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score < 1e-6, 'OOB error for linear dependency should be also close to zero!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'

print('Simple tests done!')

  0%|          | 0/100 [00:00<?, ?it/s]

Simple tests done!


#### Medium tests

In [None]:
for _ in tqdm(range(10)):
    X = np.random.randn(200, 150)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=20, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    average_train_error = np.mean((predictions - y)**2)
    assert bagging_regressor.oob, 'OOB feature must be turned on'
    oob_score = bagging_regressor.OOB_score()
    assert oob_score > average_train_error, 'OOB error must be higher than train error due to overfitting!'
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 0.1, 'Probability of missing a bag should be close to theoretical value!'

print('Medium tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

Medium tests done!


#### Complex tests:

In [None]:
for _ in tqdm(range(10)):
    X = np.random.randn(2000, 15)
    y = np.random.randn(len(X))
    bagging_regressor = SimplifiedBaggingRegressor(num_bags=100, oob=True)
    bagging_regressor.fit(LinearRegression, X, y)
    predictions = bagging_regressor.predict(X)
    oob_score = bagging_regressor.OOB_score()
    assert abs(
        np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)) < 1e-2, 'Probability of missing a bag should be close to theoretical value!'

print('Complex tests done!')

  0%|          | 0/10 [00:00<?, ?it/s]

Complex tests done!


In [None]:
np.mean(
            list(map(len, bagging_regressor.list_of_predictions_lists))
        ) / bagging_regressor.num_bags - 1/np.exp(1)

np.float64(-0.0005294411714423242)

Great job! Please, save `SimplifiedBaggingRegressor` to  `bagging.py` and submit your solution to the grading system!