# BAGGING

Bagging improves the performance of several basic algorithms by making it more robust.

We will look at Linear Regression and Decision Tree Regressor model as our base models for Bagging.

We will use a housing price dataset (regression problem) with 13 features and 506 data samples.

In [32]:
# imports

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split

In [33]:
# function for a random_generator

def random_generator(seed = 0, low = 0, high = None, size = None):
    s = seed
    a = 11
    b = 13

    if high is None:
        return ("Error. Upper Limit not found")
    if size is None:
        return ("Error. Size not found")
    if size == 1:
        return ((a*s+b)%high)
    random_list = []
    for i in range(size):
        random_list.append((a*s+b)%high)
        s = (a*s+b)
    return random_list

In [34]:
# Loading the dataset
# reg_dataset -> Regression Dataset : Boston Dataset with Housing price as the target and 13 Features related to the houses.

def load_dataset():
    reg_x, reg_y = load_boston(return_X_y = True)
    reg_data = np.concatenate((reg_x, np.array(reg_y).reshape(-1, 1)), axis = 1)
    cols = ["feature"+str(i) for i in range(1, 14)]
    cols = cols + ["price(target)"]
    reg_dataset = pd.DataFrame(data = reg_data, columns = cols)

    return reg_dataset

reg_dataset = load_dataset()
print("REGRESSION DATASET : \n", reg_dataset)

REGRESSION DATASET : 
      feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0     0.00632      18.0      2.31       0.0     0.538     6.575      65.2   
1     0.02731       0.0      7.07       0.0     0.469     6.421      78.9   
2     0.02729       0.0      7.07       0.0     0.469     7.185      61.1   
3     0.03237       0.0      2.18       0.0     0.458     6.998      45.8   
4     0.06905       0.0      2.18       0.0     0.458     7.147      54.2   
..        ...       ...       ...       ...       ...       ...       ...   
501   0.06263       0.0     11.93       0.0     0.573     6.593      69.1   
502   0.04527       0.0     11.93       0.0     0.573     6.120      76.7   
503   0.06076       0.0     11.93       0.0     0.573     6.976      91.0   
504   0.10959       0.0     11.93       0.0     0.573     6.794      89.3   
505   0.04741       0.0     11.93       0.0     0.573     6.030      80.8   

     feature8  feature9  feature10  feature11  featu

In [35]:
# Splitting into training and testing

cols = ["feature"+str(i) for i in range(1, 14)]
cols = cols + ["price(target)"]

X = reg_dataset[cols[:-1]]
Y = reg_dataset[cols[-1]]

print(X)
print(Y.values)

TrainX = np.asarray(X)
TrainY = np.asarray(Y)

x_train, x_test, y_train, y_test = train_test_split(TrainX, TrainY, test_size=0.1, random_state=10)

     feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0     0.00632      18.0      2.31       0.0     0.538     6.575      65.2   
1     0.02731       0.0      7.07       0.0     0.469     6.421      78.9   
2     0.02729       0.0      7.07       0.0     0.469     7.185      61.1   
3     0.03237       0.0      2.18       0.0     0.458     6.998      45.8   
4     0.06905       0.0      2.18       0.0     0.458     7.147      54.2   
..        ...       ...       ...       ...       ...       ...       ...   
501   0.06263       0.0     11.93       0.0     0.573     6.593      69.1   
502   0.04527       0.0     11.93       0.0     0.573     6.120      76.7   
503   0.06076       0.0     11.93       0.0     0.573     6.976      91.0   
504   0.10959       0.0     11.93       0.0     0.573     6.794      89.3   
505   0.04741       0.0     11.93       0.0     0.573     6.030      80.8   

     feature8  feature9  feature10  feature11  feature12  feature13  
0    

#### **BAGGING WITH LINEAR REGRESSION MODELS**

In [36]:
linreg1 = LinearRegression()
row_index = random_generator(1, 0, 404, 404)
print(row_index)
print(len(row_index))

linreg2 = LinearRegression()
row_index = random_generator(3, 0, 404, 404)
print(row_index)
print(len(row_index))

linreg3 = LinearRegression()
row_index = random_generator(5, 0, 404, 404)
print(row_index)
print(len(row_index))

[24, 277, 232, 141, 352, 249, 328, 389, 252, 361, 348, 205, 248, 317, 268, 133, 264, 89, 184, 17, 200, 193, 116, 77, 52, 181, 388, 241, 240, 229, 108, 393, 296, 37, 16, 189, 72, 401, 384, 197, 160, 157, 124, 165, 212, 325, 356, 293, 4, 57, 236, 185, 28, 321, 312, 213, 336, 73, 8, 101, 316, 257, 12, 145, 396, 329, 400, 373, 76, 41, 60, 269, 144, 385, 208, 281, 276, 221, 20, 233, 152, 69, 368, 21, 244, 273, 188, 61, 280, 265, 100, 305, 136, 297, 48, 137, 308, 169, 256, 1, 24, 277, 232, 141, 352, 249, 328, 389, 252, 361, 348, 205, 248, 317, 268, 133, 264, 89, 184, 17, 200, 193, 116, 77, 52, 181, 388, 241, 240, 229, 108, 393, 296, 37, 16, 189, 72, 401, 384, 197, 160, 157, 124, 165, 212, 325, 356, 293, 4, 57, 236, 185, 28, 321, 312, 213, 336, 73, 8, 101, 316, 257, 12, 145, 396, 329, 400, 373, 76, 41, 60, 269, 144, 385, 208, 281, 276, 221, 20, 233, 152, 69, 368, 21, 244, 273, 188, 61, 280, 265, 100, 305, 136, 297, 48, 137, 308, 169, 256, 1, 24, 277, 232, 141, 352, 249, 328, 389, 252, 361, 34

In [37]:
dt1 = DecisionTreeRegressor(max_depth = 3, random_state = 10)
row_index = random_generator(8, 0, 404, 404)