In [47]:
import numpy as np
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error

In [48]:
#Loading data set
boston = load_boston()
X_data = boston.data
y_target = boston.target

# Task 1

In [65]:
from sklearn.tree import DecisionTreeRegressor
import random
import math

In [68]:
def random_forest(x, y):
    '''
    This function implements random forest
    '''
    y_predicted_val = []
    oob = []
    y_pred = np.zeros_like(y)
    for i in range(0, 30):  #For creating 30 samples
        x_bagged = []
        y_bagged = []
        rows_60 = random.sample(range(0, len(x)), math.floor((len(x) * 60) / 100))  #Randomly creates indices for 60% data
        rows_40 = random.sample(rows_60, math.ceil((len(x) * 40) / 100))  #Randomly creates indices for 40% data
        number_cols = random.sample(range(3, 14), 1)   #Randomly selects number of columns
        cols = random.sample(range(0, 13), number_cols[0])  #Randomly creates indices for columns
        for index in rows_60: #Iterates through each index of 60% data
            x_data = []
            y_data = []
            for k in sorted(cols):   #Iterates through each index of column of a data point
                x_data.append(x[index][k])
            x_bagged.append(x_data)
            y_bagged.append(y[index])
        for index in rows_40:  #Iterates through each index of 40% data
            x_data = []
            y_data = []
            for k in sorted(cols):    #Iterates through each index of column of a data point
                x_data.append(x[index][k])
            x_bagged.append(x_data)
            y_bagged.append(y[index])
        x_pred = x[:, sorted(cols)]   #Getting data points with selected columns for prediction
        dtr = DecisionTreeRegressor()
        dtr.fit(np.array(x_bagged), np.array(y_bagged))
        prediction = dtr.predict(x_pred)
        y_predicted_val.append(prediction)  #y_predicted_val contains predicted values of each model
        y_pred = y_pred + prediction
        temp = []
        for j in range(0, len(x)):
            if j not in rows_60:   #Checking if a data point is oob point, a data point is a oob point if is not in rows_60
                temp.append(j)
        oob.append(temp)          #Collecting oob points for a model
    y_pred = y_pred / 30          #Predicted scores of training data points
    val = 0
    for i in range(0, len(y)):
        val += ((y[i] - y_pred[i]) ** 2)    #Computing squared error
    mse = val / len(y)       #Computing mean squared error
    oob_pred = []
    for i in range(0, len(x)):
        count = 0
        val = 0
        for j in range(0, len(oob)):
            if i in oob[j]:    #Checking whether a point is in oob points
                val += y_predicted_val[j][i]  # val accumulates the predicted values of a oob point
                count += 1                    # count stores the number of models of whose oob points have the data 
                 # point i
        if count != 0:
            oob_pred.append(val / count)
        else:
            oob_pred.append(0)
    val = 0
    for i in range(0, len(x)):
        val += ((y[i] - oob_pred[i]) ** 2) #Computing squared error for oob data points
    oob_score = val / len(y)          #Computing OOBScore
    return mse, oob_score

In [72]:
MSE, OOBScore = random_forest(X_data, y_target)
print('MSE =', MSE)
print('OOBScore =', OOBScore)

MSE = 1.969132358768848
OOBScore = 12.61180773839129


# Task 2

In [52]:
from tqdm import tqdm
MSE = []
OOBScore = []
for i in tqdm(range(0, 35)): #Computing 35 times mse and oob_score
    mse, oob_score = random_forest(X_data, y_target)
    MSE.append(mse)
    OOBScore.append(oob_score)

100%|██████████| 35/35 [00:06<00:00,  5.41it/s]


In [53]:
from prettytable import PrettyTable
print('Confidence Interval for MSE')
x = PrettyTable()
x = PrettyTable(['Sample Size', 'Sample mean', 'Sample std', 'Left CI', 'Right CI'])
sample = np.array(MSE)
sample_size = len(sample)
sample_mean = sample.mean()
sample_std = sample.std()
left_limit = np.round(sample_mean - 2 * (sample_std / np.sqrt(sample_size)), 3)
right_limit = np.round(sample_mean + 2 * (sample_std / np.sqrt(sample_size)), 3)
row = []
row.append(sample_size)
row.append(sample_mean)
row.append(sample_std)
row.append(left_limit)
row.append(right_limit)
x.add_row(row)
print(x)

Confidence Interval for MSE
+-------------+--------------------+---------------------+---------+----------+
| Sample Size |    Sample mean     |      Sample std     | Left CI | Right CI |
+-------------+--------------------+---------------------+---------+----------+
|      35     | 2.4232556386617206 | 0.24990501883446786 |  2.339  |  2.508   |
+-------------+--------------------+---------------------+---------+----------+


In [54]:
print('Confidence Interval for OOBScore')
x = PrettyTable()
x = PrettyTable(['Sample Size', 'Sample mean', 'Sample std', 'Left CI', 'Right CI'])
sample = np.array(OOBScore)
sample_size = len(sample)
sample_mean = sample.mean()
sample_std = sample.std()
left_limit = np.round(sample_mean - 2 * (sample_std / np.sqrt(sample_size)), 3)
right_limit = np.round(sample_mean + 2 * (sample_std / np.sqrt(sample_size)), 3)
row = []
row.append(sample_size)
row.append(sample_mean)
row.append(sample_std)
row.append(left_limit)
row.append(right_limit)
x.add_row(row)
print(x)

Confidence Interval for OOBScore
+-------------+--------------------+-------------------+---------+----------+
| Sample Size |    Sample mean     |     Sample std    | Left CI | Right CI |
+-------------+--------------------+-------------------+---------+----------+
|      35     | 14.059912432668275 | 1.157808047832362 |  13.669 |  14.451  |
+-------------+--------------------+-------------------+---------+----------+


# Task 3

In [73]:
def predict_house_price(x, y, xq):
    '''
    This function predicts the house price of a query point
    '''
    y_pred = 0
    for i in range(0, 30):  #For creating 30 samples
        x_bagged = []
        y_bagged = []
        x_query = []
        rows_60 = random.sample(range(0, 506), math.floor((len(x) * 60) / 100)) #Randomly creates indices for 60% data
        rows_40 = random.sample(rows_60, math.ceil((len(x) * 40) / 100))  #Randomly creates indices for 40% data
        number_cols = random.sample(range(3, 14), 1) #Randomly selects number of columns
        cols = random.sample(range(0, 13), number_cols[0]) #Randomly creates indices for columns
        for index in rows_60: #Iterates through each index of 60% data
            x_data = []
            y_data = []
            for k in sorted(cols):  #Iterates through each index of the column of a data point
                x_data.append(x[index][k])
            x_bagged.append(x_data)
            y_bagged.append(y[index])
        for index in rows_40:    #Iterates through each index of 40% data
            x_data = []
            y_data = []
            for k in cols:  #Iterates through each index of the column of a data point
                x_data.append(x[index][k])
            x_bagged.append(x_data)
            y_bagged.append(y[index])
        for k in cols:  #Iterates through each index of the column of the query data point
            x_query.append(xq[k])
        dtr = DecisionTreeRegressor()
        dtr.fit(np.array(x_bagged), np.array(y_bagged))
        prediction = dtr.predict((np.array(x_query)).reshape(1, -1))
        y_pred += prediction[0]
    y_pred /= 30
    return y_pred

In [74]:
x_query = np.array([0.18,20.0,5.00,0.0,0.421,5.60,72.2,7.95,7.0,30.0,19.1,372.13,18.60])
predicted_house_price = predict_house_price(X_data, y_target, x_query)
print('The predicted house price is ', predicted_house_price)

The predicted house price is  20.203333333333333
