In [56]:
# imports

import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
import json
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor

In [57]:
# Function to generate a list of random numbers (not necessarily unique)

def random_generator(seed = 0, low = 0, high = None, size = None):
    s = seed
    a = 11
    b = 13

    if high is None:
        return ("Error. Upper Limit not found")
    if size is None:
        return ("Error. Size not found")
    if size == 1:
        return ((a*s+b)%high)
    random_list = []
    for i in range(size):
        random_list.append((a*s+b)%high)
        s = (a*s+b)
    return random_list

# Function to generate a list of random numbers (all unique)

def unique_random_generator(seed = 0, low = 0, high = None, size = None):
    s = seed
    a = 3
    b = 1

    if high is None:
        return ("Error. Upper Limit not found")
    if size is None:
        return ("Error. Size not found")
    if size == 1:
        return ((a*s+b)%high)
    random_list = []
    i = 0
    while i < size:
        z = (a*s+b)%high
        if z in random_list:
              b += 1
              s = (a*s + b)%high
              continue
        random_list.append(z)
        i += 1
        s = ((a*s+b))%high+1
    return random_list
  

In [58]:
# reg_dataset -> Regression Dataset : Boston Dataset with Housing price as the target and 13 Features related to the houses.

def load_dataset():
    reg_x, reg_y = load_boston(return_X_y = True)
    reg_data = np.concatenate((reg_x, np.array(reg_y).reshape(-1, 1)), axis = 1)
    cols = ["feature"+str(i) for i in range(1, 14)]
    cols = cols + ["price(target)"]
    reg_dataset = pd.DataFrame(data = reg_data, columns = cols)

    return reg_dataset

reg_dataset = load_dataset()
print("REGRESSION DATASET : \n", reg_dataset)

REGRESSION DATASET : 
      feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0     0.00632      18.0      2.31       0.0     0.538     6.575      65.2   
1     0.02731       0.0      7.07       0.0     0.469     6.421      78.9   
2     0.02729       0.0      7.07       0.0     0.469     7.185      61.1   
3     0.03237       0.0      2.18       0.0     0.458     6.998      45.8   
4     0.06905       0.0      2.18       0.0     0.458     7.147      54.2   
..        ...       ...       ...       ...       ...       ...       ...   
501   0.06263       0.0     11.93       0.0     0.573     6.593      69.1   
502   0.04527       0.0     11.93       0.0     0.573     6.120      76.7   
503   0.06076       0.0     11.93       0.0     0.573     6.976      91.0   
504   0.10959       0.0     11.93       0.0     0.573     6.794      89.3   
505   0.04741       0.0     11.93       0.0     0.573     6.030      80.8   

     feature8  feature9  feature10  feature11  featu

In [59]:
# Splitting into training and testing

cols = ["feature"+str(i) for i in range(1, 14)]
cols = cols + ["price(target)"]

X = reg_dataset[cols[:-1]]
Y = reg_dataset[cols[-1]]

print(X)
print(Y.values)

TrainX = np.asarray(X)
TrainY = np.asarray(Y)

x_train, x_test, y_train, y_test = train_test_split(TrainX, TrainY, test_size=0.1, random_state=10)

     feature1  feature2  feature3  feature4  feature5  feature6  feature7  \
0     0.00632      18.0      2.31       0.0     0.538     6.575      65.2   
1     0.02731       0.0      7.07       0.0     0.469     6.421      78.9   
2     0.02729       0.0      7.07       0.0     0.469     7.185      61.1   
3     0.03237       0.0      2.18       0.0     0.458     6.998      45.8   
4     0.06905       0.0      2.18       0.0     0.458     7.147      54.2   
..        ...       ...       ...       ...       ...       ...       ...   
501   0.06263       0.0     11.93       0.0     0.573     6.593      69.1   
502   0.04527       0.0     11.93       0.0     0.573     6.120      76.7   
503   0.06076       0.0     11.93       0.0     0.573     6.976      91.0   
504   0.10959       0.0     11.93       0.0     0.573     6.794      89.3   
505   0.04741       0.0     11.93       0.0     0.573     6.030      80.8   

     feature8  feature9  feature10  feature11  feature12  feature13  
0    

# Random Forest

In [60]:
dt1 = DecisionTreeRegressor(max_depth = 3, random_state = 10)
row_index = random_generator(0, 0, 404, 404)
col_index = unique_random_generator(0, 0, 13, 4)

dt2 = DecisionTreeRegressor(max_depth = 3, random_state = 10)
row_index = random_generator(2, 0, 404, 404)
col_index = unique_random_generator(2, 0, 13, 4)


In [61]:
# fifty different Decision Trees

subset_seed = random_generator(10, 0, 400, 50)
for i in range(50):
    dt_temp = DecisionTreeRegressor(max_depth = 3, random_state = 3)
    row_index = random_generator(subset_seed[i], 0, 404, 404)
    col_index = unique_random_generator(subset_seed[i], 0, 13, 4)

In [62]:
# varying number of decision trees from 4 to 148

for n in range (4, 150, 2):
    subset_seed = random_generator(15, 0, 400, n)
    for i in range(n):
        dt_temp = DecisionTreeRegressor(max_depth = 3, random_state = 3)
        row_index = random_generator(subset_seed[i], 0, 404, 404)
        col_index = unique_random_generator(subset_seed[i], 0, 13, 4)

In [63]:
# using sklearn Random Forest Regressor

from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(n_estimators = 50, max_depth = 3, max_features = 4, max_samples = 404, random_state = 10)
rf.fit(x_train, y_train)

RandomForestRegressor(max_depth=3, max_features=4, max_samples=404,
                      n_estimators=50, random_state=10)

# Boosting



Unlike Random Forest, the trees are built sequentially.

**Sample weights**: Weight given to each training sample.

**Learning Rate**: A controlling rate for updating sample weights.

**beta**: $\frac{total error}{1 - total error}$

**alpha**: $0.5 * learning\_rate * log_e(\frac{1}{beta})$

You want to increase the sample weights of the predictions that have a large error.
$sample\_weight_{(n+1)th} = sample\_weight_{nth}*beta^{(1 - error)* learning_rate}$

In [None]:
n = 80
learning_rate = 0.01
decision_trees = [None]*n
alphas = [None]*n

dt_temp = DecisionTreeRegressor(max_depth = 3, random_state = 10)

#Create your training data and training labels
#Create your testing data and testing labels
#Initialise an array for sample weights. The weights should be normalized i.e all the weights should be same and add upto 1.

for i in range(n):
  #Fit the decision tree(dt_temp) on your data
  #Predict on the training set and test set
  #Calculate the mean absolute error for each prediction.
  #Divide the error by the maximum error (very similar to normalization)
  #Calculate the total (weighted) error by summing  errors*sample_weights. This variable is called totalerror
  #Calculate a variable called beta = totalerror/(1 - totalerror)
  #Calculate alpha as (ln(1/beta)*learning_rate)/2. Learning rate helps us control how much to update the sample weights by.
  #Update the sample weight as weight_n+1 = weight_n*(beta^((1 - error)*learning_rate))
  #Store the decision trees and values of alpha.(If you perform 14 then 13 is not actually necessary)
  #Add the predictions of this decision tree to the ensembled prediction after scaling it with alpha.