In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [7]:
df = pd.read_csv('train.csv')
print(df.head(5))

   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

**Selecting Features and Splitting Data Set into Training and Testing Sets**

In [8]:
#selection of features from the data frame
df = df[['LotArea','OverallQual','OverallCond','YearBuilt', 'GrLivArea', 'TotRmsAbvGrd','YrSold','SalePrice']]
#seperating features and targets
X = df.iloc[:, :-1].values
Y = df.iloc[:, -1].values.reshape(-1,1)
#getting train and test split
xTrain, xTest, yTrain, yTest = train_test_split(X,Y, test_size=.1)

**Node Class**

In [9]:
class Node:
    def __init__(self, featureIndex=None, threshold=None, left=None, right=None, varianceReduction = None,value=None):
        self.featureIndex = featureIndex
        self.threshold = threshold
        self.left = left
        self.right = right
        self.varianceReduction = varianceReduction
        self.value=value


**Decision Tree Class**

In [11]:
class DecisionTree:
    def __init__(self, minSamplesSplit=2, maxDepth=2):
        self.minSamplesSplit=minSamplesSplit
        self.maxDepth=maxDepth
        self.root=None

    def buildTree(self, df, currentDepth=0):
        bestSplitNode={}
        Xdata, ydata = df[:,:-1], df[:,-1]
        numSamples, numFeatures = np.shape(Xdata)
        #split condisions
        if numSamples >= self.minSamplesSplit and currentDepth <= self.maxDepth:
            #find best split
            bestSplitNode = self.getBestSplit(Xdata, ydata, df)
            # create left and right nodes
            if bestSplitNode['varianceReduction'] > 0:
                lSubTree = self.buildTree(bestSplitNode['dataLeft'], currentDepth + 1)
                rSubTree = self.buildTree(bestSplitNode['dataRight'], currentDepth + 1)
                return Node(bestSplitNode['featureIndex'], bestSplitNode['threshold'], lSubTree, rSubTree, bestSplitNode['varianceReduction'])

        #value for leaf node (prediction)
        value = np.mean(ydata)
        return Node(value=value)

    def getBestSplit(self, X,y, df):
        bestSplitNode = {}
        bestSplitNode['varianceReduction'] = 0

        numSamples, numFeatures = np.shape(X)
        maxVariance = float('-inf')
        minVariance = float('inf')
        leftList =[]
        rightList =[]
        #iterates through features
        for i in range(numFeatures):
            values = X[:,i]
            uniqueValues = np.unique(values)
            #iterates through all posible thresholds for that feature in the data set
            for threshold in uniqueValues:
                queryFor = str(threshold)
                #splits the data at threshold
                dataLeft = df[df[:, i] <= threshold]
                dataRight = df[df[:, i] > threshold]

                if len(dataLeft) >0 and len(dataRight) > 0:
                    leftY = dataLeft[:,-1]
                    rightY = dataRight[:,-1]
                    #finds the variance of the threshold
                    currVariance = self.varianceReduction(y,leftY, rightY)
                    #the node with the highest variance will be returned
                    if currVariance > maxVariance:
                        maxVariance = currVariance
                        bestSplitNode['featureIndex'] = i
                        bestSplitNode['threshold'] = threshold
                        bestSplitNode['dataLeft'] = dataLeft
                        bestSplitNode['dataRight'] = dataRight
                        bestSplitNode['varianceReduction'] = currVariance

        return bestSplitNode

    def varianceReduction(self, parent, lChild, rChild):
        weightL = len(lChild) / len(parent)
        weightR = len(rChild) / len(parent)

        return np.var(parent) - (weightL * np.var(lChild) + weightR * np.var(rChild))

    def fit(self, X, y):
        #puts data back into a 2d array
        data = np.concatenate((X,y), axis=1)
        #sets root
        self.root = self.buildTree(data)

    def makePrediction(self, x, tree):
        #traverses through tree based on the values of the features in x

        if tree.value!=None: return tree.value
        featureValue = x[tree.featureIndex]
        if featureValue <= tree.threshold:
            return self.makePrediction(x, tree.left)
        else:
            return self.makePrediction(x, tree.right)

    def predict(self, X):
        prediction = [self.makePrediction(x, self.root) for x in X]

        return prediction

    def print_tree(self, tree=None, indent=" "):

        if not tree:
            tree = self.root
            if tree is None:
                print("The tree has not been built or is empty.")
                return
            print('Root node here')
        if tree.value is not None:
            print(tree.value)
        else:
            print(f"X_"+str(tree.featureIndex), "<=", tree.threshold, "?", tree.varianceReduction)
            print(f"%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print(f"%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)


**Testing Decision Tree**

In [12]:
decisionTree = DecisionTree(minSamplesSplit=2,maxDepth=3)
#building the tree with the train data
decisionTree.fit(xTrain, yTrain)
#decisionTree.print_tree() #uncomment line to print the tree
#making prediction
decisionTreePrediction = decisionTree.predict(xTest)
#RMSE of prediction
print('RMSE of Decision Tree prediction:', np.sqrt(mean_squared_error(yTest,decisionTreePrediction)))

RMSE of Decision Tree prediction: 49258.097819269744


**Random Forest Tree Class**

In [13]:
class RandomForest:
    def __init__(self, nTrees=5, maxDepth=3, minSamplesSpit=2, nFeatures=None):
        self.nTrees=nTrees
        self.maxDepth =maxDepth
        self.minSamplesSplit = minSamplesSpit
        self.nFeatures=nFeatures
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        #iterates through trees specified when the random forest was substantiated
        for i in range(self.nTrees):
            #makes decision tree
            tree = DecisionTree(minSamplesSplit=2,maxDepth=3)
            #bootstraps the data
            X, y = self.bootstrap(X,y)
            #fits the tree to the data
            tree.fit(X,y)
            #adds it to the list of trees in forest
            self.trees.append(tree)


    def bootstrap(self,X,y):
        #gets number of samples
        numSamples = np.shape(X)[0]
        #randomizes the data set
        ids = np.random.choice(numSamples,numSamples,replace=True)
        return X[ids], y[ids]

    def predict(self, X):
        predictions = []
        finalPredictions =[]
        #iterates through the trees in forest
        for tree in self.trees:
            predictions.append(tree.predict(X))
        sum = 0
        #averages the predictions made by all the trees in the forest
        for i in range((len(X))):
            for j in range(len(predictions)):
                sum += predictions[j][i]
            finalPredictions.append(sum/self.nTrees)
            sum = 0
        return finalPredictions

**Testing Random Forest**

In [14]:
randomForest = RandomForest(nTrees=15, maxDepth=3,minSamplesSpit=2)
#building forest from training data
randomForest.fit(xTrain,yTrain)
#making prediction
randomForestPrediction = randomForest.predict(xTest)
print('RMSE of Random Forest prediction', np.sqrt(mean_squared_error(yTest,randomForestPrediction)))


RMSE of Random Forest prediction 40431.7141322193


**Average RMSE Over N Runs**

In [15]:
decisionTreeRMSE =0
forestRMSE = 0

# runs through this number of iterations
runAmount= 5
for i in range(runAmount):
    print(i)
    #get train / test split
    xTrain, xTest, yTrain, yTest = train_test_split(X,Y, test_size=.1)
    #make decision tree
    treeModel = DecisionTree(minSamplesSplit=2,maxDepth=4)
    #fit decision tree
    treeModel.fit(xTrain, yTrain)
    #make prediction
    pred = treeModel.predict(xTest)
    #get RMSE of decision tree
    decisionTreeRMSE += np.sqrt(mean_squared_error(yTest,pred))
    #make random forest
    randomForest = RandomForest(nTrees=10, maxDepth=4,minSamplesSpit=2)
    #fit random forest to data
    randomForest.fit(xTrain,yTrain)
    #make prediction
    randomForestPrediction = randomForest.predict(xTest)
    #random forest RMSE
    forestRMSE += np.sqrt(mean_squared_error(yTest,randomForestPrediction))


print('Average RMSE for decision tree', (decisionTreeRMSE/runAmount))
print('Average RMSE for forest', (forestRMSE/runAmount))

0
1
2
3
4
Average RMSE for decision tree 40248.729594001285
Average RMSE for forest 38737.53014858185
