Random Forest & Decision Tree Project

* This Project contains three datasets, housing_price_test, housing_price_train, and sample_submission

* The goal of the project is to predict the price of a house given its attributes.

* Going to build a random forest that consists of multiple decision trees from the training data set. Then, apply it on the test set and submit your code to generate predictions.

Preprocessing

In [14]:
import pandas as pd
import numpy as np

# Loading the dataset
df = pd.read_csv('housing_price_train.csv')
df_test = pd.read_csv('housing_price_test.csv')
df_test_saleprice = pd.read_csv('sample_submission.csv')
df_test_saleprice.drop('Id', axis=1, inplace=True)
df_test = pd.concat((df_test, df_test_saleprice), axis=1) # This line is used to add the SalePrice column to the test dataset
# Printing amount of rows and columns in the dataset
print(df.shape)

# Printing the first 10 rows of the dataset
#Printing all of the columns in the dataset
df.head(10)
#print(df.head(5))

(1460, 81)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000
5,6,50,RL,85.0,14115,Pave,,IR1,Lvl,AllPub,...,0,,MnPrv,Shed,700,10,2009,WD,Normal,143000
6,7,20,RL,75.0,10084,Pave,,Reg,Lvl,AllPub,...,0,,,,0,8,2007,WD,Normal,307000
7,8,60,RL,,10382,Pave,,IR1,Lvl,AllPub,...,0,,,Shed,350,11,2009,WD,Normal,200000
8,9,50,RM,51.0,6120,Pave,,Reg,Lvl,AllPub,...,0,,,,0,4,2008,WD,Abnorml,129900
9,10,190,RL,50.0,7420,Pave,,Reg,Lvl,AllPub,...,0,,,,0,1,2008,WD,Normal,118000


### Cleaning the dataset

In [15]:
def PreprocessingData(db):
    # Showing NULL values for each column in the dataset and their percentage
    null_values = db.isnull().sum().sort_values(ascending=False)
    null_values = pd.DataFrame(data=null_values, columns=["NullValueCount"]) 
    missing = null_values[null_values["NullValueCount"] > 0] 
    print(missing, "\n", missing/len(db)*100)

    # Dropping all columns and rows which have null values >= 70%
    if missing.empty:
        print("There is no missing value in the dataset")
    else:
        threshold = int(len(db)*0.7) 
        db.dropna(thresh=threshold, axis=1, inplace=True)
        db.dropna(thresh=len(db.columns) * 0.7, axis=0, inplace=True)

        for col in list(missing.index):
            if col in db.columns:  # Check if the column exists in the DataFrame
                if db[col].dtype == 'object': # Categorical
                    db[col].fillna(value=db[col].mode()[0], inplace=True) # most frequent value
                else: # Numerical
                    db[col].fillna(value=db[col].mean(), inplace=True) # avg value
            else:
                print(f"Column '{col}' not found in the DataFrame.")

    return db


In [17]:
Data = PreprocessingData(df)
Data_test = PreprocessingData(df_test)

Empty DataFrame
Columns: [NullValueCount]
Index: [] 
 Empty DataFrame
Columns: [NullValueCount]
Index: []
There is no missing value in the dataset
Empty DataFrame
Columns: [NullValueCount]
Index: [] 
 Empty DataFrame
Columns: [NullValueCount]
Index: []
There is no missing value in the dataset


### Correlations - Shows which feature affects the saleprice the most.

In [23]:
correlation = df[df.columns].corr()["SalePrice"].sort_values() 

most_correlated = correlation[correlation >= 0.5]  # Adjust the correlation threshold as needed

# Removing lowest correlation features
dropped = df.drop(correlation[correlation < 0.5].index, axis=1)

# Print the filtered dataframe
most_correlated
#dropped

# Print amount of columns and rows in the filtered dataset
#print(dropped.shape)

  correlation = df[df.columns].corr()["SalePrice"].sort_values()


YearRemodAdd    0.507101
YearBuilt       0.522897
TotRmsAbvGrd    0.533723
FullBath        0.560664
1stFlrSF        0.605852
TotalBsmtSF     0.613581
GarageArea      0.623431
GarageCars      0.640409
GrLivArea       0.708624
OverallQual     0.790982
SalePrice       1.000000
Name: SalePrice, dtype: float64

#### Decision Tree
#### Building a Decision Tree:
A Decision tree consists of nodes connected by edges. A decision tree is typically, a binary tree.

1- DecisionNode class used to save some values for each node we do the spliting on it until we reach the leaf node.


In [5]:
class DecisionNode():
    def __init__(self, feature_idx=None, threshold=None, value=None, true_branch=None, false_branch=None):
        self.feature_idx = feature_idx # index of the feature that is used
        self.threshold = threshold     # threshold value for feature when making the decision
        self.value = value # Average value if the node is a leaf in the tree
        self.true_branch = true_branch # the node we go to if decision returns True
        self.false_branch = false_branch # the node we go to if decision returns False

# Decision Tree Class
This Class consists the following functions:
<ol>
<li> <b>build_tree</b>: used to create the decision tree nodes</li> 
<li> <b>calc_variance_reduction</b> : measure the impurity by using variance reduction measure (like MSE) </li> 
the function takes three parameters (parentRec: the records for the target before split,and the left and right records after splitting. This function used to measure the impurity for each node and decide if we will split or not.
<li> <b>majority_vote</b>: used to calculate values for the leaf nodes records which equal to the mean of these records.</li> 
<li><b>split_by_feature</b>: this function take the feature and the threshold and check if the feature is numerical so it split the records into two node (true which is the left edge and false which is the right edge)
if the feature is categorical so it split where the values equal to the threshold</li>
<li> <b>fit</b>: Used to train the dataset after spliting the data into two part x: features, y: target</li>
<li><b>predict_value</b>: used to predict the value for each record, it is a recursive method to find the leaf node that corresponds to prediction
<li><b>predict</b>: take all records for the test data and iterate into each record to predit the y(target) value and save the result into a prediction list. 

In [6]:
class RegressionDecisionTree():
    # constructor
    def __init__(self, min_VarianceReduction=1e-7, max_depth=5):        
        self.root = None # root of this tree
        self.min_VarianceReduction = min_VarianceReduction # minimum VarianceReduction to allow splitt
        self.max_depth = max_depth
 

    # used to create the decision tree nodes
    def build_tree(self, X, y, current_depth=0):
        # we will use decision dictionary to save the feature and the threshold we build the tree on 
        decision = None
        # we will use subtrees dictionary to save the feature and the threshold we build the tree on 
        subtrees = None
        largest_variance_Reduction = 0
        # add y as last column of X
        df = pd.concat((X, y), axis=1)
        n_rows, n_features = X.shape
        if current_depth <= self.max_depth:
            # iterate through every feature
            for feature_idx in range(n_features):
                # values of that column
                feature_values = X.iloc[:, feature_idx]                
                unique_values = feature_values.unique()                
                for threshold in unique_values:
                    X_trueEdge, X_falseEdge = self.split_by_feature(df, feature_idx, threshold)
                    if len(X_trueEdge) > 0 and len(X_falseEdge) > 0:
                        y_true = X_trueEdge.iloc[:,-1]
                        y_false = X_falseEdge.iloc[:,-1]                        
                        # Calculate impurity
                        VarianceRed = self.Calc_variance_reduction(y, y_true, y_false)
                        # Keep track of which feature gave the largest information gain
                        if VarianceRed > largest_variance_Reduction:
                            largest_variance_Reduction = VarianceRed
                            decision = {"feature_idx":feature_idx, "threshold":threshold}
                            subtrees = {"X_true":X_trueEdge.iloc[:,:-1],
                                        "y_true":y_true,
                                        "X_false":X_falseEdge.iloc[:,:-1],
                                        "y_false":y_false}

        # we will construct new branch of tree if the variance_Reduction is larger than minimum variance_Reduction that we've defined
        if largest_variance_Reduction > self.min_VarianceReduction:
            true_branch = self.build_tree(subtrees["X_true"], subtrees["y_true"], current_depth+1)
            false_branch = self.build_tree(subtrees["X_false"], subtrees["y_false"], current_depth+1)
            return DecisionNode(feature_idx=decision["feature_idx"], threshold=decision["threshold"], true_branch=true_branch, false_branch=false_branch)

        # at leaf node we calculate the mean for the records
        leaf_value = self.majority_vote(y)
        return DecisionNode(value=leaf_value)
                        
    # measure the impurity by using variance reduction measure (like MSE)
    # left_edgeRec= True edge: where condition is true
    # Right_edgeRec= False edge: where condition is false
    def Calc_variance_reduction(self, parentRec, left_edgeRec, Right_edgeRec):  

        parent_variance = np.var(parentRec) # np.var is used to calculate the variance of the parent node, which is the same as entropy for the parent node
        p = len(left_edgeRec) / len(parentRec) # weight of the left edge
        VarReduction = parent_variance - (p * np.var(left_edgeRec) + (1 - p) * np.var(Right_edgeRec))

        # return the VarReduction = variance for parent - (Weight * var(leftEdge) + Weight * var(RightEdge)   
        return VarReduction
        
    
    def majority_vote(self, Y): 
        return round(np.mean(Y), 2)
        # return the majority_vote for the leaf nodes  

    def split_by_feature(self, X, feature_idx, threshold): # 
        
        if isinstance(threshold, int) or isinstance(threshold, float): #
            X_true = X[X.iloc[:, feature_idx] >= threshold] 
            X_false = X[X.iloc[:, feature_idx] < threshold] 
        # if the feature is categorical
        else:
            X_true = X[X.iloc[:, feature_idx] == threshold] 
            X_false = X[X.iloc[:, feature_idx] != threshold]
        
        return X_true, X_false

        # split the data into left_edge & right_edge depends one specified feature and the threshold
        # return left & right edges

    # Used to train the dataset after spliting the data into x: features, y: target
    def fit(self, X, y):
        self.root = self.build_tree(X, y)
        #return self.root


    def predict_value(self, x_test, tree=None):
        if tree is None:
            tree = self.root

        if tree.value is not None:
            return tree.value

        feature_value = x_test[tree.feature_idx]
        branch = tree.false_branch

        if isinstance(feature_value, int) or isinstance(feature_value, float):
            if feature_value >= tree.threshold:
                branch = tree.true_branch
        elif feature_value == tree.threshold:
            branch = tree.true_branch

        return self.predict_value(x_test, branch)

    def predict(self, X_test):
        y_pred = []

        for idx, row in X_test.iterrows(): 
            y_pred.append(self.predict_value(row.values)) 

        return y_pred
        


- To Check the Accuracy for our prediction we use CalcAccuracy function which take the actual values for the test dataset and the predicted values and apply the RMSE formula.

In [7]:
def CalcAccuracy(Actual_Y, Predicted_y): # Calculate the accuracy of the model using RMSE which gives us the error 
    # Calculate the accuracy of the model using RMSE
    rmse_model = np.sqrt(np.mean((Actual_Y - Predicted_y) ** 2))
    return rmse_model

In [8]:
x_train = df.iloc[:, :-1]
y_train = df.iloc[:, -1]

tree = RegressionDecisionTree()
tree.fit(x_train, y_train)

In [22]:
x_test = df_test.iloc[:, :-1]
y_test = df_test.iloc[:, -1]
predicted_y = tree.predict(x_test)
predicted_y 

#actual_y = pd.to_numeric(y_test, errors='coerce')
#predicted_y = pd.to_numeric(predicted_y, errors='coerce')
# Calculate the accuracy of the model using RMSE
#accuracy = CalcAccuracy(y_test, predicted_y)
CalcAccuracy(y_test, predicted_y)

86417.71870902876

Random Forest
#### Random forest class
- the Class consist of the following functions:
<ul>
    <li>Constructor </li>    
    <li>Subsampling </li>
    <li>build_model</li>
    <li>predict </li>

</ul>

In [10]:
class RF(object):
    def __init__(self):
        self.Traindata = None  # training data set (loaded into memory)
        self.Testdata = None  # Test data set for prediction        
        self.trees = []  # list of decision trees 
          

     # This function generate a subsample with replacement
    def __subsampling(self, train_set, sample_size_ratio):       
        sample_number = round(len(train_set) * sample_size_ratio)
        return train_set.sample(n = sample_number, replace=True) #sample_number is the number of rows to return
        
        
    def build_model(self, train_set, sample_size_ratio, number_of_trees):
        for i in range(number_of_trees):
            TrainingSample = self.__subsampling(train_set, sample_size_ratio) 
            featurespart = TrainingSample.iloc[:, :-1]
            targetpart = TrainingSample.iloc[:, -1]
            tree = RegressionDecisionTree()
            tree.fit(featurespart, targetpart)
            self.trees.append(tree)

            
               
    def predict(self, test_set):
        # Predict for each instance in the test set
        predictions = []
        for idx, row in test_set.iterrows():
            instance_prediction = []
            for tree in self.trees:
                instance_prediction.append(tree.predict_value(row))
            predictions.append(np.mean(instance_prediction))
        return predictions


### Create Random Forest

In [11]:
# Instantiate Random Forest
train_data_set = df
test_data_set = df_test
random_forest = RF()
#Create Decision Trees

build_model = random_forest.build_model(train_data_set, sample_size_ratio=0.8, number_of_trees=5) 

#Use the random forest to predict the test dataset
#predictions = random_forest.predict(test_data_set)
#print(predictions)


# Assuming train_data and test_data are your training and test datasets respectively
# Train the random forest with the dataset
#random_forest.build_model(train_data, sample_size_ratio=0.8, number_of_trees=10)

# Use the random forest to predict the test dataset
#predictions = random_forest.predict(test_data)
#print(predictions)


In [12]:
predict = random_forest.predict(test_data_set)
predict
#Calculating Accuracy
CalcAccuracy(y_test, predict)

58739.12989257164