In [168]:
import pandas as pd 

# Array math
import numpy as np 
from enum import Enum
import statistics

In [169]:
df = pd.read_csv('housing_price_train.csv')
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())
df.set_index('Id')
df.to_csv('housing_price_train1.csv', index=False)

In [170]:
df = pd.read_csv('housing_price_train.csv')
def df_to_numerical(df):
    numerical_df = pd.DataFrame()
    for col in df.columns:
        if df[col].dtype == "int64":
            numerical_df[col] = df[col]
            continue
        else:
            df[col] = df[col].astype("category")
            new_col = '{val}_Cat'.format(val=col)
            numerical_df[new_col] = df[col].cat.codes
    return numerical_df
numerical_df = df_to_numerical(df)
numerical_df

Unnamed: 0,Id,MSSubClass,MSZoning_Cat,LotFrontage_Cat,LotArea,Street_Cat,Alley_Cat,LotShape_Cat,LandContour_Cat,Utilities_Cat,...,PoolArea,PoolQC_Cat,Fence_Cat,MiscFeature_Cat,MiscVal,MoSold,YrSold,SaleType_Cat,SaleCondition_Cat,SalePrice
0,1,60,3,36,8450,1,-1,3,3,0,...,0,-1,-1,-1,0,2,2008,8,4,208500
1,2,20,3,51,9600,1,-1,3,3,0,...,0,-1,-1,-1,0,5,2007,8,4,181500
2,3,60,3,39,11250,1,-1,0,3,0,...,0,-1,-1,-1,0,9,2008,8,4,223500
3,4,70,3,31,9550,1,-1,0,3,0,...,0,-1,-1,-1,0,2,2006,8,0,140000
4,5,60,3,55,14260,1,-1,0,3,0,...,0,-1,-1,-1,0,12,2008,8,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,33,7917,1,-1,3,3,0,...,0,-1,-1,-1,0,8,2007,8,4,175000
1456,1457,20,3,56,13175,1,-1,3,3,0,...,0,-1,2,-1,0,2,2010,8,4,210000
1457,1458,70,3,37,9042,1,-1,3,3,0,...,0,-1,0,2,2500,5,2010,8,4,266500
1458,1459,20,3,39,9717,1,-1,3,3,0,...,0,-1,-1,-1,0,4,2010,8,4,142125


In [171]:
class DecisionNode():
    def __init__(self, feature_idx=None, threshold=None, value=None, left_branch=None, right_branch=None, mse=None):
        self.feature_idx = feature_idx # index of the feature that is used
        self.threshold = threshold # threshold value for feature when making the decision
        self.value = value # value if the node is a leaf in the tree
        self.left_branch = left_branch # the node we go to if decision returns True
        self.right_branch = right_branch # the node we go to if decision returns False
        self.mse = mse


In [172]:
class DecisionTree():
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.mse = None
        self.ymean = None
    
    def load_model(self, X, y):
        X = X.values
        y = y.values.reshape(-1,1)
        dataset = np.concatenate((X, y), axis=1)
        self.root = self.build_tree(dataset)
        
    def build_tree(self, df, curr_depth=0):
        X = df[:,:-1]
        Y = df[:,-1]
        
        self.ymean = np.mean(Y)
        self.mse = self.get_mse(Y, self.ymean)
        
        num_samples, num_features = np.shape(X)
        print(num_samples, num_features)
        best_split = None
        # split until stopping conditions are met
        if num_samples>=self.min_samples_split and curr_depth<=self.max_depth:
            # find the best split
            base_mse = self.mse
            #max_var_red = -float("inf")
            # loop over all the features
            for feature_index in range(num_features):
                feature_values = df[:, feature_index]
                possible_thresholds = np.unique(feature_values)
                # loop over all the feature values present in the data
                for threshold in possible_thresholds:
                    # get current split
                    dataset_left, dataset_right = self.split(df, feature_index, threshold)
                    # check if childs are not null
                    if len(dataset_left)>0 and len(dataset_right)>0:
                        left_y, right_y = dataset_left[:, -1], dataset_right[:, -1]
                        
                        left_mean = np.mean(left_y)
                        right_mean = np.mean(right_y)
                        
                        res_l = left_y - left_mean
                        res_r = right_y - right_mean
                        
                        r = np.concatenate((res_l, res_r), axis=None)
                        r = r**2
                        mse_split = np.mean(r)
                        if mse_split < base_mse:
                            base_mse = mse_split
                            best_split = {"feature_index":feature_index, 
                                          "threshold":threshold, 
                                          "mse_split":mse_split,
                                          "dataset_left":dataset_left,
                                          "dataset_right":dataset_right
                                        }
            # check if information gain is positive
            if best_split["mse_split"]<self.mse:
                # recur left
                left_subtree = self.build_tree(best_split["dataset_left"], curr_depth+1)
                # recur right
                right_subtree = self.build_tree(best_split["dataset_right"], curr_depth+1)
                # return decision node
                return DecisionNode(feature_idx=best_split["feature_index"], threshold=best_split["threshold"], 
                            left_branch=left_subtree, right_branch=right_subtree, mse=best_split["mse_split"])
        
        # compute leaf node
        leaf_value = self.calculate_leaf_value(Y)
        # return leaf node
        return DecisionNode(value=leaf_value)
    
    def split(self, dataset, feature_index, threshold):
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    # def variance_reduction(self, parent, l_child, r_child):
    #     weight_l = len(l_child) / len(parent)
    #     weight_r = len(r_child) / len(parent)
    #     reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
    #     return reduction
    
    def get_mse(self, y, y_hat):
        res = y- y_hat
        res = res**2
        mse = np.mean(res)
        return mse
        
    def calculate_leaf_value(self, Y):
        val = np.mean(Y)
        return val
                
    def print_tree(self, tree=None, indent=" "):
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)
        else:
            print("X_"+str(tree.feature_idx), "<=", tree.threshold, "?", tree.mse)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left_branch, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right_branch, indent + indent)
    

    def apply_model(self, x, tree):
        if tree.value != None: 
            return tree.value
        feature_val = x[tree.feature_idx]
        if feature_val<=tree.threshold:
            return self.apply_model(x, tree.left_branch)
        else:
            return self.apply_model(x, tree.right_branch)
    
    def predict(self, X):
        preditions = []
        for x in X:
            preditions.append(self.apply_model(x, self.root))
        return preditions

In [173]:
# X_train = numerical_df.iloc[:, :-1].values
# y_train = numerical_df.iloc[:, -1].values.reshape(-1,1)
X_train = numerical_df.iloc[:, :-1]
y_train = numerical_df.iloc[:,-1]

In [174]:
dt = DecisionTree()
dt.load_model(X_train, y_train)

1460 80
1231 80
912 80
564 80
348 80
319 80
239 80
80 80
229 80
168 80
103 80
65 80
61 80
58 80
3 80


In [175]:
dt.print_tree()

X_17 <= 7 ? 3441133618.712952
 left:X_17 <= 6 ? 1556530001.5802906
  left:X_46 <= 1376 ? 1023363183.1749283
    left:124907.77836879433
    right:165466.08908045976
  right:X_46 <= 1935 ? 1428377106.2897131
    left:194238.74476987447
    right:247980.9875
 right:X_17 <= 8 ? 6364467959.870638
  left:X_46 <= 1970 ? 3041012927.3791227
    left:249392.46601941748
    right:314894.55384615384
  right:X_44 <= 1538 ? 7423102286.750613
    left:371942.2586206897
    right:708333.3333333334


In [176]:
X_test = pd.read_csv('housing_price_test.csv')
X_test = df_to_numerical(X_test).values


In [177]:
dt.predict(X_test) 

[124907.77836879433,
 124907.77836879433,
 165466.08908045976,
 165466.08908045976,
 249392.46601941748,
 165466.08908045976,
 124907.77836879433,
 165466.08908045976,
 194238.74476987447,
 124907.77836879433,
 194238.74476987447,
 124907.77836879433,
 124907.77836879433,
 165466.08908045976,
 194238.74476987447,
 371942.2586206897,
 249392.46601941748,
 371942.2586206897,
 249392.46601941748,
 371942.2586206897,
 314894.55384615384,
 249392.46601941748,
 124907.77836879433,
 194238.74476987447,
 194238.74476987447,
 194238.74476987447,
 314894.55384615384,
 249392.46601941748,
 194238.74476987447,
 165466.08908045976,
 194238.74476987447,
 124907.77836879433,
 165466.08908045976,
 314894.55384615384,
 314894.55384615384,
 194238.74476987447,
 194238.74476987447,
 194238.74476987447,
 194238.74476987447,
 124907.77836879433,
 165466.08908045976,
 194238.74476987447,
 249392.46601941748,
 249392.46601941748,
 194238.74476987447,
 165466.08908045976,
 165466.08908045976,
 165466.08908045