In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('housing_price_train.csv')
def df_to_numerical(df):
    numerical_df = pd.DataFrame()
    for col in df.columns:
        if df[col].dtype == "int64":
            numerical_df[col] = df[col]
            continue
        else:
            df[col] = df[col].astype("category")
            new_col = '{val}_Cat'.format(val=col)
            numerical_df[new_col] = df[col].cat.codes
    return numerical_df
numerical_df = df_to_numerical(df)
numerical_df

Unnamed: 0,Id,MSSubClass,MSZoning_Cat,LotFrontage_Cat,LotArea,Street_Cat,Alley_Cat,LotShape_Cat,LandContour_Cat,Utilities_Cat,...,PoolArea,PoolQC_Cat,Fence_Cat,MiscFeature_Cat,MiscVal,MoSold,YrSold,SaleType_Cat,SaleCondition_Cat,SalePrice
0,1,60,3,36,8450,1,-1,3,3,0,...,0,-1,-1,-1,0,2,2008,8,4,208500
1,2,20,3,51,9600,1,-1,3,3,0,...,0,-1,-1,-1,0,5,2007,8,4,181500
2,3,60,3,39,11250,1,-1,0,3,0,...,0,-1,-1,-1,0,9,2008,8,4,223500
3,4,70,3,31,9550,1,-1,0,3,0,...,0,-1,-1,-1,0,2,2006,8,0,140000
4,5,60,3,55,14260,1,-1,0,3,0,...,0,-1,-1,-1,0,12,2008,8,4,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,3,33,7917,1,-1,3,3,0,...,0,-1,-1,-1,0,8,2007,8,4,175000
1456,1457,20,3,56,13175,1,-1,3,3,0,...,0,-1,2,-1,0,2,2010,8,4,210000
1457,1458,70,3,37,9042,1,-1,3,3,0,...,0,-1,0,2,2500,5,2010,8,4,266500
1458,1459,20,3,39,9717,1,-1,3,3,0,...,0,-1,-1,-1,0,4,2010,8,4,142125


In [3]:
class Node():
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, var_red=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.var_red = var_red
        self.value = value

In [4]:
class DecisionTree():
    def __init__(self, min_samples_split=2, max_depth=2):
        self.root = None
        self.dataset = None
        self.curr_depth = 0
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        
    def load_data(self, X, Y):
        self.dataset = np.concatenate((X, Y), axis=1)
        
    def build_tree(self, X, Y):
        self.load_data(X, Y)
        dataset = self.dataset[:, :-1]
        target = self.dataset[:,-1]
        
        self.root = self.building(dataset, target, curr_depth=self.curr_depth)
    
    def building(self, dataset, target, curr_depth):
        
        num_samples, num_features = np.shape(dataset)
        best_split = {}
        
        if num_samples >= self.min_samples_split and curr_depth <= self.max_depth:
            best_split = self.get_best_split(self.dataset, num_samples, num_features)
            if best_split["var_red"] > 0:
                self.curr_depth = curr_depth + 1
                left_subtree = self.build_tree(best_split["dataset_left"][:,:-1], best_split["dataset_left"][:,-1:])
                right_subtree = self.build_tree(best_split["dataset_right"][:,:-1], best_split["dataset_right"][:,-1:])
                return Node(best_split["feature_index"], best_split["threshold"],
                            left_subtree, right_subtree, best_split["var_red"])
                
        leaf_value = self.calculate_leaf_value(target)
        return Node(value=leaf_value)
    
    def get_best_split(self, dataset, num_samples, num_features):
        # dictionary to store the best split
        best_split = {}
        max_var_red = -float("inf")
        # loop over all the features
        for feature_index in range(num_features):
            feature_values = dataset[:, feature_index]
            possible_thresholds = np.unique(feature_values)
            # loop over all the feature values present in the data
            for threshold in possible_thresholds:
                # get current split
                dataset_left, dataset_right = self.split(dataset, feature_index, threshold)
                # check if childs are not null
                if len(dataset_left)>0 and len(dataset_right)>0:
                    y, left_y, right_y = dataset[:, -1], dataset_left[:, -1], dataset_right[:, -1]
                    # compute information gain
                    curr_var_red = self.variance_reduction(y, left_y, right_y)
                    # update the best split if needed
                    if curr_var_red>max_var_red:
                        best_split["feature_index"] = feature_index
                        best_split["threshold"] = threshold
                        best_split["dataset_left"] = dataset_left
                        best_split["dataset_right"] = dataset_right
                        best_split["var_red"] = curr_var_red
                        max_var_red = curr_var_red
                        
        # return best split
        return best_split
    
    def split(self, dataset, feature_index, threshold):
        dataset_left = np.array([row for row in dataset if row[feature_index]<=threshold])
        dataset_right = np.array([row for row in dataset if row[feature_index]>threshold])
        return dataset_left, dataset_right
    
    def variance_reduction(self, parent, l_child, r_child):
        weight_l = len(l_child) / len(parent)
        weight_r = len(r_child) / len(parent)
        reduction = np.var(parent) - (weight_l * np.var(l_child) + weight_r * np.var(r_child))
        return reduction
    
    def calculate_leaf_value(self, Y):
        val = np.mean(Y)
        return val
                
    def print_tree(self, tree=None, indent=" "):
        if not tree:
            tree = self.root

        if tree.value is not None:
            print(tree.value)
        else:
            print("X_"+str(tree.feature_index), "<=", tree.threshold, "?", tree.var_red)
            print("%sleft:" % (indent), end="")
            self.print_tree(tree.left, indent + indent)
            print("%sright:" % (indent), end="")
            self.print_tree(tree.right, indent + indent)
    

    def apply_model(self, x, tree):
        if tree.value != None: 
            return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.threshold:
            return self.apply_model(x, tree)
        else:
            return self.apply_model(x, tree)
    
    def predict(self, X):
        preditions = []
        for x in X:
            preditions.append(self.apply_model(x, self.root))
        return preditions

In [5]:
X_train = numerical_df.iloc[:, :-1].values
y_train = numerical_df.iloc[:, -1].values.reshape(-1,1)

In [6]:
tree = DecisionTree(min_samples_split=3, max_depth=3)
tree.build_tree(X_train, y_train)

In [7]:
X_test = pd.read_csv('housing_price_test.csv')
X_test = df_to_numerical(X_test).values
Y_pred = tree.predict(X_test) 
Ypred = pd.Series(Y_pred)
Ypred.to_csv('samples.csv')