## Data Preprocessing

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import *

In [None]:
# Importing data
train = pd.read_csv("house-prices/train.csv")
test = pd.read_csv("house-prices/test.csv")

In [None]:
# Drop columns with significant number of null values
null_values = train.isnull().sum()
to_delete = null_values[null_values > (train.shape[0] / 4)]
train.drop(list(to_delete.index), axis=1, inplace=True)
test.drop(list(to_delete.index), axis=1, inplace=True)

In [None]:
# Divide into categorical and numerical features
categorical = [col for col in train.columns.values if train[col].dtype == 'object']
train_cat = train[categorical]
train_num = train.drop(categorical, axis=1)

In [None]:
# Impute missing numerical values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
train_num = pd.DataFrame(imp.fit_transform(train_num), columns=train_num.columns)

# Impute missing categorical values and one-hot encode
imp = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
train_cat = pd.DataFrame(imp.fit_transform(train_cat), columns=train_num.columns)
train_cat = pd.get_dummies(train_cat)

# Join two dataframes
train = pd.concat([train_num, train_cat], axis=1)

In [None]:
# Split into training and test datasets
train_data = train.drop(['SalePrice'], axis=1).values
train_values = train['SalePrice'].values
X_train, X_test, y_train, y_test = train_test_split(train_data[:500], train_values[:500], shuffle=True, test_size=0.20)

# Convert into dataframes
X_train = pd.DataFrame(X_train)
X_test = pd.DataFrame(X_test)

## Regression Tree
Much of this code was taken from this source: https://levelup.gitconnected.com/building-a-decision-tree-from-scratch-in-python-machine-learning-from-scratch-part-ii-6e2e56265b19


In [None]:
class Loss_Function:
    
    def __init__(self, left, right):
        self.y_left = np.array([np.mean(left)] * len(left))
        self.left = left
        self.right = right
        self.y_right = np.array([np.mean(right)] * len(right))

    def mse(self):
        return mean_squared_error(self.y_left, self.left) + mean_squared_error(self.y_right, self.right)
    
    def mae(self):
        return mean_squared_error(self.y_left, self.left) + mean_squared_error(self.y_right, self.right)
    
    def msle(self):
        return mean_squared_log_error(self.y_left, self.left) + mean_squared_log_error(self.y_right, self.right)
    
    def median_ae(self):
        return median_absolute_error(self.y_left, self.left) + median_absolute_error(self.y_right, self.right)
    
    def mpd(self):
        return mean_poisson_deviance(self.y_left, self.left) + mean_poisson_deviance(self.y_right, self.right)
    
    def mgd(self):
        return mean_gamma_deviance(self.y_left, self.left) + mean_gamma_deviance(self.y_right, self.right)
    
    def mtd(self):
        return mean_tweedie_deviance(self.y_left, self.left) + mean_tweedie_deviance(self.y_right, self.right)
    
    def me(self):
        return max_error(self.y_left, self.left) + max_error(self.y_right, self.right)

In [None]:
class Node:

    def __init__(self, x, y, idxs, loss_fn, min_leaf=5):
        self.x = x 
        self.y = y
        self.loss_fn = loss_fn
        self.idxs = idxs 
        self.min_leaf = min_leaf
        self.row_count = len(idxs)
        self.col_count = x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):
        for c in range(self.col_count): self.find_better_split(c)
        if self.is_leaf: 
            return
        x = self.split_col
        lhs = np.nonzero(x <= self.split)[0]
        rhs = np.nonzero(x > self.split)[0]
        self.lhs = Node(self.x, self.y, self.idxs[lhs], self.loss_fn, self.min_leaf)
        self.rhs = Node(self.x, self.y, self.idxs[rhs], self.loss_fn, self.min_leaf)
        
    def find_better_split(self, var_idx):
        x = self.x.values[self.idxs, var_idx]

        for r in range(self.row_count):
            lhs = x <= x[r]
            rhs = x > x[r]
            if rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf: 
                continue

            curr_score = self.find_score(lhs, rhs)
            if curr_score < self.score: 
                self.var_idx = var_idx
                self.score = curr_score
                self.split = x[r]
                
    def find_score(self, lhs, rhs):
        y = self.y[self.idxs]
        left = y[lhs]
        right = y[rhs]
        loss = Loss_Function(left, right)
        fn = getattr(loss, self.loss_fn)
        return fn()
                
    @property
    def split_col(self): 
        return self.x.values[self.idxs,self.var_idx]
                
    @property
    def is_leaf(self): 
        return self.score == float('inf')                

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: 
            return self.val
        node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
        return node.predict_row(xi)

In [None]:
class DecisionTreeRegressor:
  
    def fit(self, X, y, loss_fn, min_leaf = 5):
        self.dtree = Node(X, y, np.array(np.arange(len(y))), loss_fn, min_leaf)
        return self

    def predict(self, X):
        return self.dtree.predict(X.values)

In [None]:
# Sample run with MAE
regressor = DecisionTreeRegressor().fit(X_train, y_train, 'mae')
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

In [None]:
functions = {'mse': 0, 'mae': 0, 'msle': 0, 'median_se': 0, 'mpd': 0, 'mgd': 0, 'mtd': 0, 'me': 0}

for fn in functions: 
    regressor = DecisionTreeRegressor().fit(X_train, y_train, fn)
    preds = regressor.predict(X_test)
    score = metrics.r2_score(y_test, preds)
    functions[fn] = score