In [97]:
from scipy.special import huber
class Node:

    def __init__(self, x, y, idxs, min_leaf=5):
        self.x = x 
        self.y = y
        self.idxs = idxs 
        self.min_leaf = min_leaf
        self.row_count = len(idxs)
        self.col_count = x.shape[1]
        self.val = np.mean(y[idxs])
        self.score = float('inf')
        self.find_varsplit()
        
    def find_varsplit(self):
        for c in range(self.col_count): self.find_better_split(c)
        if self.is_leaf: return
        x = self.split_col
        lhs = np.nonzero(x <= self.split)[0]
        rhs = np.nonzero(x > self.split)[0]
        self.lhs = Node(self.x, self.y, self.idxs[lhs], self.min_leaf)
        self.rhs = Node(self.x, self.y, self.idxs[rhs], self.min_leaf)
        
    def find_better_split(self, var_idx):
      
        x = self.x.values[self.idxs, var_idx]

        for r in range(self.row_count):
            lhs = x <= x[r]
            rhs = x > x[r]
            if rhs.sum() < self.min_leaf or lhs.sum() < self.min_leaf: continue

            curr_score = self.find_score(lhs, rhs)
            if curr_score < self.score: 
                self.var_idx = var_idx
                self.score = curr_score
                self.split = x[r]
                
    def find_score(self, lhs, rhs):
        y = self.y[self.idxs]
        left = y[lhs]
        right = y[rhs]
#         r_l = np.mean(left) - left
#         r_r = np.mean(right) - right
        return np.sum(np.log(np.cosh(np.mean(left) - left))) + np.sum(np.log(np.cosh(np.mean(right) - right)))
#         return np.mean(huber(100, r_l)) + np.mean(huber(100, r_r))
#         return np.mean(np.mean(left) - left) + 1.5* np.mean(np.mean(right) - right)
#         lhs_std = y[lhs].std()
#         rhs_std = y[rhs].std()
#         print(9)
#         return lhs_std * lhs.sum() + rhs_std * rhs.sum()
                
    @property
    def split_col(self): return self.x.values[self.idxs,self.var_idx]
                
    @property
    def is_leaf(self): return self.score == float('inf')                

    def predict(self, x):
        return np.array([self.predict_row(xi) for xi in x])

    def predict_row(self, xi):
        if self.is_leaf: return self.val
        node = self.lhs if xi[self.var_idx] <= self.split else self.rhs
        return node.predict_row(xi)

In [98]:
left = [10, 5, 10]
right = [15, 13, 24]
np.sum(np.log(np.cosh(np.mean(left) - left))) + np.sum(np.log(np.cosh(np.mean(right) - right)))

15.922027034802923

In [99]:
class DecisionTreeRegressor:
  
    def fit(self, X, y, min_leaf = 5):
        self.dtree = Node(X, y, np.array(np.arange(len(y))), min_leaf)
        return self

    def predict(self, X):
        return self.dtree.predict(X.values)

In [100]:
regressor = DecisionTreeRegressor().fit(X_train, y_train)
preds = regressor.predict(X_train)



In [101]:
# log
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

-0.0019696687624668474

In [82]:
# Huber delta
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

0.6994409555343706

In [66]:
# Penalize higher values more
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

0.5948540117299304

In [58]:
# Penalize lower values more
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

0.5760245501165678

In [50]:
# Median AE
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

0.7048962380664936

In [42]:
# MSLE
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

0.7312843315913936

In [36]:
# MAE
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

0.6994409555343706

In [32]:
# MSE
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

0.7606213544731929

In [28]:
# ME
from sklearn import metrics
preds = regressor.predict(X_test)
metrics.r2_score(y_test, preds)

0.5850524083592148

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

reg = RandomForestRegressor(
  n_estimators=1, 
  max_depth=2, 
  bootstrap=False, 
  random_state=10
)
reg.fit(X_train, y_train)

In [None]:
preds = reg.predict(X_test)
metrics.r2_score(y_test, preds)

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split

In [4]:
# Importing data
train = pd.read_csv("house-prices/train.csv")
test = pd.read_csv("house-prices/test.csv")

In [5]:
# Drop columns with significant number of null values
null_values = train.isnull().sum()
to_delete = null_values[null_values > (train.shape[0] / 4)]
train.drop(list(to_delete.index), axis=1, inplace=True)
test.drop(list(to_delete.index), axis=1, inplace=True)

In [6]:
# Divide into categorical and numerical features
categorical = [col for col in train.columns.values if train[col].dtype == 'object']
train_cat = train[categorical]
train_num = train.drop(categorical, axis=1)

In [7]:
# Impute missing numerical values
imp = SimpleImputer(missing_values=np.nan, strategy='mean')
train_num = pd.DataFrame(imp.fit_transform(train_num), columns=train_num.columns)

# Impute missing categorical values and one-hot encode
imp = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
train_cat = pd.DataFrame(imp.fit_transform(train_cat), columns=train_num.columns)
train_cat = pd.get_dummies(train_cat)

# Join two dataframes
train = pd.concat([train_num, train_cat], axis=1)

In [8]:
# Split into training and test datasets
train_data = train.drop(['SalePrice'], axis=1).values
train_values = train['SalePrice'].values
X_train, X_test, y_train, y_test = train_test_split(train_data[:500], train_values[:500], shuffle=True, test_size=0.20)

In [9]:
X_train = pd.DataFrame(X_train)

In [10]:
X_test = pd.DataFrame(X_test)