In [27]:
import pandas as pd 
import numpy as np 
from collections import Counter

from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split


In [11]:
data = pd.read_csv("Real-estate.csv",index_col=0)
data.columns = ['X1','X2','X3','X4','X5','X6','Y']

In [12]:
X = data.drop(data.columns[6],axis=1)
y = data.iloc[:,6]
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [13]:
def rss(y_left, y_right):
    def squared_residual_sum(y):
        r = y - np.mean(y)
        r = r ** 2
        r = np.sum(r)
        return r
    
    return squared_residual_sum(y_left) + squared_residual_sum(y_right)

In [14]:
def compute_rss_by_threshold(feature):
    features_rss = []
    # gather thresholds
    thresholds = X_train[feature].unique().tolist()
    #sort thresholds
    thresholds.sort()
    #for each
    for t in thresholds:
        
        y_left_ix = X_train[feature] < t
        y_left, y_right = y_train[y_left_ix], y_train[~y_left_ix]
        features_rss.append(rss(y_left, y_right))
    return thresholds, features_rss

In [15]:
def find_best_rule(X_train, y_train):
    best_feature, best_threshold, min_rss = None, None, np.inf
    for feature in X_train.columns:
        thresholds = X_train[feature].unique().tolist()
        thresholds.sort()
        thresholds = thresholds[1:]
        for t in thresholds:
            y_left_ix = X_train[feature] < t
            y_left, y_right = y_train[y_left_ix], y_train[~y_left_ix]
            t_rss = rss(y_left, y_right)
            if t_rss < min_rss:
                min_rss = t_rss
                best_threshold = t
                best_feature = feature
    
    return {'feature': best_feature, 'threshold': best_threshold}

In [16]:
def split(X_train, y_train, depth, max_depth):
    if depth == max_depth or len(X_train) < 2:
        return {'prediction': np.mean(y_train)}
    
    rule = find_best_rule(X_train, y_train)
    left_ix = X_train[rule['feature']] < rule['threshold']
    rule['left'] = split(X_train[left_ix], y_train[left_ix], depth + 1, max_depth)
    rule['right'] = split(X_train[~left_ix], y_train[~left_ix], depth + 1, max_depth)
    return rule

rules = split(X_train, y_train, 0, 3)

In [17]:
def predict(sample, rules):
    prediction = None
    while prediction is None:
        feature, threshold = rules['feature'], rules['threshold']
        if sample[feature] < threshold:
            rules = rules['left']
        else:
            rules = rules['right']
        prediction = rules.get('prediction', None)
    return prediction

In [25]:
def evaluate_r2(X, y):
    preds = X.apply(predict, axis='columns', rules=rules.copy())
    return r2_score(preds, y)

def evaluate_mse(X, y):
    preds = X.apply(predict, axis='columns', rules=rules.copy())
    return mean_squared_error(preds, y)

In [29]:
rules = split(X_train, y_train, 0, 3)

train_r2 = evaluate_r2(X_train, y_train)
test_r2 = evaluate_r2(X_test, y_test)

train_mse = evaluate_mse(X_train, y_train)
test_mse = evaluate_mse(X_test, y_test)

print(" Regression Tree:")
print("------------------")
print('Max Depth', 3, 'Training R2:', train_r2, 'Test R2:',test_r2)
print("------------------")
print('Max Depth', 3, 'Training mse:', train_mse, 'Test mse:',test_mse)

 Regression Tree:
------------------
Max Depth 3 Training R2: 0.680572872455621 Test R2: 0.4230872599515264
------------------
Max Depth 3 Training mse: 45.46795328977166 Test mse: 110.4858448470182


In [20]:
print(rules)

{'feature': 'X3', 'threshold': 837.7233, 'left': {'feature': 'X2', 'threshold': 11.8, 'left': {'feature': 'X6', 'threshold': 121.53058999999999, 'left': {'prediction': 117.5}, 'right': {'prediction': 52.37377049180329}}, 'right': {'feature': 'X3', 'threshold': 333.3679, 'left': {'prediction': 45.80754716981131}, 'right': {'prediction': 38.84390243902438}}}, 'right': {'feature': 'X5', 'threshold': 24.984070000000003, 'left': {'feature': 'X3', 'threshold': 4066.587, 'left': {'prediction': 25.66986301369862}, 'right': {'prediction': 16.97916666666667}}, 'right': {'feature': 'X2', 'threshold': 12.5, 'left': {'prediction': 42.38}, 'right': {'prediction': 34.75454545454546}}}}


In [21]:
import json
print(json.dumps(rules,indent=4))

{
    "feature": "X3",
    "threshold": 837.7233,
    "left": {
        "feature": "X2",
        "threshold": 11.8,
        "left": {
            "feature": "X6",
            "threshold": 121.53058999999999,
            "left": {
                "prediction": 117.5
            },
            "right": {
                "prediction": 52.37377049180329
            }
        },
        "right": {
            "feature": "X3",
            "threshold": 333.3679,
            "left": {
                "prediction": 45.80754716981131
            },
            "right": {
                "prediction": 38.84390243902438
            }
        }
    },
    "right": {
        "feature": "X5",
        "threshold": 24.984070000000003,
        "left": {
            "feature": "X3",
            "threshold": 4066.587,
            "left": {
                "prediction": 25.66986301369862
            },
            "right": {
                "prediction": 16.97916666666667
            }
        },
        "