In [43]:
import pandas as pd
import numpy as np
import math
from copy import deepcopy
from scipy.stats import mode
from sklearn.model_selection import StratifiedKFold

# data = pd.read_csv('2.txt',sep='\t',names=list(range(5)))


def two_attribute_std(data, feature):
    uniques = np.unique(data[:,feature])
    total_size = data.shape[0]
    std = 0
    for v in uniques:
        X = data[data[:,feature] == v]
        size = X.shape[0]
        v_std = X[:, -1].std()
        p = size / total_size
        std = std + (p * v_std)
    return std    
        
        
def sdr_calc(data, feature):
    std = data[:, -1].std()
    std2 = two_attribute_std(data, feature)
    return std - std2

def calc_cv(data):
    std = data[:, -1].std()
    mean = data[:, -1].mean()
    return (std / mean) * 100

class Node:
    def __init__(self, data, parent=None, f_index = None, f_value = None, threshold = 10, count = 3):
        self.data = data
        self.parent = parent
        self.children = []
        self.f_index = f_index
        self.f_value = f_value
        self.count = count
        self.threshold = threshold
        self.value = None
        self.leaf = self.isleaf()
    
    def isleaf(self):
        if self.data.ndim == 1: # Out of feature
            self.value = self.data.mean()
            return True
        elif self.data.shape[1] <= 1:
            self.value = self.data.mean()
            return True
        elif self.data.shape[0] <= self.count: # Count of remaining data is equal or less than smth
            self.value = self.data[:,-1].mean()
            return True
        elif calc_cv(self.data) <= self.threshold: # CV is equal or less than threshold
            self.value = self.data[:,-1].mean()
            return True
        return False
    
    def add_children(self, node):
        self.children.append(node)
        
    def get_best_feature_to_split(self):
        best_f_index = 0
        best_sdr = 0
        for f in range(0, self.data.shape[1] - 1):
            sdr = sdr_calc(self.data, f)
            if sdr > best_sdr:
                best_sdr = sdr
                best_f_index = f
        return best_f_index
    
    def visited(self):
        if self.children: # Has children. It's not empty
            return True
        return False

    
class DTRegression:
    def __init__(self, data, count=3, threshold=10):
        self.count = count
        self.threshold = threshold
        self.root = Node(data, count=self.count, threshold= self.threshold)
        self.unvisited_nodes = [self.root]
        self.visited_nodes = []

    
    def fit(self):
        while self.unvisited_nodes: # While list is not Empty
            
            n = self.unvisited_nodes.pop(0)
            best_feature = n.get_best_feature_to_split()
            self.visited_nodes.append(n)
            n.f_index = best_feature
            self.split_and_make_children(n, best_feature)
            
    
    def split_and_make_children(self, node, f_index):
        data = node.data
        unique_values = np.unique(data[:, f_index])
        for v in unique_values:
            new_data = data[data[:,f_index] == v]
            new_data = np.delete(new_data, f_index, axis=1)     
            n = Node(new_data, parent=node, f_index=None, f_value=v, count=self.count, threshold= self.threshold)
            node.add_children(n)
            
            if n.leaf :
                self.visited_nodes.append(n)
            else:
                self.unvisited_nodes.append(n)
    
    def predict(self, X):
        result = []
        for i, x in enumerate(X):
            result.append(self._predict(x))
        return result    
    
    def _predict(self, x):
        current_node = self.root
        while True:
            if current_node.leaf:
                return current_node.value
            for ch in current_node.children:
                found = False
                if ch.f_value == x[current_node.f_index]:
                    found = True
                    x = np.delete(x, current_node.f_index)  
                    current_node = ch   
                    break
#             if not found:
#                 return current_node.data[:,-1].mean()
        return current_node.value

# EnjoySport Dataset Regression From Scratch

In [51]:
from sklearn.metrics import mean_squared_error


data = pd.read_csv('2.txt',sep='\t',names=list(range(5))).to_numpy()
dtr = DTRegression(data, count=3)
dtr.fit()
preds = dtr.predict(data)
print('Enjoy sport MSE: ', mean_squared_error(data[:,-1], preds))

Enjoy sport MSE:  14.208333333333334


# Automobil Dataset Regression From Scratch

In [53]:
from sklearn.model_selection import train_test_split
data = pd.read_csv('31.txt',sep='\t',names=list(range(12))).to_numpy()


train, test = train_test_split(data, test_size=0.3 )
print(test.shape)
dtr = DTRegression(train, count=5, threshold=1)
dtr.fit()
preds = dtr.predict(test)
print(mean_squared_error(test[:,-1], preds))
dif = pd.DataFrame({'pred': preds, 'real':test[:,-1]})
print(dif['pred'].sum(), ', ', dif['real'].sum())

(657, 12)
2733208.48681576
8465942.878553528 ,  8363341


In [59]:
results = []
for _ in range(10):
    train, test = train_test_split(data, test_size=0.3)
    dtr = DTRegression(train, count=8)
    dtr.fit()
    preds = dtr.predict(test)
    results.append(mean_squared_error(test[:,-1], preds))
print('Average + STD = ', np.mean(results), ' ± ', np.std(results))

Average + STD =  2781598.6418455904  ±  325501.93123312865


# Automobil Dataset Regression With DecisionTreeRegressor from Sklearn

In [20]:
from sklearn.preprocessing import OrdinalEncoder

data = pd.read_csv('3.txt',sep='\t',names=list(range(12))).to_numpy()
enc = OrdinalEncoder()
enc.fit(data[:, :-1])
d = enc.transform(data[:, :-1])
data = np.insert(d, 11 ,data[:,-1], axis=1)

train, test = train_test_split(data, test_size=0.3 )



from sklearn.tree import DecisionTreeRegressor
regressor = DecisionTreeRegressor(random_state=0)
regressor.fit(train[:,:-1], train[:, -1])
preds = regressor.predict(test[:,:-1])
print(mean_squared_error(test[:,-1], preds))

39119908.91157408


In [21]:
results = []
for _ in range(10):
    train, test = train_test_split(data, test_size=0.3)
    regressor = DecisionTreeRegressor()
    regressor.fit(train[:,:-1], train[:, -1])
    preds = regressor.predict(test[:,:-1])
    results.append(mean_squared_error(test[:,-1], preds))
print('Average + STD = ', np.mean(results), ' ± ', np.std(results))

Average + STD =  18002941.053611107  ±  8366768.374052806
