# Implementing Decision tree from Scratch

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score, precision_score, accuracy_score

In [2]:
!pip install ucimlrepo
from ucimlrepo import fetch_ucirepo 
# fetch dataset 
adult = fetch_ucirepo(id=2)   
# data (as pandas dataframes) 
dataset = adult.data.original

print(dataset.shape)
# metadata 
print(adult.metadata) 

# variable information 
print(adult.variables) 

(48842, 15)
{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether income exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Mon Aug 07 2023', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': 'Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the following condi

In [3]:
target_column = 'income'
test_size = 0.2
random_state = 42

In [17]:
def remove_nans(df):
    '''
    this fucntion removes rows with nans 
    '''
    return df.dropna()


def numerical_to_categorical(df, n=2, ignore=[target_column]):
    '''
    change the type of the column to categorical 
    if it has <= n unique values
    '''
    # TODO
    df_dropped =df.drop(columns=ignore, axis=1)
    df_converted = df_dropped.apply(lambda x: x.astype('category')\
                                            if (str(x.dtype)=='int64' )\
                                            and (len(x.unique())<=n) else x)
    for col in ignore:
        df_converted[col] = df[col]
    return df_converted


def remove_columns_by_n(df, condition='pass', n=10, direction='less', 
                        ignore=[target_column]):
    '''
    Remove columns with more or less than n unique values. 
    Usually it makes sense to apply this function to columns with categorical values
    (see below where it is called).
    With the default values we remove all numerical columns which have
    less than 10 unique values (except for the target_column).
    '''
    if ignore is None:
        ignore = []
    
    # Ensure ignored columns are not processed
    df_ignored = df.drop(columns=ignore, axis=1)
    
    columns_to_drop = []
    for col in df_ignored.columns:
        unique_count = len(df_ignored[col].unique())
        if direction == 'less' and unique_count <= n and str(df_ignored[col].dtype) =='int64' :
            columns_to_drop.append(col)
        elif direction == 'more' and unique_count >= n and str(df_ignored[col].dtype) =='category':
            columns_to_drop.append(col)
    
    df_dropped = df.drop(columns=columns_to_drop)
    
    # Adding the ignored columns back if they were dropped
    for col in ignore:
        if col in df.columns:
            df_dropped[col] = df[col]
    
    return df_dropped


def object_to_categorical(df):
    '''
    Make columns with the 'object' type categorical 
    and replace categories with label encodings
    '''
    # TODO
    return df.apply(lambda x : x.astype('category') if str(x.dtype)=='object' else x)

In [5]:
df = dataset
df = remove_nans(df)
#TODO-SELF:in ds some classes has dot at the end i.e <=50K. that makes to have 4 unique values
#but we should have 2 because our target is binary
df.loc[:, target_column] = df[target_column].apply(lambda x: x.replace('.', '') if isinstance(x, str) and '.' in x else x)
df = numerical_to_categorical(df, n=2, ignore=[target_column])
df = remove_columns_by_n(df, n=10, direction='less', ignore=[target_column], condition='pass')
df = object_to_categorical(df)
df = remove_columns_by_n(df, n=40, direction='more',ignore=[target_column], condition='pass')
# #TODO-SELF: label encoding 
from sklearn.preprocessing import LabelEncoder                        
encoder = LabelEncoder()
# Encode the categorical column
df=df.apply(lambda x: encoder.fit_transform(x))
assert not df.isna().any().any(), 'There are still nans in the dataframe'

In [6]:
# TODO : make train-test split from the dataframe using the parameters above
# expected results variable names - train_X, test_X, train_y, test_y

X = df.loc[:, df.columns != target_column]
y = df[target_column]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=test_size, random_state=random_state, stratify=y
)
print(X.shape)

(47621, 13)


In [7]:
# n_samples = 100000 # our implementation took ~1 min with this amount of samples, you can reduce the number if neccessary
n_samples = 10000
X_train_tree, X_test_tree, y_train_tree, y_test_tree = X_train.to_numpy()[:n_samples], X_test.to_numpy(), y_train.to_numpy()[:n_samples], y_test.to_numpy()

In [8]:
def gini_impurity(y, labels=(0, 1)):
    gini=0
    for k in labels:
         #calculate proportion
        indicator = y == k
        #count I(tn =k)
        true_cases = np.sum(indicator)
        p = true_cases / len(y)
        gini += p*(1-p)
    return gini
        

def weighted_difference(y, split, labels=(0, 1)):
    #weighted average
    l_a_weight = (np.sum(split)/len(y))
    #gini of left side of split
    gini_minus = gini_impurity(y[split])
    # not symbol ~ corresponds the other side of split
    l_b_weight = (np.sum(~split)/len(y))
    gini_posit = gini_impurity(y[~split])
    weighted_diff = gini_impurity(y)-(l_a_weight*gini_minus)-(l_b_weight*gini_posit)
    import math
    return weighted_diff if not math.isnan(weighted_diff )else float('-inf')

In [9]:
print(gini_impurity(y_train_tree, labels=(0, 1)))

split = X_train_tree[:,0] < 40
print(weighted_difference(y_train_tree, split))


0.36417528
0.0005766475406871335


In [10]:
# for each new node to be considred we calculate the exaustive search for the node , and i guess for the last node leading
# to leafs we just calculate gini_impurity
def exhaustive_search(x, y):
    '''
        for each node it provides full list of possible midpoints and their impurities
    '''
    # midpoints (potential split points)
    midpoint_ls = [np.unique(x[:,idx]) for idx in range(x.shape[1])]

    # which feature each midpoints/impurities corresponds to.
    feature_array = np.hstack([np.full(len(midpt_per_feat), idx) for idx,midpt_per_feat in enumerate(midpoint_ls)])
    
    #numpying and flattening midpoints_ls  
    midpoint_array = np.hstack(midpoint_ls)
    
    #impurity reduction values corresponding to each midpoint.        
    impurity_array = np.array([weighted_difference(y,x[:,feature_array[idx]] >= midpoint)
                              for idx,midpoint in enumerate(midpoint_array)],dtype='float32')
             
    return feature_array,midpoint_array,impurity_array
                                  
feat_arr,midpoint_arr,impurt_arr = exhaustive_search(X_train_tree,y_train_tree)

def find_best_split(impurities_array, midpoints_array, features_array, verbose=False):
    
    #idx of best impurity reduction score
    optimal_idx = np.argmax(impurities_array)
    feature_idx = features_array[optimal_idx]
    best_midpoint = midpoints_array[optimal_idx]
    return (feature_idx,best_midpoint)
    
best_split= find_best_split(impurt_arr, midpoint_arr, feat_arr, verbose=False)
print(best_split)

  p = true_cases / len(y)


(7, 1)


In [12]:
class Node:
    def __init__(self, left, right, n_feat, threshold):
        self.left = left
        self.right = right
        self.n_feat = n_feat #prpbablity it means the idx of feature chosen
        self.threshold = threshold


class Leaf:
    def __init__(self, label):
        self.label = label

In [13]:
# TODO: implement the function build_tree and predict_tree

# Implement recursive tree function
#QUESTIONS: what is the output of buildtree ? i think it may be a Node object which has nested nodes and leaf (i.e in self.left / self.right)
def build_tree(x, y, current_depth, max_depth=3, n_labels=2):
    if current_depth >= max_depth:
        leaf =Leaf(np.argmax(np.bincount(y)))
        return leaf
        
    feat_arr,midpoint_arr,impurt_arr = exhaustive_search(x,y)
    feat_idx,threshold = find_best_split(impurt_arr, midpoint_arr, feat_arr)
    print(feat_idx,threshold)
    split_mask = x[:,feat_idx]<threshold
    
    left_x = x[split_mask]
    left_y = y[split_mask]
    right_x = x[~split_mask]
    right_y = y[~split_mask]
    tree = Node(build_tree(left_x, left_y, current_depth=current_depth+1),
                build_tree(right_x, right_y, current_depth=current_depth+1),feat_idx,threshold)
    return tree
    
def show_tree(tree,):
    'DFS based construction of tree'
    pass


def predict_tree(node, x):
    while True:
        if node.__class__.__name__ == 'Leaf':
            return node.label
            
        feat_idx,threshold = node.n_feat,node.threshold
        is_left = x[feat_idx] <threshold
        if is_left :
            node = node.left
            continue
        node = node.right
    

In [14]:
# Build tree
tree = build_tree(X_train_tree,y_train_tree,0)

apply_pred = lambda x_dpoint : predict_tree(tree, x_dpoint)
predictions_train = np.apply_along_axis(apply_pred, axis=1, arr=X_train_tree)
predictions_test = np.apply_along_axis(apply_pred, axis=1, arr=X_test_tree)

  p = true_cases / len(y)


7 1
4 12
10 81
10 81
10 94
7 5
0 6


In [15]:
# sklearn decision tree
sk_tree = DecisionTreeClassifier(max_depth=3).fit(X_train_tree,y_train_tree)
sk_predictions_train = sk_tree.predict(X_train_tree)
sk_predictions_test = sk_tree.predict(X_test_tree)

In [16]:
# Calculate training and test scores
print('------------------MY MODEL -----------------')
print('Accuracy Training: ', accuracy_score(y_train_tree, predictions_train))
print('Accuracy: ', accuracy_score(y_test_tree, predictions_test))
print('Precision: ', precision_score(y_test_tree, predictions_test, average='macro'))
print('Recall: ', recall_score(y_test_tree, predictions_test, average='macro'))
print('------------------SK LEARN -----------------')
print('Accuracy Training: ', accuracy_score(y_train_tree, sk_predictions_train))
print('Accuracy: ', accuracy_score(y_test_tree, sk_predictions_test))
print('Precision: ', precision_score(y_test_tree, sk_predictions_test, average='macro'))
print('Recall: ', recall_score(y_test_tree, sk_predictions_test, average='macro'))



------------------MY MODEL -----------------
Accuracy Training:  0.8408
Accuracy:  0.8371653543307087
Precision:  0.80782704980924
Recall:  0.7130646540555481
------------------SK LEARN -----------------
Accuracy Training:  0.8408
Accuracy:  0.8372703412073491
Precision:  0.8079616592624852
Recall:  0.7132812918371773
