In [16]:
import pandas as pd
import numpy as np
from collections import Counter
import math

In [17]:
class DecisionTree:

    def __init__(self, max_depth=None, min_samples_split=2, min_samples_leaf=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.min_samples_leaf = min_samples_leaf
        self.tree = None

    def entropy(self, y):
        counter = Counter(y)
        freqs = [count / len(y) for count in counter.values()]
        return -sum(freqs * np.log2(freqs))

    def information_gain(self, X, y, split_attribute):
        split_col = X[:, split_attribute]
        H_total = self.entropy(y)

        # Split the data and target variable based on the split attribute
        left_indices = split_col <= np.median(split_col)
        right_indices = split_col > np.median(split_col)
        y_left, y_right = y[left_indices], y[right_indices]

        # Calculate the entropy of the left and right groups
        H_left, H_right = self.entropy(y_left), self.entropy(y_right)

        # Calculate the information gain
        IG = H_total - (len(y_left) / len(y)) * H_left - (len(y_right) / len(y)) * H_right
        return IG

    def best_split(self, X, y):
        best_IG = 0
        best_attribute = None

        # Find the best split attribute
        for i in range(X.shape[1]):
            IG = self.information_gain(X, y, i)
            if IG > best_IG:
                best_IG = IG
                best_attribute = i

        return best_attribute

    def build_tree(self, X, y, depth=0):
        # Stop recursion if there are no samples left or max depth reached
        if len(y) == 0:
            return Counter(y).most_common(1)[0][0]
        elif depth == self.max_depth:
            return Counter(y).most_common(1)[0][0]

        # Check if the node is pure
        if len(np.unique(y)) == 1:
            return y[0]

        # Check if the number of samples is smaller than min_samples_split
        if len(y) < self.min_samples_split:
            return Counter(y).most_common(1)[0][0]

        # Find the best split attribute and value
        best_attribute = self.best_split(X, y)
        split_value = np.median(X[:, best_attribute])

        # Split the data based on the best attribute and value
        left_indices = X[:, best_attribute] <= split_value
        right_indices = X[:, best_attribute] > split_value
        X_left, y_left = X[left_indices], y[left_indices]
        X_right, y_right = X[right_indices], y[right_indices]

        # Check if the number of samples in the split is smaller than min_samples_leaf
        if len(y_left) < self.min_samples_leaf or len(y_right) < self.min_samples_leaf:
            return Counter(y).most_common(1)[0][0]

        # Recursively build the left and right branches of the tree
        left_tree = self.build_tree(X_left, y_left, depth+1)
        right_tree = self.build_tree(X_right, y_right, depth+1)

        # Return the decision node
        return {'attribute': best_attribute, 'split_value':split_value, 'left': left_tree, 'right':right_tree}

    def fit(self, X, y):
        self.tree = self.build_tree(X, y)

    def predict(self, X):
        y_pred = []
        for i in range(X.shape[0]):
            node = self.tree
            while isinstance(node, dict):
                if X[i, node['attribute']] <= node['split_value']:
                    node = node['left']
                else:
                    node = node['right']
            y_pred.append(node)
        return np.array(y_pred)


In [23]:
# load data
cardio_df = pd.read_csv('./data/cardio_train.csv' , sep= ';', index_col=0)
cardio_df

Unnamed: 0_level_0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
99993,19240,2,168,76.0,120,80,1,1,1,0,1,0
99995,22601,1,158,126.0,140,90,2,2,0,0,1,1
99996,19066,2,183,105.0,180,90,3,1,0,1,0,1
99998,22431,1,163,72.0,135,80,1,2,0,0,0,1


In [26]:
from sklearn.model_selection import train_test_split

# Split the data into features and target variable
X = cardio_df.drop('cardio', axis=1).values
y = cardio_df['cardio'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the decision tree to the data
tree = DecisionTree(max_depth=5)
tree.fit(X_train, y_train)

# Predict on new data
y_pred = tree.predict(X_test)

In [27]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

my_accuracy = accuracy_score(y_test, y_pred)
print(my_accuracy)

0.7245714285714285
