In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

### Importing Data

In [2]:
df = pd.read_csv("cardio_train.csv",sep=";")
df.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,4,17474,1,156,56.0,100,60,1,1,0,0,0,0


### Exploring and cleaning Data

In [3]:
df.nunique()

id             70000
age             8076
gender             2
height           109
weight           287
ap_hi            153
ap_lo            157
cholesterol        3
gluc               3
smoke              2
alco               2
active             2
cardio             2
dtype: int64

In [4]:
df.describe()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0,70000.0
mean,49972.4199,19468.865814,1.349571,164.359229,74.20569,128.817286,96.630414,1.366871,1.226457,0.088129,0.053771,0.803729,0.4997
std,28851.302323,2467.251667,0.476838,8.210126,14.395757,154.011419,188.47253,0.68025,0.57227,0.283484,0.225568,0.397179,0.500003
min,0.0,10798.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,25006.75,17664.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,50001.5,19703.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,74889.25,21327.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,99999.0,23713.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [5]:
# Drop "id" column
df.drop(columns="id",inplace=True)
# express age in years
df["age"] = (df['age']/365).astype(int)
# remove extreme values from systolic and diastolic blood pressure
# systolic mask
low_sys,high_sys = df["ap_hi"].quantile([0.05,0.95])
mask_systolic = df["ap_hi"].between(low_sys,high_sys)
# diastolic mask
low_dia,high_dia = df["ap_lo"].quantile([0.05,0.95])
mask_diastolic = df["ap_lo"].between(low_dia,high_dia)
# remove extreme values from height and weight
# height mask
low_height, high_height = df['height'].quantile([0.05,0.99])
mask_height = df['height'].between(low_height,high_height)
# weight mask
low_weight, high_weight = df['weight'].quantile([0.05,0.99])
mask_weight = df['weight'].between(low_weight,high_weight)
# apply masks
df = df[mask_systolic & mask_diastolic & mask_height & mask_weight]
# discretizing data
# age
df["age"].mask((df['age'] >=29) & (df['age'] < 40),35,inplace=True)
df["age"].mask((df['age'] >=40) & (df['age'] < 50),45,inplace=True)
df["age"].mask((df['age'] >=50) & (df['age'] < 65),55,inplace=True)
# height
df["height"].mask((df['height'] >=152) & (df['height'] < 160),156,inplace=True)
df["height"].mask((df['height'] >=160) & (df['height'] < 168),164,inplace=True)
df["height"].mask((df['height'] >=168) & (df['height'] < 176),172,inplace=True)
df["height"].mask((df['height'] >=176) & (df['height'] < 185),180,inplace=True)
# weight
df["weight"].mask((df['weight'] >=55) & (df['weight'] < 70),63,inplace=True)
df["weight"].mask((df['weight'] >=70) & (df['weight'] < 85),78,inplace=True)
df["weight"].mask((df['weight'] >=85) & (df['weight'] < 100),93,inplace=True)
df["weight"].mask((df['weight'] >=100) & (df['weight'] < 118),109,inplace=True)
# systolic bp
df["ap_hi"].mask((df['ap_hi'] >=100) & (df['ap_hi'] < 115),107,inplace=True)
df["ap_hi"].mask((df['ap_hi'] >=115) & (df['ap_hi'] < 130),122,inplace=True)
df["ap_hi"].mask((df['ap_hi'] >=130) & (df['ap_hi'] < 145),137,inplace=True)
df["ap_hi"].mask((df['ap_hi'] >=145) & (df['ap_hi'] < 161),152,inplace=True)
# diastolic bp
df["ap_lo"].mask((df['ap_lo'] >=70) & (df['ap_lo'] < 80),75,inplace=True)
df["ap_lo"].mask((df['ap_lo'] >=80) & (df['ap_lo'] < 90),85,inplace=True)
df["ap_lo"].mask((df['ap_lo'] >=90) & (df['ap_lo'] < 101),95,inplace=True)

In [6]:
df.nunique()

age            3
gender         2
height         4
weight         4
ap_hi          4
ap_lo          3
cholesterol    3
gluc           3
smoke          2
alco           2
active         2
cardio         2
dtype: int64

In [7]:
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,55,2,172,63.0,107,85,1,1,0,0,1,0
1,55,1,156,93.0,137,95,3,1,0,0,1,1
2,55,1,164,63.0,137,75,3,1,0,0,0,1
3,45,2,172,78.0,152,95,1,1,0,0,1,1
6,55,1,156,93.0,137,85,3,1,0,0,1,0


### Building Tree

#### splitting the data

In [8]:
target_attribute = "cardio"
attributes = df.drop(columns='cardio').columns.to_list()
X = df[attributes]
y = df[target_attribute]
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2,random_state=0)

#### First build model with scikit-learn

In [9]:
from sklearn.tree import DecisionTreeClassifier
import time

model = DecisionTreeClassifier(criterion="entropy",random_state=42)
t1=time.time()
model.fit(X_train, y_train)
t2=time.time()
print("Training Time: ", t2-t1,' sec')
t1=time.time()
y_pred= model.predict(X_test)
t2=time.time()
print("Prediction Time: ", t2-t1,' sec')

Training Time:  0.12961459159851074  sec
Prediction Time:  0.00801229476928711  sec


In [10]:
from sklearn.metrics import accuracy_score

score = accuracy_score(y_test,y_pred)
score

0.693318253001566

#### second, decision tree model with id3 algorithm

In [11]:
class DecisionTree:
    def __init__(self):
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def predict(self, X):
        predictions = []
        for i in range(X.shape[0]):
            prediction = self._predict_row(X.iloc[i])
            predictions.append(prediction)
        return np.array(predictions)

    def _build_tree(self, X, y):
        if len(set(y)) == 1:
            return {'leaf': True, 'class': y.iloc[0]}

        if X.shape[1] == 0:
            return {'leaf': True, 'class': y.value_counts().index[0]}

        best_feature, best_threshold = self._select_best_feature(X, y)

        if best_feature is None:
            return {'leaf': True, 'class': y.value_counts().index[0]}

        left_indices = X[best_feature] < best_threshold
        right_indices = X[best_feature] >= best_threshold

        left_tree = self._build_tree(X[left_indices], y[left_indices])
        right_tree = self._build_tree(X[right_indices], y[right_indices])

        return {'leaf': False, 'feature': best_feature, 'threshold': best_threshold, 'left': left_tree, 'right': right_tree}

    def _select_best_feature(self, X, y):
        best_feature = None
        best_threshold = None
        best_information_gain = -np.inf

        for feature in X.columns:
            thresholds = X[feature].unique()

            for threshold in thresholds:
                left_indices = X[feature] < threshold
                right_indices = X[feature] >= threshold

                left_y = y[left_indices]
                right_y = y[right_indices]

                if len(left_y) == 0 or len(right_y) == 0:
                    continue

                information_gain = self._information_gain(y, left_y, right_y)

                if information_gain > best_information_gain:
                    best_feature = feature
                    best_threshold = threshold
                    best_information_gain = information_gain

        return best_feature, best_threshold

    def _information_gain(self, parent, left, right):
        entropy_parent = self._entropy(parent)
        entropy_left = self._entropy(left)
        entropy_right = self._entropy(right)
        weight_left = len(left) / len(parent)
        weight_right = len(right) / len(parent)
        information_gain = entropy_parent - weight_left * entropy_left - weight_right * entropy_right
        return information_gain

    def _entropy(self, y):
        value_counts = y.value_counts()
        probabilities = value_counts / len(y)
        entropy = -np.sum(probabilities * np.log2(probabilities))
        return entropy

    def _predict_row(self, row):
        node = self.tree
        while not node['leaf']:
            if row[node['feature']] < node['threshold']:
                node = node['left']
            else:
                node = node['right']
        return node['class']


In [12]:
my_model = DecisionTree()
t1=time.time()
my_model.fit(X_train,y_train)
t2=time.time()
print("Training Time of my model: ", t2-t1,' sec')
t1=time.time()
y_pred = my_model.predict(X_test)
t2=time.time()
print("Prediction Time of my model: ", t2-t1,' sec')

Training Time of my model:  121.85528492927551  sec
Prediction Time of my model:  2.823422431945801  sec


#### my model took much longer to train on the training data and to predict

In [13]:
score = accuracy_score(y_test,y_pred)
score

0.6960153123368714

#### the accuracy of my model is higher than the accuracy of scikit-learn model !!