In [74]:
#import libaries
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler

In [75]:
# Load dataset
df = pd.read_csv('cardio_train.csv', sep=";")
df = df.drop('id', axis=1)
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,18393,2,168,62.0,110,80,1,1,0,0,1,0
1,20228,1,156,85.0,140,90,3,1,0,0,1,1
2,18857,1,165,64.0,130,70,3,1,0,0,0,1
3,17623,2,169,82.0,150,100,1,1,0,0,1,1
4,17474,1,156,56.0,100,60,1,1,0,0,0,0


In [76]:
df.info() # checking if there is any null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 70000 entries, 0 to 69999
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          70000 non-null  int64  
 1   gender       70000 non-null  int64  
 2   height       70000 non-null  int64  
 3   weight       70000 non-null  float64
 4   ap_hi        70000 non-null  int64  
 5   ap_lo        70000 non-null  int64  
 6   cholesterol  70000 non-null  int64  
 7   gluc         70000 non-null  int64  
 8   smoke        70000 non-null  int64  
 9   alco         70000 non-null  int64  
 10  active       70000 non-null  int64  
 11  cardio       70000 non-null  int64  
dtypes: float64(1), int64(11)
memory usage: 6.4 MB


In [77]:
df.duplicated().sum() # Checking if there are any duplicates and printing the duplicated rows count

24

In [78]:
df.drop_duplicates(inplace=True) # dropping duplicates

In [79]:
df["age"] = round(df["age"] / 365) # convert age to years
df.head()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
0,50.0,2,168,62.0,110,80,1,1,0,0,1,0
1,55.0,1,156,85.0,140,90,3,1,0,0,1,1
2,52.0,1,165,64.0,130,70,3,1,0,0,0,1
3,48.0,2,169,82.0,150,100,1,1,0,0,1,1
4,48.0,1,156,56.0,100,60,1,1,0,0,0,0


In [80]:
df.describe()

Unnamed: 0,age,gender,height,weight,ap_hi,ap_lo,cholesterol,gluc,smoke,alco,active,cardio
count,69976.0,69976.0,69976.0,69976.0,69976.0,69976.0,69976.0,69976.0,69976.0,69976.0,69976.0,69976.0
mean,53.338945,1.349648,164.359152,74.208519,128.820453,96.636261,1.366997,1.226535,0.088159,0.05379,0.803718,0.499771
std,6.765633,0.476862,8.211218,14.397211,154.037729,188.504581,0.680333,0.572353,0.283528,0.225604,0.397187,0.500004
min,30.0,1.0,55.0,10.0,-150.0,-70.0,1.0,1.0,0.0,0.0,0.0,0.0
25%,48.0,1.0,159.0,65.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
50%,54.0,1.0,165.0,72.0,120.0,80.0,1.0,1.0,0.0,0.0,1.0,0.0
75%,58.0,2.0,170.0,82.0,140.0,90.0,2.0,1.0,0.0,0.0,1.0,1.0
max,65.0,2.0,250.0,200.0,16020.0,11000.0,3.0,3.0,1.0,1.0,1.0,1.0


In [81]:
# Remove outliers
df.drop(df[ (df['height'] < df['height'].quantile(0.025)) | (df['height'] > df['height'].quantile(0.975))].index,inplace=True)
df.drop(df[ (df['weight'] < df['weight'].quantile(0.025)) | (df['weight'] > df['weight'].quantile(0.975))].index,inplace=True)
df.drop(df[ (df['ap_hi'] < df['ap_hi'].quantile(0.025)) | (df['ap_hi'] > df['ap_hi'].quantile(0.975))].index,inplace=True)
df.drop(df[ (df['ap_lo'] < df['ap_lo'].quantile(0.025)) | (df['ap_lo'] > df['ap_lo'].quantile(0.975))].index,inplace=True)

In [82]:
# 0 for under, 1 for healty , 2 for over , 3 for obese
df.loc[(round((df['weight']/(df['height']/100)**2), 2) < 18.50), "bmi"] = 0
df.loc[(round((df['weight']/(df['height']/100)**2), 2)>= 18.50) & (round((df['weight']/(df['height']/100)**2), 2) < 25) ,"bmi"] = 1
df.loc[(round((df['weight']/(df['height']/100)**2), 2) >= 25) & (round((df['weight']/(df['height']/100)**2), 2) < 30) ,"bmi"]= 2
df.loc[(round((df['weight']/(df['height']/100)**2), 2) >= 30), "bmi"] = 3

# 0 for young, 1 for mature , 2 for old
df.loc[(df["age"] < 18), "age_cat"] = 0
df.loc[(df["age"] >= 18) & (df["age"] < 56), "age_cat"] = 1
df.loc[(df["age"] >= 56), "age_cat"] = 2

# 0 for normal , 1 for elevated , 2 for high 1, 3 for high 2, 4 for high 3
def BPCategorize(x,y):
    if x<=120 and y<=80:
        return 0
    elif x<=129 and y<=80:
        return 1
    elif x<=139 or y<=89:
        return 2
    elif x<=180 or y<=120:
        return 3
    elif x>180 or y>120:
        return 4
    else:
        return None
    
df.insert(8, "blood_pressure", df.apply(lambda row: BPCategorize(row['ap_hi'], row['ap_lo']), axis=1))
df.drop(['age', 'height','weight','ap_hi','ap_lo'], axis=1, inplace=True)
df.head()

Unnamed: 0,gender,cholesterol,gluc,blood_pressure,smoke,alco,active,cardio,bmi,age_cat
0,2,1,1,0,0,0,1,0,1.0,1.0
1,1,3,1,3,0,0,1,1,3.0,1.0
2,1,3,1,2,0,0,0,1,1.0,1.0
3,2,1,1,3,0,0,1,1,2.0,1.0
4,1,1,1,0,0,0,0,0,1.0,1.0


In [83]:
class Id3Classifier:

  def entropy(self, column):
    vals, count = np.unique(column, return_counts=True)
    
    #intialize entropy
    entropy_values = []

    # calculate entropy
    for i in range(len(vals)):
      entropy_values.append(-(count[i]/np.sum(count))*np.log2((count[i]/np.sum(count))))

    # calculate total entropy
    entropy_total = np.sum(entropy_values)

    return entropy_total # return total entropy

  def info_gain(self, data, feature, target):
    # calculate total entropy of subset
    entropy_total = self.entropy(data[target])
    vals, count = np.unique(data[feature], return_counts=True)

    #intialize weighted entropy
    weighted_entropy = []
    
    # calculate weighted entropy of subset
    for i in range(len(vals)):
      entropy_sub = self.entropy(data.where(data[feature]==vals[i]).dropna()[target])
      weighted_entropy.append((count[i]/np.sum(count))*entropy_sub)

    # calculate information gain
    ig = entropy_total - np.sum(weighted_entropy)

    return ig # return information gain

  def fit(self, input, output):
    data = input.copy()
    data[output.name] = output
    self.tree = self.decision_tree(data, data, input.columns, output.name)

  def decision_tree(self, data, orginal_data, features, target, node_parent=None):
    classes = np.unique(data[target])
    if len(classes) <= 1:
      return classes[0]
    elif len(data) == 0:
      class_index = np.argmax(np.unique(original_data[target], return_counts=True)[1])
      return np.unique(original_data[target])[class_index]
    elif len(features) == 0:
      return node_parent
    else:
      class_index = np.argmax(np.unique(data[target], return_counts=True)[1])
      node_parent = classes[class_index]
      info_gains = [self.info_gain(data, feature, target) for feature in features]
      best_index = np.argmax(info_gains)
      best = features[best_index]

      # intialize tree
      tree = {best: {}}

      # remove best feature 
      features = [i for i in features if i != best]

      # create nodes 
      parent_attribute_values = np.unique(data[best])
      for value in parent_attribute_values:
        sub_data = data.where(data[best] == value).dropna()

        # call recursively
        subtree = self.decision_tree(sub_data, orginal_data, features, target, node_parent)
        tree[best][value] = subtree

      return tree

  def predict(self, input):
    values = input.to_dict(orient='records')
    results = []
    for value in values:
      results.append(self.prediction(value, self.tree, 1.0))

    return results
 #make predicition
  def prediction(self, sample, tree, default=1):
    for attribute in list(sample.keys()):
      if attribute in list(tree.keys()):
        try:
          res = tree[attribute][sample[attribute]]
        except:
          return default

        result = tree[attribute][sample[attribute]]

        if isinstance(result, dict):
          return self.prediction(sample, result)
        else:
          return res

In [84]:
from sklearn.metrics import accuracy_score
y = df["cardio"]
X = df.drop(["cardio"], axis=1)

# Split the data into training and testing (30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)

# To use
model = Id3Classifier() # intialize the model
model.fit(X_train, y_train) # fit the model (pass the traing data)


Y_pred = model.predict(X_test) # evaluate the model by passing test data
print('Accuracy using model from scratch = ' + str(accuracy_score(y_test, Y_pred)*100)) # return accuracy score

# comparison : The model from scratch obtains comparable accuarcy (70.7) to the accuracy of scikit-learn library (71.52)
# but the model from scratch takes more time in training and prediction due to optimizations in the scikit-learn library

Accuracy using model from scratch = 70.73076070082058


In [85]:
y = df["cardio"]
X = df.drop(["cardio"], axis=1)

# Split the data into training and testing (30% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2018)

# Decision Tree Classifier using sklearn

decision_tree = DecisionTreeClassifier(criterion='entropy',max_depth=8,splitter='best',random_state=2018) # intialize the model
decision_tree.fit(X_train, y_train) # fit the model
# evaluate the model
acc_decision_tree_train = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree_test = round(decision_tree.score( X_test, y_test) * 100, 2)
# print results
print( 'Testing accuracy using sklearn = ' + str(acc_decision_tree_test))
print( 'Training accuracy using sklearn = ' + str(acc_decision_tree_train))

Testing accuracy using sklearn = 71.52
Training accuracy using sklearn = 71.62
