In [2]:
# import all the necessary libraries here
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv('decision-tree.csv')
print(df.shape)

(768, 9)


In [4]:
print(df.columns)

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome'],
      dtype='object')


In [5]:
print(df.head())

   Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [None]:
df['Outcome'].value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [7]:
df.tail()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.34,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1
767,1,93,70,31,0,30.4,0.315,23,0


In [8]:
# target variable ->Outcome
y=df['Outcome']
# removing the target variable from the dataset
X=df.drop('Outcome', axis=1)
# split the data in 80% to train , 20% to test using sklearn
from sklearn.model_selection import train_test_split
# X is your feature matrix and y is your target vector
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [9]:
#Entropy
def entropy(target):
    count = target.value_counts()/target.shape[0]
    return np.sum(-count*np.log2(count+1e-9))

#Information Gain
def IGSplit(target,f):
    #target= y classes
    # f= contain true if xvalue < split point otherwise false
    set1=sum(f)
    set2=f.shape[0]-set1
    if(set1==0 or set2 ==0):
        return 0
    # caluclating the information gain formula
    return entropy(target)-set1/(set1+set2)*entropy(target[f])-set2/(set1+set2)*entropy(target[-f])

# ID3 Decision tree algorithm
def ID3(x,y):
    # which fetaure have the highest information gain and best split point of it
    f_name=None
    split_point = None
    max_ig = float('-inf')
    for j in x.columns:
        # information gain for each of unique points
        IG=[]
        # which split points gIGives what information gain
        sp=[]
        #find all unique points for the columns
        unique_values = x[j].sort_values().unique()[1:]
        if(len(unique_values)<=0):
            continue
        for i in unique_values:
            # list containing true if less then the split point
            # and total target
            gain=IGSplit(y,x[j]<i)
            IG.append(gain)
            sp.append(i)
        if(max_ig < max(IG)):
            max_ig = max(IG)
            split_point=sp[IG.index(max(IG))]
            f_name=j
    return f_name, split_point, max_ig

In [11]:
class Node:
    def __init__(self, featureName=None, splitPoint=None, label=None):
        self.featureName = featureName
        self.splitPoint = splitPoint
        self.label = label
        self.child = {}

#constructing Decision-tree
def buildTree(feature, target):
    if feature.shape[0] <= 10 or feature.shape[1] <= 1:
        # Create a leaf node with the majority class label
        majority_class = target.value_counts().idxmax()
        return Node(label=majority_class)

    f_name, splitPoint, gain = ID3(feature, target)
    split_condition = feature[f_name] < splitPoint
    data_1 = feature[split_condition]
    target_1 = target[split_condition]
    data_2 = feature[~split_condition]
    target_2 = target[~split_condition]

    node = Node(featureName=f_name, splitPoint=splitPoint)

    # Recursively building left and right subtrees
    node.child['<= ' + str(splitPoint)] = buildTree(data_1, target_1)
    node.child['> ' + str(splitPoint)] = buildTree(data_2, target_2)

    return node


In [14]:
x=buildTree(X_train,y_train)

In [17]:
def predict_tree(node, i):
    if node.featureName is None:
        # If it's a leaf node, return the class label
        return node.label

    split_feature = node.featureName
    splitPoint = node.splitPoint

    if i[split_feature] <= splitPoint:
        # Traverse the left subtree
        return predict_tree(node.child['<= ' + str(splitPoint)], i)
    else:
        # Traverse the right subtree
        return predict_tree(node.child['> ' + str(splitPoint)], i)

# Example usage:
# Assuming X_test is your test data
predictions = [predict_tree(x, i) for _, i in X_test.iterrows()]
print(predictions)
print(y_test)

[0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1]
52     0
505    0
167    0
309    1
690    0
      ..
730    1
461    0
116    1
110    1
737    0
Name: Outcome, Length: 154, dtype: int64


# Report the mean macro accuracy, macro precision and macro recall for the classifier.

In [22]:
def calc_metrics(Y, Y_train):

    macroPrecision = 0
    macroAccuracy = 0
    macroRecall = 0

    # to get unique classes label
    unique_class = np.unique(Y)

    # Calculate matrix for each class
    for i in unique_class:
        # true positives
        tp = np.sum((Y==i) & (Y_train== i))
        # false positives
        fp = np.sum((Y!= i) & (Y_train== i))
        # false negatives
        fn = np.sum((Y ==i) & (Y_train!= i))

        # for precision
        if (tp + fp) > 0:
          precision =tp/(tp+fp)
        else:
          precision=0
        macroPrecision = macroPrecision + precision

        # for recall
        if (tp+fn)>0:
          recall=tp/(tp+fn)
        else:
          recall=0
        macroRecall = macroRecall + recall

        # for Accuracy
        classAccuracy = tp/np.sum(Y== i)
        macroAccuracy = macroAccuracy + classAccuracy

    # mean macro accuracy, macro precision and macro recall
    num_class = len(unique_class)
    macroAccuracy = macroAccuracy/num_class
    macroPrecision = macroPrecision/num_class
    macroRecall = macroRecall/num_class

    return macroAccuracy, macroPrecision, macroRecall

mean_macro_accuracy, mean_macro_precision, mean_macro_recall = calc_metrics(y_test, predictions)

print("Mean Macro Accuracy = ", mean_macro_accuracy)
print("Mean Macro Recall = ",  mean_macro_recall)
print("Mean Macro Precision = ", mean_macro_precision)


Mean Macro Accuracy =  0.7193877551020409
Mean Macro Recall =  0.7193877551020409
Mean Macro Precision =  0.7065534396980614
