## Assignment 1 Part 2
### Name:
### Roll Number:

In [90]:
# importing all the necessary libraries
# pandas for reading the csv file, numpy for mathematical operations, matplotlib for plotting the data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [91]:
# extracting the data from the csv file, checking the shape of the data
df = pd.read_csv('../../dataset/decision-tree.csv')
print(df.shape)

# randomizing the dataset
df = df.sample(frac=1).reset_index(drop=True)

print("First 5 rows of the dataset:")
df.head()

(768, 9)
First 5 rows of the dataset:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,0,138,0,0,0,36.3,0.933,25,1
1,2,129,84,0,0,28.0,0.284,27,0
2,0,102,52,0,0,25.1,0.078,21,0
3,2,98,60,17,120,34.7,0.198,22,0
4,2,87,58,16,52,32.7,0.166,25,0


In [92]:
# splitting the dataset into training and testing data
# train set: 80%, test set: 20%
train_data = df.iloc[:int(0.8*len(df)), :]
test_data = df.iloc[int(0.8*len(df)):, :]


# check the shape of the data
print("Shape of the training data:", train_data.shape)
print("Shape of the testing data:", test_data.shape)

Shape of the training data: (614, 9)
Shape of the testing data: (154, 9)


In [93]:
# split the training and testing data into features and target
train_features = train_data.iloc[:, :-1]
train_target = train_data.iloc[:, -1]

test_features = test_data.iloc[:, :-1]
test_target = test_data.iloc[:, -1]

print("First 5 rows of the training features:")
train_features.head()

First 5 rows of the training features:


Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0,138,0,0,0,36.3,0.933,25
1,2,129,84,0,0,28.0,0.284,27
2,0,102,52,0,0,25.1,0.078,21
3,2,98,60,17,120,34.7,0.198,22
4,2,87,58,16,52,32.7,0.166,25


In [94]:
# entropy function
def entropy(data):
    values = data.value_counts()
    total = len(data)
    entropy = 0
    for value in values:
        probability = value / total
        entropy -= probability * np.log2(probability)
    return entropy

# get initial entropy
initial_entropy = entropy(train_target)
print("Initial entropy:", initial_entropy)

# information gain function
def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values = data[feature].unique()
    weighted_entropy = 0
    for value in values:
        subset = data[data[feature] == value]
        weighted_entropy += (len(subset) / len(data)) * entropy(subset[target])
    return total_entropy - weighted_entropy

# get information gain for each feature and select the feature with the maximum information gain
def best_feature(data, features, target):
    best_feature = max(features, key=lambda feature: information_gain(data, feature, target))
    return best_feature

# build the decision tree
def build_tree(data, features, target):
    if len(data) <= 10:
        return data[target].mode()[0]
    if len(data[target].unique()) == 1:
        return data[target].iloc[0]
    if len(features) == 0:
        return data[target].mode()[0]

    best_feature = max(features, key=lambda feature: information_gain(data, feature, target))
    tree = {best_feature: {}}
    features.remove(best_feature)
    for value in data[best_feature].unique():
        subset = data[data[best_feature] == value]
        subtree = build_tree(subset, features.copy(), target)
        tree[best_feature][value] = subtree
    return tree

# train the decision tree
tree = build_tree(train_data, train_features.columns.tolist(), 'Outcome')

# print the tree
print(tree)

Initial entropy: 0.942629099420442
{'DiabetesPedigreeFunction': {0.933: 1, 0.284: 0, 0.078: 0, 0.198: 0, 0.166: 0, 0.28: 0, 1.095: 0, 0.207: 0, 0.263: 0, 0.677: 0, 0.219: 0, 0.148: 0, 1.021: 0, 0.304: 0, 0.587: 0, 0.096: 0, 0.133: 0, 0.336: 0, 0.539: 1, 1.224: 1, 2.288: 1, 0.441: 1, 0.233: 1, 0.432: 0, 0.735: 0, 0.452: 0, 1.893: 1, 0.491: 0, 1.159: 0, 1.698: 0, 0.238: 0, 0.282: 0, 0.734: 1, 0.127: 1, 0.19: 0, 1.394: 1, 0.24: 1, 1.39: 1, 0.302: 1, 0.141: 0, 0.254: 1, 0.583: 0, 0.698: 0, 0.126: 0, 0.46: 0, 0.247: 0, 0.248: 0, 0.328: 1, 0.43: 0, 0.875: 1, 0.258: 1, 0.867: 1, 0.805: 1, 0.4: 0, 0.38: 0, 1.781: 0, 0.607: 0, 0.719: 1, 0.431: 1, 0.265: 0, 0.107: 0, 0.144: 0, 0.614: 0, 0.409: 0, 0.52: 0, 0.203: 1, 0.167: 0, 0.586: 0, 0.658: 0, 1.318: 1, 0.246: 0, 0.654: 0, 0.649: 0, 0.647: 0, 0.297: 1, 0.212: 1, 0.368: 0, 0.493: 0, 0.526: 0, 0.278: 1, 1.189: 1, 0.153: 0, 2.137: 1, 0.296: 1, 0.084: 0, 0.813: 0, 0.559: 0, 0.501: 0, 0.307: 0, 0.703: 0, 0.398: 1, 0.285: 0, 1.292: 1, 0.497: 0, 0.516