In [1]:

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split

In [2]:
!pip install --upgrade --no-cache-dir gdown
!gdown 1HOVOCneG14yRdGot0fYyuMzNi0YuxQcZ


Collecting gdown
  Downloading gdown-5.0.1-py3-none-any.whl (16 kB)
Installing collected packages: gdown
  Attempting uninstall: gdown
    Found existing installation: gdown 4.7.3
    Uninstalling gdown-4.7.3:
      Successfully uninstalled gdown-4.7.3
Successfully installed gdown-5.0.1
Downloading...
From: https://drive.google.com/uc?id=1CQAOCneG14yRdGot0fYyuMzNi0Yux4MJ
To: /content/covid.csv
100% 258/258 [00:00<00:00, 1.17MB/s]


In [3]:

df = pd.read_csv('/content/covid.csv')
df

Unnamed: 0,Fever,Cough,Breathing issues,Infected
0,No,No,No,No
1,Yes,Yes,Yes,Yes
2,Yes,Yes,No,No
3,Yes,No,Yes,Yes
4,Yes,Yes,Yes,Yes
5,No,Yes,No,No
6,Yes,No,Yes,Yes
7,Yes,No,Yes,Yes
8,No,Yes,Yes,Yes
9,Yes,Yes,No,Yes


In [4]:
def entropy(labels):
    p = labels.value_counts() / len(labels)
    return -sum(p * np.log2(p))


def information_gain(data, feature, target):
    # Entropy of parent
    entropy_parent = entropy(data[target])

    # Entropy of child
    entropy_child = 0
    for value in data[feature].unique():
        subset = data[data[feature] == value]
        wi = len(subset) / len(data)
        entropy_child += wi * entropy(subset[target])

    return entropy_parent - entropy_child

In [5]:
a = information_gain(df, 'Fever', 'Infected')
b = information_gain(df, 'Cough', 'Infected')
c = information_gain(df, 'Breathing issues', 'Infected')

print(f"IG_Fever: {a}")
print(f"IG_Cough: {b}")
print(f"IG_Breathing_Issues: {c}")

IG_Fever: 0.12808527889139443
IG_Cough: 0.0391486719030707
IG_Breathing_Issues: 0.39603884492804464


In [6]:
class Node:

    def __init__(self, feature=None, label=None):
        self.feature = feature
        self.label = label
        self.children = {}

    def __repr__(self):
        if self.feature is not None:
            return f'DecisionNode(feature="{self.feature}", children={self.children})'
        else:
            return f'LeafNode(label="{self.label}")'

In [7]:
def make_tree(data, target):

  if (len(data[target].unique()) == 1 or len(data.columns) == 1):
    return Node(label = data[target].iloc[0])

  # Calculate IG
  features = data.drop(target, axis=1).columns
  gains = [information_gain(data, feature, target) for feature in features]

  # Greedy Search for findinf Best Feature
  max_gain_idx = np.argmax(gains)
  best_feature = features[max_gain_idx]

  # Make a Node
  node = Node(feature = best_feature)

  # Loop over the Best Feature
  for value in data[best_feature].unique():
    subset = data[data[best_feature] == value].drop(best_feature, axis=1)
    display(subset)

    node.children[value] = make_tree(subset, target)

  return node

In [8]:
tree = make_tree(df, 'Infected')
tree

Unnamed: 0,Fever,Cough,Infected
0,No,No,No
2,Yes,Yes,No
5,No,Yes,No
9,Yes,Yes,Yes
10,No,Yes,No
13,Yes,Yes,No


Unnamed: 0,Cough,Infected
0,No,No
5,Yes,No
10,Yes,No


Unnamed: 0,Cough,Infected
2,Yes,No
9,Yes,Yes
13,Yes,No


Unnamed: 0,Infected
2,No
9,Yes
13,No


Unnamed: 0,Fever,Cough,Infected
1,Yes,Yes,Yes
3,Yes,No,Yes
4,Yes,Yes,Yes
6,Yes,No,Yes
7,Yes,No,Yes
8,No,Yes,Yes
11,No,Yes,Yes
12,No,Yes,No


Unnamed: 0,Cough,Infected
1,Yes,Yes
3,No,Yes
4,Yes,Yes
6,No,Yes
7,No,Yes


Unnamed: 0,Cough,Infected
8,Yes,Yes
11,Yes,Yes
12,Yes,No


Unnamed: 0,Infected
8,Yes
11,Yes
12,No


DecisionNode(feature="Breathing issues", children={'No': DecisionNode(feature="Fever", children={'No': LeafNode(label="No"), 'Yes': DecisionNode(feature="Cough", children={'Yes': LeafNode(label="No")})}), 'Yes': DecisionNode(feature="Fever", children={'Yes': LeafNode(label="Yes"), 'No': DecisionNode(feature="Cough", children={'Yes': LeafNode(label="Yes")})})})

In [9]:
from graphviz import Digraph, nohtml

g = Digraph('g', filename='decision-tree.gv', node_attr={'shape': 'record', 'height': '.1'})

def plot_tree(tree, g):
    root_node = tree.feature
    if root_node is None:
        return g
    g.node(root_node, nohtml(root_node))
    child_nodes = tree.children.keys()
    for i, child in enumerate(child_nodes):
        node = tree.children[child]
        name = node.feature if node.feature is not None else child+node.label
        label = node.feature if node.feature is not None else node.label
        g.node(name, nohtml(label))
        g.edge(root_node, name, label=child)
        plot_tree(node, g)
    return g

g = plot_tree(tree, g)
g.render('decision_tree', format='png', view=True)

'decision_tree.png'