## **Q1**

In [5]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from anytree import Node, RenderTree
import math

In [1]:
import pandas as pd

data = pd.read_csv("/content/advertising.csv")

In [2]:
data.head()

Unnamed: 0,Daily Time Spent on Site,Age,Area Income,Daily Internet Usage,Ad Topic Line,City,Male,Country,Timestamp,Clicked on Ad
0,68.95,35,61833.9,256.09,Cloned 5thgeneration orchestration,Wrightburgh,0,Tunisia,2016-03-27 00:53:11,0
1,80.23,31,68441.85,193.77,Monitored national standardization,West Jodi,1,Nauru,2016-04-04 01:39:02,0
2,69.47,26,59785.94,236.5,Organic bottom-line service-desk,Davidton,0,San Marino,2016-03-13 20:35:42,0
3,74.15,29,54806.18,245.89,Triple-buffered reciprocal time-frame,West Terrifurt,1,Italy,2016-01-10 02:31:19,0
4,68.37,35,73889.99,225.58,Robust logistical utilization,South Manuel,0,Iceland,2016-06-03 03:36:18,0


In [6]:
data = data.drop(['Ad Topic Line', 'City', 'Country', 'Timestamp'], axis=1)


X = data.drop('Clicked on Ad', axis=1)
y = data['Clicked on Ad']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


train_data = X_train.copy()
train_data['Clicked on Ad'] = y_train
test_data = X_test.copy()
test_data['Clicked on Ad'] = y_test

In [7]:
def entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * math.log2(p) for p in probabilities if p > 0])

def gini(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return 1 - np.sum([p**2 for p in probabilities])


In [8]:
class DecisionNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value  # only for leaf nodes


In [9]:
def information_gain(data, feature, threshold, target_col, criterion='entropy'):
    left = data[data[feature] <= threshold]
    right = data[data[feature] > threshold]

    if len(left) == 0 or len(right) == 0:
        return 0

    y = data[target_col]
    left_y = left[target_col]
    right_y = right[target_col]

    if criterion == 'entropy':
        gain = entropy(y) - (len(left_y) / len(y)) * entropy(left_y) - (len(right_y) / len(y)) * entropy(right_y)
    elif criterion == 'gini':
        gain = gini(y) - (len(left_y) / len(y)) * gini(left_y) - (len(right_y) / len(y)) * gini(right_y)
    else:
        raise ValueError("Invalid criterion")

    return gain


In [10]:
def build_tree(data, target_col='Clicked on Ad', depth=0, max_depth=5, criterion='entropy'):
    y = data[target_col]

    # Stop condition
    if len(set(y)) == 1 or depth == max_depth:
        return DecisionNode(value=y.mode()[0])

    best_gain = -1
    best_feat, best_thresh = None, None
    for feature in data.drop(columns=[target_col]).columns:
        for val in data[feature].unique():
            gain = information_gain(data, feature, val, target_col, criterion)
            if gain > best_gain:
                best_gain = gain
                best_feat = feature
                best_thresh = val

    if best_gain == 0:
        return DecisionNode(value=y.mode()[0])

    left_data = data[data[best_feat] <= best_thresh]
    right_data = data[data[best_feat] > best_thresh]
    left = build_tree(left_data, target_col, depth + 1, max_depth, criterion)
    right = build_tree(right_data, target_col, depth + 1, max_depth, criterion)

    return DecisionNode(feature=best_feat, threshold=best_thresh, left=left, right=right)


In [11]:
def predict_tree(row, node):
    if node.value is not None:
        return node.value
    if row[node.feature] <= node.threshold:
        return predict_tree(row, node.left)
    else:
        return predict_tree(row, node.right)

def predict_batch(X, tree):
    return [predict_tree(row, tree) for _, row in X.iterrows()]


In [12]:
# Build and evaluate with Entropy
tree_entropy = build_tree(train_data, criterion='entropy')
pred_entropy = predict_batch(test_data.drop('Clicked on Ad', axis=1), tree_entropy)
print("Entropy - Accuracy:", accuracy_score(test_data['Clicked on Ad'], pred_entropy))
print(classification_report(test_data['Clicked on Ad'], pred_entropy))

# Build and evaluate with Gini
tree_gini = build_tree(train_data, criterion='gini')
pred_gini = predict_batch(test_data.drop('Clicked on Ad', axis=1), tree_gini)
print("Gini - Accuracy:", accuracy_score(test_data['Clicked on Ad'], pred_gini))
print(classification_report(test_data['Clicked on Ad'], pred_gini))


Entropy - Accuracy: 0.92
              precision    recall  f1-score   support

           0       0.89      0.93      0.91        89
           1       0.94      0.91      0.93       111

    accuracy                           0.92       200
   macro avg       0.92      0.92      0.92       200
weighted avg       0.92      0.92      0.92       200

Gini - Accuracy: 0.925
              precision    recall  f1-score   support

           0       0.90      0.93      0.92        89
           1       0.94      0.92      0.93       111

    accuracy                           0.93       200
   macro avg       0.92      0.93      0.92       200
weighted avg       0.93      0.93      0.93       200



In [13]:
def print_tree(node, spacing=""):
    if node.value is not None:
        print(spacing + f"Predict: {node.value}")
        return

    print(spacing + f"[{node.feature} <= {node.threshold}]")
    print(spacing + '--> True:')
    print_tree(node.left, spacing + "  ")
    print(spacing + '--> False:')
    print_tree(node.right, spacing + "  ")

print("\nTree using Entropy:")
print_tree(tree_entropy)

print("\nTree using Gini:")
print_tree(tree_gini)



Tree using Entropy:
[Daily Internet Usage <= 178.35]
--> True:
  [Daily Time Spent on Site <= 71.86]
  --> True:
    [Area Income <= 76893.84]
    --> True:
      [Daily Internet Usage <= 175.37]
      --> True:
        Predict: 1
      --> False:
        [Daily Time Spent on Site <= 52.62]
        --> True:
          Predict: 1
        --> False:
          Predict: 0
    --> False:
      Predict: 0
  --> False:
    [Daily Internet Usage <= 160.33]
    --> True:
      [Daily Time Spent on Site <= 85.73]
      --> True:
        [Age <= 26]
        --> True:
          Predict: 0
        --> False:
          Predict: 1
      --> False:
        [Daily Time Spent on Site <= 87.97]
        --> True:
          Predict: 0
        --> False:
          Predict: 1
    --> False:
      [Age <= 42]
      --> True:
        [Area Income <= 40763.13]
        --> True:
          Predict: 1
        --> False:
          Predict: 0
      --> False:
        Predict: 1
--> False:
  [Daily Time Spent on Sit

## **Q2**

In [14]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import math

# Load iris dataset
iris = load_iris()
data = pd.DataFrame(iris.data, columns=iris.feature_names)
data['target'] = iris.target


In [15]:
def entropy(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return -np.sum([p * math.log2(p) for p in probabilities if p > 0])

def gini(y):
    counts = np.bincount(y)
    probabilities = counts / len(y)
    return 1 - np.sum([p**2 for p in probabilities])


In [16]:
class DecisionNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value


In [17]:
def information_gain(data, feature, threshold, target_col, criterion='entropy'):
    left = data[data[feature] <= threshold]
    right = data[data[feature] > threshold]

    if len(left) == 0 or len(right) == 0:
        return 0

    y = data[target_col]
    left_y = left[target_col]
    right_y = right[target_col]

    if criterion == 'entropy':
        gain = entropy(y) - (len(left_y) / len(y)) * entropy(left_y) - (len(right_y) / len(y)) * entropy(right_y)
    elif criterion == 'gini':
        gain = gini(y) - (len(left_y) / len(y)) * gini(left_y) - (len(right_y) / len(y)) * gini(right_y)
    else:
        raise ValueError("Invalid criterion")

    return gain


In [18]:
def build_tree(data, target_col='target', depth=0, max_depth=5, criterion='entropy'):
    y = data[target_col]

    if len(set(y)) == 1 or depth == max_depth:
        return DecisionNode(value=y.mode()[0])

    best_gain = -1
    best_feat, best_thresh = None, None
    for feature in data.drop(columns=[target_col]).columns:
        for val in data[feature].unique():
            gain = information_gain(data, feature, val, target_col, criterion)
            if gain > best_gain:
                best_gain = gain
                best_feat = feature
                best_thresh = val

    if best_gain == 0:
        return DecisionNode(value=y.mode()[0])

    left_data = data[data[best_feat] <= best_thresh]
    right_data = data[data[best_feat] > best_thresh]
    left = build_tree(left_data, target_col, depth + 1, max_depth, criterion)
    right = build_tree(right_data, target_col, depth + 1, max_depth, criterion)

    return DecisionNode(feature=best_feat, threshold=best_thresh, left=left, right=right)


In [19]:
def predict_tree(row, node):
    if node.value is not None:
        return node.value
    if row[node.feature] <= node.threshold:
        return predict_tree(row, node.left)
    else:
        return predict_tree(row, node.right)

def predict_batch(X, tree):
    return [predict_tree(row, tree) for _, row in X.iterrows()]


In [20]:
# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Train with Entropy
tree_entropy = build_tree(train_data, target_col='target', criterion='entropy')
pred_entropy = predict_batch(test_data.drop('target', axis=1), tree_entropy)

print("=== Entropy ===")
print("Accuracy:", accuracy_score(test_data['target'], pred_entropy))
print(classification_report(test_data['target'], pred_entropy))

# Train with Gini
tree_gini = build_tree(train_data, target_col='target', criterion='gini')
pred_gini = predict_batch(test_data.drop('target', axis=1), tree_gini)

print("=== Gini ===")
print("Accuracy:", accuracy_score(test_data['target'], pred_gini))
print(classification_report(test_data['target'], pred_gini))


=== Entropy ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

=== Gini ===
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30



In [21]:
def print_tree(node, spacing=""):
    if node.value is not None:
        print(spacing + f"Predict: {node.value}")
        return

    print(spacing + f"[{node.feature} <= {node.threshold}]")
    print(spacing + '--> True:')
    print_tree(node.left, spacing + "  ")
    print(spacing + '--> False:')
    print_tree(node.right, spacing + "  ")

print("Tree with Entropy:")
print_tree(tree_entropy)

print("Tree with Gini:")
print_tree(tree_gini)


Tree with Entropy:
[petal length (cm) <= 1.9]
--> True:
  Predict: 0
--> False:
  [petal length (cm) <= 4.7]
  --> True:
    [petal width (cm) <= 1.6]
    --> True:
      Predict: 1
    --> False:
      Predict: 2
  --> False:
    [petal width (cm) <= 1.7]
    --> True:
      [petal length (cm) <= 5.1]
      --> True:
        [sepal width (cm) <= 2.2]
        --> True:
          Predict: 2
        --> False:
          Predict: 1
      --> False:
        Predict: 2
    --> False:
      [petal length (cm) <= 4.8]
      --> True:
        [sepal length (cm) <= 5.9]
        --> True:
          Predict: 1
        --> False:
          Predict: 2
      --> False:
        Predict: 2
Tree with Gini:
[petal length (cm) <= 1.9]
--> True:
  Predict: 0
--> False:
  [petal length (cm) <= 4.7]
  --> True:
    [petal width (cm) <= 1.6]
    --> True:
      Predict: 1
    --> False:
      Predict: 2
  --> False:
    [petal width (cm) <= 1.7]
    --> True:
      [petal length (cm) <= 5.1]
      --> True:
