# Lab09 : DECISION TREE [ C4.5 AND CART ]

## Q1

Write a python function program to demonstrate the working of the decision tree based  C4.5 algorithms 
without  using  scikit-learn  library.  Use  following  data  set  for  building  the  decision  tree  and  apply  this 
knowledge to classify a new sample.  
The dataset has three attributes: Outlook (Sunny, Overcast, Rainy), Temperature, Humidity and Wind (Weak, 
Strong). The target attribute is Play Tennis (Yes/No). 

In [1]:
import pandas as pd
import numpy as np

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, prediction=None):
        self.feature = feature  
        self.threshold = threshold            
        self.left = left              
        self.right = right            
        self.prediction = prediction            

class SimpleTree:
    def __init__(self):
        self.root = None

    def fit(self, dataset):
        self.root = self._create_tree(dataset)

    def _calculate_entropy(self, labels):
        value_counts = labels.value_counts(normalize=True)
        return -np.sum(value_counts * np.log2(value_counts + 1e-9))

    def _compute_gain(self, dataset, split_feature, target):
        initial_entropy = self._calculate_entropy(dataset[target])
        value_counts = dataset[split_feature].value_counts(normalize=True)
        
        weighted_entropy = sum(value_counts[v] * self._calculate_entropy(dataset[dataset[split_feature] == v][target])
                                for v in value_counts.index)
        
        return initial_entropy - weighted_entropy

    def _find_best_split(self, dataset, target):
        max_gain = -1
        best_feature = None
        
        for feature in dataset.columns[:-1]:
            gain = self._compute_gain(dataset, feature, target)
            if gain > max_gain:
                max_gain = gain
                best_feature = feature
                
        return best_feature

    def _create_tree(self, dataset):
        target = dataset.columns[-1]
        labels = dataset[target]

        if len(labels.unique()) == 1:
            return Node(prediction=labels.iloc[0])

        if len(dataset.columns) == 1:
            return Node(prediction=labels.mode()[0])

        best_feature = self._find_best_split(dataset, target)

        tree_node = Node(feature=best_feature)

        for threshold in dataset[best_feature].unique():
            subset = dataset[dataset[best_feature] == threshold]
            child_node = self._create_tree(subset.drop(columns=[best_feature]))
            if tree_node.left is None:
                tree_node.left = child_node
                tree_node.threshold = threshold
            else:
                tree_node.right = child_node

        return tree_node

    def _classify_instance(self, node, instance):
        if node.prediction is not None:
            return node.prediction
        
        feature_value = instance[node.feature]
        
        if feature_value == node.threshold:
            return self._classify_instance(node.left, instance) if node.left else node.prediction
        else:
            return self._classify_instance(node.right, instance) if node.right else node.prediction

    def predict(self, instance):
        return self._classify_instance(self.root, instance)

data = {
    'Weather': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Breeze': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 
               'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 
             'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

tree = SimpleTree()
tree.fit(df)

new_instance = {
    'Weather': 'Sunny',
    'Temperature': 75,
    'Humidity': 70,
    'Breeze': 'Weak'
}

new_instance_df = pd.DataFrame([new_instance])

prediction = tree.predict(new_instance_df.iloc[0])
print(f"The predicted decision for the new instance is: {prediction}")


The predicted decision for the new instance is: No


## Q2

Write a python function program to demonstrate the working of the decision tree based  CART algorithms 
without using scikit-learn library. Use Q. No. 1  data set for building the decision tree and apply this knowledge 
to classify a new sample.  
The dataset has three attributes: Outlook (Sunny, Overcast, Rainy), Temperature, Humidity and Wind (Weak, 
Strong). The target attribute is Play Tennis (Yes/No).

In [3]:
import pandas as pd
import numpy as np

class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, output=None):
        self.feature = feature  
        self.threshold = threshold            
        self.left = left              
        self.right = right
        self.output = output            

class SimpleDecisionTree:
    def __init__(self):
        self.root = None

    def fit(self, dataset):
        self.root = self._construct_tree(dataset)

    def _calculate_gini(self, outcomes):
        total_count = len(outcomes)
        if total_count == 0:
            return 0
        proportions = outcomes.value_counts(normalize=True)
        return 1 - sum(proportions ** 2)

    def _compute_gain(self, dataset, split_feature, outcome_col):
        initial_gini = self._calculate_gini(dataset[outcome_col])
        weighted_gini = 0
        
        for threshold in dataset[split_feature].unique():
            subset = dataset[dataset[split_feature] == threshold]
            weighted_gini += (len(subset) / len(dataset)) * self._calculate_gini(subset[outcome_col])
        
        return initial_gini - weighted_gini

    def _find_best_split(self, dataset, outcome_col):
        max_gain = -1
        best_feature = None
        
        for feature in dataset.columns[:-1]:
            gain = self._compute_gain(dataset, feature, outcome_col)
            if gain > max_gain:
                max_gain = gain
                best_feature = feature
                
        return best_feature

    def _construct_tree(self, dataset):
        outcome_col = dataset.columns[-1]
        outcomes = dataset[outcome_col]

        if len(outcomes.unique()) == 1:
            return TreeNode(output=outcomes.iloc[0])

        if len(dataset.columns) == 1:
            return TreeNode(output=outcomes.mode()[0])

        best_feature = self._find_best_split(dataset, outcome_col)
        node = TreeNode(feature=best_feature)

        for threshold in dataset[best_feature].unique():
            subset = dataset[dataset[best_feature] == threshold]
            child_node = self._construct_tree(subset.drop(columns=[best_feature]))
            if node.left is None:
                node.left = child_node
                node.threshold = threshold
            else:
                node.right = child_node

        return node

    def _classify_sample(self, node, sample):
        if node.output is not None:
            return node.output
        
        feature_value = sample[node.feature]
        
        if feature_value == node.threshold:
            return self._classify_sample(node.left, sample) if node.left else node.output
        else:
            return self._classify_sample(node.right, sample) if node.right else node.output

    def predict(self, sample):
        return self._classify_sample(self.root, sample)

data = {
    'Weather': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain', 'Overcast', 
                'Sunny', 'Sunny', 'Rain', 'Sunny', 'Overcast', 'Overcast', 'Rain'],
    'Temperature': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Breeze': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 
               'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Play': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 'Yes', 
             'Yes', 'Yes', 'Yes', 'No']
}

df = pd.DataFrame(data)

decision_tree = SimpleDecisionTree()
decision_tree.fit(df)

new_data = {
    'Weather': 'Sunny',
    'Temperature': 75,
    'Humidity': 70,
    'Breeze': 'Weak'
}

new_data_df = pd.DataFrame([new_data])

prediction = decision_tree.predict(new_data_df.iloc[0])
print(f"The predicted decision for the new sample is: {prediction}")


The predicted decision for the new sample is: No
