In [3]:
import pandas as pd
import numpy as np

## Q1

In [4]:
class Node:
    def __init__(self, feature=None, left=None, right=None, output=None):
        self.feature = feature
        self.left = left
        self.right = right
        self.output = output

def entropy(target):
    value_counts = target.value_counts(normalize=True)
    return -sum(value_counts * np.log2(value_counts + 1e-9))

def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values = data[feature].unique()
    weighted_entropy = 0
    for value in values:
        subset = data[data[feature] == value]
        weighted_entropy += (len(subset) / len(data)) * entropy(subset[target])
    return total_entropy - weighted_entropy

def best_feature(data, features, target):
    best_gain = -1
    best_feature = None
    for feature in features:
        gain = information_gain(data, feature, target)
        if gain > best_gain:
            best_gain = gain
            best_feature = feature
    return best_feature

def build_tree(data, features, target):
    if len(data[target].unique()) == 1:
        return Node(output=data[target].iloc[0])
    
    if len(features) == 0:
        return Node(output=data[target].mode()[0])
    
    best_feat = best_feature(data, features, target)
    tree = Node(feature=best_feat)
    
    for value in data[best_feat].unique():
        subset = data[data[best_feat] == value]
        if subset.empty:
            leaf_output = data[target].mode()[0]
            if value in data[best_feat].unique():
                tree.left = Node(output=leaf_output)
            else:
                tree.right = Node(output=leaf_output)
        else:
            subtree = build_tree(subset, features.drop(best_feat), target)
            if tree.left is None:
                tree.left = subtree
            else:
                tree.right = subtree

    return tree

def classify(tree, instance):
    if tree.output is not None:
        return tree.output
    feature_value = instance[tree.feature]
    
    # Directly check which side of the tree to traverse
    if feature_value == tree.left.output:
        return classify(tree.left, instance)
    else:
        return classify(tree.right, instance)


In [5]:

# Load the dataset
data = pd.read_csv('weather.csv')

# Strip whitespace from column names
data.columns = data.columns.str.strip()

# Check if 'Decision' is present
if 'Decision' not in data.columns:
    raise KeyError("'Decision' column is not found in the dataset.")

features = data.columns[:-1]  # Exclude the target column
target = 'Decision'

# Build the decision tree
decision_tree = build_tree(data, features, target)

# Classifying a new sample
new_sample = {
    'Outlook': 'Sunny',
    'Temp': 85,
    'Humidity': 80,
    'Wind': 'Weak'
}

# Convert new sample into a Series for classification
new_sample_df = pd.Series(new_sample)
result = classify(decision_tree, new_sample_df)
print(f"The classification result for the new sample is: {result}")

The classification result for the new sample is: No


## Q2

In [6]:
class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, output=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.output = output

def gini_index(target):
    total = len(target)
    if total == 0:
        return 0
    proportions = target.value_counts(normalize=True)
    return 1 - sum(proportions ** 2)

def split_data(data, feature, threshold):
    left_subset = data[data[feature] <= threshold]
    right_subset = data[data[feature] > threshold]
    return left_subset, right_subset

def best_split(data, features, target):
    best_gini = float('inf')
    best_feature = None
    best_threshold = None
    for feature in features:
        if data[feature].dtype in ['int64', 'float64']:
            thresholds = data[feature].unique()
            for threshold in thresholds:
                left_subset, right_subset = split_data(data, feature, threshold)
                gini_left = gini_index(left_subset[target])
                gini_right = gini_index(right_subset[target])
                weighted_gini = (len(left_subset) * gini_left + len(right_subset) * gini_right) / len(data)
                
                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_feature = feature
                    best_threshold = threshold
                    
    return best_feature, best_threshold

def build_tree(data, features, target, depth=0, max_depth=5):
    if len(data[target].unique()) == 1 or depth >= max_depth:
        return TreeNode(output=data[target].mode()[0])
    
    best_feature, best_threshold = best_split(data, features, target)
    
    if best_feature is None:
        return TreeNode(output=data[target].mode()[0])
    
    left_subset, right_subset = split_data(data, best_feature, best_threshold)
    
    left_node = build_tree(left_subset, features, target, depth + 1, max_depth)
    right_node = build_tree(right_subset, features, target, depth + 1, max_depth)
    
    return TreeNode(feature=best_feature, threshold=best_threshold, left=left_node, right=right_node)

def classify(tree, instance):
    if tree.output is not None:
        return tree.output
    feature_value = instance[tree.feature]
    if feature_value <= tree.threshold:
        return classify(tree.left, instance)
    else:
        return classify(tree.right, instance)

In [7]:
# Load the dataset
data = pd.DataFrame({
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rainy', 'Rainy', 'Rainy', 'Overcast', 
                'Sunny', 'Sunny', 'Rainy', 'Sunny', 'Overcast', 'Overcast', 'Rainy'],
    'Temp': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 
             'Weak', 'Weak', 'Strong', 'Strong', 'Weak', 'Strong'],
    'Decision': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes', 'No', 'Yes', 
                 'Yes', 'Yes', 'Yes', 'Yes', 'No']
})

# Convert categorical variables to numerical for the CART implementation
data['Outlook'] = data['Outlook'].astype('category').cat.codes
data['Wind'] = data['Wind'].astype('category').cat.codes

features = ['Outlook', 'Temp', 'Humidity', 'Wind']
target = 'Decision'

# Build the decision tree
decision_tree = build_tree(data, features, target)

# Classifying a new sample
new_sample = {
    'Outlook': 0,  # Sunny (encoded as 0)
    'Temp': 85,
    'Humidity': 80,
    'Wind': 0  # Weak (encoded as 0)
}

# Convert new sample into a Series for classification
new_sample_df = pd.Series(new_sample)
result = classify(decision_tree, new_sample_df)
print(f"The classification result for the new sample is: {result}")

The classification result for the new sample is: Yes


## Q3

In [8]:
class TreeNode:
    def __init__(self, feature=None, threshold=None, left=None, right=None, output=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.output = output

# CART: Gini Index Calculation
def gini_index(target):
    total = len(target)
    if total == 0:
        return 0
    proportions = target.value_counts(normalize=True)
    return 1 - sum(proportions ** 2)

# Split Data for CART
def split_data(data, feature, threshold):
    left_subset = data[data[feature] <= threshold]
    right_subset = data[data[feature] > threshold]
    return left_subset, right_subset

# Best Split Calculation for CART
def best_split(data, features, target):
    best_gini = float('inf')
    best_feature = None
    best_threshold = None
    for feature in features:
        if data[feature].dtype in ['int64', 'float64']:
            thresholds = data[feature].unique()
            for threshold in thresholds:
                left_subset, right_subset = split_data(data, feature, threshold)
                gini_left = gini_index(left_subset[target])
                gini_right = gini_index(right_subset[target])
                weighted_gini = (len(left_subset) * gini_left + len(right_subset) * gini_right) / len(data)
                
                if weighted_gini < best_gini:
                    best_gini = weighted_gini
                    best_feature = feature
                    best_threshold = threshold
                    
    return best_feature, best_threshold

# Build Tree for CART
def build_tree(data, features, target, depth=0, max_depth=5):
    if len(data[target].unique()) == 1 or depth >= max_depth:
        return TreeNode(output=data[target].mode()[0])
    
    best_feature, best_threshold = best_split(data, features, target)
    
    if best_feature is None:
        return TreeNode(output=data[target].mode()[0])
    
    left_subset, right_subset = split_data(data, best_feature, best_threshold)
    
    left_node = build_tree(left_subset, features, target, depth + 1, max_depth)
    right_node = build_tree(right_subset, features, target, depth + 1, max_depth)
    
    return TreeNode(feature=best_feature, threshold=best_threshold, left=left_node, right=right_node)

# Classify Sample for CART
def classify(tree, instance):
    if tree.output is not None:
        return tree.output
    feature_value = instance[tree.feature]
    if feature_value == tree.threshold:
        return classify(tree.left, instance)
    else:
        return classify(tree.right, instance)

In [9]:
# Load the dataset
data = pd.read_csv('loan.csv')

# Convert categorical variables to numerical for the CART implementation
data['Income'] = data['Income'].astype('category').cat.codes
data['Credit'] = data['Credit'].astype('category').cat.codes

# Prepare Features and Target
features = ['Income', 'Credit']
target = 'Loan_Approved'

# Build the CART Decision Tree
decision_tree_cart = build_tree(data, features, target)

# Classifying a new sample using CART
new_sample_cart = {
    'Income': 1,  # Medium (encoded)
    'Credit': 0   # Good (encoded)
}

new_sample_cart_df = pd.Series(new_sample_cart)
result_cart = classify(decision_tree_cart, new_sample_cart_df)
print(f"CART classification result for the new sample: {result_cart}")

CART classification result for the new sample: Yes


In [10]:
def entropy(target):
    value_counts = target.value_counts(normalize=True)
    return -sum(value_counts * np.log2(value_counts + 1e-9))

def information_gain(data, feature, target):
    total_entropy = entropy(data[target])
    values = data[feature].unique()
    weighted_entropy = 0
    for value in values:
        subset = data[data[feature] == value]
        weighted_entropy += (len(subset) / len(data)) * entropy(subset[target])
    return total_entropy - weighted_entropy

def best_feature(data, features, target):
    best_gain = -1
    best_feature = None
    for feature in features:
        gain = information_gain(data, feature, target)
        if gain > best_gain:
            best_gain = gain
            best_feature = feature
    return best_feature

def build_tree_c45(data, features, target):
    if len(data[target].unique()) == 1:
        return TreeNode(output=data[target].iloc[0])
    
    if len(features) == 0:
        return TreeNode(output=data[target].mode()[0])
    
    best_feat = best_feature(data, features, target)
    tree = TreeNode(feature=best_feat)

    for value in data[best_feat].unique():
        subset = data[data[best_feat] == value]
        subtree = build_tree_c45(subset, [feat for feat in features if feat != best_feat], target)
        if tree.left is None:
            tree.left = subtree
        else:
            tree.right = subtree

    return tree

In [32]:
# Build the C4.5 Decision Tree
decision_tree_c45 = build_tree_c45(data, features, target)

# Classifying a new sample using C4.5
result_c45 = classify(decision_tree_c45, new_sample_cart_df)
print(f"C4.5 classification result for the new sample: {result_c45}")

C4.5 classification result for the new sample: No
