In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [19]:
pd.set_option('future.no_silent_downcasting', True)

In [20]:
df = pd.read_csv('bank_data.csv')

In [21]:
df['y'] = df['y'].replace({'no': 0, 'yes': 1})
count_zeros = (df['y'] == 0).sum()
count_ones = (df['y'] == 1).sum()

In [22]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature      
        self.threshold = threshold  
        self.left = left           
        self.right = right          
        self.value = value       

In [31]:
def fit_tree(X, y):
    root = _grow_tree(X, y)
    return root

def _grow_tree(X, y, depth=0):
        num_samples, num_features = X.shape
        num_samples_per_class = [np.sum(y == i) for i in range(num_classes)]
        most_common_class = np.argmax(num_samples_per_class)
        
        if len(np.unique(y)) == 1 or depth == max_depth:
            return Node(value=most_common_class)
        
        best_gain = -1
        best_feature = None
        best_threshold = None
        for feature_name in X.columns:
            feature_values = np.unique(X[feature_name])
            
            if len(feature_values) == 1: 
                continue
                
            for value in feature_values:
                left_indices = X.index[X[feature_name] == value]
                right_indices = X.index[X[feature_name] != value]
                left_entropy = _entropy(y[left_indices])
                right_entropy = _entropy(y[right_indices])
                entropy = (len(left_indices) / num_samples) * left_entropy + (len(right_indices) / num_samples) * right_entropy
                       
                gain = _information_gain(_entropy(y), entropy)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_name
                    best_threshold = value
        
        if best_gain == 0:  
            return Node(value=most_common_class)
        
        left_indices = X.index[X[best_feature] == best_threshold]
        right_indices = X.index[X[best_feature] != best_threshold]
        
        left_child = _grow_tree(X.loc[left_indices], y.loc[left_indices], depth + 1)
        right_child = _grow_tree(X.loc[right_indices], y.loc[right_indices], depth + 1)
        
        return Node(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)

def _entropy(y):
    if len(y) == 0:
        return 0
    num_samples = len(y)
    num_samples_per_class = [np.sum(y == i) for i in range(num_classes)]
    class_probabilities = [num_samples_i / num_samples for num_samples_i in num_samples_per_class]
    entropy = -sum(p * np.log2(p) for p in class_probabilities if p != 0)
    return entropy

def _information_gain(parent_entropy, children_entropy):
    return parent_entropy - children_entropy

def predict(root, X):
    return np.array([_predict_tree(x, root) for _, x in X.iterrows()])

def _predict_tree(x, node):
    if node.value is not None:
        return node.value
    if x[node.feature] == node.threshold:
        return _predict_tree(x, node.left)
    else:
        return _predict_tree(x, node.right)

In [32]:
X = df.drop(columns=['y']) 
y = df['y'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [33]:
def accuracy_s(y_pred, y_true):
    accuracy = len(y_pred)
    for i in range(len(y_pred)):
        if y_pred.iloc[i]!=y_true[i]:
            accuracy-=1
    return accuracy/len(y_pred)

In [None]:
for i in range(17):
    if i == 0:
        continue
    print(f"max_depth = {i}") 
    max_depth = i
    num_classes = len(np.unique(y_train))

    root = fit_tree(X_train, y_train)
    train_predictions = predict(root, X_train)
    train_accuracy = accuracy_s(y_train, train_predictions)
    print("Training Accuracy:", train_accuracy)
    
    test_predictions = predict(root, X_test)
    test_accuracy = accuracy_s(y_test, test_predictions)
    print("Testing Accuracy:", test_accuracy)

max_depth = 1
Training Accuracy: 0.8924225663716814
Testing Accuracy: 0.8950276243093923
max_depth = 2
Training Accuracy: 0.8924225663716814
Testing Accuracy: 0.8950276243093923
max_depth = 3
Training Accuracy: 0.8935287610619469
Testing Accuracy: 0.8961325966850828
max_depth = 4
Training Accuracy: 0.8957411504424779
Testing Accuracy: 0.8906077348066298
max_depth = 5
Training Accuracy: 0.8979535398230089
Testing Accuracy: 0.8928176795580111
max_depth = 6
Training Accuracy: 0.9037610619469026
Testing Accuracy: 0.8961325966850828
max_depth = 7
Training Accuracy: 0.90625
Testing Accuracy: 0.8928176795580111
max_depth = 8
Training Accuracy: 0.9081858407079646
Testing Accuracy: 0.8895027624309392
max_depth = 9
Training Accuracy: 0.9112278761061947
Testing Accuracy: 0.8906077348066298
max_depth = 10
Training Accuracy: 0.9153761061946902
Testing Accuracy: 0.8883977900552487
max_depth = 11
Training Accuracy: 0.9217367256637168
Testing Accuracy: 0.887292817679558
max_depth = 12
Training Accurac

In [None]:
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature      # Index of feature to split on
        self.threshold = threshold  # Threshold value for the feature
        self.left = left            # Left child node
        self.right = right          # Right child node
        self.value = value          # Value if the node is a leaf
        
class DecisionTreeID3:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
    
    def fit(self, X, y):
        self.num_classes = len(np.unique(y))
        self.root = self._grow_tree(X, y)
        
    def _grow_tree(self, X, y, depth=0):
        num_samples, num_features = X.shape
        num_samples_per_class = [np.sum(y == i) for i in range(self.num_classes)]
        most_common_class = np.argmax(num_samples_per_class)
        
        # If only one class is present or maximum depth reached, create a leaf node
        if len(np.unique(y)) == 1 or depth == self.max_depth:
            return Node(value=most_common_class)
        
        # Select the best split based on information gain
        best_gain = -1
        best_feature = None
        best_threshold = None
        for feature_name in X.columns:
            feature_values = np.unique(X[feature_name])
            
            if len(feature_values) == 1:  # Skip features with only one value
                continue
                
            for value in feature_values:
                left_indices = X.index[X[feature_name] == value]
                right_indices = X.index[X[feature_name] != value]
                left_entropy = self._entropy(y[left_indices])
                right_entropy = self._entropy(y[right_indices])
                entropy = (len(left_indices) / num_samples) * left_entropy + \
                          (len(right_indices) / num_samples) * right_entropy
                       
                gain = self._information_gain(self._entropy(y), entropy)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature_name
                    best_threshold = value
        
        if best_gain == 0:  # If no information gain, create a leaf node
            return Node(value=most_common_class)
        
        left_indices = X.index[X[best_feature] == best_threshold]
        right_indices = X.index[X[best_feature] != best_threshold]
        
        left_child = self._grow_tree(X.loc[left_indices], y.loc[left_indices], depth + 1)
        right_child = self._grow_tree(X.loc[right_indices], y.loc[right_indices], depth + 1)
        
        return Node(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)
    
    def _entropy(self, y):
        if len(y) == 0:
            return 0
        num_samples = len(y)
        num_samples_per_class = [np.sum(y == i) for i in range(self.num_classes)]
        class_probabilities = [num_samples_i / num_samples for num_samples_i in num_samples_per_class]
        entropy = -sum(p * np.log2(p) for p in class_probabilities if p != 0)
        return entropy
    
    def _information_gain(self, parent_entropy, children_entropy):
        return parent_entropy - children_entropy
    
    def predict(self, X):
        return np.array([self._predict_tree(x, self.root) for _, x in X.iterrows()])
    
    def _predict_tree(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] == node.threshold:
            return self._predict_tree(x, node.left)
        else:
            return self._predict_tree(x, node.right)