In [213]:
import numpy as np

In [214]:
# Reading in the datasets
clean_dataset = np.loadtxt('Dataset/clean_dataset.txt')
noisy_dataset = np.loadtxt('Dataset/noisy_dataset.txt')

In [215]:
print(clean_dataset.shape)

(2000, 8)


In [216]:
print(noisy_dataset.shape)

(2000, 8)


In [217]:
print(clean_dataset[1:5][2:4])

[[-61. -60. -68. -62. -77. -90. -80.   1.]
 [-63. -65. -60. -63. -77. -81. -87.   1.]]


In [218]:
np.unique(clean_dataset[:, -1])

array([1., 2., 3., 4.])

In [219]:
nique, counts = np.unique(clean_dataset[:,-1], return_counts=True)
# result = dict(zip(clean_dataset, counts))
print(len(nique))
print(counts)

4
[500 500 500 500]


In [220]:
# Placeholder function to calculate entropy
def cal_entropy(dataset):

    nique, counts = np.unique(dataset[:,-1], return_counts=True)

    pk_values = []

    # calculating pk values
    for k in range(len(nique)):
        pk = counts[k]/len(dataset[:, -1])
        pk_values.append(pk)

    entropy = 0
    # calculating entropy based on pk values
    for pk in pk_values:
        entropy += (pk * np.log2(pk))

    return -1 * entropy

In [221]:
# Function to calculate the best split feature and value
# We will iterate through all possible split points
def find_split(dataset):

    # Initialisation
    best_info_gain = -9999
    best_feature_index = None
    best_feature_value = None

    best_left_data = []
    best_right_data = []

    best_value = None

    entropy_complete_dataset = cal_entropy(dataset)

    # Loop through all features
    for feature_index in range(dataset.shape[1]-1):
        # Get unique feature values
        unique_feature_values = np.unique(dataset[:,feature_index])

        # Find all potential split points (midpoint between the values)
        potential_splits = []

        for i in range(len(unique_feature_values)-1):
            current_value = unique_feature_values[i]
            next_value = unique_feature_values[i+1]

            mid_value = (current_value+next_value)/2

            potential_splits.append(mid_value)

        for split_value in potential_splits:
            left_dataset_list = []
            right_dataset_list = []

            for row in dataset:
                if row[feature_index] <= split_value:
                    left_dataset_list.append(row)
                else:
                    right_dataset_list.append(row)
            
            left_dataset = np.array(left_dataset_list)
            right_dataset = np.array(right_dataset_list)

            prob_left = len(left_dataset) / len(dataset)
            prob_right = len(right_dataset) / len(dataset)

            information_gain = entropy_complete_dataset - (prob_left * cal_entropy(left_dataset) + (prob_right * cal_entropy(right_dataset)))
 
            if information_gain > best_info_gain:
                best_info_gain = information_gain
                best_feature_index = feature_index
                
                best_left_data = left_dataset
                best_right_data = right_dataset

                best_value = split_value

    return best_info_gain, best_feature_index, best_value, best_left_data, best_right_data

In [222]:
print(cal_entropy(np.array([[1],[1],[1],[2],[2],[3],[3],[3],[4],[4]])))

1.9709505944546686


In [223]:
print(cal_entropy(np.array([[2],[2],[2],[2],[2]])) == 0)


True


In [224]:
# Our node class
class Node:
  def __init__(self, left = None, right = None, splitValue = None, attributeIndex = None,
   isLeaf = False, prediction = None) -> None:
    self.left = left
    self.right = right
    self.splitValue = splitValue
    self.attributeIndex = attributeIndex
    self.isLeaf = isLeaf
    self.prediction = prediction

In [225]:
def decision_tree_learning(dataset, depth):

    
    
    # if cal_entropy(dataset) == 0:
    #     nique, counts = np.unique(dataset[:,-1], return_counts=True)
    #     leafNode = Node(isLeaf=True, prediction=nique)
    #     # print("LEAFNODE:  ", "++++", counts[0], "----", nique)
    #     return leafNode, depth
    
    unique_labels = np.unique(dataset[:,-1])
    if len(unique_labels)==1:
        leafNode = Node(isLeaf=True, prediction=unique_labels[0])
        return leafNode, depth

    
    best_info_gain, best_feature_index, best_value, left_data, right_data = find_split(dataset)

    # Best feature index 
    # best info gain
    # best feature value
    node = Node(splitValue=best_value,
                attributeIndex=best_feature_index)
    node.left, l_depth = decision_tree_learning(left_data, depth + 1)
    node.right, r_depth = decision_tree_learning(right_data, depth + 1)

    return (node, max(l_depth, r_depth))



# # result = dict(zip(clean_dataset, counts))
# print(len(nique))
# print(counts)
    

In [235]:
node, depth = decision_tree_learning(noisy_dataset, 0)

print(vars(node))
print(depth)

print(get_prediction(node, clean_dataset[0]))
print(clean_dataset[0][-1])

{'left': <__main__.Node object at 0x108432d90>, 'right': <__main__.Node object at 0x1085f15b0>, 'splitValue': -54.5, 'attributeIndex': 0, 'isLeaf': False, 'prediction': None}
18
2.0
1.0


In [227]:
# function to shuffle data indices and split values
def kfold_datasets_generator(dataset):

    working_data = dataset.copy()
    print(working_data.shape)
    # print(working_data[:5])
    # shuffling the rows
    np.random.shuffle(working_data)
    # print(working_data[:5])

    split_datasets = np.split(working_data, 10)
    print(len(split_datasets))
    print(split_datasets[0].shape)

    # print(sub_array[0][:5])
    # Okay, now we have 10 datasets. We need to focus on training the tree on the first 9 and evaluate on the last 1

    l = []
    for i in range(len(split_datasets)):
        # The ith set will be the testing set, the rest will be the training set
        testing_set = split_datasets[i]
        training_set = split_datasets[:i] + split_datasets[i + 1:]

        training_set = np.concatenate((training_set), axis = 0)

        d = {'testing': testing_set, 'training': training_set}
        l.append(d)
        # d["testing": split_datasets[i]]
    
    # print((l[i]['testing'].shape, l[i]['training'].shape) for i in range(len(l)))
    # for i in range(len(l)):
    #     print(l[i]['testing'].shape, l[i]['training'].shape)
    #     # print()
    
    return l

In [228]:
def get_prediction(node: Node, input):

    ## TODO: Add a loop to loop through every row in the input set
    val = node.splitValue
    index = node.attributeIndex
    if node.isLeaf:
        # print("NOde prediction = ", node.prediction)
        return node.prediction
    if input[index] <= val:
        # print("YES")
        return get_prediction(node.left, input)
    else:
        # print("NO")
        return get_prediction(node.right, input)
    # print(input.shape)
    

In [233]:
def kfold_evaluator(datasets):
    all_fold_accuracies = []
    
    for i in datasets:
        node, depth = decision_tree_learning(i['training'], 0)
        print(vars(node))
        print(depth)

        confusion_matrix = [[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0],[0, 0, 0, 0]]


        for j in range(len(i['testing'])):
            predicted = get_prediction(node, i['testing'][j])
            actual = i['testing'][j][-1]
            predicted.astype(np.int64)
            predicted = predicted.astype(np.int64)
            actual = actual.astype(np.int64)
            confusion_matrix[predicted - 1][actual - 1] += 1
        confusion_matrix = np.array(confusion_matrix)
        print(confusion_matrix)

        print("-" * 30)

        # --- Integrated Statistics Calculation ---
        
        print("--- Per-Class Statistics ---")
        
        precision_scores = []
        recall_scores = []

        for k in range(4): # Use 'k' to avoid shadowing the outer 'i'
            class_name = f"Class {k+1}"

            # True Positives: Correctly predicted for this class
            true_positives = confusion_matrix[k, k]
            
            # False Positives: Predicted as this class, but was actually another
            # Sum of row 'k', minus the TP
            false_positives = np.sum(confusion_matrix[k, :]) - true_positives
            
            # False Negatives: Actually this class, but predicted as another
            # Sum of column 'k', minus the TP
            false_negatives = np.sum(confusion_matrix[:, k]) - true_positives

            # --- Calculate Precision ---
            # TP / (TP + FP)
            precision_denominator = (true_positives + false_positives)
            if precision_denominator == 0:
                precision = 0.0
            else:
                precision = true_positives / precision_denominator
            
            precision_scores.append(precision)

            # --- Calculate Recall ---
            # TP / (TP + FN)
            recall_denominator = (true_positives + false_negatives)
            if recall_denominator == 0:
                recall = 0.0
            else:
                recall = true_positives / recall_denominator
            
            recall_scores.append(recall)

            print(f"{class_name}:")
            print(f"  True Positives:  {true_positives}")
            print(f"  False Positives: {false_positives}")
            print(f"  False Negatives: {false_negatives}")
            print(f"  Precision:       {precision:.4f}")
            print(f"  Recall:          {recall:.4f}")

        print("-" * 30)
        print("--- Overall Statistics (for this fold) ---")
        
        # Overall Accuracy = (Sum of all correct) / (Total samples)
        total_correct = np.trace(confusion_matrix) # Sum of the diagonal
        total_samples = np.sum(confusion_matrix)
        
        if total_samples == 0:
            overall_accuracy = 0.0
        else:
            overall_accuracy = total_correct / total_samples
            
        all_fold_accuracies.append(overall_accuracy)

        print(f"Overall Accuracy: {overall_accuracy:.4f}")
        print(f"Average Precision: {np.mean(precision_scores):.4f}")
        print(f"Average Recall: {np.mean(recall_scores):.4f}")
        print("=" * 30 + "\n")


    print("--- Final K-Fold Summary ---")
    if len(all_fold_accuracies) > 0:
        print(f"Average Overall Accuracy across all folds: {np.mean(all_fold_accuracies):.4f}")
    else:
        print("No datasets were processed.")
            


        # get_predictions(node, depth, i['testing'])

In [236]:
datasets = kfold_datasets_generator(noisy_dataset)
kfold_evaluator(datasets)

(2000, 8)
10
(200, 8)
{'left': <__main__.Node object at 0x1084fa850>, 'right': <__main__.Node object at 0x108cf3130>, 'splitValue': -54.5, 'attributeIndex': 0, 'isLeaf': False, 'prediction': None}
21
[[57  5  2  2]
 [ 0 40  5  1]
 [ 3  5 36  2]
 [ 4  4  3 31]]
------------------------------
--- Per-Class Statistics ---
Class 1:
  True Positives:  57
  False Positives: 9
  False Negatives: 7
  Precision:       0.8636
  Recall:          0.8906
Class 2:
  True Positives:  40
  False Positives: 6
  False Negatives: 14
  Precision:       0.8696
  Recall:          0.7407
Class 3:
  True Positives:  36
  False Positives: 10
  False Negatives: 10
  Precision:       0.7826
  Recall:          0.7826
Class 4:
  True Positives:  31
  False Positives: 11
  False Negatives: 5
  Precision:       0.7381
  Recall:          0.8611
------------------------------
--- Overall Statistics (for this fold) ---
Overall Accuracy: 0.8200
Average Precision: 0.8135
Average Recall: 0.8188

{'left': <__main__.Node ob

In [None]:
arr = np.arange(9).reshape((3, 3))
print(arr)
np.random.shuffle(arr)
arr

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[3, 4, 5],
       [6, 7, 8],
       [0, 1, 2]])

In [None]:
cf = np.array([[1, 2, 3, 4], [3, 2, 3, 5], [4, 2, 3, 4], [5, 2, 3, 4]])
print(cf)
print("==")
# cf[1:][0]
print(np.add(cf[:1, [0]], cf[2:, [0]]))
print("==")
print(cf[2:, [0]])
print("==")
print(cf[:1, [0]])
print("==")
np.add(cf[:1, [0]], cf[2:, [0]])

[[1 2 3 4]
 [3 2 3 5]
 [4 2 3 4]
 [5 2 3 4]]
==
[[5]
 [6]]
==
[[4]
 [5]]
==
[[1]]
==


array([[5],
       [6]])