In [237]:
import pandas as pd
from sklearn.datasets import load_breast_cancer, load_digits, load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.datasets import make_classification

import numpy as np

#### Dataset

In [238]:
# Load the Breast Cancer Wisconsin (Diagnostic) dataset
data = load_breast_cancer()
X = data.data
y = data.target

# First split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## 2a) Implement a program that automatically creates a set of if- then clauses from the training table of a binary dataset of your choice. Implement different strategies to minimize the number of if-then clauses. Document your strategies, the number of resulting conditional clauses, and the accuracy achieved.

I implement 5 different strategies. The first is to generate a rule for each feature and data point. This doesn't generalize well, but is akin to nearest neighbors. I also implemented algorithm 8 as described in the textbook, which was to sort the tables by sum of parameters and then deciding thresholds. The next two strategies I implement are decision trees. The first rule I use to split is maximum information gain. In essence, at each node, we find the feature that gives us the most information and split there. The second decision tree strategy is using the Gini impurity, which is very similar to information gain. Gini impurity is a measure of the probability of incorrectly classifying an element if it was labeled randomly according to the class distribution in the dataset. Finally, I tried pruning as a way to limit the number of if/else clauses. See below for the resulting number of clauses and accuracy achieved. In general, the decision trees performed similarly and pruning resulted in worse accuracy for fewer conditional cluses.

### Generate a rule for each feature
This is a sanity check. We generate a rule for each feature, then compare the test value to the closes value in the training dataset and take the most reported result. (think nearest neighbors)

In [239]:
# Load the Breast Cancer Wisconsin (Diagnostic) dataset
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

# Split the dataset into a training set and a test set
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Create a dictionary to store the rules
rules = {}

# For each feature in the training set
for feature in train_df.columns[:-1]:
    # For each unique value of the feature
    for value in train_df[feature].unique():
        # Create a rule that predicts the most common target value for this feature value
        rules[(feature, value)] = train_df[train_df[feature] == value]['target'].mode()[0]

# Apply the rules to the test set and calculate the accuracy
y_pred = []
for i in range(len(test_df)):
    predictions = [rules.get((feature, test_df.iloc[i][feature]), 0) for feature in test_df.columns[:-1]]
    y_pred.append(max(set(predictions), key=predictions.count))

y_true = test_df['target'].values
accuracy = accuracy_score(y_true, y_pred)

print(f"Number of rules: {len(rules)}, Accuracy: {accuracy}")

Number of rules: 12497, Accuracy: 0.37719298245614036


### Algorithm 8

In [240]:
def memorize(data, labels):
    table = [(sum(x), y) for x, y in zip(data, labels)]
    sorted_table = sorted(table, key=lambda x: x[0])

    thresholds = [sorted_table[0]]
    for row in sorted_table:
        if row[1] != thresholds[-1][0]:
            thresholds.append((row[0], row[1]))
    
    minthreshs = np.log2(len(thresholds) + 1)
    mec = (minthreshs * (len(data[0]) + 1)) + (minthreshs + 1)
    
    return thresholds, mec

def predict_alg8(data, thresholds):
    predictions = []
    for x in data:
        x_sum = np.sum(x)
        index = 0
        for threshold in thresholds:
            if x_sum <= threshold[0]:
                break
            index += 1
        predictions.append(thresholds[index][1])
    return predictions

def alg8(X_train, y_train, X_test, y_test):
    thresholds, _ = memorize(X_train, y_train)
    print("num thresholds = " + str(len(thresholds)))
    y_pred = predict_alg8(X_test, thresholds)
    accuracy = np.mean(y_pred == y_test)
    print("accuracy: " + str(accuracy))


alg8(X_train, y_train, X_test, y_test)

num thresholds = 513
accuracy: 0.8596491228070176


### Decision tree implementation with information gain
Split based on max information gain

In [241]:
# Function to count the nodes in a tree
def count_nodes(tree):
    if 'prediction' in tree:
        return 0
    return 1 + count_nodes(tree['left']) + count_nodes(tree['right'])

# Function to make a prediction for a single sample
def predict(sample, node):
    if 'prediction' in node:
        return node['prediction']
    if sample[node['feature']] <= node['value']:
        return predict(sample, node['left'])
    else:
        return predict(sample, node['right'])


In [242]:
def entropy(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / counts.sum()
    return -np.sum(probabilities * np.log2(probabilities))

def information_gain(left, right, parent_entropy):
    p = len(left) / (len(left) + len(right))
    return parent_entropy - p * entropy(left) - (1 - p) * entropy(right)

def best_split(X, y):
    best_feature = None
    best_value = None
    best_info_gain = -np.inf
    parent_entropy = entropy(y)
    for feature in range(X.shape[1]):
        values = np.unique(X[:, feature])
        for value in values:
            left_mask = X[:, feature] <= value
            right_mask = X[:, feature] > value
            info_gain = information_gain(y[left_mask], y[right_mask], parent_entropy)
            if info_gain > best_info_gain:
                best_feature, best_value, best_info_gain = feature, value, info_gain
    return best_feature, best_value

def build_tree_info(X, y, max_depth=np.inf, depth=0):
    entropy_node = entropy(y)
    if entropy_node == 0 or depth == max_depth:
        return {'prediction': np.round(np.mean(y)), 'entropy': entropy_node, 'samples': len(y)}
    feature, value = best_split(X, y)
    left_mask = X[:, feature] <= value
    right_mask = X[:, feature] > value
    left = build_tree_info(X[left_mask], y[left_mask], max_depth, depth + 1)
    right = build_tree_info(X[right_mask], y[right_mask], max_depth, depth + 1)
    return {'feature': feature, 'value': value, 'entropy': entropy_node, 'samples': len(y), 'left': left, 'right': right}


### Decision Tree with Gini impurity
The Gini impurity of a set is the probability of misclassifying a randomly chosen element if it was randomly labeled according to the distribution of labels in the set.

In [243]:
# Function to calculate Gini impurity
def gini_impurity(y):
    _, counts = np.unique(y, return_counts=True)
    probabilities = counts / len(y)
    return 1 - np.sum(probabilities**2)

# Function to calculate the best split for a node
def best_split(X, y):
    best_gini = 1
    best_feature = -1
    best_value = -1
    for feature in range(X.shape[1]):
        for value in np.unique(X[:, feature]):
            left_mask = X[:, feature] <= value
            right_mask = X[:, feature] > value
            left_gini = gini_impurity(y[left_mask])
            right_gini = gini_impurity(y[right_mask])
            gini = (len(y[left_mask]) * left_gini + len(y[right_mask]) * right_gini) / len(y)
            if gini < best_gini:
                best_gini = gini
                best_feature = feature
                best_value = value
    return best_feature, best_value

# Recursive function to build the decision tree
def build_tree_gini(X, y, max_depth, depth=0):
    if gini_impurity(y) == 0 or depth == max_depth:
        return {'prediction': np.round(np.mean(y))}
    feature, value = best_split(X, y)
    left_mask = X[:, feature] <= value
    right_mask = X[:, feature] > value
    return {'feature': feature, 'value': value, 'left': build_tree_gini(X[left_mask], y[left_mask], max_depth, depth + 1), 'right': build_tree_gini(X[right_mask], y[right_mask], max_depth, depth + 1)}



#### Post process pruning

In [244]:
def test(node, X, y):
    predictions = [predict(x, node) for x in X]  # Make predictions for each feature vector in X
    return np.mean(predictions == y)  # Compare predictions to actual targets and calculate the mean accuracy

def prune(node, X_train, y_train, delta):
    # Base case: if the node is a leaf node, don't prune it
    if 'prediction' in node:
        return node

    # Calculate the current score of the node
    current_score = test(node, X_train, y_train)

    # Create a new leaf node that represents the majority class of the current node's instances
    leaf_node = {'prediction': 1 if np.mean(y_train) > 0.5 else 0}

    # Calculate the score of the new leaf node
    leaf_score = test(leaf_node, X_train, y_train)

    # If the leaf node's score is lower than the current node's score, prune the node
    if current_score - leaf_score < delta:
        return leaf_node

    # Otherwise, recursively prune the left and right subtrees
    left_mask = X_train[:, node['feature']] <= node['value']
    right_mask = X_train[:, node['feature']] > node['value']
    if left_mask.any():
        node['left'] = prune(node['left'], X_train[left_mask], y_train[left_mask], delta)
    if right_mask.any():
        node['right'] = prune(node['right'], X_train[right_mask], y_train[right_mask], delta)

    # Return the pruned node
    return node

#### Max depth test for info decision tree

In [245]:
for max_depth in range(5):
    # Build the decision tree
    tree = build_tree_info(X_train, y_train, max_depth=max_depth)

    # Use the decision tree to make predictions on the test set
    y_pred = [predict(sample, tree) for sample in X_test]

    # Calculate the accuracy
    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 0, Accuracy: 0.702, Node (conditional clause) Count: 0
Max Depth: 1, Accuracy: 0.877, Node (conditional clause) Count: 1
Max Depth: 2, Accuracy: 0.93, Node (conditional clause) Count: 3
Max Depth: 3, Accuracy: 0.93, Node (conditional clause) Count: 7
Max Depth: 4, Accuracy: 0.912, Node (conditional clause) Count: 12


#### Max depth test for gini decision tree

In [246]:
for max_depth in range(5):
    # Build the decision tree
    tree = build_tree_gini(X_train, y_train, max_depth=max_depth)

    # Use the decision tree to make predictions on the test set
    y_pred = [predict(sample, tree) for sample in X_test]

    # Calculate the accuracy
    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 0, Accuracy: 0.702, Node (conditional clause) Count: 0
Max Depth: 1, Accuracy: 0.877, Node (conditional clause) Count: 1
Max Depth: 2, Accuracy: 0.93, Node (conditional clause) Count: 3
Max Depth: 3, Accuracy: 0.93, Node (conditional clause) Count: 7
Max Depth: 4, Accuracy: 0.912, Node (conditional clause) Count: 12


#### Pruning

In [247]:
tree = build_tree_info(X_train, y_train, max_depth=5)
y_pred = [predict(sample, tree) for sample in X_test]

# Calculate the accuracy
accuracy = round(accuracy_score(y_test, y_pred),3)
print(f"Max Depth: 5, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

# Prune tree and test
for delta in np.arange(0,.2,.01):
    pruned_tree = prune(tree, X_train, y_train, delta=delta)
    y_pred = [predict(sample, pruned_tree) for sample in X_test]

    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Delta: {round(delta,2)}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 5, Accuracy: 0.912, Node (conditional clause) Count: 15
Delta: 0.0, Accuracy: 0.912, Node (conditional clause) Count: 15
Delta: 0.01, Accuracy: 0.93, Node (conditional clause) Count: 11
Delta: 0.02, Accuracy: 0.947, Node (conditional clause) Count: 8
Delta: 0.03, Accuracy: 0.947, Node (conditional clause) Count: 8
Delta: 0.04, Accuracy: 0.93, Node (conditional clause) Count: 5
Delta: 0.05, Accuracy: 0.93, Node (conditional clause) Count: 4
Delta: 0.06, Accuracy: 0.93, Node (conditional clause) Count: 4
Delta: 0.07, Accuracy: 0.93, Node (conditional clause) Count: 4
Delta: 0.08, Accuracy: 0.93, Node (conditional clause) Count: 4
Delta: 0.09, Accuracy: 0.93, Node (conditional clause) Count: 4
Delta: 0.1, Accuracy: 0.877, Node (conditional clause) Count: 1
Delta: 0.11, Accuracy: 0.877, Node (conditional clause) Count: 1
Delta: 0.12, Accuracy: 0.877, Node (conditional clause) Count: 1
Delta: 0.13, Accuracy: 0.877, Node (conditional clause) Count: 1
Delta: 0.14, Accuracy: 0.877, 

## Testing on different datasets

##### Digits dataset

In [248]:
digits = load_digits()
X = digits.data
y = digits.target

# Modify the target vector to have only two classes
y_binary = np.where(y < 5, 0, 1)

# First split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y_binary, test_size=0.1, random_state=42)

#### Algorithm 8

In [249]:
alg8(X_train, y_train, X_test, y_test)

num thresholds = 1618
accuracy: 0.4444444444444444


#### Max depth test for info decision tree

In [250]:
for max_depth in range(10):
    # Build the decision tree
    tree = build_tree_info(X_train, y_train, max_depth=max_depth)

    # Use the decision tree to make predictions on the test set
    y_pred = [predict(sample, tree) for sample in X_test]

    # Calculate the accuracy
    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 0, Accuracy: 0.483, Node (conditional clause) Count: 0
Max Depth: 1, Accuracy: 0.683, Node (conditional clause) Count: 1
Max Depth: 2, Accuracy: 0.767, Node (conditional clause) Count: 3
Max Depth: 3, Accuracy: 0.811, Node (conditional clause) Count: 7
Max Depth: 4, Accuracy: 0.872, Node (conditional clause) Count: 14
Max Depth: 5, Accuracy: 0.889, Node (conditional clause) Count: 27
Max Depth: 6, Accuracy: 0.861, Node (conditional clause) Count: 44
Max Depth: 7, Accuracy: 0.889, Node (conditional clause) Count: 64
Max Depth: 8, Accuracy: 0.906, Node (conditional clause) Count: 79
Max Depth: 9, Accuracy: 0.906, Node (conditional clause) Count: 96


#### Max depth test for gini decision tree

In [251]:
for max_depth in range(10):
    # Build the decision tree
    tree = build_tree_gini(X_train, y_train, max_depth=max_depth)

    # Use the decision tree to make predictions on the test set
    y_pred = [predict(sample, tree) for sample in X_test]

    # Calculate the accuracy
    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 0, Accuracy: 0.483, Node (conditional clause) Count: 0
Max Depth: 1, Accuracy: 0.683, Node (conditional clause) Count: 1
Max Depth: 2, Accuracy: 0.767, Node (conditional clause) Count: 3
Max Depth: 3, Accuracy: 0.811, Node (conditional clause) Count: 7
Max Depth: 4, Accuracy: 0.872, Node (conditional clause) Count: 14
Max Depth: 5, Accuracy: 0.889, Node (conditional clause) Count: 27
Max Depth: 6, Accuracy: 0.861, Node (conditional clause) Count: 44
Max Depth: 7, Accuracy: 0.889, Node (conditional clause) Count: 64
Max Depth: 8, Accuracy: 0.906, Node (conditional clause) Count: 79
Max Depth: 9, Accuracy: 0.906, Node (conditional clause) Count: 96


#### Pruning

In [252]:
tree = build_tree_info(X_train, y_train, max_depth=9)
y_pred = [predict(sample, tree) for sample in X_test]

# Calculate the accuracy
accuracy = round(accuracy_score(y_test, y_pred),3)
print(f"Max Depth: 9, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

# Prune tree and test
for delta in np.arange(0,2,.01):
    pruned_tree = prune(tree, X_train, y_train, delta=delta)
    y_pred = [predict(sample, pruned_tree) for sample in X_test]

    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Delta: {round(delta,2)}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 9, Accuracy: 0.906, Node (conditional clause) Count: 96
Delta: 0.0, Accuracy: 0.906, Node (conditional clause) Count: 96
Delta: 0.01, Accuracy: 0.906, Node (conditional clause) Count: 86
Delta: 0.02, Accuracy: 0.911, Node (conditional clause) Count: 72
Delta: 0.03, Accuracy: 0.9, Node (conditional clause) Count: 66
Delta: 0.04, Accuracy: 0.9, Node (conditional clause) Count: 57
Delta: 0.05, Accuracy: 0.9, Node (conditional clause) Count: 56
Delta: 0.06, Accuracy: 0.894, Node (conditional clause) Count: 49
Delta: 0.07, Accuracy: 0.894, Node (conditional clause) Count: 30
Delta: 0.08, Accuracy: 0.9, Node (conditional clause) Count: 27
Delta: 0.09, Accuracy: 0.9, Node (conditional clause) Count: 27
Delta: 0.1, Accuracy: 0.9, Node (conditional clause) Count: 26
Delta: 0.11, Accuracy: 0.872, Node (conditional clause) Count: 18
Delta: 0.12, Accuracy: 0.839, Node (conditional clause) Count: 12
Delta: 0.13, Accuracy: 0.778, Node (conditional clause) Count: 3
Delta: 0.14, Accuracy: 0

##### Iris dataset

In [253]:
# Load the Iris dataset
iris = load_iris()

# The features are stored in iris.data and the targets in iris.target
X = iris.data
y = iris.target

# Filter the data to only include the first two classes
mask = y < 2
X = X[mask]
y = y[mask]

# First split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

#### Algorithm 8

In [254]:
alg8(X_train, y_train, X_test, y_test)

num thresholds = 91
accuracy: 1.0


#### Max depth test for info decision tree

In [255]:
for max_depth in range(10):
    # Build the decision tree
    tree = build_tree_info(X_train, y_train, max_depth=max_depth)

    # Use the decision tree to make predictions on the test set
    y_pred = [predict(sample, tree) for sample in X_test]

    # Calculate the accuracy
    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 0, Accuracy: 0.4, Node (conditional clause) Count: 0
Max Depth: 1, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 2, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 3, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 4, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 5, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 6, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 7, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 8, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 9, Accuracy: 1.0, Node (conditional clause) Count: 1


#### Max depth test for gini decision tree

In [256]:
for max_depth in range(10):
    # Build the decision tree
    tree = build_tree_gini(X_train, y_train, max_depth=max_depth)

    # Use the decision tree to make predictions on the test set
    y_pred = [predict(sample, tree) for sample in X_test]

    # Calculate the accuracy
    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Max Depth: {max_depth}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 0, Accuracy: 0.4, Node (conditional clause) Count: 0
Max Depth: 1, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 2, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 3, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 4, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 5, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 6, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 7, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 8, Accuracy: 1.0, Node (conditional clause) Count: 1
Max Depth: 9, Accuracy: 1.0, Node (conditional clause) Count: 1


#### Pruning

In [257]:
tree = build_tree_info(X_train, y_train, max_depth=2)
y_pred = [predict(sample, tree) for sample in X_test]

# Calculate the accuracy
accuracy = round(accuracy_score(y_test, y_pred),3)
print(f"Max Depth: 2, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

# Prune tree and test
for delta in np.arange(0,2,.01):
    pruned_tree = prune(tree, X_test, y_test, delta=delta)
    y_pred = [predict(sample, pruned_tree) for sample in X_test]

    accuracy = round(accuracy_score(y_test, y_pred),3)
    print(f"Delta: {round(delta,2)}, Accuracy: {accuracy}, Node (conditional clause) Count: {count_nodes(tree)}")

Max Depth: 2, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.0, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.01, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.02, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.03, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.04, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.05, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.06, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.07, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.08, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.09, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.1, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.11, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.12, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.13, Accuracy: 1.0, Node (conditional clause) Count: 1
Delta: 0.14, Accuracy: 1.0, Node (conditional clause) Co

## 2c Completely Random Artificial dataset

In [258]:
def test_trees(X_train, X_test, y_train, y_test, max_depth):
    # Build the decision tree
    tree_gini = build_tree_gini(X_train, y_train, max_depth=max_depth)

    # Use the decision tree to make predictions on the test set
    y_pred_gini = [predict(sample, tree_gini) for sample in X_test]

    # Calculate the accuracy
    accuracy_gini = round(accuracy_score(y_test, y_pred_gini),3)

    return count_nodes(tree_gini), accuracy_gini

In [259]:
X, y = make_classification(n_samples=1000, n_features=10, n_informative=10, n_redundant=0, n_classes=2, random_state=42)

# First split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print("alg8:")
alg8(X_train, y_train, X_test, y_test)

print("Gini:")
for max_depth in range(10):
    nodes_gini, accuracy_gini = test_trees(X_train, X_test, y_train, y_test, max_depth)
    print(f"Max Depth: {max_depth}, Node count: {nodes_gini}, Accuracy (Gini): {accuracy_gini}")

alg8:
num thresholds = 901
accuracy: 0.52
Gini:
Max Depth: 0, Node count: 0, Accuracy (Gini): 0.45
Max Depth: 1, Node count: 1, Accuracy (Gini): 0.69
Max Depth: 2, Node count: 3, Accuracy (Gini): 0.71
Max Depth: 3, Node count: 7, Accuracy (Gini): 0.76
Max Depth: 4, Node count: 15, Accuracy (Gini): 0.79
Max Depth: 5, Node count: 27, Accuracy (Gini): 0.82
Max Depth: 6, Node count: 41, Accuracy (Gini): 0.81
Max Depth: 7, Node count: 55, Accuracy (Gini): 0.83
Max Depth: 8, Node count: 64, Accuracy (Gini): 0.84
Max Depth: 9, Node count: 72, Accuracy (Gini): 0.87


### vary instances

In [260]:
X, y = make_classification(n_samples=100, n_features=10, n_informative=10, n_redundant=0, n_classes=2, random_state=42)

# First split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print("alg8:")
alg8(X_train, y_train, X_test, y_test)

print("Gini:")

for max_depth in range(10):
    nodes_gini, accuracy_gini = test_trees(X_train, X_test, y_train, y_test, max_depth)
    print(f"Max Depth: {max_depth}, Node count: {nodes_gini}, Accuracy (Gini): {accuracy_gini}")

alg8:
num thresholds = 91
accuracy: 0.5
Gini:
Max Depth: 0, Node count: 0, Accuracy (Gini): 0.5
Max Depth: 1, Node count: 1, Accuracy (Gini): 0.7
Max Depth: 2, Node count: 3, Accuracy (Gini): 0.8
Max Depth: 3, Node count: 6, Accuracy (Gini): 0.8
Max Depth: 4, Node count: 8, Accuracy (Gini): 0.8
Max Depth: 5, Node count: 10, Accuracy (Gini): 0.8
Max Depth: 6, Node count: 11, Accuracy (Gini): 0.8
Max Depth: 7, Node count: 12, Accuracy (Gini): 0.8
Max Depth: 8, Node count: 12, Accuracy (Gini): 0.8
Max Depth: 9, Node count: 12, Accuracy (Gini): 0.8


In [261]:
X, y = make_classification(n_samples=500, n_features=10, n_informative=10, n_redundant=0, n_classes=2, random_state=42)

# First split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

print("alg8:")
alg8(X_train, y_train, X_test, y_test)

print("Gini:")

for max_depth in range(10):
    nodes_gini, accuracy_gini = test_trees(X_train, X_test, y_train, y_test, max_depth)
    print(f"Max Depth: {max_depth}, Node count: {nodes_gini}, Accuracy (Gini): {accuracy_gini}")

alg8:
num thresholds = 451
accuracy: 0.58
Gini:
Max Depth: 0, Node count: 0, Accuracy (Gini): 0.4
Max Depth: 1, Node count: 1, Accuracy (Gini): 0.58
Max Depth: 2, Node count: 3, Accuracy (Gini): 0.66
Max Depth: 3, Node count: 7, Accuracy (Gini): 0.68
Max Depth: 4, Node count: 11, Accuracy (Gini): 0.7
Max Depth: 5, Node count: 16, Accuracy (Gini): 0.64
Max Depth: 6, Node count: 21, Accuracy (Gini): 0.78
Max Depth: 7, Node count: 30, Accuracy (Gini): 0.74
Max Depth: 8, Node count: 37, Accuracy (Gini): 0.78
Max Depth: 9, Node count: 42, Accuracy (Gini): 0.76


### Vary input columns

In [262]:
X, y = make_classification(n_samples=1000, n_features=2, n_informative=2, n_redundant=0, n_classes=2, random_state=42)

# First split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


print("alg8:")
alg8(X_train, y_train, X_test, y_test)

print("Gini:")

for max_depth in range(10):
    nodes_gini, accuracy_gini = test_trees(X_train, X_test, y_train, y_test, max_depth)
    print(f"Max Depth: {max_depth}, Node count: {nodes_gini}, Accuracy (Gini): {accuracy_gini}")

alg8:
num thresholds = 901
accuracy: 0.64
Gini:
Max Depth: 0, Node count: 0, Accuracy (Gini): 0.51
Max Depth: 1, Node count: 1, Accuracy (Gini): 0.91
Max Depth: 2, Node count: 3, Accuracy (Gini): 0.91
Max Depth: 3, Node count: 7, Accuracy (Gini): 0.95
Max Depth: 4, Node count: 13, Accuracy (Gini): 0.92
Max Depth: 5, Node count: 22, Accuracy (Gini): 0.92
Max Depth: 6, Node count: 32, Accuracy (Gini): 0.92
Max Depth: 7, Node count: 45, Accuracy (Gini): 0.92
Max Depth: 8, Node count: 57, Accuracy (Gini): 0.91
Max Depth: 9, Node count: 67, Accuracy (Gini): 0.91


In [263]:
X, y = make_classification(n_samples=1000, n_features=50, n_informative=50, n_redundant=0, n_classes=2, random_state=42)

# First split the data into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)


print("alg8:")
alg8(X_train, y_train, X_test, y_test)

print("Gini:")

for max_depth in range(10):
    nodes_gini, accuracy_gini = test_trees(X_train, X_test, y_train, y_test, max_depth)
    print(f"Max Depth: {max_depth}, Node count: {nodes_gini}, Accuracy (Gini): {accuracy_gini}")

alg8:
num thresholds = 901
accuracy: 0.48
Gini:
Max Depth: 0, Node count: 0, Accuracy (Gini): 0.44
Max Depth: 1, Node count: 1, Accuracy (Gini): 0.49
Max Depth: 2, Node count: 3, Accuracy (Gini): 0.57
Max Depth: 3, Node count: 7, Accuracy (Gini): 0.56
Max Depth: 4, Node count: 15, Accuracy (Gini): 0.61
Max Depth: 5, Node count: 28, Accuracy (Gini): 0.58
Max Depth: 6, Node count: 47, Accuracy (Gini): 0.57
Max Depth: 7, Node count: 66, Accuracy (Gini): 0.55
Max Depth: 8, Node count: 82, Accuracy (Gini): 0.56
Max Depth: 9, Node count: 91, Accuracy (Gini): 0.56
