# **Base Python Decison Tree Implemintation (Working)**

In [None]:
import pandas as pd
import numpy as np
import DecisionTreeFromScratch as dt_scratch

run algorithm on dummy data

In [None]:
# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
n_samples = 100

# Generate two random features from a normal distribution (loc = mean, scale = sd)
feature_1 = np.random.normal(loc=0, scale=1, size=n_samples)
feature_2 = np.random.normal(loc=5, scale=2, size=n_samples)

# Generate labels from a Bernoulli distribution with p=0.5
labels = np.random.binomial(n=1, p=0.5, size=n_samples)

# Create DataFrame
df = pd.DataFrame({'Feature_1': feature_1, 'Feature_2': feature_2})
label_series = pd.Series(labels, name="Label")

# Create and train the decision tree
tree = dt_scratch.DecisionTree(df, label_series, depth=0, max_depth=3, min_data_split=2)
tree.Train()

investigate tree

In [None]:
yhat = tree.predictions
yhat, len(yhat)

([{'predicted_label': np.int64(1),
   'depth': 3,
   'gini': np.float64(0.4856804733727811),
   'zero_one_ratio': np.float64(0.7105263157894737)},
  {'predicted_label': np.int64(0),
   'depth': 3,
   'gini': np.float64(0.48),
   'zero_one_ratio': np.float64(1.5)},
  {'predicted_label': np.int64(0),
   'depth': 3,
   'gini': np.float64(0.0),
   'zero_one_ratio': np.float64(1.0)},
  {'predicted_label': np.int64(0),
   'depth': 3,
   'gini': np.float64(0.0),
   'zero_one_ratio': np.float64(2.0)},
  {'predicted_label': np.int64(0),
   'depth': 2,
   'gini': np.float64(0.5),
   'zero_one_ratio': np.float64(1.0)},
  {'predicted_label': np.int64(1),
   'depth': 3,
   'gini': np.float64(0.0),
   'zero_one_ratio': np.float64(0.0)},
  {'predicted_label': np.int64(1),
   'depth': 3,
   'gini': np.float64(0.0),
   'zero_one_ratio': np.float64(0.0)}],
 7)

For ```depth = 3``` we have 7 leaf nodes. In general a decision tree of depth $n$ will have a maximum of $2^n$ leaf nodes. A fewer number of nodes suggests that the we split a pure leaf node at a higher level. The total number of nodes in a tree scale according too

$$
T(n) = 2^{n+1} - 1
$$

To calculate the growth rate of this algorithm (the total number of nodes) we need to conduct a rate of change calculation i.e. differentiate the alogrithm.

\begin{align}
\frac{dT}{dn} &= \frac{d}{dn}(2^{n+1} - 1) \\
&= 2^{n+1}ln(2)
\end{align}

The first term dominates so the number of nodes in a decison tree grows exponentially.


## Prediction on Unseen Data



Now lets "predict" our label on unseen data using our ```DecisionTree``` class.

In [None]:
# Create new data to classify
n_out_sample = 200
new_data = pd.DataFrame({
    'Feature_1': np.random.normal(loc=0, scale=1, size=n_out_sample),
    'Feature_2': np.random.normal(loc=1, scale=1.1, size=n_out_sample)
})

# Get predictions
predictions = tree.Predict(new_data)
print("Predictions with metadata:")
for pred in predictions:
    print(f"Predicted label: {pred['predicted_label']}")
    print(f"Node depth: {pred['depth']}")
    print(f"Gini impurity: {pred['gini']}")
    print(f"Ratio of 0s to 1s: {pred['zero_one_ratio']}")
    print("---")

# If you just want the class labels
class_labels = [pred['predicted_label'] for pred in predictions]
unique_labels, counts = np.unique(class_labels, return_counts=True)
counts_dict = dict(zip(unique_labels, counts))
print("Counts of each class label:", counts_dict)

Predictions with metadata:
Predicted label: 1
Node depth: 3
Gini impurity: 0.4856804733727811
Ratio of 0s to 1s: 0.7105263157894737
---
Predicted label: 1
Node depth: 3
Gini impurity: 0.4856804733727811
Ratio of 0s to 1s: 0.7105263157894737
---
Predicted label: 1
Node depth: 3
Gini impurity: 0.4856804733727811
Ratio of 0s to 1s: 0.7105263157894737
---
Predicted label: 1
Node depth: 3
Gini impurity: 0.4856804733727811
Ratio of 0s to 1s: 0.7105263157894737
---
Predicted label: 0
Node depth: 2
Gini impurity: 0.5
Ratio of 0s to 1s: 1.0
---
Predicted label: 0
Node depth: 3
Gini impurity: 0.0
Ratio of 0s to 1s: 2.0
---
Predicted label: 1
Node depth: 3
Gini impurity: 0.4856804733727811
Ratio of 0s to 1s: 0.7105263157894737
---
Predicted label: 1
Node depth: 3
Gini impurity: 0.4856804733727811
Ratio of 0s to 1s: 0.7105263157894737
---
Predicted label: 0
Node depth: 2
Gini impurity: 0.5
Ratio of 0s to 1s: 1.0
---
Predicted label: 1
Node depth: 3
Gini impurity: 0.4856804733727811
Ratio of 0s to 