In [None]:
import numpy as np
import matplotlib.pyplot as plt

# Set the seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 100

# Generate random x values (features)
x_train = 2 * np.random.rand(num_samples, 1)

# Generate corresponding y values with a linear relationship (y = 4 + 3x + noise)
true_slope = 3
true_intercept = 4
noise = np.random.randn(num_samples, 1)
y_train = true_intercept + true_slope * x_train + noise

# Plotting the data
plt.figure(figsize=(8, 5))
plt.scatter(x_train, y_train, color="blue", label="Training data", alpha=0.7)
plt.title("Dummy Linear Regression Data")
plt.xlabel("x_train")
plt.ylabel("y_train")
plt.legend()
plt.grid(True)
plt.show()

print("Features shape is:", x_train.shape)
print("Target shape is:", y_train.shape)

# -----------------------------
# Node Class Definition
# -----------------------------
class Node:
    def __init__(self, x, y, depth=0):
        self.x = x
        self.y = y
        self.depth = depth
        self.threshold = None
        self.left = None
        self.right = None
        self.is_leaf = False
        self.prediction = np.mean(y)

# -----------------------------
# Find Best Threshold Function
# -----------------------------
def find_best_threshold(x, y):
    n = x.shape[0]
    mean = np.mean(x)
    total_variance = (1 / n) * np.sum((x - mean) ** 2)

    best_gain = -np.inf
    best_t = None
    for i in range(n - 1):
        t = (x[i, 0] + x[i + 1, 0]) / 2
        left_idx = x[:, 0] <= t
        right_idx = x[:, 0] > t

        if np.sum(left_idx) == 0 or np.sum(right_idx) == 0:
            continue

        left_var = np.var(y[left_idx])
        right_var = np.var(y[right_idx])

        gain = total_variance - (
            (np.sum(left_idx) * left_var + np.sum(right_idx) * right_var) / n
        )

        if gain > best_gain:
            best_gain = gain
            best_t = t

    return best_t

# -----------------------------
# Build Tree up to Depth 4
# -----------------------------
def build_tree_depth4(x, y):
    root = Node(x, y, depth=0)
    queue = [root]

    while queue:
        current = queue.pop(0)
        if current.depth == 4 or len(current.x) <= 1:
            current.is_leaf = True
            continue

        sorted_idx = np.argsort(current.x[:, 0])
        x_sorted = current.x[sorted_idx]
        y_sorted = current.y[sorted_idx]
        t = find_best_threshold(x_sorted, y_sorted)

        if t is None:
            current.is_leaf = True
            continue

        current.threshold = t
        left_idx = current.x[:, 0] <= t
        right_idx = current.x[:, 0] > t

        x_left = current.x[left_idx]
        y_left = current.y[left_idx]
        x_right = current.x[right_idx]
        y_right = current.y[right_idx]

        current.left = Node(x_left, y_left, depth=current.depth + 1)
        current.right = Node(x_right, y_right, depth=current.depth + 1)

        queue.append(current.left)
        queue.append(current.right)

    return root

# -----------------------------
# Print Tree Structure
# -----------------------------
def print_tree_iterative(root):
    queue = [(root, "Root", 0)]
    while queue:
        node, label, depth = queue.pop(0)
        indent = "    " * depth
        if node.is_leaf:
            print(f"{indent}{label} → Predict: {node.prediction:.3f}, Samples: {len(node.x)}")
        else:
            print(f"{indent}{label} → x <= {node.threshold:.3f}")
            queue.append((node.left, "Left", depth + 1))
            queue.append((node.right, "Right", depth + 1))

# -----------------------------
# Build and Show Tree
# -----------------------------
tree_root = build_tree_depth4(x_train, y_train)
print("DECISION TREE STRUCTURE (up to depth 4):")
print_tree_iterative(tree_root)

# -----------------------------
# One-Split Tree (Best Threshold)
# -----------------------------
best_gain = -np.inf
best_threshold = None
best_left_value = None
best_right_value = None

x_sorted = np.sort(x_train, axis=0)
thresholds = (x_sorted[:-1] + x_sorted[1:]) / 2
total_variance = np.var(y_train)

for t in thresholds:
    left_idx = x_train[:, 0] <= t[0]
    right_idx = x_train[:, 0] > t[0]

    y_left = y_train[left_idx]
    y_right = y_train[right_idx]

    if len(y_left) == 0 or len(y_right) == 0:
        continue

    left_var = np.var(y_left)
    right_var = np.var(y_right)
    weighted_var = (len(y_left) * left_var + len(y_right) * right_var) / len(y_train)

    info_gain = total_variance - weighted_var

    if info_gain > best_gain:
        best_gain = info_gain
        best_threshold = t[0]
        best_left_value = np.mean(y_left)
        best_right_value = np.mean(y_right)

# Predict using 1-split tree
y_test_pred = np.where(x_train[:, 0] <= best_threshold, best_left_value, best_right_value)

# -----------------------------
# Print Decision Stump Tree
# -----------------------------
def print_decision_tree(threshold, x_train, y_train):
    left_idx = x_train[:, 0] <= threshold
    right_idx = x_train[:, 0] > threshold

    X_left = x_train[left_idx]
    y_left = y_train[left_idx]
    X_right = x_train[right_idx]
    y_right = y_train[right_idx]

    left_pred = np.mean(y_left)
    right_pred = np.mean(y_right)

    left_data = np.round(X_left[:, 0], 2)
    right_data = np.round(X_right[:, 0], 2)

    print(f"[Root] if x <= {threshold:.3f}")
    print(f"├── Left  → Predict: {left_pred:.3f}, Samples: {len(X_left)}, Data: {left_data}")
    print(f"└── Right → Predict: {right_pred:.3f}, Samples: {len(X_right)}, Data: {right_data}")

print("\nDECISION STUMP TREE:")
print_decision_tree(best_threshold, x_train, y_train)

# -----------------------------
# Plot Prediction (1-Split)
# -----------------------------
plt.figure(figsize=(8, 5))
plt.scatter(x_train, y_train, color='blue', label='Training data')
plt.plot(x_train, y_test_pred, color='red', label='Prediction (1-split)', linestyle='--')
plt.title('Simple 1-Split Decision Tree Prediction')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.grid(True)
plt.show()
