# Simple Decision Tree Regressor from Scratch

In this notebook, we will implement a simple decision tree regressor from scratch. 
This will give us a deeper understanding of the inner workings of decision trees.


## Import Necessary Libraries

We will use NumPy for numerical calculations and Matplotlib for plotting.


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_regression


## Generate Synthetic Dataset

We will create a synthetic dataset using the `make_regression` function. 
This dataset will be used to train and visualize the predictions of our decision tree.


In [None]:
X, y = make_regression(n_samples=200, n_features=1, noise=10, random_state=42)
plt.plot(X, y, '*')

## Implement the Decision Tree Regressor

Next, we'll define our decision tree regressor. The tree will be built recursively by choosing the best split based on variance reduction.


In [None]:
# Define a simple decision tree regressor class.
class SimpleDecisionTreeRegressor:
    # Initialization method with optional max depth parameter.
    # The max depth limits how deep the tree can grow to prevent overfitting.
    def __init__(self, max_depth=None):
        self.max_depth = max_depth  # Maximum depth of the tree
        self.tree = None  # This will hold the built tree

    # Method to fit the tree with data.
    # X is the feature matrix and y is the target vector.
    def fit(self, X, y):
        # Build the tree using the training data and starting from depth 0.
        self.tree = self._build_tree(X, y, 0)

    # Method to build the tree recursively.
    # It receives the subset of the data to split, and the current depth of the tree.
    def _build_tree(self, X, y, depth):
        # If the max depth is reached or there are no more unique values in y,
        # return the mean value of y as a leaf node.
        if depth == self.max_depth or len(set(y)) <= 1:
            return np.mean(y)

        # Find the best split based on the current subset of data.
        best_feature, best_threshold, best_score, best_splits = self._best_split(X, y)
        # If no split is possible because no feature gives a better score than the variance,
        # return the mean value of y as a leaf node.
        if best_feature is None:
            return np.mean(y)

        # Recursively build the left subtree using the best split subset.
        left_tree = self._build_tree(*best_splits[0], depth + 1)
        # Recursively build the right subtree using the best split subset.
        right_tree = self._build_tree(*best_splits[1], depth + 1)
        # Return a dictionary representing the current node split with the feature index,
        # the threshold value, and the left/right subtrees.
        return {'feature': best_feature, 'threshold': best_threshold, 'left': left_tree, 'right': right_tree}

    # Method to find the best split for a node.
    # It iterates over all features and their possible threshold values to find the split
    # that results in the largest reduction in variance (best score).
    def _best_split(self, X, y):
        m, n = X.shape  # m = number of samples, n = number of features
        # Start with the worst score; the total variance of the target times the number of samples.
        best_score = np.var(y) * len(y)
        # Initialize variables to store the best feature index, threshold value, and splits.
        best_feature, best_threshold, best_splits = None, None, None

        # Iterate over all features.
        for feature in range(n):
            # Get all unique values of the feature as potential threshold values.
            thresholds = np.unique(X[:, feature])
            # Iterate over all thresholds.
            for threshold in thresholds:
                # Split the data based on the current feature and threshold.
                splits = self._split(X, y, feature, threshold)
                # Calculate the score as the weighted average of the variance of the two splits.
                score = sum(np.var(split[1]) * len(split[1]) for split in splits)

                # If the calculated score is better (lower) than the current best score,
                # update the best split variables with the current feature, threshold, and splits.
                if score < best_score:
                    best_score = score
                    best_feature = feature
                    best_threshold = threshold
                    best_splits = splits

        # Return the best feature index, threshold, score, and splits.
        return best_feature, best_threshold, best_score, best_splits

    # Method to split the data based on a given feature and threshold.
    # It returns the subsets of data for the left and right nodes after the split.
    def _split(self, X, y, feature, threshold):
        # Create a boolean mask for the rows that go to the left child (where feature value is less than threshold).
        left_mask = X[:, feature] < threshold
        # Create a boolean mask for the rows that go to the right child (where feature value is not less than threshold).
        right_mask = ~left_mask
        # Split the data into left and right based on the masks and return the subsets.
        return (X[left_mask], y[left_mask]), (X[right_mask], y[right_mask])

    # Method to predict the target for each sample in the feature matrix X.
    # It applies the learned tree rules to each sample to make predictions.
    def predict(self, X):
        # Use a list comprehension to apply the prediction to each sample in X.
        # For each record in X, we call the _predict_one method, which traverses the tree.
        return np.array([self._predict_one(record, self.tree) for record in X])
    
    # Helper method to predict a single data point by traversing the tree.
    def _predict_one(self, x, tree):
        # If the current node is not a dictionary, it means we have reached a leaf node.
        # Leaf nodes do not have a 'feature' or 'threshold', but just a single value (the mean of the target values).
        if not isinstance(tree, dict):
            return tree  # Return the value of the leaf node
        
        # Check the feature value of the sample against the threshold of the current node.
        # Depending on whether it's lower or higher, move to the left or right child node, respectively.
        if x[tree['feature']] < tree['threshold']:
            return self._predict_one(x, tree['left'])  # Traverse left subtree
        else:
            return self._predict_one(x, tree['right'])  # Traverse right subtree



## Train the Model and Make Predictions

With our decision tree defined, we can now train it on our dataset and make predictions.


In [None]:
# Instantiate the tree and fit to the data
tree = SimpleDecisionTreeRegressor(max_depth=3)
tree.fit(X, y)

# Make predictions over the data
predictions = tree.predict(X)


## Plotting the Results

Finally, we will plot the actual data points against the predictions made by our decision tree to visually assess its performance.


In [None]:
# Plot the actual data points
plt.scatter(X, y, color='blue', label='Actual')
# Plot the predicted data points
plt.scatter(X, predictions, color='red', label='Predicted')
# Set plot title and labels
plt.title('Decision Tree Regression')
plt.xlabel('Feature')
plt.ylabel('Target')
# Show legend
plt.legend()
# Display the plot
plt.show()


### Comaparison with Random forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Instantiate and train the simple decision tree regressor
simple_tree = SimpleDecisionTreeRegressor(max_depth=3)
simple_tree.fit(X_train, y_train)

# Instantiate and train the random forest regressor from scikit-learn
rf_regressor = RandomForestRegressor(max_depth=3, n_estimators=10, random_state=42)
rf_regressor.fit(X_train, y_train)

# Make predictions with both models on the test set
simple_tree_predictions = simple_tree.predict(X_test)
rf_predictions = rf_regressor.predict(X_test)

# Evaluate the performance of both models
simple_tree_mse = mean_squared_error(y_test, simple_tree_predictions)
rf_mse = mean_squared_error(y_test, rf_predictions)

# Print the performance
print(f"Simple Decision Tree MSE: {simple_tree_mse}")
print(f"Random Forest MSE: {rf_mse}")

# Visual comparison of actual vs predicted values
plt.figure(figsize=(14, 7))

# Plot for simple decision tree
plt.subplot(1, 2, 1)
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.scatter(X_test, simple_tree_predictions, color='red', label='Predicted')
plt.title('Simple Decision Tree Regression')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()

# Plot for random forest
plt.subplot(1, 2, 2)
plt.scatter(X_test, y_test, color='blue', label='Actual')
plt.scatter(X_test, rf_predictions, color='green', label='Predicted')
plt.title('Random Forest Regression')
plt.xlabel('Feature')
plt.ylabel('Target')
plt.legend()

plt.tight_layout()
plt.show()