To implement a Decision Tree from scratch in Python using pandas and numpy, we'll start by defining a class DecisionTree and its associated methods. This implementation will support both classification and regression tasks, which will be determined by the task parameter when creating an instance of the Decision Tree.

In [1]:
import numpy as np
import pandas as pd

class DecisionNode:
    def __init__(self, feature=None, threshold=None, value=None, left=None, right=None):
        self.feature = feature
        self.threshold = threshold
        self.value = value
        self.left = left
        self.right = right

class DecisionTree:
    def __init__(self, task='classification', min_samples_split=2, max_depth=None):
        self.task = task
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth
        self.root = None

    def _split(self, X, y, feature, threshold):
        mask = X[:, feature] <= threshold
        return X[mask], y[mask], X[~mask], y[~mask]

    def _calculate_impurity(self, y):
        if self.task == 'classification':
            _, counts = np.unique(y, return_counts=True)
            probabilities = counts / y.shape[0]
            return 1 - np.sum(probabilities ** 2)
        else:  # regression
            return np.var(y)

    def _calculate_information_gain(self, y, left_y, right_y):
        parent_impurity = self._calculate_impurity(y)
        left_impurity = self._calculate_impurity(left_y)
        right_impurity = self._calculate_impurity(right_y)
        return parent_impurity - len(left_y) / len(y) * left_impurity - len(right_y) / len(y) * right_impurity

    def _find_best_split(self, X, y):
        best_gain = -np.inf
        best_feature = None
        best_threshold = None

        for feature in range(X.shape[1]):
            for threshold in np.unique(X[:, feature]):
                left_y, right_y = self._split(X, y, feature, threshold)[1::2]
                if len(left_y) == 0 or len(right_y) == 0:
                    continue
                gain = self._calculate_information_gain(y, left_y, right_y)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold

    def _build_tree(self, X, y, depth=0):
        if len(y) < self.min_samples_split or (self.max_depth is not None and depth == self.max_depth):
            if self.task == 'classification':
                return DecisionNode(value=np.argmax(np.bincount(y)))
            else:  # regression
                return DecisionNode(value=np.mean(y))

        feature, threshold = self._find_best_split(X, y)
        if feature is None:
            if self.task == 'classification':
                return DecisionNode(value=np.argmax(np.bincount(y)))
            else:  # regression
                return DecisionNode(value=np.mean(y))

        left_X, left_y, right_X, right_y = self._split(X, y, feature, threshold)
        left = self._build_tree(left_X, left_y, depth + 1)
        right = self._build_tree(right_X, right_y, depth + 1)
        return DecisionNode(feature=feature, threshold=threshold, left=left, right=right)

    def fit(self, X, y):
        self.root = self._build_tree(X, y)
       
