In [1]:
import pandas as pd
import numpy as np
import tree
from sklearn.tree import DecisionTreeClassifier as DTC
from sklearn.preprocessing import LabelEncoder

In [2]:
class DecisionTreeClassifier:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.n_classes_ = len(set(y))
        self.n_features_ = X.shape[1]
        self.tree_ = self._grow_tree(X, y)

    def predict(self, X):
        return [self._predict(inputs) for inputs in X]

    def debug(self, feature_names, class_names, show_details=True):
        self.tree_.debug(feature_names, class_names, show_details)

    def _gini(self, y):
 
        m = y.size
        return 1.0 - sum((np.sum(y == c) / m) ** 2 for c in range(self.n_classes_))

    def _best_split(self, X, y):
        # Need at least two elements to split a node.
        m = y.size
        if m <= 1:
            return None, None

        # Count of each class in the current node.
        num_parent = [np.sum(y == c) for c in range(self.n_classes_)]

        # Gini of current node.
        best_gini = 1.0 - sum((n / m) ** 2 for n in num_parent)
        best_idx, best_thr = None, None

        # Loop through all features.
        for idx in range(self.n_features_):
            # Sort data along selected feature.
            thresholds, classes = zip(*sorted(zip(X[:, idx], y)))

  
            num_left = [0] * self.n_classes_
            num_right = num_parent.copy()
            for i in range(1, m):  # possible split positions
                c = classes[i - 1]
                num_left[c] += 1
                num_right[c] -= 1
                gini_left = 1.0 - sum(
                    (num_left[x] / i) ** 2 for x in range(self.n_classes_)
                )
                gini_right = 1.0 - sum(
                    (num_right[x] / (m - i)) ** 2 for x in range(self.n_classes_)
                )

                gini = (i * gini_left + (m - i) * gini_right) / m


                if thresholds[i] == thresholds[i - 1]:
                    continue

                if gini < best_gini:
                    best_gini = gini
                    best_idx = idx
                    best_thr = (thresholds[i] + thresholds[i - 1]) / 2  # midpoint

        return best_idx, best_thr

    def _grow_tree(self, X, y, depth=0):
        """Build a decision tree by recursively finding the best split."""
        # Population for each class in current node. The predicted class is the one with
        # largest population.
        num_samples_per_class = [np.sum(y == i) for i in range(self.n_classes_)]
        predicted_class = np.argmax(num_samples_per_class)
        node = tree.Node(
            gini=self._gini(y),
            num_samples=y.size,
            num_samples_per_class=num_samples_per_class,
            predicted_class=predicted_class,
        )

        # Split recursively until maximum depth is reached.
        if depth < self.max_depth:
            idx, thr = self._best_split(X, y)
            if idx is not None:
                indices_left = X[:, idx] < thr
                X_left, y_left = X[indices_left], y[indices_left]
                X_right, y_right = X[~indices_left], y[~indices_left]
                node.feature_index = idx
                node.threshold = thr
                node.left = self._grow_tree(X_left, y_left, depth + 1)
                node.right = self._grow_tree(X_right, y_right, depth + 1)
        return node

    def _predict(self, inputs):
        """Predict class for a single sample."""
        node = self.tree_
        while node.left:
            if inputs[node.feature_index] < node.threshold:
                node = node.left
            else:
                node = node.right
        return node.predicted_class

In [3]:
df = pd.read_csv("ml2.csv")
print(df)

      AGE  INCOME  Gender Maritial Buys
0     <21    High    Male   Single   No
1     <21    High    Male  Married   No
2   21-35    High    Male   Single  Yes
3     >35  Medium    Male   Single  Yes
4     >35     Low  Female   Single  Yes
5     >35     Low  Female  Married   No
6   21-35     Low  Female  Married  Yes
7     <21  Medium    Male   Single   No
8     <21     Low  Female  Married  Yes
9     >35  Medium  Female   Single  Yes
10    <21  Medium  Female  Married  Yes
11  21-35  Medium    Male  Married  Yes
12  21-35    High  Female   Single  Yes
13    >35  Medium    Male  Married   No


In [4]:
# for col in df.columns:
#     df[col] = LabelEncoder().fit_transform(df[col])

df = df.apply(LabelEncoder().fit_transform)

In [5]:
print(df)

    AGE  INCOME  Gender  Maritial  Buys
0     1       0       1         1     0
1     1       0       1         0     0
2     0       0       1         1     1
3     2       2       1         1     1
4     2       1       0         1     1
5     2       1       0         0     0
6     0       1       0         0     1
7     1       2       1         1     0
8     1       1       0         0     1
9     2       2       0         1     1
10    1       2       0         0     1
11    0       2       1         0     1
12    0       0       0         1     1
13    2       2       1         0     0


In [6]:
X = df.iloc[ : ,  : -1]
y = df['Buys']

In [7]:
X = np.asarray(X)
print(X)

[[1 0 1 1]
 [1 0 1 0]
 [0 0 1 1]
 [2 2 1 1]
 [2 1 0 1]
 [2 1 0 0]
 [0 1 0 0]
 [1 2 1 1]
 [1 1 0 0]
 [2 2 0 1]
 [1 2 0 0]
 [0 2 1 0]
 [0 0 0 1]
 [2 2 1 0]]


In [8]:
print(y)

0     0
1     0
2     1
3     1
4     1
5     0
6     1
7     0
8     1
9     1
10    1
11    1
12    1
13    0
Name: Buys, dtype: int64


In [9]:
clf = DecisionTreeClassifier(max_depth = 2)
clf.fit(X, y)

In [10]:
clf.predict([[2, 0, 1, 0]])

[0]

In [11]:
# sklearn classifier
c = DTC()
c = c.fit(X, y)
c.predict([[2, 0, 1, 0]])

array([0])