<h1><center> Supervised learning : decision tree </center></h1>
<h2><center> GL </center></h2> 
<h3><center> ASMAE KARMOUCHI </center></h3>
<h3><center> MOHAMMED AMINE KENDI</center></h3>
<h5><center> Academic Year: 2022 - 2023</center></h5>

In [1]:
import numpy as np

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index  # Index de la caractéristique utilisée pour la division
        self.threshold = threshold  # Seuil utilisé pour la division
        self.left = left  # Sous-arbre gauche (valeurs inférieures ou égales au seuil)
        self.right = right  # Sous-arbre droit (valeurs supérieures au seuil)
        self.value = value  # Valeur de la feuille (uniquement pour les feuilles)

In [2]:
class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.num_classes = len(set(y))
        self.num_features = X.shape[1]
        self.tree = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        num_samples_per_class = [np.sum(y == i) for i in range(self.num_classes)]
        predicted_class = np.argmax(num_samples_per_class)
        node = Node(value=predicted_class)

        # Conditions d'arrêt
        if depth < self.max_depth:
            best_feature, best_threshold = self._best_criteria(X, y)
            if best_feature is not None:
                left_indices, right_indices = self._split(X[:, best_feature], best_threshold)
                left = self._grow_tree(X[left_indices, :], y[left_indices], depth + 1)
                right = self._grow_tree(X[right_indices, :], y[right_indices], depth + 1)
                node = Node(feature_index=best_feature, threshold=best_threshold, left=left, right=right)
        return node

    def _best_criteria(self, X, y):
        best_gini = 1
        best_feature = None
        best_threshold = None

        for feature_index in range(self.num_features):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_indices, right_indices = self._split(X[:, feature_index], threshold)
                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue
                gini = self._gini_impurity(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_index
                    best_threshold = threshold

        return best_feature, best_threshold

    def _split(self, feature, threshold):
        left_indices = np.where(feature <= threshold)[0]
        right_indices = np.where(feature > threshold)[0]
        return left_indices, right_indices

    def _gini_impurity(self, left_labels, right_labels):
        total_samples = len(left_labels) + len(right_labels)
        p_left = len(left_labels) / total_samples
        p_right = len(right_labels) / total_samples
        gini = 1.0 - p_left**2 - p_right**2
        return gini

    def _predict_sample(self, x, tree):
        if tree.value is not None:
            return tree.value
        feature_value = x[tree.feature_index]
        if feature_value <= tree.threshold:
            return self._predict_sample(x, tree.left)
        else:
            return self._predict_sample(x, tree.right)

    def predict(self, X):
        y_pred = np.array([self._predict_sample(x, self.tree) for x in X])
        return y_pred



![](destr.png)

## Exemple d'utilisation
#### Breast Cancer Wisconsin

In [3]:

if __name__ == "__main__":
    from sklearn.datasets import load_breast_cancer
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import accuracy_score

    data = load_breast_cancer()
    X = data.data
    y = data.target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    tree = DecisionTree(max_depth=5)
    tree.fit(X_train, y_train)
    y_pred = tree.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)


Accuracy: 0.6228070175438597
