In [1]:
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

In [2]:
path = 'iris.csv'
iris = pd.read_csv(path)

y = iris['target']
X = iris.iloc[:, :-1]

X, y

(     sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
 0                  5.1               3.5                1.4               0.2
 1                  4.9               3.0                1.4               0.2
 2                  4.7               3.2                1.3               0.2
 3                  4.6               3.1                1.5               0.2
 4                  5.0               3.6                1.4               0.2
 ..                 ...               ...                ...               ...
 145                6.7               3.0                5.2               2.3
 146                6.3               2.5                5.0               1.9
 147                6.5               3.0                5.2               2.0
 148                6.2               3.4                5.4               2.3
 149                5.9               3.0                5.1               1.8
 
 [150 rows x 4 columns],
 0      0
 1      0
 2   

In [20]:
class MyDecisionTree:
    def __init__(self, criterion='random', max_depth=10, min_sample_split=2) -> None:
        self.criterion = criterion
        self.max_depth = max_depth
        self.min_sample_split = min_sample_split

    def fit(self, X, y):
        self.tree = self._build_tree(X, y)

    def _build_tree(self, X, y, depth=0):
        if depth == self.max_depth or len(y) < self.min_sample_split:
            return LeafNode(self._get_leaf_value(y))

        feature_index, threshold = self._choose_feature(X, y)

        if feature_index is None or threshold is None:
            return LeafNode(self._get_leaf_value(y))
        min_value = np.min(X.iloc[:, feature_index])
        max_value = np.max(X.iloc[:, feature_index])

        left_indices = X.iloc[:, feature_index] < threshold
        right_indices = X.iloc[:, feature_index] >= threshold

        if left_indices.sum() == 0 or right_indices.sum() == 0:
            return LeafNode(self._get_leaf_value(y))

        left_tree = self._build_tree(X[left_indices], y[left_indices], depth + 1)
        right_tree = self._build_tree(X[right_indices], y[right_indices], depth + 1)
        
        return SplitNode(feature_index, threshold, left_tree, right_tree)

    def _choose_feature(self, X, y):
        best_feature = None
        best_threshold = None
        best_gini = float('inf')

        for feature_index in range(X.shape[1]):
            feature_values = X.iloc[:, feature_index].sort_values().unique()
            thresholds = (feature_values[:-1] + feature_values[1:]) / 2
            for threshold in thresholds:
                gini = self._calculate_gini(X, y, feature_index, threshold)
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature_index
                    best_threshold = threshold
        return best_feature, best_threshold

    def _calculate_gini(self, X, y, feature_index, threshold):
        left_indices = X.iloc[:, feature_index] < threshold
        right_indices = X.iloc[:, feature_index] >= threshold

        n = len(y)
        n_left = left_indices.sum()
        n_right = right_indices.sum()

        if n_right == 0 or n_left == 0: # Cas ou division separe mal donc pas efficace
            return float('inf') # On retourne inf pour definir que cette division est tres mauvaise

        left_gini = 1 - sum((np.sum(y[left_indices] == x) / n_left) ** 2 for x in np.unique(y))
        right_gini = 1 - sum((np.sum(y[right_indices] == x) / n_right) ** 2 for x in np.unique(y))

        weighted_gini = (n_left / n) * left_gini + (n_right / n) * right_gini
        return weighted_gini
    
    def _get_leaf_value(self, y):
        if y.empty:
            return print(y)
        else:
            return y.mode().iloc[0]

    def predict(self, X):
        return np.array(X.apply(self._predict_one, axis=1))

    def _predict_one(self, x):
        node = self.tree
        while isinstance(node, SplitNode):
            feature_value = x.iloc[node.feature_index]
            if not isinstance(feature_value, (int, float)):
                raise ValueError(f"Feature value {feature_value} at index {node.feature_index} is not numeric")
            if feature_value <= node.threshold:
                if node.left == None:
                    node = node.right
                else:
                    node = node.left
            else:
                if node.right == None:
                    node = node.left
                else:
                    node = node.right
        return node.value

class SplitNode:
    def __init__(self, feature_index, threshold, left, right) -> None:
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right

class LeafNode:
    def __init__(self, value) -> None:
        self.value = value

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
obj = MyDecisionTree(max_depth=15)
obj.fit(X_train, y_train)
my_prediction = obj.predict(X_test)
my_accuracy = accuracy_score(y_test, my_prediction)
print("My prediction:", my_accuracy)
print(my_prediction)

My prediction: 1.0
[1 0 2 1 1 0 1 2 1 1 2 0 0 0 0 1 2 1 1 2 0 2 0 2 2 2 2 2 0 0]


In [10]:
clf = DecisionTreeClassifier(max_depth=10, random_state=42)
clf.fit(X_train, y_train)
sk_predict = clf.predict(X_test)
sk_accuracy = accuracy_score(y_test, sk_predict)
print("\n Sk-learn prediction:", sk_accuracy)


 Sk-learn prediction: 1.0


In [7]:
print("\nComparaison des prédictions:")
comparison = pd.DataFrame({
    'Vraie valeur': y_test,
    'Ma prédiction': my_prediction,
    'Prédiction scikit-learn': sk_predict
})
print(comparison.head(10))

print(f"\nDifférence de précision: {abs(my_accuracy - sk_accuracy):.4f}")


Comparaison des prédictions:
     Vraie valeur  Ma prédiction  Prédiction scikit-learn
73              1              1                        1
18              0              0                        0
118             2              2                        2
78              1              1                        1
76              1              1                        1
31              0              0                        0
64              1              1                        1
141             2              2                        2
68              1              1                        1
82              1              1                        1

Différence de précision: 0.0000
