In [40]:
PATH = "/data/notebook_files"
elections_data_path = f"{PATH}/data/elections.csv"

# Séparer les caractéristiques et les étiquettes
elections_data = pd.read_csv(elections_data_path)
X = elections_data.drop("Label", axis=1)
y = elections_data["Label"]

elections_data

Unnamed: 0,Adresse,Majeur?,Nationalite,Label
0,Paris,oui,Francais,1
1,Paris,non,Francais,-1
2,Montpellier,oui,Italien,1
3,Paris,oui,Suisse,-1
4,Strasbourg,non,Italien,-1
5,Strasbourg,non,Francais,-1
6,Strasbourg,oui,Francais,1
7,Montpellier,oui,Suisse,-1


In [113]:
from collections import Counter
from copy import copy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

class SymbolicDecisionTree:
    def __init__(self):
        self.tree = None
        self.data = None

    class Node:
        def __init__(
            self,
            feature=None,
            value=None,
            true_branch=None,
            false_branch=None,
            result=None,
            entropy=None,
        ):
            self.feature = feature
            self.value = value
            self.true_branch = true_branch
            self.false_branch = false_branch
            self.result = result
            self.entropy = entropy

    def fit(self, X, y):
        # Combine features and labels
        training_data = pd.concat([X, y], axis=1)
        self.data = training_data
        self.tree = self.build_tree(training_data)

    def predict(self, X):
        predictions = [self.predict_single_entry(entry) for _, entry in X.iterrows()]
        return predictions

    def predict_xai(self, entry):
        explanation, _ = self.trace_tree(entry, self.tree, explanation=[])
        return explanation
    
    def counter_fact(self, entry):
        explanation, _ = self.untrace_tree(entry, self.tree, explanation=[])
        return explanation

    def build_tree(self, data):
        # If there's no data, or if all targets are the same,
        # return a leaf node with the result
        if len(data) == 0:
            return self.Node()

        current_uncertainty = self.gini(data.iloc[:, -1])
        # print(data.iloc[:, -1])
        best_gain = 0
        best_criteria = None
        best_sets = None

        feature_count = len(data.columns) - 1  # number of attributes

        for col in range(feature_count):  # for each feature
            feature_values = data.iloc[:, col].unique()  # unique values
            for val in feature_values:  # for each value
                partitioned_data = self.partition(data, data.columns[col], val)

                # Information gain
                p = float(partitioned_data[0].shape[0]) / data.shape[0]
                gain = (
                    current_uncertainty
                    - p * self.gini(partitioned_data[0].iloc[:, -1])
                    - (1 - p) * self.gini(partitioned_data[1].iloc[:, -1])
                )

                if (
                    gain > best_gain
                    and len(partitioned_data[0]) > 0
                    and len(partitioned_data[1]) > 0
                ):
                    best_gain = gain
                    best_criteria = (data.columns[col], val)
                    best_sets = partitioned_data

        if best_gain > 0:
            true_branch = self.build_tree(best_sets[0])
            false_branch = self.build_tree(best_sets[1])
            return self.Node(
                feature=best_criteria[0],
                value=best_criteria[1],
                true_branch=true_branch,
                false_branch=false_branch,
                entropy=current_uncertainty,
            )
        else:
            # We're at a leaf, determine the outcome most frequent class
            outcome = data.iloc[:, -1].value_counts().idxmax()
            return self.Node(result=outcome, entropy=current_uncertainty)

    def predict_single_entry(self, entry):
        node = self.tree
        while node.result is None:
            if entry[node.feature] == node.value:
                node = node.true_branch
            else:
                node = node.false_branch
        return node.result

    def trace_tree(self, entry, node, explanation):
        if node.result is not None:
            return explanation, node.result

        if entry[node.feature] == node.value:
            explanation.append(f"{node.feature} == {node.value}")
            return self.trace_tree(entry, node.true_branch, explanation)
        else:
            explanation.append(f"{node.feature} != {node.value}")
            return self.trace_tree(entry, node.false_branch, explanation)

    @staticmethod
    def partition(data, feature, value):
        true_data = data[data[feature] == value]
        false_data = data[data[feature] != value]
        return true_data, false_data
    
    @staticmethod
    def gini(labels):
        impurity = 1
        label_counts = Counter(labels)
        for label in label_counts:
            prob_of_label = label_counts[label] / float(len(labels))
            impurity -= prob_of_label**2
        return impurity

    def display_tree(self, node=None, indent="", branch=""):
        """
        Visualise la structure arborescente de l'arbre dans un format clair et organisé.

        Format de sortie :
        "├──" indique un nœud qui confirme la condition.
        "└──" indique un nœud qui ne confirme pas la condition.
        Les feuilles affichent le résultat de la classe.
        Les nœuds internes affichent le critère de décision et son entropie.

        Exemple :
        feature_name1 == feature_value1? (Entropy = 0.1234)
        ├── Class : Class_A
        └── feature_name2 == feature_value2? (Entropy = 0.5678)
            ├── Class : Class_B
            └── Class : Class_C

        Dans cet exemple :
        - Si feature_name1 == feature_value1, l'arbre de décision le classe comme
        'Class_A'.
        - Sinon, il vérifie si feature_name2 == feature_value2 ; si c'est vrai, il est
        classé comme 'Class_B', et si c'est faux, il est classé comme 'Class_C'.
        """

        if node is None:
            node = self.tree

        # Base case: if it's a leaf node, print the result and return
        if node.result is not None:
            print(f"{indent}{branch}Class: {node.result}")
            return

        # Print the criterion for the current node
        print(
            f"{indent}{branch}{node.feature} == {node.value}? (Entropy = {node.entropy:.4f})"
        )

        # Recursive case: print the true and false branches
        new_indent = indent + ("│   " if branch == "├── " else "    ")

        # For true branch, use '├──'
        self.display_tree(node.true_branch, new_indent, "├── ")

        # For false branch, use '└──'
        self.display_tree(node.false_branch, new_indent, "└── ")

In [72]:
# Diviser les données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

# Initialiser et entraîner l'arbre de décision symbolique
tree_xai = SymbolicDecisionTree()
tree_xai.fit(X_train, y_train)

# Évaluer la précision de l'arbre de décision
predictions = tree_xai.predict(X_test)
accuracy = accuracy_score(y_test, predictions)
print(f"Accuracy: {accuracy * 100:.4f}%")

Accuracy: 100.0000%


In [94]:

def untrace_tree(example_class, trace_path, node, before):
    if node.result is not None:
        before.append((None, node.result))
        trace_path.append(before)
        print(f"End of tree at {node.result}")
    else:
        before_false = copy(before)
        before_false.append((node.feature,"!=", node.value))
        before_true = copy(before)
        before_true.append((node.feature,"==", node.value))
        untrace_tree(example_class, trace_path, node.false_branch, before_false)
        untrace_tree(example_class, trace_path, node.true_branch, before_true)

trace_path = []
untrace_tree(1, trace_path, tree_xai.tree, [])
trace_path

End of tree at -1
End of tree at 1
End of tree at -1


[[('Nationalite', '!=', 'Suisse'), ('Majeur?', '!=', 'oui'), (None, -1)],
 [('Nationalite', '!=', 'Suisse'), ('Majeur?', '==', 'oui'), (None, 1)],
 [('Nationalite', '==', 'Suisse'), (None, -1)]]

In [108]:
def filter_trace_path(class_to_not_keep, trace_path):
    l = []
    for path in trace_path:
        end_class = path[-1][1]
        if end_class != class_to_not_keep:
            l.append(path)
    return l
filtered_trace_path = filter_trace_path(1,trace_path)
filtered_trace_path

[[('Nationalite', '!=', 'Suisse'), ('Majeur?', '!=', 'oui'), (None, -1)],
 [('Nationalite', '==', 'Suisse'), (None, -1)]]

In [118]:
def count_rules(to_explain, trace_path):
    l = []
    for path in trace_path:
        n_r = 0
        for rule in path[:-1]:
            a,b,c = rule
            if b == '!=':
                if to_explain[a] != c: # l'exemple valide la règle
                    pass
                else: # l'exemple ne valise pas la règle 
                    n_r += 1
            else: # b == '=='
                if to_explain[a] == c: # l'exemple valide la règle
                    pass
                else: # l'exemple ne valise pas la règle 
                    n_r += 1
        l.append(n_r)
    return np.array(l)
sample = X.iloc[0]
class_to_not_keep = y.iloc[0]
print(sample)
print(class_to_not_keep)
filtered_trace_path = filter_trace_path(1, trace_path)
n_r_list = count_rules(sample, filtered_trace_path)
np.argwhere(n_r_list == np.amax(n_r_list)) 

Adresse           Paris
Majeur?             oui
Nationalite    Francais
Name: 0, dtype: object
1
