In [1]:
import pandas as pd
import numpy as np
import random
from dotenv import load_dotenv
load_dotenv(r'.env')
import os
data_dir = os.environ.get('DATA_DIR')

In [5]:
data = pd.read_csv(data_dir + "/sign_mnist_train.csv")
data.head(5)

Unnamed: 0,label,pixel1,pixel2,pixel3,pixel4,pixel5,pixel6,pixel7,pixel8,pixel9,...,pixel775,pixel776,pixel777,pixel778,pixel779,pixel780,pixel781,pixel782,pixel783,pixel784
0,3,107,118,127,134,139,143,146,150,153,...,207,207,207,207,206,206,206,204,203,202
1,6,155,157,156,156,156,157,156,158,158,...,69,149,128,87,94,163,175,103,135,149
2,2,187,188,188,187,187,186,187,188,187,...,202,201,200,199,198,199,198,195,194,195
3,2,211,211,212,212,211,210,211,210,210,...,235,234,233,231,230,226,225,222,229,163
4,13,164,167,170,172,176,179,180,184,185,...,92,105,105,108,133,163,157,163,164,179


In [6]:
data.isna().any().any()

False

### Random forest from scratch

In [14]:
def split_dataset(dataset, split_ratio=0.8):
    """Permet de separer les données en jeux d'entrainement et de validation selon un pourcentage pour le jeu d'entrainement"""

    random.shuffle(dataset)

    split_index = int(len(dataset) * split_ratio)

    train_set = dataset[:split_index]
    validation_set = dataset[split_index:]

    return train_set, validation_set


train_set, validation_set = split_dataset(np.array(data), split_ratio=0.8)

#Division des labels et des données
X_train = train_set[:,1:]
y_train = train_set[::,0]
X_val = validation_set[:,1:]
y_val = validation_set[::,0]

In [19]:

class RandomForest:
    def __init__(self, n_trees=10, max_depth=None):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.trees = []

    def fit(self, X, y):
        for _ in range(self.n_trees):
            # Sélection aléatoire des échantillons avec remplacement
            indices = np.random.choice(len(X), len(X), replace=True)
            X_bootstrap = X.iloc[indices]
            y_bootstrap = y.iloc[indices]

            # Construction d'un arbre de décision
            tree = DecisionTree(max_depth=self.max_depth)
            tree.fit(X_bootstrap, y_bootstrap)

            # Ajout de l'arbre à la forêt
            self.trees.append(tree)

    def predict(self, X):
        # Prédiction en utilisant la moyenne des prédictions de chaque arbre
        predictions = np.zeros(len(X))

        for tree in self.trees:
            predictions += tree.predict(X)

        return predictions / len(self.trees)


class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y, depth=0):
        if depth == self.max_depth or len(set(y)) == 1:
            # Si la profondeur maximale est atteinte ou si toutes les étiquettes sont les mêmes
            return np.mean(y)

        # Sélection de la meilleure division
        feature_index, threshold = self.find_best_split(X, y)

        if feature_index is None:
            # Aucune division possible
            return np.mean(y)

        # Division des données
        mask = X.iloc[:, feature_index] <= threshold
        X_left, y_left = X[mask], y[mask]
        X_right, y_right = X[~mask], y[~mask]

        # Construction récursive des sous-arbres
        left_tree = self.fit(X_left, y_left, depth + 1)
        right_tree = self.fit(X_right, y_right, depth + 1)

        # Stockage de la division dans le format [index de la fonctionnalité, seuil, sous-arbre gauche, sous-arbre droit]
        self.tree = [feature_index, threshold, left_tree, right_tree]

        return self.tree

    def find_best_split(self, X, y):
        best_feature, best_threshold, best_mse = None, None, float('inf')

        for feature in X.columns:
            thresholds = X[feature].unique()

            for threshold in thresholds:
                mask = X[feature] <= threshold
                y_left, y_right = y[mask], y[~mask]

                if len(y_left) > 0 and len(y_right) > 0:
                    mse = self.mean_squared_error(y_left) + self.mean_squared_error(y_right)

                    if mse < best_mse:
                        best_mse = mse
                        best_feature = feature
                        best_threshold = threshold

        return best_feature, best_threshold

    def mean_squared_error(self, y):
        return np.mean((y - np.mean(y))**2)

    def predict(self, X):
        predictions = []
        for _, row in X.iterrows():
            predictions.append(self.predict_single(row))

        return np.array(predictions)

    def predict_single(self, data_point):
        if self.tree is None:
            return 0  # Valeur par défaut si l'arbre n'a pas été entraîné

        feature_index, threshold, left_tree, right_tree = self.tree

        if data_point[feature_index] <= threshold:
            return left_tree if not isinstance(left_tree, list) else self.predict_single(data_point, left_tree)
        else:
            return right_tree if not isinstance(right_tree, list) else self.predict_single(data_point, right_tree)


# Tests
random_forest = RandomForest(n_trees=10)
random_forest.fit(pd.DataFrame(X_train), pd.DataFrame(y_train))
predictions = random_forest.predict(pd.DataFrame(X_val))

In [20]:
from sklearn.metrics import accuracy_score

In [21]:
accuracy_score(predictions, y_val)

0.04279730468038609

0.04279730468038609