In [3]:
import numpy as np
import pandas as pd
from collections import Counter
import math

# Node class used to build decision tree structure
class Node:
    """
    Represents a single node in the decision tree.

    Attributes:
        feature (str): The feature used for splitting at this node.
        threshold (float or int): The threshold value for splitting.
        left (Node): The left child node.
        right (Node): The right child node.
        value (any): The value at the leaf node (if applicable).
    """
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

    def is_leaf_node(self):
        """
        Check if the node is a leaf node.

        Returns:
            bool: True if the node is a leaf node, False otherwise.
        """
        return self.value is not None

# --- Helper functions for classification and regression ---
def entropy(y):
    """
    Calculate the entropy of a target array.

    Args:
        y (array-like): The target values.

    Returns:
        float: The entropy of the target values.
    """
    counts = Counter(y)
    total = len(y)
    ent = 0.0
    for count in counts.values():
        p = count / total
        ent -= p * math.log2(p)
    return ent

def information_gain(parent, left, right):
    """
    Calculate the information gain from a split.

    Args:
        parent (array-like): The parent node target values.
        left (array-like): The left child node target values.
        right (array-like): The right child node target values.

    Returns:
        float: The information gain from the split.
    """
    weight_left = len(left) / len(parent)
    weight_right = len(right) / len(parent)
    return entropy(parent) - (weight_left * entropy(left) + weight_right * entropy(right))

def mse(y):
    """
    Calculate the mean squared error of a target array.

    Args:
        y (array-like): The target values.

    Returns:
        float: The mean squared error of the target values.
    """
    mean = np.mean(y)
    return np.mean((y - mean) ** 2)

def mse_reduction(parent, left, right):
    """
    Calculate the reduction in mean squared error from a split.

    Args:
        parent (array-like): The parent node target values.
        left (array-like): The left child node target values.
        right (array-like): The right child node target values.

    Returns:
        float: The reduction in mean squared error from the split.
    """
    weight_left = len(left) / len(parent)
    weight_right = len(right) / len(parent)
    return mse(parent) - (weight_left * mse(left) + weight_right * mse(right))

def calculate_gain(y_parent, y_left, y_right, task='classification'):
    """
    Calculate the gain (information gain or MSE reduction) from a split.

    Args:
        y_parent (array-like): The parent node target values.
        y_left (array-like): The left child node target values.
        y_right (array-like): The right child node target values.
        task (str): The task type ('classification' or 'regression').

    Returns:
        float: The gain from the split.
    """
    if len(y_left) == 0 or len(y_right) == 0:
        return 0
    return information_gain(y_parent, y_left, y_right) if task == 'classification' else mse_reduction(y_parent, y_left, y_right)

# --- Simple Decision Tree (supports classification and regression) ---
class SimpleDecisionTree:
    """
    A simple implementation of a decision tree that supports both classification and regression.

    Attributes:
        task (str): The task type ('classification', 'regression', or 'auto').
        max_depth (int): The maximum depth of the tree.
        tree (Node or dict): The root node of the decision tree.
    """
    def __init__(self, task='auto', max_depth=3):
        self.task = task
        self.max_depth = max_depth
        self.tree = None

    def fit(self, X, y):
        """
        Fit the decision tree to the training data.

        Args:
            X (DataFrame): The feature matrix.
            y (Series or array-like): The target values.
        """
        y = pd.Series(y)
        if self.task == 'auto':
            self.task = 'classification' if y.dtype == 'object' or len(np.unique(y)) <= 10 else 'regression'
        data = X.copy()
        data['target'] = y
        self.tree = self._build_tree(data, depth=0)

    def _build_tree(self, data, depth):
        """
        Recursively build the decision tree.

        Args:
            data (DataFrame): The dataset at the current node.
            depth (int): The current depth of the tree.

        Returns:
            Node or value: The constructed tree or leaf value.
        """
        y = data['target']
        if len(y.unique()) == 1:
            return y.iloc[0]
        if self.max_depth is not None and depth >= self.max_depth:
            return y.mode()[0] if self.task == 'classification' else y.mean()
        best_feature, best_value = self._find_best_split(data)
        if best_feature is None:
            return y.mode()[0] if self.task == 'classification' else y.mean()
        if self.task == 'classification':
            left = data[data[best_feature] == best_value]
            right = data[data[best_feature] != best_value]
        else:
            left = data[data[best_feature] <= best_value]
            right = data[data[best_feature] > best_value]
        return {
            'feature': best_feature,
            'value': best_value,
            'left': self._build_tree(left, depth + 1),
            'right': self._build_tree(right, depth + 1)
        }

    def _find_best_split(self, data):
        """
        Find the best feature and value to split the data.

        Args:
            data (DataFrame): The dataset to split.

        Returns:
            tuple: The best feature and value to split on.
        """
        best_gain = -1
        best_feature, best_value = None, None
        for col in data.columns:
            if col == 'target':
                continue
            for val in np.unique(data[col]):
                if self.task == 'classification':
                    left = data[data[col] == val]['target']
                    right = data[data[col] != val]['target']
                else:
                    left = data[data[col] <= val]['target']
                    right = data[data[col] > val]['target']
                if len(left) == 0 or len(right) == 0:
                    continue
                gain = calculate_gain(data['target'], left, right, self.task)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = col
                    best_value = val
        return best_feature, best_value

    def predict(self, X):
        """
        Predict target values for the given data.

        Args:
            X (DataFrame): The feature matrix.

        Returns:
            Series: The predicted target values.
        """
        return X.apply(lambda row: self._predict_row(row, self.tree), axis=1)

    def _predict_row(self, row, node):
        """
        Predict the target value for a single data point.

        Args:
            row (Series): The feature values of the data point.
            node (Node or dict): The current node of the tree.

        Returns:
            any: The predicted target value.
        """
        if not isinstance(node, dict):#
            return node
        val = row[node['feature']]
        if self.task == 'classification':
            return self._predict_row(row, node['left']) if val == node['value'] else self._predict_row(row, node['right'])
        else:
            return self._predict_row(row, node['left']) if val <= node['value'] else self._predict_row(row, node['right'])

# --- Random Forest class (classification & regression) ---
class RandomForest:
    """
    A simple implementation of a Random Forest algorithm that supports both classification and regression.

    Attributes:
        n_estimators (int): The number of trees in the forest.
        max_depth (int): The maximum depth of each tree.
        task (str): The task type ('classification' or 'regression').
        max_features (str or int or float): The maximum number of features to consider for splits ('sqrt', int, or float).
        trees (list): The list of fitted trees and their selected features.
    """
    def __init__(self, n_estimators=10, max_depth=5, task='classification', max_features='sqrt'):
        self.n_estimators = n_estimators
        self.max_depth = max_depth
        self.task = task
        self.max_features = max_features
        self.trees = []

    def _get_bootstrap_sample(self, X, y):
        """
        Create a bootstrap sample from the dataset.

        Args:
            X (DataFrame): The feature matrix.
            y (Series): The target values.

        Returns:
            tuple: The bootstrap sample of features and target values.
        """
        indices = np.random.choice(len(X), size=len(X), replace=True)
        return X.iloc[indices], y.iloc[indices]

    def _get_feature_subset(self, X):
        """
        Select a random subset of features for splitting.

        Args:
            X (DataFrame): The feature matrix.

        Returns:
            array: The selected features.
        """
        n_features = X.shape[1]
        if self.max_features == 'sqrt':
            k = int(np.sqrt(n_features))
        elif isinstance(self.max_features, float):
            k = int(self.max_features * n_features)
        elif isinstance(self.max_features, int):
            k = self.max_features
        else:
            k = n_features
        return np.random.choice(X.columns, size=k, replace=False)

    def fit(self, X, y):
        """
        Fit the Random Forest to the training data.

        Args:
            X (DataFrame): The feature matrix.
            y (Series or array-like): The target values.
        """
        self.trees = []
        for _ in range(self.n_estimators):
            X_sample, y_sample = self._get_bootstrap_sample(X, y)
            selected_features = self._get_feature_subset(X_sample)
            tree = SimpleDecisionTree(task=self.task, max_depth=self.max_depth)
            tree.fit(X_sample[selected_features], y_sample)
            self.trees.append((tree, selected_features))

    def predict(self, X):
        """
        Predict target values for the given data.

        Args:
            X (DataFrame): The feature matrix.

        Returns:
            array: The predicted target values.
        """
        tree_preds = [tree.predict(X[features]).values for tree, features in self.trees]
        tree_preds = np.array(tree_preds).T
        if self.task == 'classification':
            return np.array([Counter(row).most_common(1)[0][0] for row in tree_preds])
        else:
            return np.mean(tree_preds, axis=1)

# --- Accuracy metric for classification tasks ---
def accuracy_score(y_true, y_pred):
    """
    Calculate the accuracy score for classification tasks.

    Args:
        y_true (array-like): The true target values.
        y_pred (array-like): The predicted target values.

    Returns:
        float: The accuracy score.
    """
    return np.mean(np.array(y_true) == np.array(y_pred))


In [4]:
from sklearn.model_selection import train_test_split

data = pd.read_csv("DATA/gender_classification_v7.csv")
df = pd.DataFrame(data)

X = df.iloc[:, :-2]  
y = df.iloc[:, -1]  

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf = RandomForest(n_estimators=10, max_depth=None, task='classification')
rf.fit(X_train, y_train)

# Predict and evaluate
y_pred = rf.predict(X_test)

#Accuracy
print("Predictions:", y_pred)
print("Actual:", y_test)
print("Accuracy:", accuracy_score(y_test, y_pred))


Predictions: ['Male' 'Female' 'Female' ... 'Female' 'Male' 'Female']
Actual: 1501      Male
2586    Female
2653    Female
1055      Male
705       Male
         ...  
2313    Female
3214      Male
2732    Female
1926      Male
4227    Female
Name: gender, Length: 1001, dtype: object
Accuracy: 0.9140859140859141
