In [19]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo
from typing import List, Dict
from scipy.special import gammaln

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class SMCDecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2, n_particles=10, alpha_value=0.1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_particles = n_particles
        self.tree = None
        self.alpha_value = alpha_value

    def _log_dirichlet(self, dirichlet_params):
        return np.sum(gammaln(dirichlet_params)) - gammaln(np.sum(dirichlet_params)) 

    def _initialize_particles(self, n_features):
        return {
            'feature_index': np.random.randint(0, n_features, self.n_particles),
            'threshold': np.random.uniform(0, 1, self.n_particles)
        }

    def _move_particles(self, particles, n_features):
        move_mask = np.random.random(self.n_particles) < 0.5
        particles['feature_index'][move_mask] = np.random.randint(0, n_features, np.sum(move_mask))
        particles['threshold'][~move_mask] += np.random.normal(0, 0.1, np.sum(~move_mask))
        return particles

    def _evaluate_split(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        left_y, right_y = y[left_mask], y[~left_mask]
        
        if len(left_y) == 0 or len(right_y) == 0:
            return -np.inf
        
        left_counts = np.zeros(len(self.classes_))
        right_counts = np.zeros(len(self.classes_))
        
        for i, c in enumerate(self.classes_):
            left_counts[i] = np.sum(left_y == c)
            right_counts[i] = np.sum(right_y == c)
        
        alpha = np.ones(len(self.classes_)) * self.alpha_value
        
        log_likelihood = (
            self._log_dirichlet(left_counts + alpha) - self._log_dirichlet(alpha) +
            self._log_dirichlet(right_counts + alpha) - self._log_dirichlet(alpha)
        )
        
        log_prior = 0 # -(np.log2(4) + np.log2(X.shape[1]))*self.num_nodes()
         
        return log_likelihood + log_prior

    def _smc_split(self, X, y):
        particles = self._initialize_particles(X.shape[1])
        
        for _ in range(5):  # Number of SMC iterations
            particles = self._move_particles(particles, X.shape[1])
            
            weights = np.array([self._evaluate_split(X, y, p_f, p_t) 
                                for p_f, p_t in zip(particles['feature_index'], particles['threshold'])])
            weights = np.where(np.isfinite(weights), np.exp(weights), 0)
            
            # Handle potential zero sum of weights
            if np.sum(weights) == 0:
                weights = np.ones_like(weights) / len(weights)
            else:
                weights /= np.sum(weights)
            
            # Check for NaN values and replace with uniform probabilities if necessary
            if np.any(np.isnan(weights)):
                print("Warning: NaN weights encountered. Using uniform probabilities.")
                weights = np.ones_like(weights) / len(weights)
            
            indices = np.random.choice(self.n_particles, size=self.n_particles, p=weights)
            particles = {k: v[indices] for k, v in particles.items()}
        
        best_index = np.argmax(weights)
        return particles['feature_index'][best_index], particles['threshold'][best_index]

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        unique_classes = np.unique(y)

        if depth >= self.max_depth or n_samples < self.min_samples_split or len(unique_classes) == 1:
            counts = np.zeros(len(self.classes_))
            for i, c in enumerate(self.classes_):
                counts[i] = np.sum(y == c)
            return Node(value=counts)

        feature_index, threshold = self._smc_split(X, y)
        
        left_mask = X[:, feature_index] <= threshold
        X_left, y_left = X[left_mask], y[left_mask]
        X_right, y_right = X[~left_mask], y[~left_mask]
        
        left_subtree = self._grow_tree(X_left, y_left, depth + 1)
        right_subtree = self._grow_tree(X_right, y_right, depth + 1)

        return Node(feature_index=feature_index, threshold=threshold, left=left_subtree, right=right_subtree)

    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.classes_ = np.unique(y)
        self.n_classes = len(self.classes_)
        print(f"Number of classes: {self.n_classes}")
        print(f"Unique classes: {self.classes_}")
        print(f"Input y shape: {y.shape}")
        print(f"Input y unique values: {np.unique(y)}")
        self.tree = self._grow_tree(X, y)

    def _predict_sample(self, x, node):
        if node.value is not None:
            return self.classes_[np.argmax(node.value)]
        
        if x[node.feature_index] <= node.threshold:
            return self._predict_sample(x, node.left)
        else:
            return self._predict_sample(x, node.right)

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def num_nodes(self):
        return self._count_nodes(self.tree)

    def _count_nodes(self, node):
        if node is None:
            return 0
        return 1 + self._count_nodes(node.left) + self._count_nodes(node.right)

def run_smc_dt_on_dataset(X_train, y_train, X_test, y_test, seed=42):
    try:
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        smc_dt = SMCDecisionTree(max_depth=5, n_particles=100)
        smc_dt.fit(X_train_scaled, y_train)

        y_pred = smc_dt.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        return accuracy, smc_dt.num_nodes()
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

def experiment_on_datasets(seeds: List[int], n_trials=10, UCI=True) -> Dict[str, Dict[str, float]]:
    if UCI:
        datasets = [
            #(17, "BCW-D"),
            (109, "Wine"),
            # (53, "Iris"),
            # (850, "Raisin"),
        ]
    else:
        # Specify CSV files directly
        datasets = [
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor.csv", "HiddenXOR"),
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/S1.csv", "BasicSanityCheck"), 
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/S2.csv", "HarderSanityCheck")
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv1.csv", "HiddenXORLV1"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv2.csv", "HiddenXORLV2"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv3.csv", "HiddenXORLV3"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv1.csv", "HiddenXORLV1Real"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv2.csv", "HiddenXORLV2Real"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv3.csv", "HiddenXORLV3Real")
        ]

    results = {}

    for dataset_info in datasets:
        if UCI:
            dataset_id, dataset_name = dataset_info
            print(f"\nRunning experiment on {dataset_name} dataset...")
            dataset = fetch_ucirepo(id=dataset_id)
            X = dataset.data.features.values
            y = dataset.data.targets.values.ravel()
        else:
            file_path, dataset_name = dataset_info
            print(f"\nRunning experiment on {dataset_name} dataset from {file_path}...")
            if not os.path.exists(file_path):
                print(f"File {file_path} not found.")
                continue
            data = pd.read_csv(file_path)
            X = data.iloc[:, :-2].values  # Assuming last two columns are 'train'/'test' and the target
            y = data.iloc[:, -2].values
            split_column = data.iloc[:, -1].values  # 'train'/'test' column

            # Split based on the 'train'/'test' column
            X_train = X[split_column == 'train']
            y_train = y[split_column == 'train']
            X_test = X[split_column == 'test']
            y_test = y[split_column == 'test']

        # print(f"Original y shape: {y.shape}")
        # print(f"Original y unique values: {np.unique(y)}")
        
        # print(f"Dataset shape: {X.shape}")
        # print(f"Unique classes in dataset: {np.unique(y)}")
        
        accuracies = []
        n_nodes_list = []
        for seed in seeds:
            print(f"\nRunning with seed {seed}")
            if UCI: 
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
            
            if y_train.dtype == object:
                le = LabelEncoder()
                y_train = le.fit_transform(y_train)
                y_test = le.transform(y_test)
                # print(f"After LabelEncoder - y_train unique values: {np.unique(y_train)}")
                # print(f"After LabelEncoder - y_test unique values: {np.unique(y_test)}")
            try:
                accs, sizes = [], []
                for _ in range(n_trials):
                    accuracy, n_nodes = run_smc_dt_on_dataset(X_train, y_train, X_test, y_test, seed)
                    if accuracy is not None and n_nodes is not None:
                        accs.append(accuracy)
                        sizes.append(n_nodes)
                    accuracies.append(np.mean(accs))
                    n_nodes_list.append(np.mean(sizes))
                    print(f"Seed {seed} - Accuracy: {np.mean(accs):.4f}, Nodes: {np.mean(sizes)}")
            except Exception as e:
                print(f"Error occurred with seed {seed}: {str(e)}")
        
        if accuracies and n_nodes_list:
            mean_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            mean_nodes = np.mean(n_nodes_list)
            std_nodes = np.std(n_nodes_list)
            
            results[dataset_name] = {
                "mean_accuracy": mean_accuracy,
                "std_accuracy": std_accuracy,
                "mean_nodes": mean_nodes,
                "std_nodes": std_nodes
            }
            
            print(f"{dataset_name} - Mean Accuracy: {mean_accuracy:.4f}, Std: {std_accuracy:.4f}")
            print(f"{dataset_name} - Mean Nodes: {mean_nodes:.2f}, Std: {std_nodes:.2f}")
        else:
            print(f"Failed to process {dataset_name} dataset")
    
    return results

# Run the experiments
seeds = np.array([1, 2, 3, 4, 5])  # Using multiple seeds for more robust results
results = experiment_on_datasets(seeds, n_trials=10, UCI=True)

# Print final results
print("\nFinal Results:")
for dataset, metrics in results.items():
    print(f"{dataset}:")
    print(f"  Accuracy - Mean: {metrics['mean_accuracy']:.4f}, Std: {metrics['std_accuracy']:.4f}")
    print(f"  Nodes    - Mean: {metrics['mean_nodes']:.2f}, Std: {metrics['std_nodes']:.2f}")


Running experiment on Wine dataset...

Running with seed 1
Number of classes: 3
Unique classes: [1 2 3]
Input y shape: (142,)
Input y unique values: [1 2 3]
Seed 1 - Accuracy: 0.9722, Nodes: 17.0
Number of classes: 3
Unique classes: [1 2 3]
Input y shape: (142,)
Input y unique values: [1 2 3]
Seed 1 - Accuracy: 0.9722, Nodes: 16.0
Number of classes: 3
Unique classes: [1 2 3]
Input y shape: (142,)
Input y unique values: [1 2 3]
Seed 1 - Accuracy: 0.9537, Nodes: 15.666666666666666
Number of classes: 3
Unique classes: [1 2 3]
Input y shape: (142,)
Input y unique values: [1 2 3]
Seed 1 - Accuracy: 0.9444, Nodes: 16.0
Number of classes: 3
Unique classes: [1 2 3]
Input y shape: (142,)
Input y unique values: [1 2 3]
Seed 1 - Accuracy: 0.9500, Nodes: 15.0
Number of classes: 3
Unique classes: [1 2 3]
Input y shape: (142,)
Input y unique values: [1 2 3]
Seed 1 - Accuracy: 0.9444, Nodes: 15.666666666666666
Number of classes: 3
Unique classes: [1 2 3]
Input y shape: (142,)
Input y unique values: 

## **MCMC** ##

In [22]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo
from typing import List, Dict
from scipy.special import gammaln

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class MCMCDecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2, mcmc_iterations=1000, alpha_value=0.1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.mcmc_iterations = mcmc_iterations
        self.tree = None
        self.alpha_value = alpha_value

    def _log_dirichlet(self, dirichlet_params):
        return np.sum(gammaln(dirichlet_params)) - gammaln(np.sum(dirichlet_params))

    def _evaluate_split(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        left_y, right_y = y[left_mask], y[~left_mask]
        
        if len(left_y) == 0 or len(right_y) == 0:
            return -np.inf
        
        left_counts = np.zeros(len(self.classes_))
        right_counts = np.zeros(len(self.classes_))
        
        for i, c in enumerate(self.classes_):
            left_counts[i] = np.sum(left_y == c)
            right_counts[i] = np.sum(right_y == c)
        
        alpha = np.ones(len(self.classes_)) * self.alpha_value
        
        log_likelihood = (
            self._log_dirichlet(left_counts + alpha) - self._log_dirichlet(alpha) +
            self._log_dirichlet(right_counts + alpha) - self._log_dirichlet(alpha)
        )

        log_prior = -(np.log2(4) + np.log2(X.shape[1]))*self.num_nodes()
         
        return log_likelihood + log_prior

    def _mcmc_split(self, X, y):
        n_features = X.shape[1]
        current_feature = np.random.randint(0, n_features)
        current_threshold = np.random.choice(X[:, current_feature])
        current_score = self._evaluate_split(X, y, current_feature, current_threshold)
        
        best_feature, best_threshold, best_score = current_feature, current_threshold, current_score
        
        for _ in range(self.mcmc_iterations):
            if np.random.random() < 0.5:
                proposed_feature = np.random.randint(0, n_features)
                proposed_threshold = np.random.choice(X[:, proposed_feature])
            else:
                proposed_feature = current_feature
                proposed_threshold = current_threshold + np.random.normal(0, 0.1 * (np.max(X[:, current_feature]) - np.min(X[:, current_feature])))
            
            proposed_score = self._evaluate_split(X, y, proposed_feature, proposed_threshold)
            
            if np.isfinite(proposed_score) and np.isfinite(current_score):
                acceptance_prob = min(1, np.exp(proposed_score - current_score))
            elif np.isfinite(proposed_score):
                acceptance_prob = 1
            else:
                acceptance_prob = 0
            
            if np.random.random() < acceptance_prob:
                current_feature, current_threshold, current_score = proposed_feature, proposed_threshold, proposed_score
                
                if current_score > best_score:
                    best_feature, best_threshold, best_score = current_feature, current_threshold, current_score
        
        return best_feature, best_threshold

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        unique_classes = np.unique(y)

        if depth >= self.max_depth or n_samples < self.min_samples_split or len(unique_classes) == 1:
            counts = np.zeros(len(self.classes_))
            for i, c in enumerate(self.classes_):
                counts[i] = np.sum(y == c)
            return Node(value=counts)

        feature_index, threshold = self._mcmc_split(X, y)
        
        left_mask = X[:, feature_index] <= threshold
        X_left, y_left = X[left_mask], y[left_mask]
        X_right, y_right = X[~left_mask], y[~left_mask]
        
        left_subtree = self._grow_tree(X_left, y_left, depth + 1)
        right_subtree = self._grow_tree(X_right, y_right, depth + 1)

        return Node(feature_index=feature_index, threshold=threshold, left=left_subtree, right=right_subtree)

    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.classes_ = np.unique(y)
        self.n_classes = len(self.classes_)
        print(f"Number of classes: {self.n_classes}")
        print(f"Unique classes: {self.classes_}")
        print(f"Input y shape: {y.shape}")
        print(f"Input y unique values: {np.unique(y)}")
        self.tree = self._grow_tree(X, y)

    def _predict_sample(self, x, node):
        if node.value is not None:
            return self.classes_[np.argmax(node.value)]
        
        if x[node.feature_index] <= node.threshold:
            return self._predict_sample(x, node.left)
        else:
            return self._predict_sample(x, node.right)

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def num_nodes(self):
        return self._count_nodes(self.tree)

    def _count_nodes(self, node):
        if node is None:
            return 0
        return 1 + self._count_nodes(node.left) + self._count_nodes(node.right)

def run_mcmc_dt_on_dataset(X_train, y_train, X_test, y_test):
    
    try:
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        mcmc_dt = MCMCDecisionTree(max_depth=5)
        mcmc_dt.fit(X_train_scaled, y_train)

        y_pred = mcmc_dt.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        return accuracy, mcmc_dt.num_nodes()
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None
    
# ... [rest of the code remains the same] ...
def experiment_on_datasets(seeds: List[int], n_trials: int=1, UCI: bool = False) -> Dict[str, Dict[str, float]]:

    if UCI:
        datasets = [
            (17, "BCW-D"),
            (109, "Wine"),
            (53, "Iris"),
            (850, "Raisin"),
            ]
    else:
        # Specify CSV files directly
        datasets = [
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor.csv", "HiddenXOR"),
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/S1.csv", "BasicSanityCheck"), 
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/S2.csv", "HarderSanityCheck")
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv1.csv", "HiddenXORLV1"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv2.csv", "HiddenXORLV2"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv3.csv", "HiddenXORLV3"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv1.csv", "HiddenXORLV1Real"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv2.csv", "HiddenXORLV2Real"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv3.csv", "HiddenXORLV3Real")
        ]


    results = {}

    for dataset_info in datasets:
        if UCI:
            dataset_id, dataset_name = dataset_info
            print(f"\nRunning experiment on {dataset_name} dataset...")
            dataset = fetch_ucirepo(id=dataset_id)
            X = dataset.data.features.values
            y = dataset.data.targets.values.ravel()
        else:
            file_path, dataset_name = dataset_info
            print(f"\nRunning experiment on {dataset_name} dataset from {file_path}...")
            if not os.path.exists(file_path):
                print(f"File {file_path} not found.")
                continue
            data = pd.read_csv(file_path)
            X = data.iloc[:, :-2].values  # Assuming last two columns are 'train'/'test' and the target
            y = data.iloc[:, -2].values
            split_column = data.iloc[:, -1].values  # 'train'/'test' column

            # Split based on the 'train'/'test' column
            X_train = X[split_column == 'train']
            y_train = y[split_column == 'train']
            X_test = X[split_column == 'test']
            y_test = y[split_column == 'test']

        print(f"Original y shape: {y.shape}")
        print(f"Original y unique values: {np.unique(y)}")

        if y.dtype == object:
            le = LabelEncoder()
            y = le.fit_transform(y)
            print(f"After LabelEncoder - y unique values: {np.unique(y)}")
        
        print(f"Dataset shape: {X.shape}")
        print(f"Unique classes in dataset: {np.unique(y)}")
        
        accuracies = []
        n_nodes_list = []
        for seed in seeds:
            print(f"\nRunning with seed {seed}")
            if UCI: 
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
            accs, sizes = [], []
            for _ in range(n_trials): 
                accuracy, n_nodes = run_mcmc_dt_on_dataset(X_train, y_train, X_test, y_test)
                if accuracy is not None and n_nodes is not None:
                    accs.append(accuracy)
                    sizes.append(n_nodes)
                accuracies.append(np.mean(accs))
                n_nodes_list.append(np.mean(sizes))
                print(f"Seed {seed} - Accuracy: {np.mean(accs):.4f}, Nodes: {np.mean(sizes)}")
            if accuracy is not None and n_nodes is not None:
                accuracies.append(accuracy)
                n_nodes_list.append(n_nodes)
                print(f"Seed {seed} - Accuracy: {accuracy:.4f}, Nodes: {n_nodes}")
        
        if accuracies and n_nodes_list:
            mean_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            mean_nodes = np.mean(n_nodes_list)
            std_nodes = np.std(n_nodes_list)
            
            results[dataset_name] = {
                "mean_accuracy": mean_accuracy,
                "std_accuracy": std_accuracy,
                "mean_nodes": mean_nodes,
                "std_nodes": std_nodes
            }
            
            print(f"{dataset_name} - Mean Accuracy: {mean_accuracy:.4f}, Std: {std_accuracy:.4f}")
            print(f"{dataset_name} - Mean Nodes: {mean_nodes:.2f}, Std: {std_nodes:.2f}")
        else:
            print(f"Failed to process {dataset_name} dataset")
    
    return results

# Run the experiments
np.random.seed(42)
seeds = np.array([1])  # Using multiple seeds for more robust results
results = experiment_on_datasets(seeds, n_trials=5)

# Print final results
print("\nFinal Results:")
for dataset, metrics in results.items():
    print(f"{dataset}:")
    print(f"  Accuracy - Mean: {metrics['mean_accuracy']:.4f}, Std: {metrics['std_accuracy']:.4f}")
    print(f"  Nodes    - Mean: {metrics['mean_nodes']:.2f}, Std: {metrics['std_nodes']:.2f}")


Running experiment on HiddenXORLV1 dataset from /Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv1.csv...
Original y shape: (1000,)
Original y unique values: [0 1]
Dataset shape: (1000, 5)
Unique classes in dataset: [0 1]

Running with seed 1
Number of classes: 2
Unique classes: [0 1]
Input y shape: (700,)
Input y unique values: [0 1]
Seed 1 - Accuracy: 1.0000, Nodes: 15.0
Number of classes: 2
Unique classes: [0 1]
Input y shape: (700,)
Input y unique values: [0 1]
Seed 1 - Accuracy: 1.0000, Nodes: 15.0
Number of classes: 2
Unique classes: [0 1]
Input y shape: (700,)
Input y unique values: [0 1]
Seed 1 - Accuracy: 1.0000, Nodes: 15.0
Number of classes: 2
Unique classes: [0 1]
Input y shape: (700,)
Input y unique values: [0 1]
Seed 1 - Accuracy: 1.0000, Nodes: 15.0
Number of classes: 2
Unique classes: [0 1]
Input y shape: (700,)
Input y unique values: [0 1]
Seed 1 - Accuracy: 1.0000, Nodes: 15.0
Seed 1 - Accuracy: 1.0000, Nodes: 15
HiddenXORLV1 - M

In [23]:
results

{'HiddenXORLV1': {'mean_accuracy': 1.0,
  'std_accuracy': 0.0,
  'mean_nodes': 15.0,
  'std_nodes': 0.0},
 'HiddenXORLV2': {'mean_accuracy': 0.6266666666666668,
  'std_accuracy': 1.1102230246251565e-16,
  'mean_nodes': 55.0,
  'std_nodes': 0.0},
 'HiddenXORLV3': {'mean_accuracy': 0.5499999999999999,
  'std_accuracy': 1.1102230246251565e-16,
  'mean_nodes': 63.0,
  'std_nodes': 0.0},
 'HiddenXORLV1Real': {'mean_accuracy': 0.5438148148148149,
  'std_accuracy': 0.02917674722917502,
  'mean_nodes': 15.81111111111111,
  'std_nodes': 0.6996471773969406},
 'HiddenXORLV2Real': {'mean_accuracy': 0.4684074074074074,
  'std_accuracy': 0.0025872961772700743,
  'mean_nodes': 19.400000000000002,
  'std_nodes': 0.7302967433402214},
 'HiddenXORLV3Real': {'mean_accuracy': 0.5331296296296296,
  'std_accuracy': 0.001780572648749831,
  'mean_nodes': 17.400000000000002,
  'std_nodes': 0.7302967433402214}}

## **Synthetic Data Generation for testing** ##

In [None]:
import pandas as pd
import numpy as np

np.random.seed(42)

# Generate synthetic data
n_samples = 1000
n_features = 5
n_classes = 4

# Generate features
X = np.random.randint(0, 4, size=(n_samples, n_features))

# Generate target variable with some noise
y = np.zeros(n_samples, dtype=int)
y[(X[:, 0] == 0) & (X[:, 1] == 1)] = 0
y[(X[:, 0] == 1) & (X[:, 2] == 2)] = 1
y[(X[:, 1] == 2) & (X[:, 3] == 0)] = 2
y[(X[:, 2] == 3) & (X[:, 4] == 1)] = 3

# Add some noise
noise = np.random.rand(n_samples) < 0.1
y[noise] = np.random.randint(0, 4, size=sum(noise))

# Create a DataFrame
df = pd.DataFrame(X, columns=[f'Feature_{i+1}' for i in range(n_features)])
df['Target'] = y

# Split into train and test
train_mask = np.random.rand(len(df)) < 0.8
df['Split'] = np.where(train_mask, 'train', 'test')

# Save to CSV
df.to_csv('/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/S2.csv', index=False)

print(df.head(10))
print("\nFeature distributions:")
print(df[['Feature_1', 'Feature_2', 'Feature_3', 'Feature_4', 'Feature_5']].describe())
print("\nTarget distribution:")
print(df['Target'].value_counts(normalize=True))


   Feature_1  Feature_2  Feature_3  Feature_4  Feature_5  Target  Split
0          2          3          0          2          2       0  train
1          3          0          0          2          1       0  train
2          2          2          2          2          3       0  train
3          0          3          3          3          2       0   test
4          1          0          1          3          3       0  train
5          1          1          1          3          3       0   test
6          0          0          3          1          1       3  train
7          0          3          0          0          2       0  train
8          2          2          1          3          3       0  train
9          3          3          2          1          1       0  train

Feature distributions:
         Feature_1    Feature_2    Feature_3    Feature_4    Feature_5
count  1000.000000  1000.000000  1000.000000  1000.000000  1000.000000
mean      1.531000     1.428000     1.4360

## **Potentially Updated SMC Code** ##

In [20]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.metrics import accuracy_score
from ucimlrepo import fetch_ucirepo
from typing import List, Dict
from scipy.special import gammaln

class Node:
    def __init__(self, feature_index=None, threshold=None, left=None, right=None, value=None):
        self.feature_index = feature_index
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value

class SMCDecisionTree:
    def __init__(self, max_depth=5, min_samples_split=2, n_particles=10, alpha_value=0.1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.n_particles = n_particles
        self.tree = None
        self.alpha_value = alpha_value

    def _log_dirichlet(self, dirichlet_params):
        return np.sum(gammaln(dirichlet_params)) - gammaln(np.sum(dirichlet_params)) 

    def _initialize_particles(self, n_features):
        return {
            'feature_index': np.random.randint(0, n_features, self.n_particles),
            'threshold': np.random.uniform(0, 1, self.n_particles)
        }

    def _move_particles(self, particles, n_features):
        move_mask = np.random.random(self.n_particles) < 0.5
        particles['feature_index'][move_mask] = np.random.randint(0, n_features, np.sum(move_mask))
        particles['threshold'][~move_mask] += np.random.normal(0, 0.1, np.sum(~move_mask))
        return particles

    def _evaluate_split(self, X, y, feature_index, threshold):
        left_mask = X[:, feature_index] <= threshold
        left_y, right_y = y[left_mask], y[~left_mask]
        
        if len(left_y) == 0 or len(right_y) == 0:
            return -np.inf
        
        left_counts = np.zeros(len(self.classes_))
        right_counts = np.zeros(len(self.classes_))
        
        for i, c in enumerate(self.classes_):
            left_counts[i] = np.sum(left_y == c)
            right_counts[i] = np.sum(right_y == c)
        
        alpha = np.ones(len(self.classes_)) * self.alpha_value
        
        log_likelihood = (
            self._log_dirichlet(left_counts + alpha) - self._log_dirichlet(alpha) +
            self._log_dirichlet(right_counts + alpha) - self._log_dirichlet(alpha)
        )
        
        log_prior = 0 #-(np.log2(4) + np.log2(X.shape[1]))*self.num_internal_nodes() 
        return log_likelihood + log_prior

    def _smc_split(self, X, y):
        particles = self._initialize_particles(X.shape[1])
        
        for _ in range(20):  # Number of SMC iterations
            particles = self._move_particles(particles, X.shape[1])
            
            weights = np.array([self._evaluate_split(X, y, p_f, p_t) 
                                for p_f, p_t in zip(particles['feature_index'], particles['threshold'])])
            weights = np.where(np.isfinite(weights), np.exp(weights), 0)
            
            # Handle potential zero sum of weights
            if np.sum(weights) == 0:
                weights = np.ones_like(weights) / len(weights)
            else:
                weights /= np.sum(weights)
            
            # Check for NaN values and replace with uniform probabilities if necessary
            if np.any(np.isnan(weights)):
                print("Warning: NaN weights encountered. Using uniform probabilities.")
                weights = np.ones_like(weights) / len(weights)
            
            indices = np.random.choice(self.n_particles, size=self.n_particles, p=weights)
            particles = {k: v[indices] for k, v in particles.items()}
        
        best_index = np.argmax(weights)
        return particles['feature_index'][best_index], particles['threshold'][best_index]

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        unique_classes = np.unique(y)

        if depth >= self.max_depth or n_samples < self.min_samples_split or len(unique_classes) == 1:
            counts = np.zeros(len(self.classes_))
            for i, c in enumerate(self.classes_):
                counts[i] = np.sum(y == c)
            return Node(value=counts)

        feature_index, threshold = self._smc_split(X, y)
        
        left_mask = X[:, feature_index] <= threshold
        X_left, y_left = X[left_mask], y[left_mask]
        X_right, y_right = X[~left_mask], y[~left_mask]
        
        left_subtree = self._grow_tree(X_left, y_left, depth + 1)
        right_subtree = self._grow_tree(X_right, y_right, depth + 1)

        return Node(feature_index=feature_index, threshold=threshold, left=left_subtree, right=right_subtree)

    def _compute_log_posterior(self, node, alpha):
        if node is None:
            return 0
        
        if node.value is not None:
            # Leaf node
            counts = node.value
            log_likelihood = self._log_dirichlet(counts + alpha) - self._log_dirichlet(alpha)
            log_prior = 0 # -(np.log2(4) + np.log2(self.n_features)) * self.num_internal_nodes()
            return log_likelihood + log_prior
        
        left_log_post = self._compute_log_posterior(node.left, alpha)
        right_log_post = self._compute_log_posterior(node.right, alpha)
        
        return left_log_post + right_log_post

    def log_posterior(self):
        alpha = np.ones(len(self.classes_)) * self.alpha_value
        return self._compute_log_posterior(self.tree, alpha) 

    def fit(self, X, y):
        self.n_features = X.shape[1]
        self.classes_ = np.unique(y)
        self.n_classes = len(self.classes_)
        print(f"Number of classes: {self.n_classes}")
        print(f"Unique classes: {self.classes_}")
        print(f"Input y shape: {y.shape}")
        print(f"Input y unique values: {np.unique(y)}")
        self.tree = self._grow_tree(X, y)
        log_posterior = self.log_posterior()
        print(f"Log Posterior of the fitted tree: {log_posterior}")

    def _predict_sample(self, x, node):
        if node.value is not None:
            return self.classes_[np.argmax(node.value)]
        
        if x[node.feature_index] <= node.threshold:
            return self._predict_sample(x, node.left)
        else:
            return self._predict_sample(x, node.right)

    def predict(self, X):
        return np.array([self._predict_sample(x, self.tree) for x in X])

    def num_nodes(self):
        return self._count_nodes(self.tree)

    def _count_nodes(self, node):
        if node is None:
            return 0
        return 1 + self._count_nodes(node.left) + self._count_nodes(node.right)

    def _count_internal_nodes(self, node):
        if node is None or (node.left is None and node.right is None):
            return 0
        return 1 + self._count_internal_nodes(node.left) + self._count_internal_nodes(node.right)

    def num_internal_nodes(self):
        return self._count_internal_nodes(self.tree)

def run_smc_dt_on_dataset(X_train, y_train, X_test, y_test, max_depth=5, n_particles=100, seed=42):
    try:
        scaler = MinMaxScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)

        smc_dt = SMCDecisionTree(max_depth=max_depth, n_particles=n_particles)
        smc_dt.fit(X_train_scaled, y_train)

        y_pred = smc_dt.predict(X_test_scaled)
        accuracy = accuracy_score(y_test, y_pred)

        return accuracy, smc_dt.num_nodes()
    except Exception as e:
        print(f"Error occurred: {str(e)}")
        import traceback
        traceback.print_exc()
        return None, None

def experiment_on_datasets(seeds: List[int], n_trials=10, UCI=True) -> Dict[str, Dict[str, float]]:
    if UCI:
        datasets = [
            # (17, "BCW-D"),
            (109, "Wine"),
            # (53, "Iris"),
            # (850, "Raisin"),
        ]
    else:
        # Specify CSV files directly
        datasets = [
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor.csv", "HiddenXOR"),
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/S1.csv", "BasicSanityCheck"), 
            # ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/S2.csv", "HarderSanityCheck")
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv1.csv", "HiddenXORLV1"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv2.csv", "HiddenXORLV2"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_lv3.csv", "HiddenXORLV3"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv1.csv", "HiddenXORLV1Real"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv2.csv", "HiddenXORLV2Real"),
            ("/Users/momac18/Dropbox/Mac/Desktop/CodeBases/RF-GFN/RF-GFN/baselines/data/hidden_xor_real_lv3.csv", "HiddenXORLV3Real")
        ]

    results = {}

    for dataset_info in datasets:
        if UCI:
            dataset_id, dataset_name = dataset_info
            print(f"\nRunning experiment on {dataset_name} dataset...")
            dataset = fetch_ucirepo(id=dataset_id)
            X = dataset.data.features.values
            y = dataset.data.targets.values.ravel()
        else:
            file_path, dataset_name = dataset_info
            print(f"\nRunning experiment on {dataset_name} dataset from {file_path}...")
            if not os.path.exists(file_path):
                print(f"File {file_path} not found.")
                continue
            data = pd.read_csv(file_path)
            X = data.iloc[:, :-2].values  # Assuming last two columns are 'train'/'test' and the target
            y = data.iloc[:, -2].values
            split_column = data.iloc[:, -1].values  # 'train'/'test' column

            # Split based on the 'train'/'test' column
            X_train = X[split_column == 'train']
            y_train = y[split_column == 'train']
            X_test = X[split_column == 'test']
            y_test = y[split_column == 'test']

        # print(f"Original y shape: {y.shape}")
        # print(f"Original y unique values: {np.unique(y)}")
        
        # print(f"Dataset shape: {X.shape}")
        # print(f"Unique classes in dataset: {np.unique(y)}")
        
        accuracies = []
        n_nodes_list = []
        for seed in seeds:
            print(f"\nRunning with seed {seed}")
            if UCI: 
                X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=seed)
            
            if y_train.dtype == object:
                le = LabelEncoder()
                y_train = le.fit_transform(y_train)
                y_test = le.transform(y_test)
                # print(f"After LabelEncoder - y_train unique values: {np.unique(y_train)}")
                # print(f"After LabelEncoder - y_test unique values: {np.unique(y_test)}")
            try:
                accs, sizes = [], []
                for _ in range(n_trials):
                    accuracy, n_nodes = run_smc_dt_on_dataset(X_train, y_train, X_test, y_test, seed)
                    if accuracy is not None and n_nodes is not None:
                        accs.append(accuracy)
                        sizes.append(n_nodes)
                    accuracies.append(np.mean(accs))
                    n_nodes_list.append(np.mean(sizes))
                    print(f"Seed {seed} - Accuracy: {np.mean(accs):.4f}, Nodes: {np.mean(sizes)}")
            except Exception as e:
                print(f"Error occurred with seed {seed}: {str(e)}")
        
        if accuracies and n_nodes_list:
            mean_accuracy = np.mean(accuracies)
            std_accuracy = np.std(accuracies)
            mean_nodes = np.mean(n_nodes_list)
            std_nodes = np.std(n_nodes_list)
            
            results[dataset_name] = {
                "mean_accuracy": mean_accuracy,
                "std_accuracy": std_accuracy,
                "mean_nodes": mean_nodes,
                "std_nodes": std_nodes
            }
            
            print(f"{dataset_name} - Mean Accuracy: {mean_accuracy:.4f}, Std: {std_accuracy:.4f}")
            print(f"{dataset_name} - Mean Nodes: {mean_nodes:.2f}, Std: {std_nodes:.2f}")
        else:
            print(f"Failed to process {dataset_name} dataset")
    
    return results

# Run the experiments
seeds = np.array([1])  # Using multiple seeds for more robust results
results = experiment_on_datasets(seeds, n_trials=1, UCI=True)

# Print final results
print("\nFinal Results:")
for dataset, metrics in results.items():
    print(f"{dataset}:")
    print(f"  Accuracy - Mean: {metrics['mean_accuracy']:.4f}, Std: {metrics['std_accuracy']:.4f}")
    print(f"  Nodes    - Mean: {metrics['mean_nodes']:.2f}, Std: {metrics['std_nodes']:.2f}")


Running experiment on Wine dataset...

Running with seed 1
Number of classes: 3
Unique classes: [1 2 3]
Input y shape: (142,)
Input y unique values: [1 2 3]
Log Posterior of the fitted tree: -100.44632524007334
Seed 1 - Accuracy: 0.6389, Nodes: 3.0
Wine - Mean Accuracy: 0.6389, Std: 0.0000
Wine - Mean Nodes: 3.00, Std: 0.00

Final Results:
Wine:
  Accuracy - Mean: 0.6389, Std: 0.0000
  Nodes    - Mean: 3.00, Std: 0.00
