In [None]:
## import
import treeple.tree._honest_tree
from treeple.ensemble._supervised_forest import ObliqueRandomForestClassifier
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import time

In [3]:
## Feature normalization
df_human = pd.read_excel('data/Human.parcellated_thickness.xlsx')
df_human.head()

df_human_normalize= {}
features = df_human.columns[2:]  # features are from the 2nd column to the last

# Z-score normalization
for feature in features:
    mean = df_human[feature].mean()
    std = df_human[feature].std()
    df_human_normalize[feature] = (df_human[feature] - mean) / std

# Save the Human normalized data
df_human_normalize = pd.DataFrame(df_human_normalize)
label_human = df_human.iloc[:, :2]
df_human_normalize = pd.concat([label_human, df_human_normalize], axis=1)
df_human_normalize.to_excel('data/normalized/Human_normalized_parcellated_thickness.xlsx', index=False)

df_human_normalize_markov = df_human_normalize.loc[:, ~df_human_normalize.columns.str.startswith('Schaefer')]


In [4]:
df_sex = pd.read_excel('data/subjects_age_sex_data_MRI.xlsx')

## set up training data
X1 = []
X2 = []
y_human = []
IDs = set(df_human_normalize_markov['sid'])
ref_IDs = set(df_sex['ID'])

for subject in tqdm(IDs):
    if subject in ref_IDs:
        features = np.array(df_human_normalize_markov[df_human_normalize_markov['sid']==subject]).reshape(-1)[2:]
        gender = list(df_sex[df_sex['ID']==subject]['Sex'])
        sex = int(gender[0]=='FEMALE')

        X1.append(list(features[:182]))
        X2.append(list(features[182:]))
        y_human.append(sex)

X1_human = np.array(X1)
X2_human = np.array(X2)

100%|██████████| 14465/14465 [00:10<00:00, 1427.51it/s]


In [6]:
X1_human = np.nan_to_num(X1_human)
X2_human = np.nan_to_num(X2_human)
X_human = np.concatenate((X1_human,X2_human),axis=1)

In [None]:


# Import the supervised tree estimators from the treeple package.
# (Make sure treeple is installed and the estimators below exist in your version.)
from treeple import (ObliqueRandomForestClassifier, 
                     ObliqueDecisionTreeClassifier, 
                     RotationForestClassifier, 
                     ObliqueGradientBoostingClassifier)

# --- Parameters ---
reps = 5  # number of repetitions for each configuration
n_estimator = 2000  # parameter for ensemble classifiers
# List of noise dimensions to add (0 means no extra noise; higher values add more noise)
noise_dims_list = [0, 10, 20, 30, 40]

# Define a dictionary of estimator constructors.
# We wrap each estimator in a lambda so that a fresh instance is created for each run.
estimators = {
    "ObliqueRandomForest": lambda: ObliqueRandomForestClassifier(n_estimators=n_estimator, n_jobs=-1, feature_combinations=2.3),
    "ObliqueDecisionTree": lambda: ObliqueDecisionTreeClassifier(),  # single tree, so no n_estimators
    "RotationForest": lambda: RotationForestClassifier(n_estimators=n_estimator, n_jobs=-1),
    "ObliqueGradientBoosting": lambda: ObliqueGradientBoostingClassifier(n_estimators=n_estimator, n_jobs=-1)
}

# This dictionary will store results for each estimator and each noise level.
# The structure will be: results[estimator_name][noise_dim] = {"accuracy": avg_accuracy, "time": avg_train_time}
results = {est_name: {} for est_name in estimators.keys()}

# For each noise level, add that many extra columns (noise features) to your original data.
for noise_dim in noise_dims_list:
    print(f"\n=== Testing with {noise_dim} noisy dimensions added ===")
    
    # For each estimator from treeple
    for est_name, est_constructor in estimators.items():
        accuracies = []
        train_times = []
        
        # Repeat the experiment several times to average out randomness.
        for rep in tqdm(range(reps), desc=f"{est_name}, noise dims={noise_dim}"):
            # Create noise features: shape = (n_samples, noise_dim)
            if noise_dim > 0:
                noise_features = np.random.normal(0, 1, size=(X1_human.shape[0], noise_dim))
                X_sim = np.concatenate([X1_human, noise_features], axis=1)
            else:
                X_sim = X1_human.copy()
            
            # Split the data (80% training, 20% testing) with stratification
            X_train, X_test, y_train, y_test = train_test_split(
                X_sim, y_human, train_size=0.8, random_state=rep, stratify=y_human)
            
            # Initialize the classifier
            clf = est_constructor()
            
            # Record training time
            start_time = time.time()
            clf.fit(X_train, y_train)
            end_time = time.time()
            runtime = end_time - start_time
            
            # Compute accuracy on the test set
            y_pred = clf.predict(X_test)
            accuracy = np.mean(y_pred == y_test)
            
            accuracies.append(accuracy)
            train_times.append(runtime)
        
        # Save average results for this estimator and noise level.
        avg_accuracy = np.mean(accuracies)
        avg_time = np.mean(train_times)
        results[est_name][noise_dim] = {"accuracy": avg_accuracy, "time": avg_time}
        print(f"{est_name} | Noise dims: {noise_dim} | Avg. Accuracy: {avg_accuracy:.4f} | Avg. Training Time: {avg_time:.4f} sec")

# Optionally, print a summary of all results.
print("\n=== Summary of Results ===")
for est_name, noise_results in results.items():
    print(f"\nEstimator: {est_name}")
    for noise_dim, metrics in noise_results.items():
        print(f"  Noise dims: {noise_dim} | Accuracy: {metrics['accuracy']:.4f} | Training Time: {metrics['time']:.4f} sec")


In [None]:
### SPORF ###
reps = 5
sporf_accuracy = []
n_estimator = 2000
accuracies = []
for ii in tqdm(range(reps)):
    x_train, x_test, y_train, y_test = train_test_split(
                    X1_human, y_human, train_size=0.8, random_state=ii, stratify=y_human)
    clf = ObliqueRandomForestClassifier(n_estimators=n_estimator, n_jobs=-1, feature_combinations=3.47, max_features=0.993)
    clf.fit(x_train, y_train)
    accuracy = np.mean(clf.predict(x_test)==y_test)
    accuracies.append(accuracy)
sporf_accuracy = np.concatenate((sporf_accuracy, accuracies))
print('Accuracy for n_estimator = ', n_estimator,' is ', accuracies)

sporf_accuracy = sporf_accuracy.reshape(5, 5)
print(sporf_accuracy)

  0%|          | 0/5 [00:00<?, ?it/s]