In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import shap
import matplotlib.pyplot as plt
import os
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Load the data 🛸🧐👽

In [2]:
notebook_dir = os.getcwd()
features_file = os.path.join(notebook_dir, '../data/toy_expression.tsv')
labels_file = os.path.join(notebook_dir, '../data/toy_labels.tsv')

features_df = pd.read_csv(features_file, sep='\t', index_col=0)
labels_df = pd.read_csv(labels_file, sep='\t', index_col=0)

## Set up for cross validation 🫡🎯☺️

In [3]:
SEED = 33
X = features_df.T.loc[labels_df.index]
y = labels_df["label"].values
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)


## Train a Random Forest 🌲🌳📉

In [4]:
# Store results
accuracies = []
shap_values_all = []
explainer = None

for fold, (train_idx, test_idx) in tqdm(enumerate(cv.split(X, y), 1), total=5, desc='Cross-validation'):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Train model
    model = RandomForestClassifier(random_state=SEED) # change the model here
    model.fit(X_train, y_train)

    # Predict and evaluate
    preds = model.predict(X_test)
    acc = accuracy_score(y_test, preds)
    accuracies.append(acc)
    print(f"Fold {fold} Accuracy: {acc:.4f}")

    # SHAP interpretation (only for first fold to save time)
    if fold == 1:
        explainer = shap.Explainer(model, X_train)
        shap_values = explainer(X_test)
        shap_values_all.append(shap_values)

# Summary of performance
print(f"\nMean Accuracy 🫡: {np.mean(accuracies):.4f} ± {np.std(accuracies):.4f}")

Cross-validation:  80%|████████  | 4/5 [00:00<00:00, 15.12it/s]

Fold 1 Accuracy: 0.0000
Fold 2 Accuracy: 0.0000
Fold 3 Accuracy: 0.5000
Fold 4 Accuracy: 1.0000


Cross-validation: 100%|██████████| 5/5 [00:00<00:00, 15.11it/s]

Fold 5 Accuracy: 0.0000

Mean Accuracy 🫡: 0.3000 ± 0.4000





## Shap evaluation 🧐😳☃️

In [None]:
# SHAP summary plot for first fold (class 1 in binary classification)
if shap_values_all:
    shap.plots.beeswarm(shap_values_all[0][:, :, 1])

ValueError: The beeswarm plot does not support plotting explanations with instances that have more than one dimension!