Inputs should be the gene network obtained from GRN inference techniques. It should have the columns "Gene1","Gene2", "EdgeWeight" and "Direction". The sample files are provided in the data folder.

In [None]:
import csv
def contains_evaluation_results_bayesian(subfolder):
    for root, dirs, files in os.walk(subfolder):  # Walk through subfolders and sub-subfolders
        if "evaluation_results_BCU.csv" in files:
            return True  # File found, return True to exclude the subfolder
def smart_read_csv(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        # Read a small portion to detect delimiter
        sample = f.read(2048)
        sniffer = csv.Sniffer()
        try:
            dialect = sniffer.sniff(sample, delimiters=[',', '\t'])
        except csv.Error:
            # Fallback if delimiter can't be detected
            dialect = csv.get_dialect('excel')  # defaults to comma
        f.seek(0)
        return pd.read_csv(f, sep=dialect.delimiter)

In [None]:
import pandas as pd
import os
import glob
import itertools
from sklearn.metrics import precision_recall_curve, roc_curve, auc, precision_score, recall_score, f1_score, matthews_corrcoef, log_loss, cohen_kappa_score, top_k_accuracy_score
import numpy as np
from scipy.stats import beta

def compute_method_weights(method_auprs):
    """Normalize AUPR values to create weights for Bayesian updating."""
    min_aupr, max_aupr = min(method_auprs.values()), max(method_auprs.values())
    method_weights = {method: (aupr - min_aupr) / (max_aupr - min_aupr) + 0.1 for method, aupr in method_auprs.items()}  # Avoid zero weights
    return method_weights

method_alphas = {'GRNBOOST2': 0.55, 'SINCERITIES': 0.42, 'GENIE3': 0.56, 'LEAP': 0.43}
def merge_edges_with_bayesian_inference(methods, method_names, credibility_threshold=0.9):
    """Merge edges from multiple methods and apply Bayesian inference based on Direction and EdgeWeight."""

    # Compute Bayesian weights for methods
    method_weights = compute_method_weights(method_auprs)

    # Merge DataFrames on "Direction" to align same edges
    merged_df = methods[0].copy()  # Start with first method's DataFrame
    merged_df["Method"] = method_names[0]  # Assign method name

    for i in range(1, len(methods)):
        df = methods[i].copy()
        df["Method"] = method_names[i]  # Assign method name
        merged_df = pd.concat([merged_df, df], ignore_index=True)  # Append DataFrames

    # Group by 'Direction' to aggregate EdgeWeight across methods
    grouped = merged_df.groupby("Direction")

    # Bayesian update function
    def bayesian_update(edge_weights, method_names, method_weights):
        alpha_prior = 1
        beta_prior = 1
        threshold = np.median(edge_weights)
        # Compute weighted positive and negative support
        positive_support = sum(method_weights[method] for edge, method in zip(edge_weights, method_names) if edge > threshold)
        negative_support = sum(method_weights[method] for edge, method in zip(edge_weights, method_names) if edge <= threshold)

        return alpha_prior + positive_support, beta_prior + negative_support

    edge_summary = []
    for name, group in grouped:
        edge_weights = group["EdgeWeight"].values  # Extract edge weights
        method_names_list = group["Method"].values  # Extract corresponding method names
        a, b = bayesian_update(edge_weights, method_names_list, method_weights)
        prob = beta.mean(a, b)  # Compute mean probability from Beta distribution
        edge_summary.append([name, a, b, prob])

    # Create a DataFrame with Bayesian results
    edge_df = pd.DataFrame(edge_summary, columns=["Direction", "Alpha", "Beta", "Probability"])

    # Merge back "Gene1" and "Gene2" using the original merged_df
    edge_df = edge_df.merge(merged_df[["Direction", "Gene1", "Gene2"]].drop_duplicates(), on="Direction", how="left")

    # Filter edges based on probability threshold
    credible_edges = edge_df[edge_df["Probability"] >= credibility_threshold]

    return credible_edges

def load_files(basefolder):
    """Load all CSV files from the basefolder and normalize edge weights."""
    all_files = glob.glob(os.path.join(basefolder, "*.csv"))
    method_data = {}

    for file in all_files:
        method_name = os.path.basename(file).split('.')[0]
        df = smart_read_csv(file)
        print(df.columns)
        if "Direction" not in df.columns:
            # Determine which case of Gene1 and Gene2 is present
            gene1_col = "Gene1" if "Gene1" in df.columns else ("gene1" if "gene1" in df.columns else None)
            gene2_col = "Gene2" if "Gene2" in df.columns else ("gene2" if "gene2" in df.columns else None)

            # Ensure both columns are found
            if gene1_col and gene2_col:
                # Create the 'Direction' column
                df["Direction"] = df[gene1_col].astype(str) + " -> " + df[gene2_col].astype(str)
            else:
                raise ValueError("Required columns 'Gene1'/'gene1' and/or 'Gene2'/'gene2' are missing.")
        print(df)

        if "EdgeWeight" in df.columns:
            min_val = df["EdgeWeight"].min()
            max_val = df["EdgeWeight"].max()
            # Avoid division by zero
            if max_val > min_val:
                df["EdgeWeight"] = (df["EdgeWeight"] - min_val) / (max_val - min_val)
            else:
                df["EdgeWeight"] = 0.5  # Arbitrary constant if all weights are equal

        method_data[method_name] = df
        # print(df)

    return method_data


def evaluate_edges(predicted_edges, ref_network):
    """Compute AUPR and AUROC given predicted edges and the reference network."""
    true_edges = set(ref_network["Direction"])  # Set of true edges
    predicted_edges_list = list(zip(predicted_edges["Direction"], predicted_edges["Probability"]))

    # Create binary labels (1 if edge exists in reference, else 0)
    y_true = [1 if direction in true_edges else 0 for direction, _ in predicted_edges_list]
    y_scores = [prob for _, prob in predicted_edges_list]

    # Compute AUPR
    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    aupr = auc(recall, precision)

    # Compute AUROC
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    auroc = auc(fpr, tpr)

    threshold = 0.5  # Default threshold for binary classification
    y_pred = [1 if prob >= threshold else 0 for prob in y_scores]

    precision_metric = precision_score(y_true, y_pred, zero_division=0)
    recall_metric = recall_score(y_true, y_pred, zero_division=0)
    f1_metric = f1_score(y_true, y_pred, zero_division=0)
    mcc_metric = matthews_corrcoef(y_true, y_pred)
    log_loss_metric = log_loss(y_true, y_scores, labels=[0, 1]) if len(set(y_true)) > 1 else None
    kappa_metric = cohen_kappa_score(y_true, y_pred)
    top_k_acc = top_k_accuracy_score(y_true, np.array(y_scores).reshape(-1, 1), k=1)  # Top-1 Accuracy

    return {
        "AUPR": aupr,
        "AUROC": auroc,
        "Precision": precision_metric,
        "Recall": recall_metric,
        "F1-Score": f1_metric,
        "MCC": mcc_metric,
        "Log Loss": log_loss_metric,
        "Kappa": kappa_metric,
        "Top-K Acc": top_k_acc
    }

    # return aupr, auroc

def main(basefolder, ref_network_file, credibility_threshold=0.9):
    """Run Bayesian inference and evaluate merged networks."""
    method_data = load_files(basefolder)
    ref_network = smart_read_csv(os.path.join(basefolder, ref_network_file))
    if "Direction" not in ref_network.columns:
            # Determine which case of Gene1 and Gene2 is present
            gene1_col = "Gene1" if "Gene1" in ref_network.columns else ("gene1" if "gene1" in ref_network.columns else None)
            gene2_col = "Gene2" if "Gene2" in ref_network.columns else ("gene2" if "gene2" in ref_network.columns else None)

            # Ensure both columns are found
            if gene1_col and gene2_col:
                # Create the 'Direction' column
                ref_network["Direction"] = ref_network[gene1_col].astype(str) + " -> " + ref_network[gene2_col].astype(str)
            else:
                raise ValueError("Required columns 'Gene1'/'gene1' and/or 'Gene2'/'gene2' are missing.")

    excluded_files = {"Final_XGBoost_MPCM", "refNetwork", "coffee_rankedEdges", "Final_XGBoost_PCM"}
    method_names = [name for name in method_data.keys() if name not in excluded_files]

    results = []
    I = 0
    MAX_COMBO_SIZE = 5  # Limit to top-5 method permutations

    for r in range(2, min(len(method_names), MAX_COMBO_SIZE) + 1):
        for combo in itertools.combinations(method_names, r):  # Use permutations instead of combinations
            # print(f"Processing permutation: {perm}")
            I += 1
            selected_methods = [method_data[method] for method in combo]

            # Apply Bayesian inference
            merged_edges = merge_edges_with_bayesian_inference(selected_methods, combo, credibility_threshold=credibility_threshold)

            # Evaluate the merged edges
            metrics = evaluate_edges(merged_edges, ref_network)
            results.append({"Combination": combo, **metrics})


    # Save results
    results_df = pd.DataFrame(results)
    output_dir = os.path.join(basefolder, "myresult")
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, "evaluation_results_BCU.csv")
    results_df.to_csv(output_file, index=False)

    print(f"Evaluation results saved to {output_file}")
    print("Total I =", I)

def process_all_subfolders(main_function, base_folder, csv_file, weight_threshold):
    subfolders = [
        os.path.join(base_folder, name)
        for name in os.listdir(base_folder)
        if os.path.isdir(os.path.join(base_folder, name))
        and not (name.endswith("-50") or name.endswith("-70"))
        and not contains_evaluation_results_bayesian(os.path.join(base_folder, name))
    ]

    for subfolder in subfolders:
        print(f"Processing folder: {subfolder}")
        main_function(subfolder, csv_file, weight_threshold)

# Run Bayesian inference for each dataset
names = ["GSD"]
for name in names:
    print(name)
    process_all_subfolders(main_function=main, base_folder="your folder path" + name, csv_file="reference network path", weight_threshold=0)

