In [None]:
import pandas as pd
import os
import glob
import itertools
import numpy as np
from scipy.optimize import minimize
from sklearn.metrics import precision_recall_curve, roc_curve, auc, precision_score, recall_score, f1_score, matthews_corrcoef, log_loss, cohen_kappa_score, top_k_accuracy_score

# AUPR values from previous script
method_alphas = {'GRNBOOST2': 0.55, 'SINCERITIES': 0.42, 'GENIE3': 0.56, 'LEAP': 0.43}

def contains_evaluation_results_mrf(subfolder):
    """Check if the evaluation results file already exists in a subfolder."""
    for root, dirs, files in os.walk(subfolder):
        if "evaluation_results_MRF_Fuse.csv" in files:
            return True

def compute_method_weights(method_auprs):
    """Compute normalized weights for each method based on AUPR values."""
    min_aupr, max_aupr = min(method_auprs.values()), max(method_auprs.values())
    method_weights = {method: (aupr - min_aupr) / (max_aupr - min_aupr) + 0.1 for method, aupr in method_auprs.items()}
    return method_weights

def mrf_optimization(edge_weights, method_names, method_weights):
    """Perform Markov Random Field optimization on edge weights."""
    if len(edge_weights) < 2:
        return edge_weights  # Not enough data to optimize

    method_specific_weights = np.array([method_weights[method] for method in method_names])

    def energy_function(weights):
        smoothness = np.sum((weights[:-1] - weights[1:]) ** 2) if len(weights) > 1 else 0  # Encourage smooth transitions
        likelihood = -np.sum(weights * np.log(np.maximum(method_specific_weights, 1e-6)))  # Ensure non-zero probabilities
        return smoothness + likelihood

    initial_weights = np.array(edge_weights)
    result = minimize(energy_function, initial_weights, method='L-BFGS-B', bounds=[(0, 1)] * len(edge_weights))

    return result.x if result.success else initial_weights

def merge_edges_with_mrf(methods, method_names, method_auprs, weight_threshold=0):
    """Merge edges from multiple methods using MRF-based modeling."""
    method_weights = compute_method_weights(method_auprs)
    merged_df = pd.concat(methods, ignore_index=True)
    merged_df["Method"] = list(itertools.chain(*[[name] * len(df) for name, df in zip(method_names, methods)]))

    grouped = merged_df.groupby(["Gene1", "Gene2"])
    edge_summary = []

    for (gene1, gene2), group in grouped:
        edge_weights = group["EdgeWeight"].values
        method_names_list = group["Method"].values
        optimized_weights = mrf_optimization(edge_weights, method_names_list, method_weights)
        avg_prob = np.mean(optimized_weights)
        edge_summary.append([gene1, gene2, avg_prob])

    edge_df = pd.DataFrame(edge_summary, columns=["Gene1", "Gene2", "Probability"])
    credible_edges = edge_df[edge_df["Probability"] >= weight_threshold]

    return credible_edges

def evaluate_edges(predicted_edges, ref_network):
    """Evaluate merged edges against the reference network using AUPR and AUROC."""
    if predicted_edges.empty:
        return 0, 0  # Return zero scores if there are no predictions

    true_edges = set(zip(ref_network["Gene1"], ref_network["Gene2"]))
    predicted_edges_list = list(zip(predicted_edges["Gene1"], predicted_edges["Gene2"], predicted_edges["Probability"]))

    y_true = [1 if (gene1, gene2) in true_edges else 0 for gene1, gene2, _ in predicted_edges_list]
    y_scores = [float(prob) for _, _, prob in predicted_edges_list]

    precision, recall, _ = precision_recall_curve(y_true, y_scores)
    aupr = auc(recall, precision)
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    auroc = auc(fpr, tpr)

    threshold = 0.5  # Default threshold for binary classification
    best_f1, best_thresh = 0, 0
    for thresh in np.linspace(0, 1, 100):
        y_pred = [1 if s >= thresh else 0 for s in y_scores]
        f1 = f1_score(y_true, y_pred)
        if f1 > best_f1:
            best_f1 = f1
            best_thresh = thresh
    y_pred = [1 if prob >= best_thresh else 0 for prob in y_scores]

    precision_metric = precision_score(y_true, y_pred, zero_division=0)
    recall_metric = recall_score(y_true, y_pred, zero_division=0)
    f1_metric = f1_score(y_true, y_pred, zero_division=0)
    mcc_metric = matthews_corrcoef(y_true, y_pred)
    log_loss_metric = log_loss(y_true, y_scores, labels=[0, 1]) if len(set(y_true)) > 1 else None
    kappa_metric = cohen_kappa_score(y_true, y_pred)
    top_k_acc = top_k_accuracy_score(y_true, np.array(y_scores).reshape(-1, 1), k=1)  # Top-1 Accuracy

    return {
        "AUPR": aupr,
        "AUROC": auroc,
        "Precision": precision_metric,
        "Recall": recall_metric,
        "F1-Score": f1_metric,
        "MCC": mcc_metric,
        "Log Loss": log_loss_metric,
        "Kappa": kappa_metric,
        "Top-K Acc": top_k_acc
    }


def load_files(basefolder):
    """Load all CSV files from the basefolder and normalize edge weights."""
    all_files = glob.glob(os.path.join(basefolder, "*.csv"))
    method_data = {}

    for file in all_files:
        method_name = os.path.basename(file).split('.')[0]
        df = pd.read_csv(file)
        # print(df)

        if "EdgeWeight" in df.columns:
            min_val = df["EdgeWeight"].min()
            max_val = df["EdgeWeight"].max()
            # Avoid division by zero
            if max_val > min_val:
                df["EdgeWeight"] = (df["EdgeWeight"] - min_val) / (max_val - min_val)
            else:
                df["EdgeWeight"] = 0.5  # Arbitrary constant if all weights are equal

        method_data[method_name] = df
        # print(df)

    return method_data

def main(basefolder, ref_network_file, weight_threshold=0):
    """Main function to process MRF-based evaluation for all method combinations."""
    method_data = load_files(basefolder)
    ref_network = pd.read_csv(os.path.join(basefolder, ref_network_file))
    excluded_files = {"Final_XGBoost_MPCM", "refNetwork", "coffee_rankedEdges", "Final_XGBoost_PCM"}
    method_names = [name for name in method_data.keys() if name not in excluded_files]

    results = []
    MAX_COMBO_SIZE = 5

    for r in range(2, min(len(method_names), MAX_COMBO_SIZE) + 1):
        for combo in itertools.combinations(method_names, r):
            selected_methods = [method_data[method] for method in combo]
            merged_edges = merge_edges_with_mrf(selected_methods, combo, method_auprs, weight_threshold=weight_threshold)

            if not merged_edges.empty:
                metrics = evaluate_edges(merged_edges, ref_network)
                results.append({"Combination": combo, **metrics})


    results_df = pd.DataFrame(results)
    output_dir = os.path.join(basefolder, "myresult")
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, "evaluation_results_MRF_Fuse.csv")
    results_df.to_csv(output_file, index=False)
    print(f"Evaluation results saved to {output_file}")

def process_all_subfolders(main_function, base_folder, csv_file, weight_threshold):
    """Process all subfolders and apply the MRF-based evaluation."""
    subfolders = [
        os.path.join(base_folder, name)
        for name in os.listdir(base_folder)
        if os.path.isdir(os.path.join(base_folder, name))
        and not (name.endswith("-50") or name.endswith("-70"))
        and not contains_evaluation_results_mrf(os.path.join(base_folder, name))  # Avoid redundant processing
    ]

    for subfolder in subfolders:
        print(f"Processing folder: {subfolder}")
        main_function(subfolder, csv_file, weight_threshold)

names = ["GSD"]
for name in names:
    process_all_subfolders(main_function=main, base_folder="your folder path" + name, csv_file="reference network path", weight_threshold=0)
