In [None]:
# Load libraries

import re 
import itertools
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, accuracy_score, roc_curve, auc, precision_recall_curve, average_precision_score, f1_score
import matplotlib.pyplot as plt

In [None]:
# Define the path to the CSV file containing the HE stain results
csv_file_path = "IPH_HE_results.csv"

# Read the CSV file into a DataFrame
df_he_results = pd.read_csv(csv_file_path)

# Extract the StudyNumber from the case_id using regular expressions
df_he_results["SNR"] = [re.search("^[(AE)(0-9)]*", i).group(0) for i in df_he_results["case_id"]]

# Display the DataFrame
df_he_results

In [None]:
# Read the secondary stain results from the CSV file
# Uncomment the desired file to read
# df_sec_stain_results = pd.read_csv("IPH_EVG_results.csv")
# df_sec_stain_results = pd.read_csv("IPH_FIBRIN_results.csv")
# df_sec_stain_results = pd.read_csv("IPH_SMA_results.csv")
df_sec_stain_results = pd.read_csv("IPH_GLYCC_results.csv")
# df_sec_stain_results = pd.read_csv("IPH_SR_results.csv")
# df_sec_stain_results = pd.read_csv("IPH_CD68_results.csv")
# df_sec_stain_results = pd.read_csv("IPH_CD34_results.csv")
# df_sec_stain_results = pd.read_csv("IPH_CD66b_results.csv")

# Extract the StudyNumber from the case_id using regular expressions
df_sec_stain_results["SNR"] = [re.search("^[(AE)(0-9)]*", i).group(0) for i in df_sec_stain_results["case_id"]]

# Display the DataFrame
df_sec_stain_results


In [None]:
# Find the intersection of StudyNumber values between the two DataFrames
# This will help us identify common samples between the HE results and the secondary stain results
intersection = set(df_he_results['SNR']) & set(df_sec_stain_results['SNR'])

In [None]:
# Filter the HE results DataFrame to include only the common StudyNumber values
df_he_results = df_he_results[df_he_results['SNR'].isin(list(intersection))]

# Filter the secondary stain results DataFrame to include only the common StudyNumber values
df_sec_stain_results = df_sec_stain_results[df_sec_stain_results['SNR'].isin(list(intersection))]

In [None]:
# Merge the HE results DataFrame and the secondary stain results DataFrame on the 'SNR' column
# The 'suffixes' parameter is used to differentiate columns from the two DataFrames
df_combined = df_he_results.merge(df_sec_stain_results, how='inner', on='SNR', suffixes=["_HE", "_STAIN"])

# Display the combined DataFrame
df_combined


In [None]:
# Display the value counts of the 'gt_HE' column in the combined DataFrame
# This will show the distribution of the ground truth labels for HE stain
df_combined["gt_HE"].value_counts()

In [None]:
# Convert the ground truth labels for HE stain to binary format
# 'no' is converted to 0 and 'yes' is converted to 1
y_true = df_combined["gt_HE"].replace({'no': 0, 'yes': 1}).astype(int)

# Extract the predicted probabilities for the secondary stain
prob_STAIN = df_combined["prob_STAIN"]

# Extract the predicted probabilities for the HE stain
prob_HE = df_combined["prob_HE"]

In [None]:
# Combine probabilities into a feature matrix for the ensemble
# This matrix will be used as input for the ensemble model
X_ensemble = np.column_stack((prob_STAIN, prob_HE))

# Display the feature matrix
X_ensemble

In [None]:
# Initialize k-fold cross-validation with 10 splits, shuffling, and a fixed random state for reproducibility
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Initialize lists to store evaluation metrics for each fold
auc_scores = []  # Area Under the ROC Curve scores
accuracy_scores = []  # Accuracy scores

In [None]:
# Generate a mean false positive rate (FPR) for ROC curve interpolation
mean_fpr = np.linspace(0, 1, 100)

# Initialize a list to store true positive rates (TPRs) for each fold
tprs = []

# Generate a mean recall for Precision-Recall curve interpolation
mean_recall = np.linspace(0, 1, 100)

# Initialize a list to store precision values for each fold
precisions = []

In [None]:
# Perform k-fold cross-validation
for train_index, test_index in kf.split(X_ensemble):
    # Split the data into training and testing sets
    X_train, X_test = X_ensemble[train_index], X_ensemble[test_index]
    y_train, y_test = y_true[train_index], y_true[test_index]
    
    # Print the size of the training and testing sets
    print(f"train set size: {len(X_train)}")
    print(f"test set size: {len(X_test)}")
    
    # Train the logistic regression model
    ensemble_model = LogisticRegression()
    ensemble_model.fit(X_train, y_train)
    
    # Predict probabilities on the test set
    test_probs = ensemble_model.predict_proba(X_test)[:, 1]
    
    # Evaluate the model
    auc_s = roc_auc_score(y_test, test_probs)
    accuracy = accuracy_score(y_test, (test_probs > 0.5).astype(int))
    
    # Append the AUC and accuracy scores to their respective lists
    auc_scores.append(auc_s)
    accuracy_scores.append(accuracy)

    # Compute ROC curve and AUC
    fpr, tpr, _ = roc_curve(y_test, test_probs)

    # Ensure the curve starts at (0, 0) and ends at (1, 1)
    fpr = np.insert(fpr, 0, 0.0)
    tpr = np.insert(tpr, 0, 0.0)
    fpr = np.append(fpr, 1.0)
    tpr = np.append(tpr, 1.0)
    
    # Interpolate the true positive rates
    tprs.append(np.interp(mean_fpr, fpr, tpr))
    
    # Compute Precision-Recall curve
    precision, recall, _ = precision_recall_curve(y_test, test_probs)
    
    # Interpolate the precision values
    precisions.append(np.interp(mean_recall, recall[::-1], precision[::-1]))

# Average results across folds
print("Average AUC:", np.mean(auc_scores), " --  Stdev: ", np.std(auc_scores))
print("Average Accuracy:", np.mean(accuracy_scores), " --  Stdev: ", np.std(accuracy_scores))

In [None]:
# Calculate mean ROC and PR curves
mean_tpr = np.mean(tprs, axis=0)
mean_precision = np.mean(precisions, axis=0)

# Plotting
plt.figure(figsize=(14, 6))

# ROC Curve
plt.subplot(1, 2, 1)
plt.plot(mean_fpr, mean_tpr, color="blue", label=f"Mean ROC (AUC = {np.mean(auc_scores):.2f} ± {np.std(auc_scores):.2f})")
plt.plot([0, 1], [0, 1], color="gray", linestyle="--", label="Random Classifier")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.title("Mean ROC Curve (10-Fold CV)")
plt.legend(loc="lower right")
plt.xlim(0, 1)  # Ensure x-axis goes from 0 to 1
plt.ylim(0, 1)  # Ensure y-axis goes from 0 to 1
plt.grid()

# Remove top and right borders
ax = plt.gca()  # Get current axis
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

# Precision-Recall Curve
plt.subplot(1, 2, 2)
plt.plot(mean_recall, mean_precision, color="green", label=f"Mean PR Curve")
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.title("Mean Precision-Recall Curve (10-Fold CV)")
plt.legend(loc="lower left")
plt.xlim(0, 1)  # Ensure x-axis goes from 0 to 1
plt.ylim(0, 1)  # Ensure y-axis goes from 0 to 1
plt.grid()

# Remove top and right borders
ax = plt.gca()  # Get current axis
ax.spines["top"].set_visible(False)
ax.spines["right"].set_visible(False)

plt.tight_layout()
plt.show()


In [None]:
## Combined Version of the Code

In this section, we combine the HE stain results with secondary stain results based on common SNR values. We then perform k-fold cross-validation to evaluate the performance of the ensemble model using logistic regression. The results are visualized using ROC and Precision-Recall curves.

### Steps:
1. **Load Libraries and Data**: Import necessary libraries and read the CSV files containing HE and secondary stain results.
2. **Data Preprocessing**: Extract SNR values from the `case_id` column using regular expressions and find the intersection of SNR values between the two datasets.
3. **Data Filtering**: Filter the datasets to include only the common SNR values and merge them on the `SNR` column.
4. **Feature Extraction**: Extract ground truth labels and predicted probabilities for HE and secondary stains.
5. **Ensemble Model**: Combine the probabilities into a feature matrix and perform k-fold cross-validation using logistic regression.
6. **Evaluation**: Calculate evaluation metrics such as AUC, accuracy, and F1 score for each fold. Compute mean ROC and Precision-Recall curves.
7. **Visualization**: Plot the mean ROC and Precision-Recall curves for each stain combination and highlight the best-performing combinations.

In [None]:
# Dictionary to store stain combinations and their corresponding CSV file paths
stain_combos = {
    'HE+EVG': "IPH_EVG_results.csv",
    'HE+CD66b': "IPH_CD66b_results.csv",
    'HE+CD34': "IPH_CD34_results.csv",
    'HE+CD68': "IPH_CD68_results.csv",
    'HE+SMA': "IPH_SMA_results.csv",
    'HE+GLYCC': "IPH_GLYCC_results.csv",
    'HE+FIBRIN': "IPH_FIBRIN_results.csv",
    'HE+SR': "IPH_SR_sb.csv"
    # Add more combinations here
}

# Note: Ensure that the CSV files are available in the same directory or provide the correct path.
# This dictionary can be expanded with additional stain combinations as needed.

In [None]:
def get_stains_data_combo(stain_path, printing=False):
    """
    Function to read and combine HE results with secondary stain results based on common SNR values.

    Parameters:
    stain_path (str): Path to the CSV file containing secondary stain results.
    printing (bool): If True, prints the total number of samples and IPH splits.

    Returns:
    pd.DataFrame: Combined DataFrame with HE and secondary stain results.
    """
    # Read the HE results CSV file
    df_he_results = pd.read_csv("IPH_HE_sb.csv")
    # Extract the SNR from the case_id using regular expressions
    df_he_results["SNR"] = [re.search("^[(AE)(0-9)]*", i).group(0) for i in df_he_results["case_id"]]

    # Read the secondary stain results CSV file
    df_sec_stain_results = pd.read_csv(stain_path)
    # Extract the SNR from the case_id using regular expressions
    df_sec_stain_results["SNR"] = [re.search("^[(AE)(0-9)]*", i).group(0) for i in df_sec_stain_results["case_id"]]

    # Find the intersection of SNR values between the two DataFrames
    intersection = set(df_he_results['SNR']) & set(df_sec_stain_results['SNR'])

    # Filter the DataFrames to include only the common SNR values
    df_he_results = df_he_results[df_he_results['SNR'].isin(list(intersection))]
    df_sec_stain_results = df_sec_stain_results[df_sec_stain_results['SNR'].isin(list(intersection))]

    # Merge the HE results DataFrame and the secondary stain results DataFrame on the 'SNR' column
    df_combined = df_he_results.merge(df_sec_stain_results, how='inner', on='SNR', suffixes=["_HE", "_STAIN"])

    # Print the total number of samples and IPH splits if printing is True
    if printing:
        print(f"\tTotal amount of samples: {len(df_combined)}")
        print(f"\tIPH splits: {df_combined['gt_HE'].value_counts()}")

    return df_combined


In [None]:
# Update matplotlib parameters for better readability
plt.rcParams.update({'font.size': 18})

# Initialize figures for ROC and Precision-Recall curves
fig1, ax1 = plt.subplots(figsize=(10, 8))
fig2, ax2 = plt.subplots(figsize=(10, 8))

# Variables to store the best-performing combination across all combinations
best_auc = 0
best_auc_combo = ""
best_ap = 0
best_ap_combo = ""

# Define colorblind-friendly palette (CUD palette)
colorblind_palette = ['#CC79A7', '#E69F00', '#56B4E9', '#009E73', '#F0E442', '#0072B2', '#D55E00', '#FE88B1', '#000000']
color_cycle = itertools.cycle(colorblind_palette)

# Dictionary to store the assigned colors for each stain combo
color_map_auc = {}
color_map_ap = {}

# First pass to determine the best-performing stain combinations
for stain_combo, folder_path in stain_combos.items():
    print("\033[4m" + stain_combo)
    print("\033[0m")
    combined_data = get_stains_data_combo(folder_path, printing=True)

    y_true = combined_data["gt_HE"].replace({'no': 0, 'yes': 1}).astype(int)
    prob_STAIN = combined_data["prob_STAIN"]
    prob_HE = combined_data["prob_HE"]

    X_ensemble = np.column_stack((prob_STAIN, prob_HE))

    # k-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    auc_scores = []
    accuracy_scores = []
    tprs = []
    precisions = []
    aps = []
    f1_scores = []
    
    mean_fpr = np.linspace(0, 1, 100)
    mean_recall = np.linspace(0, 1, 100)

    i = 0
    for train_index, test_index in kf.split(X_ensemble):
        # Split the data into training and testing sets
        X_train, X_test = X_ensemble[train_index], X_ensemble[test_index]
        y_train, y_test = y_true[train_index], y_true[test_index]
        if i == 0:
            print(f"\ttrain set size: {len(X_train)}")
            print(f"\ttest set size: {len(X_test)}")
            i += 1
        # Train the logistic regression model
        ensemble_model = LogisticRegression()
        ensemble_model.fit(X_train, y_train)

        # Predict probabilities on the test set
        test_probs = ensemble_model.predict_proba(X_test)[:, 1]

        # Evaluate the model
        accuracy = accuracy_score(y_test, (test_probs > 0.5).astype(int))
        accuracy_scores.append(accuracy)

        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test, test_probs)
        roc_auc = auc(fpr, tpr)
        auc_scores.append(roc_auc)
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)

        # Compute Precision-Recall curve
        precision, recall, _ = precision_recall_curve(y_test, test_probs)
        ap = average_precision_score(y_test, test_probs)
        aps.append(ap)
        interp_precision = np.interp(mean_recall, recall[::-1], precision[::-1])
        precisions.append(interp_precision)

        # Calculate F1 score for this fold
        fold_f1 = f1_score(y_test, (test_probs > 0.5).astype(int))
        f1_scores.append(fold_f1)

    # Calculate mean values for this stain combo
    if tprs:
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        if mean_auc > best_auc:  # Update the best-performing ROC curve
            best_auc = mean_auc
            best_auc_combo = stain_combo

    if precisions:
        mean_precision = np.mean(precisions, axis=0)
        mean_ap = np.mean(aps)
        if mean_ap > best_ap:  # Update the best-performing Precision-Recall curve
            best_ap = mean_ap
            best_ap_combo = stain_combo
            
    # Average results across folds
    print("\tAverage AUC:", np.mean(auc_scores), " --  Stdev: ", np.std(auc_scores))
    print("\tAverage F1 Score:", np.mean(f1_scores), " --  Stdev: ", np.std(f1_scores))
    print("\tAverage Accuracy:", np.mean(accuracy_scores), " --  Stdev: ", np.std(accuracy_scores))

# Assign colors for each stain combo
for stain_combo in stain_combos.keys():
    color = next(color_cycle)
    color_map_auc[stain_combo] = color
    color_map_ap[stain_combo] = color

# Second pass to plot and emphasize only the best curve
for stain_combo, folder_path in stain_combos.items():
    combined_data = get_stains_data_combo(folder_path)

    y_true = combined_data["gt_HE"].replace({'no': 0, 'yes': 1}).astype(int)
    prob_STAIN = combined_data["prob_STAIN"]
    prob_HE = combined_data["prob_HE"]

    X_ensemble = np.column_stack((prob_STAIN, prob_HE))

    # k-fold cross-validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    
    auc_scores = []
    accuracy_scores = []
    tprs = []
    precisions = []
    aps = []
    
    mean_fpr = np.linspace(0, 1, 100)
    mean_recall = np.linspace(0, 1, 100)
    
    for train_index, test_index in kf.split(X_ensemble):
        # Split the data into training and testing sets
        X_train, X_test = X_ensemble[train_index], X_ensemble[test_index]
        y_train, y_test = y_true[train_index], y_true[test_index]
        # Train the logistic regression model
        ensemble_model = LogisticRegression()
        ensemble_model.fit(X_train, y_train)

        # Predict probabilities on the test set
        test_probs = ensemble_model.predict_proba(X_test)[:, 1]

        # Evaluate the model
        accuracy = accuracy_score(y_test, (test_probs > 0.5).astype(int))
        accuracy_scores.append(accuracy)

        # ROC Curve
        fpr, tpr, _ = roc_curve(y_test, test_probs)
        roc_auc = auc(fpr, tpr)
        auc_scores.append(roc_auc)
        interp_tpr = np.interp(mean_fpr, fpr, tpr)
        interp_tpr[0] = 0.0
        tprs.append(interp_tpr)

        # Compute Precision-Recall curve
        precision, recall, _ = precision_recall_curve(y_test, test_probs)
        ap = average_precision_score(y_test, test_probs)
        aps.append(ap)
        interp_precision = np.interp(mean_recall, recall[::-1], precision[::-1])
        precisions.append(interp_precision)
    
    # Finalize and plot ROC Curve
    if tprs:
        mean_tpr = np.mean(tprs, axis=0)
        mean_tpr[-1] = 1.0
        mean_auc = auc(mean_fpr, mean_tpr)
        color = color_map_auc[stain_combo]  # Use the assigned color
        line_width = 2  # Emphasize only the best-performing ROC curve
        ax1.plot(mean_fpr, mean_tpr, lw=line_width, color=color, label=f'{stain_combo}')

    # Finalize and plot Precision-Recall Curve
    if precisions:
        mean_precision = np.mean(precisions, axis=0)
        mean_ap = np.mean(aps)
        color = color_map_ap[stain_combo]  # Use the assigned color
        line_width = 2  # Emphasize only the best-performing PR curve
        ax2.plot(mean_recall, mean_precision, lw=line_width, color=color, label=f'{stain_combo}')
 
# Finalizing merged ROC Curve
ax1.plot([0, 1], [0, 1], linestyle='--', lw=3, color='gray', alpha=.8)
ax1.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], xlabel='False Positive Rate', ylabel='True Positive Rate')
ax1.xaxis.label.set_size(24)
ax1.yaxis.label.set_size(24)
ax1.tick_params(axis='both', which='major', labelsize=16)
# Hide the right and top spines
ax1.spines[['right', 'top']].set_visible(False)
roc_save_path = './OUT_PLOTS/merged_per_2stain_joint_ROC_Curves_color_blind.pdf'
fig1.savefig(roc_save_path, bbox_inches='tight', dpi=300)

# Finalizing merged Precision-Recall Curve
ax2.set(xlim=[-0.05, 1.05], ylim=[-0.05, 1.05], xlabel='Recall', ylabel='Precision')
ax2.xaxis.label.set_size(24)
ax2.yaxis.label.set_size(24)
ax2.tick_params(axis='both', which='major', labelsize=16)
ax2.legend(loc="center left", bbox_to_anchor=(1, 0.5), fontsize=18)  # Move legend to the right
# Hide the right and top spines
ax2.spines[['right', 'top']].set_visible(False)
pr_save_path = './OUT_PLOTS/merged_per_2stain_joint_PR_Curves_color_blind.pdf'
fig2.savefig(pr_save_path, bbox_inches='tight', dpi=300)

plt.show()
