In [None]:
#importing libraries

import pyreadr
import pandas as pd
import collections
import numpy as np
import re

import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
import seaborn as sns
from matplotlib.patches import Patch
import pyreadr
from scipy.stats import pearsonr

In [None]:
import os

# Check if the output directory exists, if not, create it
output_dir = "./OUT/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
import os

# Define the input file path with the IPH results
input_file = "./HE_samples_IPH.csv"

# Check if the input file exists
if os.path.exists(input_file):
    # Read the CSV file into a DataFrame
    df_iph = pd.read_csv(input_file)
else:
    raise FileNotFoundError(f"The file {input_file} does not exist.")

# Display the DataFrame
df_iph

In [None]:
# Extract the SNR (StudyNumber) from the 'case_id' column using regular expressions
# The regular expression searches for patterns starting with 'A', 'E', or digits (0-9)
df_iph["SNR"] = [re.search(r"^[AE0-9]*", case_id).group(0) for case_id in df_iph["case_id"]]

In [None]:
# Check if the preprocessed <stain> CellProfiler data exists
if os.path.exists("./OUT/<STAIN>_CellProfiler.csv"):
    # If the file exists, read it into a DataFrame
    df_stain = pd.read_csv("./OUT/<STAIN>_CellProfiler.csv")
else:
    # If the file does not exist, read the raw data from a specified ExpressScan SlideToolkit output file
    # Note: Replace 'your_file_path.gct' with the actual file path
    df_stain = pd.read_csv("your_file_path.gct", sep="\t")

# Display the DataFrame
df_stain

In [None]:
import os

# Check if the preprocessed CellProfiler data exists
output_file = "./OUT/<STAIN>_CellProfiler.csv"
if not os.path.exists(output_file):
    # Create an empty DataFrame to store the combined data
    df_stain_together = pd.DataFrame({"Stain": [],
                                      "SNR": [],
                                      "<STAIN>_area": [],
                                      "<STAIN>_count": [],
                                      "TISSUE": []})

    # Iterate over each row in the stain DataFrame
    for index, row in df_stain.iterrows():
        # Extract the SNR and stain information using regular expressions
        row_SNR = re.search(r"^[AE0-9]*", row["NAME"]).group(0)
        row_STAIN = row["Metadata_STAIN"]
        row_<STAIN>_area = row["AreaOccupied_AreaOccupied_<STAIN>"]
        row_<STAIN>_count = row["Count_<STAIN>"]
        row_TISSUE = row["AreaOccupied_AreaOccupied_Tissue"]

        # Check if the SNR already exists in the combined DataFrame
        if row_SNR in df_stain_together['SNR'].values:
            # If it exists, update the existing row by adding the new values
            df_stain_together.loc[df_stain_together['SNR'] == row_SNR, '<STAIN>_area'] += row_<STAIN>_area
            df_stain_together.loc[df_stain_together['SNR'] == row_SNR, '<STAIN>_count'] += row_<STAIN>_count
            df_stain_together.loc[df_stain_together['SNR'] == row_SNR, 'TISSUE'] += row_TISSUE
        else:
            # If it does not exist, add a new row to the combined DataFrame
            df_stain_together = pd.concat([df_stain_together, pd.DataFrame({"Stain": [row_STAIN],
                                                                            "SNR": [row_SNR],
                                                                            "<STAIN>_area": [row_<STAIN>_area],
                                                                            "<STAIN>_count": [row_<STAIN>_count],
                                                                            "TISSUE": [row_TISSUE]})])

    # Calculate the ratios of <STAIN>_area and <STAIN>_count to TISSUE
    df_stain_together["<STAIN>_area/TISSUE"] = df_stain_together["<STAIN>_area"] / df_stain_together["TISSUE"]
    df_stain_together["<STAIN>_count/TISSUE"] = df_stain_together["<STAIN>_count"] / df_stain_together["TISSUE"]

    # Save the combined DataFrame to a CSV file
    df_stain_together.to_csv(output_file, index=False)

    # Display the combined DataFrame
    df_stain_together
else:
    # If the preprocessed file exists, read it into a DataFrame
    df_stain_together = pd.read_csv(output_file)
    df_stain_together


In [None]:
# Find the intersection of SNR values between the two DataFrames
# This will help in identifying common SNR values present in both datasets
intersection = set(df_stain_together['SNR']) & set(df_iph['SNR'])

In [None]:
# Filter the 'df_stain_together' DataFrame to include only rows with 'SNR' values present in the intersection set
df_stain_together = df_stain_together[df_stain_together['SNR'].isin(list(intersection))]

# Filter the 'df_iph' DataFrame to include only rows with 'SNR' values present in the intersection set
df_iph = df_iph[df_iph['SNR'].isin(list(intersection))]

In [None]:
# Merge the two DataFrames on the 'SNR' column to create a combined DataFrame
# This will include only the rows with matching 'SNR' values in both DataFrames
df_combined = df_iph.merge(df_stain_together, how='inner', on='SNR')


In [None]:
# Calculate the ratio of <STAIN>_count to TISSUE using log2 transformation
# This helps in normalizing the data and handling cases where TISSUE is zero
df_combined["<STAIN>_count/TISSUE"] = np.where(
    df_combined["TISSUE"] != 0,  # Check if TISSUE is not zero to avoid division by zero
    np.log2(df_combined['<STAIN>_count'] + 1) / np.log2(df_combined['TISSUE'] + 1),  # Apply log2 transformation and calculate the ratio
    np.nan  # Substitute NaN for cases where TISSUE is zero
)


In [None]:
df_combined

In [None]:
# Define the columns to be used for analysis
columns = ["<STAIN>_area/TISSUE", '<STAIN>_count/TISSUE']

In [None]:
# Iterate over each column specified for analysis
for column in columns:
    # Separate the data into two groups based on the 'IPH' column
    mean_yes_coord = df_combined[df_combined['IPH'] == True][column].tolist()
    mean_no_coord = df_combined[df_combined['IPH'] == False][column].tolist()

    # Create a new plot
    ax = plt.axes()
    box = ax.boxplot([mean_no_coord, mean_yes_coord], positions=np.linspace(0, 1, 2), widths=0.5)

    # Set plot labels and title
    plt.ylabel(column)
    plt.xlabel('IPH Yes/No')
    plt.title(f'{column} - Grouped by IPH prediction')
    plt.xticks([0.0, 1.0], ['No', 'Yes'])

    # Get the y-axis limits
    bottom, top = ax.get_ylim()
    y_range = top - bottom

    # Calculate significance between the two groups using Mann-Whitney U test
    stat, p_value = mannwhitneyu(mean_no_coord, mean_yes_coord, alternative='two-sided')
    
    # Calculate Z-score
    n1 = len(mean_no_coord)
    n2 = len(mean_yes_coord)
    mean_U = n1 * n2 / 2
    std_U = np.sqrt(n1 * n2 * (n1 + n2 + 1) / 12)
    z = (stat - mean_U) / std_U

    # Calculate effect size r
    N = n1 + n2
    r = z / np.sqrt(N)

    # Print statistical results
    print(f"U-statistic: {stat}")
    print(f"P-value: {p_value}")
    print(f"Effect size r: {r}")
    print(f'Mean IPH yes: {np.mean(mean_yes_coord)} - Stdev: {np.std(mean_yes_coord)}')
    print(f'Mean IPH no: {np.mean(mean_no_coord)} - Stdev: {np.std(mean_no_coord)}')

    # Plot significance bars if p-value is below threshold
    if p_value < 0.05:  # Adjust significance level as needed
        x1 = box['boxes'][0].get_xdata()[0] + 0.25
        x2 = box['boxes'][1].get_xdata()[0] + 0.25
        bar_height = top + (y_range * 0.07)
        bar_tips = bar_height - (y_range * 0.02)
        plt.plot([x1, x1, x2, x2], [bar_tips, bar_height, bar_height, bar_tips], lw=1, c='k')

        # Determine significance symbol
        if p_value < 0.001:
            sig_symbol = '<0.001'
        elif p_value < 0.01:
            sig_symbol = '<0.01'
        elif p_value < 0.05:
            sig_symbol = '<0.05'
        else:
            sig_symbol = '---'
        
        # Add significance text to the plot
        text_height = bar_height + (y_range * 0.01)
        txt = plt.text((x1 + x2) * 0.5, text_height, sig_symbol, ha='center', va='bottom', c='k')
        txt.set_fontsize(7)

    # Show the plot
    plt.show()
