In [None]:
# importing libraries

import pyreadr
import pandas as pd
import collections
import numpy as np
import re

import matplotlib.pyplot as plt
from scipy.stats import mannwhitneyu
import seaborn as sns
from matplotlib.patches import Patch
import pyreadr
from scipy.stats import pearsonr

In [None]:
import os

# Check if the directory 'OUT' exists, if not, create it
output_dir = "./OUT/"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

In [None]:
# Load the IPH results datase
df_iph = pd.read_csv("<path_to_your_dataset>/HE_samples_IPH.csv")

# Display the DataFrame
df_iph

In [None]:
# Extract the SNR (StudyNumber) from the 'case_id' column using a regular expression
# The regular expression searches for patterns starting with 'AE' or digits (0-9)
df_iph["SNR"] = [re.search(r"^(AE|[0-9]*)", i).group(0) for i in df_iph["case_id"]]

In [None]:
# Load the stain data from a CSV file
# Note: Replace '<path_to_your_dataset>' with the actual path to your dataset
df_stain_together = pd.read_csv("<path_to_your_dataset>/stains_cellprofiler_output.csv")

# Display the DataFrame
df_stain_together

In [None]:
# Normalize the stain data by calculating the log2 ratio of counts to tissue area for each stain
# This helps in comparing the stain counts relative to the tissue area

# List of stains to normalize
stains = ['CD34', 'CD68', 'CD66b', 'SMA', 'HE', 'EVG', 'FIBRIN', 'GLYCC', 'SR']

# Normalize the stains
for stain in stains:
    if stain in ['EVG', 'FIBRIN', 'GLYCC', 'SR']:
        # For these stains, use direct ratio
        df_stain_together[f'{stain}-TISSUE'] = df_stain_together[stain] / df_stain_together[f'{stain}_TISSUE']
    else:
        # For other stains, use log2 ratio
        df_stain_together[f'{stain}-TISSUE'] = np.log2(df_stain_together[f'{stain}_count'] + 1) / np.log2(df_stain_together[f'{stain}_TISSUE'] + 1)

In [None]:
# Find the intersection of 'SNR' values between the stain data and IPH results data
# This helps in identifying common samples present in both datasets
intersection = set(df_stain_together['SNR']) & set(df_iph['SNR'])

In [None]:
# Filter the stain data to include only the samples present in the intersection
df_stain_together = df_stain_together[df_stain_together['SNR'].isin(list(intersection))]

# Filter the IPH results data to include only the samples present in the intersection
df_iph = df_iph[df_iph['SNR'].isin(list(intersection))]

In [None]:
# Merge the IPH results data with the stain data on the 'SNR' column
# This combines the two datasets based on the common 'SNR' values
df_combined = df_iph.merge(df_stain_together, how='inner', on='SNR')


In [None]:
# Select relevant columns for analysis
# This includes normalized stain data and outcome variables
selected_columns = ["CD34-TISSUE", "CD68-TISSUE", "CD66b-TISSUE", "SMA-TISSUE", "EVG-TISSUE", "FIBRIN-TISSUE", "GLYCC-TISSUE", "SR-TISSUE", "HE-TISSUE", "area", "prob"]

# Create a new DataFrame with the selected columns
df_test = df_combined[selected_columns]

# Count the number of non-NA/null observations for each column
df_counts = df_test.apply(lambda x: x.notna().sum(), axis=0)

# Display the counts of non-NA/null observations
df_counts

In [None]:
# Display the DataFrame containing selected columns for analysis
# This DataFrame includes normalized stain data and outcome variables

# Display the DataFrame
df_test

In [None]:
# Calculate the correlation matrix for the selected columns in the DataFrame
# This matrix shows the Pearson correlation coefficients between pairs of columns
corr_outcome = df_test.corr()

# Display the correlation matrix
corr_outcome

In [None]:
# Calculate the p-values for the Pearson correlation coefficients between pairs of columns
# The method uses a lambda function to apply the pearsonr function from scipy.stats to each pair of columns
# The pearsonr function returns a tuple (correlation coefficient, p-value), and we extract the p-value
# Subtract the identity matrix to set the diagonal to zero, as we don't need p-values for self-correlation

pval = df_test.corr(method=lambda x, y: pearsonr(x, y)[1]) - np.eye(*corr_outcome.shape)

# Display the matrix of p-values
pval

In [None]:
from scipy.stats import norm

def pearson_ci(r, n, confidence=0.95):
    """
    Calculate the confidence interval for a Pearson correlation coefficient.

    Parameters:
    r (float): Pearson correlation coefficient.
    n (int): Number of samples.
    confidence (float): Confidence level for the interval (default is 0.95).

    Returns:
    tuple: Lower and upper bounds of the confidence interval.
    """
    # Fisher's Z-transformation
    z = 0.5 * np.log((1 + r) / (1 - r))
    
    # Standard error of the Z-transformation
    se_z = 1 / np.sqrt(n - 3)
    
    # Critical value for the desired confidence level
    z_crit = norm.ppf(1 - (1 - confidence) / 2)
    
    # Confidence interval in Z-space
    z_lower = z - z_crit * se_z
    z_upper = z + z_crit * se_z
    
    # Back-transform to r-space
    r_lower = (np.exp(2 * z_lower) - 1) / (np.exp(2 * z_lower) + 1)
    r_upper = (np.exp(2 * z_upper) - 1) / (np.exp(2 * z_upper) + 1)
    
    return r_lower, r_upper

In [None]:
# Iterate over each stain to calculate the 95% confidence interval for the Pearson correlation coefficient
for stain in ["CD34-TISSUE", "CD68-TISSUE", "CD66b-TISSUE", "SMA-TISSUE", "EVG-TISSUE", "FIBRIN-TISSUE", "GLYCC-TISSUE", "SR-TISSUE", "HE-TISSUE"]:
    # Get the Pearson correlation coefficient between the stain and the 'area' column
    r = corr_outcome["area"][stain]
    
    # Get the number of non-NA/null observations for the stain
    n = df_counts[stain]
    
    # Calculate the 95% confidence interval for the Pearson correlation coefficient
    r_lower, r_upper = pearson_ci(r, n, 0.95)
    
    # Print the confidence interval for the stain
    print(f"[{stain}] 95% Confidence Interval: [{r_lower:.5f}, {r_upper:.5f}]")