In [13]:
import os
import pandas as pd
import numpy as np
from scipy.stats import mannwhitneyu, rankdata

In [None]:
dir_control = "/mnt/c/Users/donna/Downloads/Thesis/correlation_results/scrna/fslr_coad"
dir_coad = "/mnt/c/Users/donna/Downloads/Thesis/correlation_results/scrna/control/features/fslr"

In [61]:
def load_correlation_matrices(directory):
    correlation_matrices = {}
    for filename in os.listdir(directory):
        if filename.endswith(".csv"):
            tissue = os.path.splitext(filename)[0].lower()
            matrix = pd.read_csv(os.path.join(directory, filename), header=0).values
            correlation_matrices[tissue] = matrix
    return correlation_matrices

In [None]:
def compute_ranks(correlation_matrices):
    sample_wise_corrs = {}
    for tissue_name, matrix in correlation_matrices.items():
        n_cell_types, n_samples = matrix.shape
        for sample_idx in range(n_samples):
            for cell_idx in range(n_cell_types):
                value = matrix[cell_idx, sample_idx]
                if not np.isnan(value):
                    sample_wise_corrs.setdefault(sample_idx, []).append((f'{tissue_name}_cell_{cell_idx}', value))

    rank_data = []
    for sample_idx, items in sample_wise_corrs.items():
        labels, values = zip(*items)
        values = np.array(values)
        ranks = rankdata(values, method='ordinal')
        # ranks = len(ranks) + 1 - ranks  # flip: most negative = rank 1
        for label, rank in zip(labels, ranks):
            tissue = label.split('_cell_')[0]
            rank_data.append({'tissue': tissue, 'sample': f'sample_{sample_idx}', 'rank': rank})

    df_ranks = pd.DataFrame(rank_data)
    return df_ranks


In [66]:
control_matrices = load_correlation_matrices(dir_control)
coad_matrices = load_correlation_matrices(dir_coad)

control_ranks = compute_ranks(control_matrices)
coad_ranks = compute_ranks(coad_matrices)

In [67]:
control_ranks['group'] = 'control'
coad_ranks['group'] = 'coad'

In [68]:
df_all_ranks = pd.concat([control_ranks, coad_ranks], ignore_index=True)

In [69]:
results = {}
for tissue in ['small_intestine', 'large_intestine']:
    coad_vals = df_all_ranks[(df_all_ranks['group'] == 'coad') & (df_all_ranks['tissue'] == tissue)]['rank']
    control_vals = df_all_ranks[(df_all_ranks['group'] == 'control') & (df_all_ranks['tissue'] == tissue)]['rank']

    stat, p_value = mannwhitneyu(coad_vals, control_vals, alternative='less')
    results[tissue] = {
        'U_statistic': stat,
        'p_value': p_value,
        'significant': p_value < 0.05
    }

In [70]:
results_df = pd.DataFrame.from_dict(results, orient='index')
print(results_df)

                 U_statistic   p_value  significant
small_intestine    1212131.5  0.643584        False
large_intestine     839743.0  0.887612        False
