In [1]:
import os
import numpy as np
import pandas as pd
from scipy.stats import rankdata, spearmanr
from scipy.stats import wilcoxon
import pprint
from collections import defaultdict

In [2]:
paper_df = pd.read_csv('/mnt/c/Users/donna/Downloads/Thesis/rankjes/paper_ranks/03x_matched_reference_ranks.csv', usecols=['tissue', 'rank', "cell_type", "GC_code"])
paper_df['tissue'] = paper_df['tissue'].str.lower()
paper_df['cell_type'] = paper_df['cell_type'].str.lower()

In [16]:
corr_matrix_dir = '/mnt/c/Users/donna/Downloads/Thesis/correlation_results/scrna/cov_spread_brca'

In [4]:
# Prepare result storage
results = []
ranks_dir = "/mnt/c/Users/donna/Downloads/Thesis/rankjes"
os.makedirs(ranks_dir, exist_ok=True)

In [5]:
paper_medians = (
    paper_df.groupby("tissue")["rank"]
    .median()
    .reset_index()
)

In [6]:
two_word_tissues = {
    'small_intestine',
    'large_intestine',
    'salivary_gland',
    'bone_marrow',
    'lymph_node'
}

In [17]:
##Niet lopen over fragment features
# Load all correlation matrices
correlation_matrices = {}
for file in os.listdir(corr_matrix_dir):
    if file.endswith('.csv'):
        tissue_name = os.path.splitext(file)[0].lower()
        matrix = pd.read_csv(os.path.join(corr_matrix_dir, file), index_col=0)
        correlation_matrices[tissue_name] = matrix

# Rank all cell types across tissues within each sample
sample_wise_corrs = {}
for tissue_name, matrix in correlation_matrices.items():
    for sample_name in matrix.index:  # sample = row
        for cell_type in matrix.columns:  # cell type = column
            value = matrix.loc[sample_name, cell_type]
            if not np.isnan(value):
                sample_wise_corrs.setdefault(sample_name, []).append(
                    (cell_type, tissue_name, value)  # keep tissue separate
                )

# Compute and collect rank data
rank_data = []
for sample_name, items in sample_wise_corrs.items():
    cell_types, tissues, values = zip(*items)  # unpack all three fields
    values = np.array(values)
    ranks = rankdata(values, method='ordinal')
    ranks = len(ranks) + 1 - ranks  # flip: highest correlation = rank 1

    for cell_type, tissue, rank in zip(cell_types, tissues, ranks):
        rank_data.append({
            'sample': sample_name,
            'cell_type': f"{cell_type}_{tissue}",
            'tissue': tissue,
            'rank': rank,
        })

# Save to CSV
df_ranks = pd.DataFrame(rank_data)
output_path = "/mnt/c/Users/donna/Downloads/Thesis/rankjes/brca_cov_spread_ranks.csv"
df_ranks.to_csv(output_path, index=False)


In [10]:
base_dir = "/mnt/c/Users/donna/Downloads/Thesis/correlation_results/scrna/control/abs_cors"

In [14]:
# Loopen over fragment features
results = []

for feature in os.listdir(base_dir):
    feature_path = os.path.join(base_dir, feature)
    if not os.path.isdir(feature_path):
        continue

    # Load all correlation matrices for this feature
    correlation_matrices = {}
    for file in os.listdir(feature_path):
        if file.endswith('.csv'):
            tissue_name = os.path.splitext(file)[0].lower()
            matrix = pd.read_csv(os.path.join(feature_path, file), index_col=0)
            correlation_matrices[tissue_name] = matrix

    # Rank all cell types across tissues within each sample
    sample_wise_corrs = {}
    for tissue_name, matrix in correlation_matrices.items():
        for sample_name in matrix.columns:
            for cell_type in matrix.index:
                value = matrix.loc[cell_type, sample_name]
                if not np.isnan(value):
                    sample_wise_corrs.setdefault(sample_name, []).append(
                        (f'{cell_type}_{tissue_name}', value)
                    )

    #### Hier gebleven
    rank_data = []
    for sample_name, items in sample_wise_corrs.items():
        # Every sample contains all cell types with corresponding correlation values
        cell_types, values = zip(*items)
        values = np.array(values)

        # Calculates ranks per sample
        ranks = rankdata(values, method='ordinal')
        ranks = len(ranks) + 1 - ranks  # flip: highest correlation = rank 1

        for cell_type, rank in zip(cell_types, ranks):
            # Split the cell_type by underscores
            parts = cell_type.split('_')

            last_two = '_'.join(parts[-2:])
            last_one = parts[-1]

            if last_two in two_word_tissues:
                tissue = last_two
                original_cell_type = '_'.join(parts[:-2])


            else:
                tissue = last_one
                original_cell_type = '_'.join(parts[:-1])

        

            rank_data.append({
                'cell_type': f"{original_cell_type}_{tissue}",
                'rank': rank,
                'sample': sample_name,
                'tissue': tissue,
            })

    # Convert to DataFrame
    df_ranks = pd.DataFrame(rank_data)

    # Save DataFrame to CSV 
    output_path = os.path.join("/mnt/c/Users/donna/Downloads/Thesis/rankjes/abscor_ranks_per_feature", f"{feature}_rank_results.csv")
    df_ranks.to_csv(output_path, index=False)
   

In [9]:
my_path = '/mnt/c/Users/donna/Downloads/Thesis/rankjes/abscor_ranks_per_feature'
paper_path = '/mnt/c/Users/donna/Downloads/Thesis/rankjes/paper_ranks'

#Extract coverage depths
paper_depth_files = [f for f in os.listdir(paper_path) if f.endswith('.csv')]
paper_depths = [f.split('_')[0] for f in paper_depth_files] 
paper_df['tissue'] = paper_df['tissue'].str.lower()
paper_df['cell_type'] = paper_df['cell_type'].str.lower()

summary_results = []

# Loop over your fragment feature CSVs
for my_file in os.listdir(my_path):
    if not my_file.endswith('.csv'):
        continue
    feature = my_file.split('_results')[0]
    my_df = pd.read_csv(os.path.join(my_path, my_file))

    #Calculate median rank per cell_type for current feature
    my_median = my_df.groupby('cell_type')['rank'].median().reset_index().rename(columns={'rank': 'median_rank_my'})

    feature_result = {'feature': feature}
    #Loop over the diff coverage depths
    for paper_file, depth in zip(paper_depth_files, paper_depths):
        paper_df = pd.read_csv(os.path.join(paper_path, paper_file))
        paper_df['tissue'] = paper_df['tissue'].str.lower()
        paper_df['cell_type'] = paper_df['cell_type'].apply(lambda x: '_'.join(x.rsplit('_', 1)[:-1] + [x.rsplit('_', 1)[-1].lower()]) if '_' in x else x.lower())


        #Calculate median rank per cell_type for current cov depth
        paper_median = paper_df.groupby('cell_type')['rank'].median().reset_index().rename(columns={'rank': f'median_rank_{depth}'})


        #Merge on cell_type
        merged_df = pd.merge(my_median, paper_median, on='cell_type')

        my_ranks = merged_df['median_rank_my']
        paper_ranks = merged_df[f'median_rank_{depth}']

        #Calc corr and p-value
        spearman_corr, spearman_p = spearmanr(my_ranks, paper_ranks)

        # Store corr per coverage depth
        feature_result[f'corr_{depth}'] = spearman_corr
        feature_result[f'pval_{depth}'] = spearman_p
      

    summary_results.append(feature_result)


summary_df = pd.DataFrame(summary_results)
summary_df.to_csv('/mnt/c/Users/donna/Downloads/Thesis/rankjes/spearmann_rank_corr.csv', index=False)



In [47]:
## Paired Rank Sum Test
from scipy.stats import wilcoxon
import os
import pandas as pd

my_path = '/mnt/c/Users/donna/Downloads/Thesis/rankjes/abscor_ranks_per_feature'
paper_path = '/mnt/c/Users/donna/Downloads/Thesis/rankjes/paper_ranks'

# Extract coverage depths
paper_depth_files = [f for f in os.listdir(paper_path) if f.endswith('.csv')]
paper_depths = [f.split('_')[0] for f in paper_depth_files] 


summary_results = []

# Loop over your fragment feature CSVs
for my_file in os.listdir(my_path):
    if not my_file.endswith('.csv'):
        continue
    feature = my_file.split('_results')[0]
    my_df = pd.read_csv(os.path.join(my_path, my_file))

    # Calculate median rank per cell_type for current feature
    my_median = my_df.groupby('cell_type')['rank'].median().reset_index().rename(columns={'rank': 'median_rank_my'})

    feature_result = {'feature': feature}

    # Loop over the diff coverage depths
    for paper_file, depth in zip(paper_depth_files, paper_depths):
        paper_df = pd.read_csv(os.path.join(paper_path, paper_file))
        paper_df['tissue'] = paper_df['tissue'].str.lower()
        paper_df['cell_type'] = paper_df['cell_type'].apply(
        lambda x: '_'.join(x.rsplit('_', 2)[:-2] + [x.rsplit('_', 2)[-2].lower(), x.rsplit('_', 2)[-1].lower()]) if x.count('_') >= 2 else x.lower()
        )

        # Calculate median rank per cell_type for current cov depth
        paper_median = paper_df.groupby('cell_type')['rank'].median().reset_index().rename(columns={'rank': f'median_rank_{depth}'})

        # Merge on cell_type
        merged_df = pd.merge(my_median, paper_median, on='cell_type')

        my_ranks = merged_df['median_rank_my']
        paper_ranks = merged_df[f'median_rank_{depth}']

        # Paired rank sum test (Wilcoxon signed-rank)
        if len(my_ranks) >= 5:  # Wilcoxon requires at least 5 non-zero differences
            try:
                w_stat, w_p = wilcoxon(my_ranks, paper_ranks)
                feature_result[f'corr_{depth}'] = w_stat
                feature_result[f'pval_{depth}'] = w_p
            except ValueError:
                feature_result[f'corr_{depth}'] = None
                feature_result[f'pval_{depth}'] = None
        else:
            feature_result[f'corr_{depth}'] = None
            feature_result[f'pval_{depth}'] = None

    summary_results.append(feature_result)

summary_df = pd.DataFrame(summary_results)
summary_df.to_csv('/mnt/c/Users/donna/Downloads/Thesis/rankjes/abscor_rank_wilcoxon.csv', index=False)


In [45]:
from scipy.stats import mannwhitneyu
import os
import pandas as pd

my_path = '/mnt/c/Users/donna/Downloads/Thesis/rankjes/abscor_ranks_per_feature'
paper_path = '/mnt/c/Users/donna/Downloads/Thesis/rankjes/paper_ranks'

# Extract coverage depths
paper_depth_files = [f for f in os.listdir(paper_path) if f.endswith('.csv')]
paper_depths = [f.split('_')[0] for f in paper_depth_files] 

summary_results = []

# Loop over your fragment feature CSVs
for my_file in os.listdir(my_path):
    if not my_file.endswith('.csv'):
        continue
    feature = my_file.split('_results')[0]
    my_df = pd.read_csv(os.path.join(my_path, my_file))

    # Calculate median rank per cell_type for current feature
    my_median = my_df.groupby('cell_type')['rank'].median().reset_index().rename(columns={'rank': 'median_rank_my'})

    feature_result = {'feature': feature}

    # Loop over the diff coverage depths
    for paper_file, depth in zip(paper_depth_files, paper_depths):
        paper_df = pd.read_csv(os.path.join(paper_path, paper_file))
        paper_df['tissue'] = paper_df['tissue'].str.lower()
        paper_df['cell_type'] = paper_df['cell_type'].apply(
        lambda x: '_'.join(x.rsplit('_', 2)[:-2] + [x.rsplit('_', 2)[-2].lower(), x.rsplit('_', 2)[-1].lower()]) if x.count('_') >= 2 else x.lower()
        )

        paper_df['tissue'] = paper_df['tissue'].str.lower()

        # Calculate median rank per cell_type for current cov depth
        paper_median = paper_df.groupby('cell_type')['rank'].median().reset_index().rename(columns={'rank': f'median_rank_{depth}'})

        # Merge on cell_type
        # Determine unmatched cell types
        my_celltypes = set(my_median['cell_type'])
        paper_celltypes = set(paper_median['cell_type'])
        shared_celltypes = my_celltypes & paper_celltypes
        only_in_my = my_celltypes - paper_celltypes
        only_in_paper = paper_celltypes - my_celltypes

        if only_in_my or only_in_paper:
            print(f"\nFeature: {feature}, Depth: {depth}")
            if only_in_my:
                print(f"  Cell types only in your data ({len(only_in_my)}): {sorted(only_in_my)}")
            if only_in_paper:
                print(f"  Cell types only in paper data ({len(only_in_paper)}): {sorted(only_in_paper)}")

        # Merge on shared cell types only
        merged_df = pd.merge(my_median, paper_median, on='cell_type')


        my_ranks = merged_df['median_rank_my']
        paper_ranks = merged_df[f'median_rank_{depth}']

        # Mann–Whitney U test (non-paired)
        if len(my_ranks) >= 3 and len(paper_ranks) >= 3:
            try:
                u_stat, u_p = mannwhitneyu(my_ranks, paper_ranks, alternative='two-sided')
                feature_result[f'corr_{depth}'] = u_stat
                feature_result[f'pval_{depth}'] = u_p
            except ValueError:
                feature_result[f'corr_{depth}'] = None
                feature_result[f'pval_{depth}'] = None
        else:
            feature_result[f'corr_{depth}'] = None
            feature_result[f'pval_{depth}'] = None

    summary_results.append(feature_result)

summary_df = pd.DataFrame(summary_results)
summary_df.to_csv('/mnt/c/Users/donna/Downloads/Thesis/rankjes/abscor_rank_mannwhitney.csv', index=False)

In [46]:
print(len(merged_df))

323
