In [4]:
import pyBigWig
import numpy as np
import h5py
import os
from scipy.stats import ks_2samp
from multiprocessing import Pool

# Constants for bin sizes
large_bin_size = 1_000_000
small_bin_size = 10_000

#large_bin_size = 250_000  # 250 kb
#small_bin_size = 2_500   # 10 kb, gives 25 smaller bins per large bin

# Chromosome sizes
chromosomes = {
    '1': 249250621, '2': 243199373, '3': 198022430, '4': 191154276, '5': 180915260,
    '6': 171115067, '7': 159138663, '8': 146364022, '9': 141213431, '10': 135534747,
    '11': 135006516, '12': 133851895, '13': 115169878, '14': 107349540, '15': 102531392,
    '16': 90354753, '17': 81195210, '18': 78077248, '19': 59128983, '20': 63025520,
    '21': 48129895, '22': 51304566
}

def correlation_matrix(data):
    # Compute the correlation matrix using numpy
    correlation_mat = np.corrcoef(data, rowvar=False)
    return correlation_mat

def calculate_integrals(args):
    bigwigfile_path, chrom, chrom_length, large_bin_start = args
    bw = pyBigWig.open(bigwigfile_path)  # Open the BigWig file here
    integrals = np.zeros(int(large_bin_size / small_bin_size))

    for j in range(int(large_bin_size / small_bin_size)):
        small_bin_start = large_bin_start + j * small_bin_size
        small_bin_end = small_bin_start + small_bin_size - 1
        small_bin_start = max(small_bin_start, 0)
        small_bin_end = min(small_bin_end, chrom_length - 1)

        if small_bin_start < small_bin_end:
            small_bin_data = bw.values(chrom, small_bin_start, small_bin_end)
            integral = np.trapz(small_bin_data) if small_bin_data is not None else 0
            integrals[j] = integral if not np.isnan(integral) else 0

    bw.close()
    return integrals

def process_chrom(bigwigfile_path, chrom, file_prefix):
    output_dir = f'/home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/methylation_1Mb_p_value_ks_correlation_dir/chr{chrom}/'
    p_value_filename = os.path.join(output_dir, f'{file_prefix}_{chrom}_ks_p_value_adjusted.h5')

    # Check if the file already exists
    if os.path.exists(p_value_filename):
        print(f"File {p_value_filename} already exists. Skipping.")
        return

    chrom_length = chromosomes[chrom]
    large_bins = range(0, chrom_length, large_bin_size)
    num_large_bins = len(large_bins)
    integrals = np.zeros((int(large_bin_size / small_bin_size), num_large_bins))

    args_list = [(bigwigfile_path, chrom, chrom_length, lb) for lb in large_bins]

    with Pool(processes=4) as pool:
        integrals_list = pool.map(calculate_integrals, args_list)

    for i, integrals_small in enumerate(integrals_list):
        integrals[:, i] = integrals_small
        
    # Compute KS p-values and correlation
    epsilon = 1e-10
    num_rows = integrals.shape[0]
    p_value_ks_matrix = np.zeros((num_rows, num_rows))

    for i in range(num_rows):
        for j in range(i + 1, num_rows):
            _, ks_p_value = ks_2samp(integrals[i], integrals[j])
            p_value_ks_matrix[i, j] = p_value_ks_matrix[j, i] = ks_p_value

    corr_matrix = correlation_matrix(p_value_ks_matrix)
    
    # Save matrices
    output_dir = f'/home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/methylation_1Mb_p_value_ks_correlation_dir/chr{chrom}/'
    os.makedirs(output_dir, exist_ok=True)
    p_value_filename = os.path.join(output_dir, f'{file_prefix}_{chrom}_ks_p_value_adjusted.h5')
    with h5py.File(p_value_filename, 'w') as hf:
        hf.create_dataset('matrix', data=p_value_ks_matrix)

def process_directory(base_path):
    suffix = '.methy_count.b37.bw'
    files = [f for f in os.listdir(base_path) if f.endswith(suffix)]

    for file_name in files:
        full_path = os.path.join(base_path, file_name)
        file_prefix = file_name[:-len(suffix)]
        for chrom in chromosomes:
            process_chrom(full_path, chrom, file_prefix)  # Pass the file path, not the object.
            
if __name__ == "__main__":
    base_path = '/home/dwk681/workspace/cluster_cells_from_GSE189158_NOMe_HiC/filesFromCluster/bam/methylation/'
    process_directory(base_path)
