In [5]:
import os
import glob
import h5py
import numpy as np
from scipy.sparse import coo_matrix, csr_matrix, triu, tril
import multiprocessing
from matplotlib import pyplot as plt
from concurrent.futures import ProcessPoolExecutor, as_completed

def load_hic_data(filepath):
    """Loads Hi-C data from a text file."""
    data = []
    with open(filepath, 'r') as file:
        for line in file:
            parts = line.strip().split()
            if len(parts) == 3:
                try:
                    i, j, reads = int(parts[0]), int(parts[1]), int(parts[2])
                    data.append((i, j, reads))
                except ValueError:
                    # Log error or pass if data format is incorrect
                    pass
    return data

def create_matrix(data):
    """Creates a symmetric matrix from Hi-C data using sparse format."""
    if not data:
        return None
    max_index = max(max(i, j) for i, j, _ in data)
    rows, cols, vals = zip(*data)
    matrix = coo_matrix((vals, (rows, cols)), shape=(max_index + 1, max_index + 1), dtype=int).tocsr()
    matrix += matrix.T - coo_matrix((matrix.diagonal(), (range(max_index + 1), range(max_index + 1))), shape=(max_index + 1, max_index + 1))
    return matrix

def emphasize_interactions(matrix, max_distance):
    """Highlights interactions by adding offsets of contacts."""
    emphasized_matrix = matrix.copy()
    for offset in range(1, max_distance + 1):
        emphasized_matrix += triu(matrix, offset) + tril(matrix, -offset)
    return emphasized_matrix

def csr_pearson_correlation(csr_mat):
    """Calculates Pearson correlation from a CSR matrix."""
    csc_mat = csr_mat.tocsc()
    mean = np.array(csc_mat.mean(axis=1)).flatten()
    std_dev = np.sqrt(csc_mat.power(2).mean(axis=1).A1 - mean**2)
    valid_std_dev = std_dev != 0
    rows, cols = csr_mat.nonzero()
    standardized_data = np.divide(csr_mat.data - mean[rows], std_dev[rows], where=valid_std_dev[rows])
    standardized_csr = csr_matrix((standardized_data, (rows, cols)), shape=csr_mat.shape)
    correlation_matrix = standardized_csr.dot(standardized_csr.T).toarray()
    diag = np.sqrt(np.diag(correlation_matrix))

    # Adjusting diagonal for valid standard deviations
    diag = np.where(valid_std_dev, diag, 1)  # replace zero with one to avoid division by zero
    correlation_matrix /= diag[:, None]
    correlation_matrix /= diag[None, :]
    return csr_matrix(np.nan_to_num(correlation_matrix))  # Replace NaNs with zero, caused by division by zero

def third_order_cumulant_matrix(data):
    """Calculates the third-order cumulant matrix for 3D data interaction."""
    if not isinstance(data, np.ndarray):
        data = data.toarray()  # Assuming data is a sparse matrix
    symmetric_matrix = data + data.T - np.diag(data.diagonal())

    n_columns = symmetric_matrix.shape[1]
    means = np.mean(symmetric_matrix, axis=0)
    cumulants = np.zeros((n_columns, n_columns, n_columns), dtype=np.float64)  # Ensure type is float64 for HDF5 compatibility

    for i in range(n_columns):
        for j in range(i, n_columns):
            for k in range(j, n_columns):
                x, y, z = symmetric_matrix[:, i], symmetric_matrix[:, j], symmetric_matrix[:, k]
                cumulant_ijk = np.mean((x - means[i]) * (y - means[j]) * (z - means[k])) - \
                               means[i] * np.mean((y - means[j]) * (z - means[k])) - \
                               means[j] * np.mean((x - means[i]) * (z - means[k])) - \
                               means[k] * np.mean((x - means[i]) * (y - means[j])) + \
                               2 * means[i] * means[j] * means[k]

                cumulants[i, j, k] = cumulants[i, k, j] = cumulants[j, i, k] = \
                cumulants[j, k, i] = cumulants[k, i, j] = cumulants[k, j, i] = cumulant_ijk

    return cumulants


def process_file(file_path, output_dirs, max_distance, chromosome):
    print(f"Processing file: {file_path}")
    data = load_hic_data(file_path)
    if not data:
        print(f"No data loaded from {file_path}")
        return "No data loaded"

    csr_mat = create_matrix(data)
    if csr_mat is None:
        print(f"Failed to create matrix from data in {file_path}")
        return "Failed to create matrix"

    base_name = os.path.splitext(os.path.basename(file_path))[0]
    file_paths = [os.path.join(base_dir, chromosome, f"{base_name}.hdf5") for base_dir in output_dirs]

    matrices_needed = [not os.path.exists(path) for path in file_paths]

    if any(matrices_needed):
        emphasized_matrix = None
        if matrices_needed[1] or matrices_needed[2]:
            emphasized_matrix = emphasize_interactions(csr_mat, max_distance)
        
        matrices = [
            csr_pearson_correlation(csr_mat) if matrices_needed[0] else None,
            csr_pearson_correlation(emphasized_matrix) if matrices_needed[1] else None,
            third_order_cumulant_matrix(emphasized_matrix) if matrices_needed[2] else None
        ]

        for mat, output_path, is_needed in zip(matrices, file_paths, matrices_needed):
            if is_needed and mat is not None:
                os.makedirs(os.path.dirname(output_path), exist_ok=True)
                with h5py.File(output_path, 'w') as file:
                    file.create_dataset('data', data=mat)
                print(f"Saved {output_path}")
            elif is_needed:
                print(f"Matrix not created or not needed for {output_path}")
            else:
                print(f"Output already exists and was skipped: {output_path}")

    return "Processed successfully"

def process_chromosome(chromosome, input_dir, output_dirs, max_distance):
    """Process all Hi-C data files for a given chromosome in parallel."""
    print(f"Processing chromosome: {chromosome}")
    files = glob.glob(os.path.join(input_dir, '*.txt'))
    if not files:
        print(f"No files found in {input_dir}")
        return

    with ProcessPoolExecutor(max_workers=multiprocessing.cpu_count()) as executor:
        futures = [executor.submit(process_file, file, output_dirs, max_distance, chromosome) for file in files]
        for future in as_completed(futures):
            print(f"File processed: {future.result()}")  # Ensure each future completes                

# Configuration and execution
resolution = 1_000_000
max_distance = int(10_000_000 / resolution) + 1
base_input_dir = '../../projects/single_cell_files/'
base_output_raw_correlation_dir = '../../projects/single_cell_files/hicluster_1Mb_correlation_dir/'
base_output_emphasized_correlation_dir = '../../projects/single_cell_files/hicluster_1Mb_emphasized_correlation_dir/'
base_output_emphasized_cumulant_dir = '../../projects/single_cell_files/hicluster_1Mb_emphasized_cumulant_dir/'
output_dirs = [base_output_raw_correlation_dir, base_output_emphasized_correlation_dir, base_output_emphasized_cumulant_dir]

# Create directories once before processing
for dir_path in output_dirs:
    os.makedirs(dir_path, exist_ok=True)

# Call to process all chromosomes
chromosomes = [f'chr{i}' for i in range(1, 23)]
for chromosome in chromosomes:
    input_dir = os.path.join(base_input_dir, chromosome)
    process_chromosome(chromosome, input_dir, output_dirs, max_distance)


Processing chromosome: chr1
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc42.TAGCTT_chr1.txtProcessing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc32.CGATGT_chr1.txtProcessing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc14.ACTTGA_chr1.txtProcessing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc25.GCCAAT_chr1.txtProcessing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc16.CGATGT_chr1.txtProcessing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc6.TAGCTT_chr1.txtProcessing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc12.CGATGT_chr1.txt
Processing fi

Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc36.GCCAAT_chr1.hdf5Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc24.CGATGT_chr1.hdf5

Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc36.GCCAAT_chr1.hdf5Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc25.ACTTGA_chr1.txt

Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc18.GCCAAT_chr1.hdf5File processed: Processed successfully

Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc1

Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc28.CGATGT_chr1.hdf5
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc21.GCCAAT_chr1.txt
Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc30.GCCAAT_chr1.hdf5
File processed: Processed successfully
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc14.GCCAAT_chr1.txt
File processed: Processed successfully
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc9.GCCAAT_chr1.hdf5
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc9.GCCAAT_chr1.hdf5
Saved /home

Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc22.ACTTGA_chr1.hdf5
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc17.ACTTGA_chr1.hdf5Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc44.GCCAAT_chr1.txt

Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc17.ACTTGA_chr1.hdf5
File processed: Processed successfully
Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc17.ACTTGA_chr1.hdf5
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc22.GCCAAT_chr1.txt
File processed: Processed successfully
Output al

Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc40.TAGCTT_chr1.hdf5
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc40.TAGCTT_chr1.hdf5
Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc40.TAGCTT_chr1.hdf5
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr1/sc39.TAGCTT_chr1.txt
File processed: Processed successfully
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc4.CGATGT_chr1.hdf5
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc4.

Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc29.CGATGT_chr1.hdf5
File processed: Processed successfully
File processed: Processed successfully
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc48.CGATGT_chr1.hdf5
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc48.CGATGT_chr1.hdf5
Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc48.CGATGT_chr1.hdf5
File processed: Processed successfully
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc22.GCCAAT_chr1.hdf5
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic

Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc44.CGATGT_chr1.hdf5
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc44.CGATGT_chr1.hdf5
Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/chr1/sc44.CGATGT_chr1.hdf5
File processed: Processed successfully
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_correlation_dir/chr1/sc41.TAGCTT_chr1.hdf5
Output already exists and was skipped: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_correlation_dir/chr1/sc41.TAGCTT_chr1.hdf5
Saved /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_emphasized_cumulant_dir/ch

Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr2/sc47.ACTTGA_chr2.txt
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr2/sc31.ACTTGA_chr2.txt
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr2/sc11.TAGCTT_chr2.txt
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr2/sc3.TAGCTT_chr2.txt
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr2/sc8.ACTTGA_chr2.txt
Processing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr2/sc6.TAGCTT_chr2.txtProcessing file: /home/dwk681/workspace/multi_omics_hic_clustering/scNOMeHiC_20210127/1Mb/hicluster_1Mb_raw_dir/chr2/sc28.TAGCTT_chr2.txt

Processing file: /home/dwk681/workspa

TypeError: Object dtype dtype('O') has no native HDF5 equivalent