In [1]:
import numpy as np
import numba
import os

def yellow(text : object):
    return '\033[33m' + str(text) + '\033[0m'

def green(text : object):
    return '\033[32m' + str(text) + '\033[0m'

def blue(text : object):
    return '\033[34m' + str(text) + '\033[0m'

def red(text : object):
    return '\033[31m' + str(text) + '\033[0m'

def read_genome_file(file_name :str) -> dict[str, int]:
    """
    Reads a genome file and returns a dictionary with the genome name and the length of the genome.
    """
    genome_dict = {}
    with open(file_name, 'r') as f:
        for line in f:
            line = line.strip()
            chromo, length = line.split('\t')
            genome_dict[chromo.strip()] = int(length)
    return genome_dict

class DataSplitter:
    
    # Hubs contains the bed file as a list of intervals like this:
    # [start, end]
    # [start, end]
    # ...
    # [start, end]
    hubs : np.ndarray
    
    # The parent chromosome
    chromo: str

    def __parse_bed_as_array(self, bed_path : str) -> np.ndarray:
        """
        Parses the bed file and returns an array of intervals.
        """
        intervals = []
        with open(bed_path, "r") as bed_file:
            for line in bed_file:
                line = line.strip()
                if line == "":
                    continue

                # Split the line into a list of values
                chr_, start, end, *rest = line.split("\t")
                
                # Check if the chromosome is the same as the parent chromosome
                if chr_ != self.chromo:
                    continue

                # Convert the start and end to int
                start = int(start)
                end = int(end)

                # Add the interval to the list
                intervals.append((start, end))

        return np.array(intervals)

    def __init__(self, chromo : str, hubs_path : str, resolution : int):
        self.chromo = chromo
        self.resolution = resolution
        self.hubs = self.__parse_bed_as_array(hubs_path)

    @staticmethod
    @numba.njit    
    def check_single_pair_nb( hubs : np.ndarray, pair : tuple[int, int], resolution : int):
        """
        Checks wether the given pair is a valid pair (belongs inside the hubs).
        """
        for hub in hubs:
            contained = True
            for p in pair:
                
                start = p
                end = p + resolution

                if start < hub[0] or end > hub[1]:
                    contained = False
                    break

            
            # If the pair is inside the hub, return True
            if contained:
                return True

        return False

    def check_single_pair(self, pair : tuple[int, int]):
        return DataSplitter.check_single_pair_nb(self.hubs, pair, self.resolution)

    def split_data(self, tsv_path : str, matrix_path : str, out_folder : str):
        """
        Splits the matrix data into two files:
        - one containing the pairs that are inside the hubs
        - one containing the pairs that are outside the hubs
        """

        # the file paths.
        inside_file_path = out_folder + f"{self.chromo}_inside.tsv"
        outside_file_path = out_folder + f"{self.chromo}_outside.tsv"
        # create the output folder if it doesn't exist
        if not os.path.exists(out_folder):
            os.makedirs(out_folder)

        # The inside counts
        inside_count = 0
        outside_count = 0

        # The matrices.
        matrix = np.load(matrix_path)

        # Opening the files.    
        with open(inside_file_path, "w") as inside_file, open(outside_file_path, "w") as outside_file, open(tsv_path, "r") as tsv_file:
            # Add amn header to the inside and outside files
            header = "bin1\tbin2\tzscore\tpvalue\n"
            inside_file.write(header)
            outside_file.write(header)
        
            # Looping over the lines.
            for line in tsv_file:
                line = line.strip()
                if line == "":
                    continue

                # Split the line into a list of values
                start1, start2, *rest = line.split("\t")

                # Convert the start and end to int
                start1 = int(start1)
                start2 = int(start2)

                # get the index coefficients
                i = start1 // self.resolution
                j = start2 // self.resolution

                # retrieve the p-value from the matrix
                pvalue = matrix[i, j]

                line = line + "\t" + str(pvalue)

                # Check if the pair is inside the hubs
                if self.check_single_pair((start1, start2)):
                    inside_file.write(line + "\n")
                    inside_count += 1
                else:
                    outside_file.write(line + "\n")
                    outside_count += 1
        print(f"Split the data between {green(inside_count)} inside and {red(outside_count)} outside pairs.")

In [9]:
CHROMOSOMES = ["chr1", "chr2", "chr3", "chr4", "chr5", "chr6", "chr7", "chr8", "chr9", "chr10", "chr11", "chr12", "chr13", "chr14", "chr15", "chr16", "chr17", "chr18", "chr19", "chr20", "chr21", "chr22"]

In [21]:
def join_all_files(out_folder : str):
    """
    Joins all the files in the given folder.
    """
    files = os.listdir(out_folder)
    
    #sort the files by chromosome:
    files.sort(key=lambda x: int(x.split("_")[0].replace("chr", "")))
    inside_file = out_folder + "inside.tsv"
    outside_file = out_folder + "outside.tsv"

    with open(inside_file, "w") as inside_file, open(outside_file, "w") as outside_file:
        for file in files:
            if file.endswith("inside.tsv"):
                with open(out_folder + file, "r") as f:
                    # skip the header
                    next(f)
                    for line in f:
                        inside_file.write(line)
            elif file.endswith("outside.tsv"):
                with open(out_folder + file, "r") as f:
                    #skip the header
                    next(f)
                    for line in f:
                        outside_file.write(line)

            print(f"Joined {file}")

    # Remove the files
    for file in files:
        os.remove(out_folder + file)
    
    print(f"Successfully joined all the files in {out_folder}")

In [12]:
HUBS_PATH = "../results/hubs/HMEC.CAGE.bed"

for chromo in CHROMOSOMES:
    print("Working on: " + yellow(chromo))
    # Creating the splitter
    splitter = DataSplitter(chromo, HUBS_PATH, resolution=100_000)
    
    #Splitting the data
    splitter.split_data(
        matrix_path=f"../results/ctcf_enrichment/100kb/{chromo}.npy",
        tsv_path=f"../results/zscores/100kb/{chromo}.txt",
        out_folder="../results/hubs_data/100kb/"
    )

    print("Finished: " + green(chromo))

Working on: [33mchr1[0m
Split the data between [32m48684[0m inside and [31m1262020[0m outside pairs.
Finished: [32mchr1[0m
Working on: [33mchr2[0m
Split the data between [32m248515[0m inside and [31m1171190[0m outside pairs.
Finished: [32mchr2[0m
Working on: [33mchr3[0m
Split the data between [32m5511[0m inside and [31m1078860[0m outside pairs.
Finished: [32mchr3[0m
Working on: [33mchr4[0m
Split the data between [32m81913[0m inside and [31m864145[0m outside pairs.
Finished: [32mchr4[0m
Working on: [33mchr5[0m
Split the data between [32m13887[0m inside and [31m843104[0m outside pairs.
Finished: [32mchr5[0m
Working on: [33mchr6[0m
Split the data between [32m29203[0m inside and [31m811640[0m outside pairs.
Finished: [32mchr6[0m
Working on: [33mchr7[0m
Split the data between [32m12472[0m inside and [31m676068[0m outside pairs.
Finished: [32mchr7[0m
Working on: [33mchr8[0m
Split the data between [32m9231[0m inside and [31m619178[

In [22]:
join_all_files("../results/hubs_data/100kb/")

Joined chr1_inside.tsv
Joined chr1_outside.tsv
Joined chr2_inside.tsv
Joined chr2_outside.tsv
Joined chr3_inside.tsv
Joined chr3_outside.tsv
Joined chr4_inside.tsv
Joined chr4_outside.tsv
Joined chr5_inside.tsv
Joined chr5_outside.tsv
Joined chr6_inside.tsv
Joined chr6_outside.tsv
Joined chr7_inside.tsv
Joined chr7_outside.tsv
Joined chr8_inside.tsv
Joined chr8_outside.tsv
Joined chr9_inside.tsv
Joined chr9_outside.tsv
Joined chr10_inside.tsv
Joined chr10_outside.tsv
Joined chr11_inside.tsv
Joined chr11_outside.tsv
Joined chr12_inside.tsv
Joined chr12_outside.tsv
Joined chr13_inside.tsv
Joined chr13_outside.tsv
Joined chr14_inside.tsv
Joined chr14_outside.tsv
Joined chr15_inside.tsv
Joined chr15_outside.tsv
Joined chr16_inside.tsv
Joined chr16_outside.tsv
Joined chr17_inside.tsv
Joined chr17_outside.tsv
Joined chr18_inside.tsv
Joined chr18_outside.tsv
Joined chr19_inside.tsv
Joined chr19_outside.tsv
Joined chr20_inside.tsv
Joined chr20_outside.tsv
Joined chr21_inside.tsv
Joined chr21_o