In [56]:
from src.clus_files_io import parse_clus_file
from src.cluster_description import ClustersDescription
import pyranges as pr 
import pandas as pd
import os,json
import numpy as np
from src.utils import RESOLUTIONS, extract_bins

# The Cluster which are Enriched in Both

In [57]:
# Read the information about the cluster enriched in both CAGE and CTCF
CLUSTERS_IN_ANALYSIS_PATH = "../analysis_results/ctcf.CLUS"

# Reading the clusters:
clusters_in_analysis = parse_clus_file(CLUSTERS_IN_ANALYSIS_PATH)

# Reading the Feature Types File
Read the File mapping each Feature to its Type, convert them to ranges

In [58]:
FEATURE_TYPE_PATH = "./data/FANTOM5_cage_peak_type_tbl.tsv"

### Parses the file containing the peak types for each feature and returns a dictionary with the feature id as key and the feature type as value
### Like this:
### {
###    "peak_1": "tss",
###    "peak_2": "tss",
###    "peak_3": "enhancer",
###    "peak_4": "tss",
###     ...
### }
def parse_peak_type(path : str) -> dict[str, str]:
    """
    Parse the peak type table.
    """
    peak_type = {}
    with open(path, "r") as f:
        # consume the first line of the line iterators to skip the header
        header = f.readline().strip()

        # loop over the header
        for line in f.readlines():
            if line.startswith("#"):
                continue
            line = line.strip().split("\t")
            peak_type[line[0]] = line[1]
    return peak_type

In [59]:
peak_types = parse_peak_type(FEATURE_TYPE_PATH)
# make them into a dataframe 
peak_types_df = pd.DataFrame.from_dict(peak_types, orient="index")
peak_types_df.columns = ["peak_type"]

In [60]:
def extract_chr_start_end(x : str) -> tuple[str, int, int]:
    """
    Extract the chromosome and the start and end of the feature.
    """
    chromo = x.split(":")[0]
    remaindeer =  x.split(":")[1].split(",")[0]
    split_ = ".." if ".." in remaindeer else "-"
    start, end = remaindeer.split(split_)
    return chromo, int(start), int(end), x

# Transform them to a dataframe and then to a pyranges object
peak_types_df["chromo"], peak_types_df["start"], peak_types_df["end"], names = zip(*peak_types_df.index.map(extract_chr_start_end))
peak_types_df["name"] = names
peak_types_df.rename({"chromo" : "Chromosome", "start" : "Start", "end" : "End", "name" : "Name"}, axis=1, inplace=True)

# Drop the index
peak_types_df.reset_index(drop=True, inplace=True)

# Filter the enhancers
ENHANCERS = peak_types_df[peak_types_df["peak_type"] == "enhancer"]

In [61]:
ENHANCERS_RANGES = pr.PyRanges(ENHANCERS)
print("Average Enhancer Length: {:.2f}".format(ENHANCERS_RANGES.lengths().mean()))

Average Enhancer Length: 281.14


# Reading the clusters which we are interested in
Cluster out are the clusters which represent the non-enriched clusters, which are not enriched in CTCF and CAGE.

Cluster in are the clusters which represent the enriched clusters, which are enriched in CTCF and CAGE.

In [62]:
def import_clusters(cluster_folder : str, include_dict : dict[str, str] = None, exclude_dict : dict[str, str] = None) -> pr.PyRanges:
    for file in os.listdir(cluster_folder):
        chromo = file.split("_")[0]
        
        if include_dict and chromo not in include_dict:
            continue
        
        clusters = json.load(open(os.path.join(cluster_folder, file), "r"))
        clusters = clusters["cl_member"]
        for cluster_id, bins in clusters.items():
            resolution = int(RESOLUTIONS[cluster_id.split("_")[0]])

            # Deal with inclusion
            if include_dict:
                if cluster_id not in include_dict.get(chromo, []):
                    continue

            # Deal with exclusion
            if exclude_dict:
                if cluster_id in exclude_dict.get(chromo, []):
                    continue
                
            for bin in extract_bins(bins):            
                yield chromo, bin, bin + resolution, cluster_id


# Loading the clusters
CLUSTER_FOLDER = "../data/clusters/HMEC/"

In [63]:
# Load Chromosomes, Starts and ends.
chromosomes = []
starts = []
ends = []
names = []
chromosomes, starts, ends, names = zip(*import_clusters(CLUSTER_FOLDER, include_dict=clusters_in_analysis))

FOREGROUND = pr.PyRanges(pd.DataFrame({"Chromosome":chromosomes, "Start":starts, "End":ends, "name":names}))

In [64]:
RESULTS_PATH = "../results/"

# Saving all the enhancer as a background
ENHANCERS[["Chromosome", "Start", "End"]].to_csv(os.path.join(RESULTS_PATH, "background.bed"), sep="\t", index=False, header=False)

# Overlapping the enhancers with the clusters and saving the results as a bed.
enhancer_in_clusters = ENHANCERS_RANGES.overlap(FOREGROUND)
enhancer_in_clusters.df[["Chromosome", "Start", "End"]].to_csv(os.path.join(RESULTS_PATH, "foreground.bed"), sep="\t", index=False, header=False)

# Preparing for checking enrichment with unibind.
To prepare the data for use with unibind, we need to convert the coordinates to hg38 for both the foreground and the background.

In [65]:
import subprocess

def convert_to_hg38(path : str):
    # Call the script ./analysis/convert_to_hg38.R 
    # make the path absolute
    path = os.path.abspath(path)
    cmd = 'Rscript ../analysis/scripts/hg19_to_hg38.R "{}" "{}"'.format(path, path) # The output file will be the same as the input file (inplace)
    process = subprocess.Popen(
        cmd,
        stdout=subprocess.PIPE,
        stderr=subprocess.STDOUT,
        shell=True,
        encoding='utf-8',
        errors='replace'
    )

    while True:
        realtime_output = process.stdout.readline()

        if realtime_output == '' and process.poll() is not None:
            break

        if realtime_output:
            print(realtime_output.strip(), flush=True)
            pass

# Converting the stuff to hg38
print("-- Converting the foreground")
convert_to_hg38(os.path.join(RESULTS_PATH, "foreground.bed"))
print("-- Converting the background")
convert_to_hg38(os.path.join(RESULTS_PATH, "background.bed"))

-- Converting the foreground
Size Before Conversion:  2281
Coordinates translated, new size:  2290
-- Converting the background
Size Before Conversion:  65423
Coordinates translated, new size:  65585
