In [21]:
FANTOM_TYPE_PATH = "./data/FANTOM5_cage_peak_type_tbl.tsv"

# parse the path
def parse_fantom_type(path : str) -> dict[str, str]:
    """
    Parses the FANTOM5_cage_peak_type_tbl.tsv file and returns a dictionary of peak type to peak type description.
    """
    results : dict[str, str] = {}

    # Open and read
    with open(path, "r") as f:
        lines = f.readlines()
        lines = lines[1:] # remove the header
        for line in lines:
            entry_name = line.split("\t")[0].strip()
            peak_type = line.split("\t")[1].strip()
            results[entry_name] = peak_type

    # Return the results
    return results

In [27]:
def histogram(sets: list[str]) -> dict[str, int]:
    results = {}
    for set_ in sets:
        if set_ not in results:
            results[set_] = 0
        results[set_] += 1
    return results

In [12]:
from src.cluster_description import Cluster, ClustersDescription
import pyranges as pr

In [22]:
fantom_types = parse_fantom_type(FANTOM_TYPE_PATH)

chromo = "chr16"
clusters = ClustersDescription(f"../data/clusters/HMEC/{chromo}_spec_res.json", chromo)
cluster = clusters["500kb_35_595_56500000_90000000"]

In [23]:
# Features
features_path = "../data/features/HMEC/CAGE/features.bed"
features = pr.read_bed(features_path)

In [24]:
# Counting the overlaps
overlaps = cluster.find_overlaps(chromo, features)

In [31]:
#Classigying the entries:
names = overlaps.Name.values.flatten()

values = []
for name in names:
    if name in fantom_types:
        values.append(fantom_types[name])

In [34]:
ENRICHED_BOTH_PATH = "../data/enriched_clusters_for_all_chromosomes.txt"

def read_enriched_both(path: str) -> dict[str: list[str]]:
    results: dict[str: list[str]] = {}
    chromosome = None
    with open(path, "r") as f:
        for line in f:
            if line.startswith("chr"):
                chromosome = line.strip()
                results[chromosome] = []
                continue
            results[chromosome].append(line.strip())
    return results

In [38]:
enriched_both = read_enriched_both(ENRICHED_BOTH_PATH)

In [41]:
for chromo, _clusters in enriched_both.items():
    clusters = ClustersDescription(f"../data/clusters/HMEC/{chromo}_spec_res.json", chromo)
    for cluster in _clusters:
        cluster = clusters[cluster]

        overlaps = cluster.find_overlaps(chromo, features)

        names = overlaps.Name.values.flatten()

        values = []
        for name in names:
            if name in fantom_types:
                values.append(fantom_types[name])
        
        print(chromo, len(values))
        print(histogram(values))

chr16 60
{'tss': 60}
chr16 23
{'tss': 23}
chr16 538
{'tss': 538}
chr16 185
{'tss': 185}
chr16 101
{'tss': 101}
chr16 29
{'tss': 29}
chr16 898
{'tss': 898}
chr19 251
{'tss': 251}
chr19 668
{'tss': 668}
chr19 114
{'tss': 114}
chr19 456
{'tss': 456}
chr19 3603
{'tss': 3603}
chr19 43
{'tss': 43}
chr19 935
{'tss': 935}
chr19 762
{'tss': 762}
chr19 131
{'tss': 131}
chr19 1166
{'tss': 1166}
chr19 1928
{'tss': 1928}
chr19 72
{'tss': 72}
chr19 1675
{'tss': 1675}
