In [31]:
import pyranges as pr
import numpy as np
import pandas as pd

RESOLUTIONS = {
    "5kb": 5e3,
    "10kb": 1e4,
    "50kb": 5e4,
    "100kb": 1e5,
    "500kb": 5e5,
    "1Mb": 1e6,
}

def __chr_sort_part(x : str):
    p = x.replace("chr", "")
    if p.isdigit():
        return int(p)
    else:
        return p

def sort_by_chromosome(x : dict | list):
    if type(x) == dict:
        # Return a sorted dict.
        return {k: v for k, v in sorted(x.items(), key=lambda item: __chr_sort_part(item[0]))}
    elif type(x) == list:
        return sorted(x, key=lambda x: __chr_sort_part(x))
    else:
        raise TypeError(f"The type of the argument is not a dict or a list. It is: {type(x)}")

In [10]:
### Read the input files.
CLUSTER_PATH = "../../analysis_results/CAGE_enriched.bed"
clusters = pr.read_bed(CLUSTER_PATH)

In [11]:
### Extract basic stuff.
names = set(clusters.Name.values.flatten().tolist())
resolutions = [x.split("_")[0] for x in names]
resolutions_dict = {x: resolutions.count(x) for x in set(resolutions)}

In [13]:
### Sort the resolution dict by keys.
resolutions_dict = {k: v for k, v in sorted(resolutions_dict.items(), key=lambda item: RESOLUTIONS[item[0]])}
resolutions_dict

{'5kb': 2627, '10kb': 1112, '50kb': 633, '100kb': 127, '500kb': 114, '1Mb': 19}

In [18]:
### Plot the resolution histogram.
import plotly.graph_objects as go

fig = go.Figure()
fig.add_trace(go.Bar(x=list(resolutions_dict.keys()), y=list(resolutions_dict.values()), text=list(resolutions_dict.values())))
fig.update_layout(title_text="Resolution histogram", xaxis_title="Resolution", yaxis_title="Number of clusters")
fig.show()

In [34]:
### Plot the distribution of clusters by chromosome.
chromosomes = clusters.Chromosome.values.tolist()
chromosome_counts = {x: chromosomes.count(x) for x in set(chromosomes)}
chromosome_counts = sort_by_chromosome(chromosome_counts)

In [35]:
### Plot an histogram of the chromosome counts.
fig = go.Figure()
fig.add_trace(go.Bar(x=list(chromosome_counts.keys()), y=list(chromosome_counts.values()), text=list(chromosome_counts.values())))
fig.update_layout(title_text="Chromosome histogram", xaxis_title="Chromosome", yaxis_title="Number of clusters")
fig.show()