In [26]:
from src.cluster_description import Cluster, ClustersDescription
import pyranges as pr

GMT_PATH = "./data/c5.go.bp.v7.5.1.entrez.gmt"

# Parses the GMT file returining a dict containing for each 
def parse_gmt(path: str) -> dict[tuple[str, str]]:
    # Create a dictionary to store the results
    results = {}

    # Open the GMT file
    with open(path, "r") as f:
        # For each line in the file
        for line in f:
            # Split the line into a list of words
            words = line.split("\t")

            # The motif name
            gene_set = words[0]
            link = words[1]
            genes = words[2:]
            # strip each gene
            genes = [gene.strip() for gene in genes]

            # Add the gene set to the dictionary
            results[gene_set] = [link, genes]

    # Return the results
    return results

gmt = parse_gmt(GMT_PATH)
gmt

{'GOBP_MITOCHONDRIAL_GENOME_MAINTENANCE': ['http://www.gsea-msigdb.org/gsea/msigdb/cards/GOBP_MITOCHONDRIAL_GENOME_MAINTENANCE',
  ['10000',
   '10891',
   '11232',
   '142',
   '1763',
   '1890',
   '201163',
   '201973',
   '2021',
   '219736',
   '291',
   '3980',
   '4205',
   '4358',
   '4976',
   '55186',
   '7156',
   '7157',
   '80119',
   '83667',
   '84275',
   '92667',
   '9361']],
 'GOBP_REPRODUCTION': ['http://www.gsea-msigdb.org/gsea/msigdb/cards/GOBP_REPRODUCTION',
  ['100',
   '10007',
   '100125288',
   '100129278',
   '100130958',
   '100131137',
   '100137049',
   '10017',
   '10018',
   '100289087',
   '10046',
   '10049',
   '100506013',
   '100507650',
   '10051',
   '10096',
   '10097',
   '100996631',
   '10111',
   '10116',
   '10134',
   '10149',
   '10155',
   '1017',
   '10179',
   '10184',
   '101927581',
   '101928601',
   '10265',
   '1027',
   '1028',
   '10343',
   '10361',
   '10370',
   '10371',
   '10388',
   '10403',
   '10406',
   '10407',
   '1040

In [5]:
# Reading the clusters.
chromo = "chr16"
clusters = ClustersDescription(f"../data/clusters/HMEC/{chromo}_spec_res.json", chromo)
cluster = clusters["500kb_16_120_56500000_75000000"]

# Reading the features.
features = pr.read_bed("../data/features/HMEC/CAGE/features.bed")

# Couting the overlaps
overlaps = cluster.find_overlaps(chromo, features)

In [10]:
FANTOM_PATH = "./data/FANTOM5_CAGE_peak_entrez_gene_tbl.tsv"

# read the table from the path
def parse_fantom_data(path: str) -> dict[str, list[str]]:
    # Create a dictionary to store the results
    results: dict[str, list[str]] = {}

    header_found = False

    # Open the file
    with open(path, "r") as f:
        # For each line in the file
        for line in f:
            # Go to the next line if it starts with a #
            if line.startswith("#"):
                continue

            # deal with the header
            if not header_found:
                header_found = True
                continue

            # Split the line into a list of words
            words = line.split("\t")
            cage_id = words[0].strip()
            entrez_id = words[1].strip()

            if entrez_id not in results:
                results[cage_id] = []

            results[cage_id].append(entrez_id)

    # Return the results
    return results

In [11]:
cage_to_entrez = parse_fantom_data(FANTOM_PATH)

In [15]:
cage_to_entrez

{'chr10:100013403..100013414,-': ['NA'],
 'chr10:100027943..100027958,-': ['84171'],
 'chr10:100076685..100076699,+': ['NA'],
 'chr10:100150910..100150935,-': ['NA'],
 'chr10:100150951..100150962,-': ['NA'],
 'chr10:100150986..100150988,+': ['NA'],
 'chr10:100174900..100174956,-': ['84795'],
 'chr10:100174957..100174982,-': ['84795'],
 'chr10:100180637..100180657,-': ['NA'],
 'chr10:100195025..100195029,-': ['NA'],
 'chr10:100204220..100204230,-': ['NA'],
 'chr10:100206067..100206107,+': ['NA'],
 'chr10:100206469..100206475,-': ['NA'],
 'chr10:100206528..100206535,-': ['NA'],
 'chr10:100206642..100206717,-': ['3257'],
 'chr10:100226702..100226710,-': ['NA'],
 'chr10:100719113..100719115,-': ['NA'],
 'chr10:100847096..100847097,+': ['NA'],
 'chr10:100981666..100981684,-': ['NA'],
 'chr10:100992367..100992392,-': ['NA'],
 'chr10:100992596..100992612,-': ['NA'],
 'chr10:100992617..100992648,-': ['NA'],
 'chr10:100993894..100993906,-': ['NA'],
 'chr10:100995440..100995474,-': ['60495'],
 '

In [29]:
# The names of the overlapping ranges
names = overlaps.Name.values.flatten()
print("Total number of overlapping genes:", len(names))

# The entrez ids of the overlapping ranges
entrez_ids = [cage_to_entrez.get(name, [None])[0] for name in names]

# Count the None entrez ids and remove them
print(entrez_ids.count(None), "Cage Peaks not registered in the table.")
entrez_ids = [entrez_id for entrez_id in entrez_ids if entrez_id is not None]

# count the NA entrez ids and remove them
print(entrez_ids.count("NA"), "Cage Peaks with no associated entrez id.")
entrez_ids = [entrez_id for entrez_id in entrez_ids if entrez_id != "NA"]

# Count the remaining entrez ids
print(len(entrez_ids), "Cage Peaks with associated entrez id.")

Total number of overlapping genes: 624
86 Cage Peaks not registered in the table.
211 Cage Peaks with no associated entrez id.
327 Cage Peaks with associated entrez id.


In [51]:
# Remove the duplicates
entrez_ids = list(set(entrez_ids))

In [52]:
# Get the ontology group of each entrez id
def get_ontology_group(entrez_id: str, gmt: dict[str, list[str]]) -> str:
    for gene_set, (link, genes) in gmt.items():
        if entrez_id in genes:
            return gene_set
    return None

# Do so for each entrez id in the overlap
sets = []
for entrez_id in entrez_ids:
    group = get_ontology_group(entrez_id[0], gmt)
    if group is not None:
        sets.append(group)

In [53]:
def histogram(sets: list[str]) -> dict[str, int]:
    results = {}
    for set_ in sets:
        if set_ not in results:
            results[set_] = 0
        results[set_] += 1
    return results

In [54]:
histogram(sets)

{'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 11, 'GOBP_REPRODUCTION': 18}

In [57]:
class GeneSetEnrichment:
    gmt: dict[str, list[str]]
    cage_to_entrez: dict[str, list[str]]
    features: pr.PyRanges

    def __init__(self, gmt_path: str, cage_to_entrez_path: str, features_path: str):
        self.gmt = parse_gmt(gmt_path)
        self.cage_to_entrez = parse_fantom_data(cage_to_entrez_path)
        self.features = pr.read_bed(features_path)

    def get_gene_set(self, cluster: Cluster, chromosome: str) -> str:
        # The features overlapping with this cluster
        overlaps = cluster.find_overlaps(chromosome, self.features)

        # The names of the overlapping features
        names = overlaps.Name.values.flatten()

        # The entrez ids of the overlapping features
        entrez_ids = [self.cage_to_entrez[name][0] for name in names if name in self.cage_to_entrez]

        # Remove the NAs
        entrez_ids = [entrez_id for entrez_id in entrez_ids if entrez_id != "NA"]

        # Remove the duplicates
        entrez_ids = list(set(entrez_ids))

        # The ontology groups of the overlapping genes
        sets = []
        for entrez_id in entrez_ids:
            group = get_ontology_group(entrez_id[0], self.gmt)
            if group is not None:
                sets.append(group)

        # Return the histogram of the ontology groups
        return histogram(sets)

In [58]:
geneset_enrichment = GeneSetEnrichment(GMT_PATH, FANTOM_PATH, "../data/features/HMEC/CAGE/features.bed")
chromo = "chr16"
clusters = ClustersDescription(f"../data/clusters/HMEC/{chromo}_spec_res.json", chromo)
cluster = clusters["500kb_16_120_56500000_75000000"]
geneset_enrichment.get_gene_set(cluster, chromo)

{'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 11, 'GOBP_REPRODUCTION': 18}

In [59]:
ENRICHED_BOTH_PATH = "../data/enriched_clusters_for_all_chromosomes.txt"

def read_enriched_both(path: str) -> dict[str: list[str]]:
    results: dict[str: list[str]] = {}
    chromosome = None
    with open(path, "r") as f:
        for line in f:
            if line.startswith("chr"):
                chromosome = line.strip()
                results[chromosome] = []
                continue
            results[chromosome].append(line.strip())
    return results

In [44]:
geneset_enrichment = GeneSetEnrichment(GMT_PATH, FANTOM_PATH, "../data/features/HMEC/CAGE/features.bed")
enriched_both = read_enriched_both(ENRICHED_BOTH_PATH)

for chromo, _clusters in enriched_both.items():
    clusters = ClustersDescription(f"../data/clusters/HMEC/{chromo}_spec_res.json", chromo)
    for cluster in _clusters:
        print(chromo, cluster)
        enrichment = geneset_enrichment.get_gene_set(clusters[cluster], chromo)
        print(enrichment)


chr16 100kb_15_105_84200000_85600000
{'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 7, 'GOBP_REPRODUCTION': 11}
chr16 10kb_37_666_85300000_85670000
{'GOBP_REPRODUCTION': 5}
chr16 500kb_16_120_56500000_75000000
{'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 37, 'GOBP_REPRODUCTION': 35}
chr16 500kb_12_66_81000000_86500000
{'GOBP_REPRODUCTION': 22, 'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 9}
chr16 100kb_28_378_84200000_86900000
{'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 7, 'GOBP_REPRODUCTION': 18}
chr16 50kb_13_78_85050000_85650000
{'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 2, 'GOBP_REPRODUCTION': 5}
chr16 500kb_35_595_56500000_90000000
{'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 59, 'GOBP_REPRODUCTION': 77}
chr19 100kb_17_136_13000000_14600000
{'GOBP_REPRODUCTION': 6, 'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 26}
chr19 500kb_13_78_13000000_19000000
{'GOBP_REPRODUCTION': 57, 'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 49}
chr19 50kb_15_105_13950000_14650000
{'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 22, 'GOBP_REPRODUCTION': 1}
chr19 500kb_7_2

In [46]:
gmt_counts= {k : len(v[1]) for k, v in gmt.items()}

In [47]:
# Plot the histogram of the gtm using plotly
import plotly.express as px
import plotly.graph_objects as go
#Set to be rendering on the browser
import plotly.io as pio
pio.renderers.default = "browser"

fig = go.Figure(data=go.Bar(x=list(gmt_counts.keys()), y=list(gmt_counts.values())))
fig.show()

In [68]:
import tqdm

In [69]:
# Getting the background
background = pr.read_bed(f"../data/features/HMEC/CAGE/features.bed")

# Getting the representation 
feature_names = background.Name.values.flatten()

# Getting the entrez ids and removing None and NAs
entrez_ids = [cage_to_entrez.get(name, [None])[0] for name in feature_names]
entrez_ids = [entrez_id for entrez_id in entrez_ids if entrez_id is not None and entrez_id != "NA"]

background_sets = []

for entrez_id in tqdm.tqdm(entrez_ids):
    group = get_ontology_group(entrez_id[0], gmt)
    if group is not None:
        background_sets.append(group)


100%|██████████| 46014/46014 [07:06<00:00, 107.99it/s]


In [71]:
histogram(background_sets)

{'GOBP_REPRODUCTION': 7915, 'GOBP_XENOBIOTIC_METABOLIC_PROCESS': 3352}

In [78]:
gmt_inverse = {}

for gene_set, (link, genes) in gmt.items():
    for gene in genes:
        if gene not in gmt_inverse:
            gmt_inverse[gene] = []
        gmt_inverse[gene].append(gene_set)

background_sets = {}
for entrez_id in entrez_ids:
    groups = gmt_inverse.get(entrez_id, [])
    
    for group in groups:
        if group not in background_sets:
            background_sets[group] = 0
        background_sets[group] += 1

In [79]:
# plot the histogram of the background
fig = go.Figure(data=go.Bar(x=list(background_sets.keys()), y=list(background_sets.values())))
fig.show()