In [2]:
from src.cluster_description import Cluster, ClustersDescription
import pyranges as pr
import numpy as np

_GMT_PATH = "./data/c5.go.bp.v7.5.1.entrez.gmt"
_FANTOM_PATH = "./data/FANTOM5_CAGE_peak_entrez_gene_tbl.tsv"

# Parses the GMT file returining a dict containing for each 
# Returns a dictionary mapping each gene entrez id to the ontology group it belongs to.
def parse_gmt(path: str) -> dict[str, list[str]]:
    # Create a dictionary to store the results
    results = {}

    # Open the GMT file
    with open(path, "r") as f:
        # For each line in the file
        for line in f:
            # Split the line into a list of words
            words = line.split("\t")

            # The motif name
            gene_set = words[0]
            link = words[1]
            genes = words[2:]
            # strip each gene
            genes = [gene.strip() for gene in genes]

            # Add the gene set to the dictionary
            results[gene_set] = [link, genes]

    # invert the dictionary
    gmt_inverse : dict[str, list[str]] = {}

    # For each gene set
    for gene_set, (link, genes) in results.items():
        for gene in genes:
            if gene not in gmt_inverse:
                gmt_inverse[gene] = []
            gmt_inverse[gene].append(gene_set)
        
    return gmt_inverse

# read the table from the path
def parse_fantom_data(path: str) -> dict[str, str]:
    # Create a dictionary to store the results
    results: dict[str, str] = {}

    header_found = False

    # Open the file
    with open(path, "r") as f:
        # For each line in the file
        for line in f:
            # Go to the next line if it starts with a #
            if line.startswith("#"):
                continue

            # deal with the header
            if not header_found:
                header_found = True
                continue

            # Split the line into a list of words
            words = line.split("\t")
            cage_id = words[0].strip()
            entrez_id = words[1].strip()

            # Add the gene to the dictionary
            results[cage_id] = entrez_id
        
    # Return the results
    return results

# Maps the given entry id to the gene entrez id
def overlaps_to_entrez(overlaps: list[str], peak_to_entrez : dict[str, str]) -> list[str]:
    entrez_ids = []
    for overlap in overlaps:
        entrez_ids.append(peak_to_entrez.get(overlap, None))
    
    # Remove duplicates and NAs
    entrez_ids = [entrez_id for entrez_id in entrez_ids if entrez_id not in ["NA", None]]
    return entrez_ids

# Return the ontology group to which each entrez id belongs to.
def map_entrez_to_ontology(entrez_ids: list[str], gene_ontology : dict[str, str]) -> list[str]:
    ontology_hist = {}
    unknown_entrez_ids = []
    for entrez_id in entrez_ids:
        if entrez_id not in gene_ontology:
            unknown_entrez_ids.append(entrez_id)
            continue
        ontology_groups = gene_ontology[entrez_id]

        for ontology_group in ontology_groups:
            if ontology_group not in ontology_hist:
                ontology_hist[ontology_group] = 0
            ontology_hist[ontology_group] += 1
    
    return ontology_hist, unknown_entrez_ids


def get_groups_for_features(features: list[str], peak_to_entrez: dict[str, str], gene_ontology : dict[str, str], remove_duplicates: bool = False) -> dict[str, int]:
    # Get the entrez ids
    entrez_ids = overlaps_to_entrez(features, peak_to_entrez)

    # Removing duplicates
    if remove_duplicates:
        entrez_ids = list(set(entrez_ids))
        
    # Get the ontology
    groups, _ = map_entrez_to_ontology(entrez_ids, gene_ontology)
    
    # Return the groups
    return groups

# Reading the data.
peak_to_entrez = parse_fantom_data(_FANTOM_PATH)
gene_ontology = parse_gmt(_GMT_PATH)

### Getting the Background Data.

In [57]:
# Getting the background
background = pr.read_bed(f"../data/features/HMEC/CAGE/features.bed")

# Getting the representation 
feature_names = background.Name.values.flatten()
expected_groups = get_groups_for_features(feature_names, peak_to_entrez, gene_ontology)

### Reading the Observed Data

In [58]:
# Read the information about the cluster enriched in both
ENRICHED_IN_BOTH_PATH = "../results/intersected.tsv"

def parse_enrichment_file(path : str):
    results = {}

    with open(path, "r") as f:
        for line in f:
            words = line.split("\t")
            chromosome = words[0].strip()
            clusters = [w.strip() for w in words[1:]]

            results[chromosome] = clusters

    return results

enriched_clusters = parse_enrichment_file(ENRICHED_IN_BOTH_PATH)

In [59]:
# Reading the features.
features = pr.read_bed("../data/features/HMEC/CAGE/features.bed")

observed_groups = {}

for chromo, clusters_names in enriched_clusters.items():
    # Reading the clusters.
    clusters = ClustersDescription(f"../data/clusters/HMEC/{chromo}_spec_res.json", chromo)
    
    # Looping over the cluster (strings)
    for cluster in clusters_names:

        # Getting the actual cluster object
        cluster = clusters[cluster]

        # Counting the overlaps
        overlaps = cluster.find_overlaps(chromo, features)
        overlaps_names = overlaps.Name.values.flatten()
        
        # Getting the groups
        groups = get_groups_for_features(overlaps_names, peak_to_entrez, gene_ontology)

        # Adding the groups to the observed groups
        for group, count in groups.items():
            if group not in observed_groups:
                observed_groups[group] = 0
            observed_groups[group] += count

In [71]:
# performing hypergeometric test
from scipy.stats import hypergeom
from tqdm import tqdm

pvalues = {}

for group, expected in tqdm(expected_groups.items()):
    observed = observed_groups.get(group, 0)
    pvalue = hypergeom.sf(observed - 1, len(background), expected, len(background) - expected)
    pvalues[group] = [pvalue, observed, expected]

100%|██████████| 7645/7645 [03:03<00:00, 41.61it/s] 


In [74]:
sorted_pvalues : dict[str, float] = sorted(pvalues.items(), key=lambda x: x[1][0])

In [79]:
#transform the sorted pvalues into a dataframe
import pandas as pd
pvalues_df = pd.DataFrame({}, columns=["group", "pvalue", "observed", "expected"])

for group, pvalue in sorted_pvalues:
    pvalues_df.loc[len(pvalues_df)] = [group, pvalue[0], pvalue[1], pvalue[2]]

In [80]:
pvalues_df

Unnamed: 0,group,pvalue,observed,expected
0,GOBP_PEPTIDYL_ARGININE_METHYLATION_TO_ASYMMETR...,0.0,20,19
1,GOBP_CONSTITUTIVE_SECRETORY_PATHWAY,0.0,18,16
2,GOBP_MENAQUINONE_METABOLIC_PROCESS,0.0,32,11
3,GOBP_VITAMIN_K_METABOLIC_PROCESS,0.0,32,22
4,GOBP_PROTECTION_FROM_NON_HOMOLOGOUS_END_JOININ...,0.0,36,30
...,...,...,...,...
7640,GOBP_VESTIBULOCOCHLEAR_NERVE_FORMATION,1.0,0,4
7641,GOBP_POSITIVE_REGULATION_OF_CIRCADIAN_SLEEP_WA...,1.0,0,1
7642,GOBP_PLASMA_CELL_DIFFERENTIATION,1.0,0,21
7643,GOBP_GASTRIC_MOTILITY,1.0,0,1
