# File input and parsing.

In [6]:
import pyranges as pr 
from src.cluster_description import ClustersDescription
from src.clus_files_io import parse_clus_file

FEATURE_TYPE_PATH = "./data/FANTOM5_cage_peak_type_tbl.tsv"

### Parses the file containing the peak types for each feature and returns a dictionary with the feature id as key and the feature type as value
### Like this:
### {
###    "peak_1": "tss",
###    "peak_2": "tss",
###    "peak_3": "enhancer",
###    "peak_4": "tss",
###     ...
### }
def parse_peak_type(path : str) -> dict[str, str]:
    """
    Parse the peak type table.
    """
    peak_type = {}
    with open(path, "r") as f:
        # consume the first line of the line iterators to skip the header
        header = f.readline().strip()

        # loop over the header
        for line in f.readlines():
            if line.startswith("#"):
                continue
            line = line.strip().split("\t")
            peak_type[line[0]] = line[1]
    return peak_type

In [7]:
# read the file.
peak_types = parse_peak_type(FEATURE_TYPE_PATH)

In [8]:
# Read the information about the cluster enriched in both CAGE and CTCF
CLUSTERS_IN_ANALYSIS_PATH = "../analysis_results/intersected.CLUS"

# Reading the clusters:
clusters_in_analysis = parse_clus_file(CLUSTERS_IN_ANALYSIS_PATH)

In [9]:
def count_peak_types(feature_names : str, peak_types : dict[str, str]) -> dict[str, int]:
    """
    Count the number of peaks of each type.
    """
    counts = {}
    for feature_name in feature_names:
        peak_type = peak_types.get(feature_name, "")
        if peak_type == "":
            continue

        if peak_type not in counts:
            counts[peak_type] = 0
        counts[peak_type] += 1
    return counts

In [10]:
# Reading the features.
features = pr.read_bed("../data/features/HMEC/CAGE/features.bed")

# the list keeping track of which genes are present in the features in analysis.
peak_type_counts = {}

# the list of overlapping features (features overlapping with each cluster)
overlapping_features = []

# For each cluster
for chromo, clusters_names in clusters_in_analysis.items():
    # Reading the clusters.
    clusters = ClustersDescription(f"../data/clusters/HMEC/{chromo}_spec_res.json", chromo)
    
    # Looping over the cluster (strings)
    for cluster in clusters_names:

        # Getting the actual cluster object
        cluster = clusters[cluster]

        # Counting the overlaps
        overlaps = cluster.find_overlaps(chromo, features)
        overlaps_names = overlaps.Name.values.flatten()
        
        # Adding the overlaps to the list of overlapping features
        overlapping_features.extend(overlaps_names)
        overlapping_features = list(set(overlapping_features)) # removing duplicates

# counting the peak types
peak_type_counts = count_peak_types(overlapping_features, peak_types)

In [11]:
peak_type_counts

{'tss': 4501, 'enhancer': 582}

In [12]:
peak_type_counts["tss"]/peak_type_counts["enhancer"]

0.1293045989780049

In [13]:
background_peak_type_counts = count_peak_types(features.Name.values.flatten(), peak_types)
background_peak_type_counts["tss"]/background_peak_type_counts["enhancer"]

0.23283343383760258

In [14]:
#build the contingency table
from scipy.stats import chi2_contingency
contingency_table = [[peak_type_counts["enhancer"], background_peak_type_counts["enhancer"]],
                        [peak_type_counts["tss"], background_peak_type_counts["tss"]]]

#perform the test
chi2, p, dof, expected = chi2_contingency(contingency_table)

In [18]:
# Print the results nicely
print("Chi-squared test for enrichment of CAGE peaks in FANTOM5")
print("-------------------------------------------------------")
print(f"Chi-squared statistic: {chi2}")
print(f"p-value: {p}")
print(f"dof: {dof}")
print("-------------------------------------------------------")

Chi-squared test for enrichment of CAGE peaks in FANTOM5
-------------------------------------------------------
Chi-squared statistic: 176.32891322392476
p-value: 3.0691088464711664e-40
dof: 1
-------------------------------------------------------
