# General Classes for Parsing.
The following are the classes which are used to parse and filter the result tables produced by the analysis.

In [3]:
import os, json
import pandas as pd
import enum 
import statsmodels.stats.multitest as sm
from src.utils import sort_by_chromosome, RESOLUTIONS
from src .cluster_description import Cluster, ClustersDescription

### The values representing the columns in the results files
class ValueKeys(enum.Enum):
    CHROMOSOME = "chromosome"
    NAME = "name"
    P_VALUE = "p_value"
    Q_VALUE = "q_value"

### Whether to apply FDR for each chromosome separately or to the whole genome at the same time.
class FDRMode(enum.Enum):
    PER_CHROMOSOME = "per_chromosome"
    WHOLE_GENOME = "whole_genome"

### Which plotting library to use (not used right now)
class PlotLibrary(enum.Enum):
    SEABORN = "seaborn"
    MATPLOTLIB = "matplotlib"
    PLOTLY = "plotly"

### How to serialize the BED file representing the clusters.
### If "multiple_clusters" is selected, a single track will contain multiple clusters.
### If "single_cluster" is selected, a single track will contain a single cluster and thus the file will contain multiple tracks.
class BedSerializationMode(enum.Enum):
    MULTIPLE_CLUSTERS = "multiple_clusters"
    SINGLE_CLUSTER = "single_cluster"

### A single results table.
class ResultsTable:
    path : str
    df : pd.DataFrame

    def __init__(self, path, df):
        self.path = path
        self.df = df

    @staticmethod 
    def from_file(path) -> "ResultsTable":
        df = pd.read_csv(path, sep="\t")
        # add a column for the resolution
        df["resolution"] = df["name"].apply(lambda x: RESOLUTIONS[x.split("_")[0]])
        return ResultsTable(path, df)

    # Utilities for querying the results and filtering them.
    def where(self, key : str, *, less_than=None, greater_than=None, is_in=None,equals=None, not_equals=None, ) -> "ResultsTable":
        _df = self.df.copy()
        if less_than is not None:
            _df = _df[_df[key] < less_than]
        if greater_than is not None:
            _df = _df[_df[key] > greater_than]
        if is_in is not None:
            _df = _df[_df[key].isin(is_in)]
        if equals is not None:
            _df = _df[_df[key] == equals]
        if not_equals is not None:
            _df = _df[_df[key] != not_equals]
        return ResultsTable(self.path, _df)

    @property
    def names(self):
        return self.df["name"].tolist()

    @property
    def size(self):
        return len(self.df)

    ### Applies FDR
    def apply_fdr(self, mode : FDRMode) -> "ResultsTable":

        # FDR applied on a single chromosome
        if mode == FDRMode.PER_CHROMOSOME:

            # Get how many chromosomes there are
            chromosomes = self.df["chromosome"].unique()
            _df = self.df.copy()

            # loop over them correcting for FDR
            for chromosome in chromosomes:
                _df.loc[_df["chromosome"] == chromosome, "q_value"] = sm.fdrcorrection(_df.loc[_df["chromosome"] == chromosome, "p_value"], alpha=0.05)[1]
            
            # return the new table
            return ResultsTable(self.path, _df)
        
        # Do the same for the whole genome
        if mode == FDRMode.WHOLE_GENOME:
            _df = self.df.copy()
            _df["q_value"] = sm.fdrcorrection(_df["p_value"], alpha=0.05)[1]
            return ResultsTable(self.path, _df)
    
    ### plots the dataframe:
    def plot_chromosome_counts(self, key : str, plot_library : PlotLibrary, *, title=None, xlabel=None, ylabel=None, legend=True, **kwargs):
        if plot_library == PlotLibrary.PLOTLY:
            import plotly.graph_objs as go
            import plotly.express as px
            
            # plots an histogram of the entries for each chromosome
            fig = go.Figure()
            counts = self.df.groupby("chromosome")[key].count().to_dict()
            counts : dict = sort_by_chromosome(counts)
            fig.add_trace(go.Bar(x=list(counts.keys()), y=list(counts.values()), name="Counts"))
            fig.update_layout(title=title, xaxis_title=xlabel, yaxis_title=ylabel, legend_title_text="Chromosome")
            return fig

        else:
            raise NotImplementedError("Plotting library not implemented")

    ### Writes the bed to a file.
    def write_bed(self, path  :str, clusters : ClustersDescription, mode : BedSerializationMode = BedSerializationMode.MULTIPLE_CLUSTERS):
        
        if mode == BedSerializationMode.MULTIPLE_CLUSTERS:
            chromosomes = []
            starts = []
            ends = []
            names = []

            for row in self.df.itertuples():
                chromo = row.chromosome
                assert chromo == clusters.chromosome, "Chromosome mismatch, consider filtering before"

                cluster = clusters[row.name]
                bins = cluster.bins
                _ends = [s + int(cluster.resolution) for s in bins]

                chromosomes.extend([chromo] * len(bins))
                starts.extend(bins)
                ends.extend(_ends)
                names.extend([row.name] * len(bins))

            __df = pd.DataFrame({"chromosome": chromosomes, "start": starts, "end": ends, "name": names})
            __df.to_csv(path, sep="\t", index=False, header=False)

        else:
            with open(path, "w") as f:
                for row in self.df.itertuples():
                    chromo = row.chromosome
                    assert chromo == clusters.chromosome, "Chromosome mismatch, consider filtering before"
                    cluster = clusters[row.name]
                    bins = cluster.bins
                    _ends = [s + int(cluster.resolution) for s in bins]

                    
                    # Write the track name and the description, which contains the name of the cluster and the resolution
                    f.write(f"track name={row.name} description=\"{row.name}_{cluster.resolution}\"\n")

                    for s, e, n in zip(bins, _ends, [row.name] * len(bins)):
                        f.write(f"{chromo}\t{s}\t{e}\n")
                        
    ### Intersect 2 results tables.
    def intersect(self, other : "ResultsTable") -> "ResultsTable":
        return ResultsTable(self.path, self.df.merge(other.df, on=["name", "chromosome"], ))

    ### Writes the table to a CLUS file like this:
    ### chr1   cluster1_name  cluster2_name cluster3_name ...
    ### chr2   cluster1_name  cluster2_name cluster3_name ... 
    def write_clus_file(self, path : str):
        chromosomes = self.df["chromosome"].unique()
        TAB = "\t" #as a tab cannot be used in f strings.
        with open(path, "w") as f:
            for chromosome in chromosomes:
                _df = self.df.loc[self.df["chromosome"] == chromosome]
                f.write(f"{chromosome}\t{TAB.join(_df['name'].tolist())}\n")


    # for jupyter notebooks
    def __repr__(self):
        return self.df.to_string()
    
    def _repr_html_(self):
        return self.df.to_html()
            

# Parsing the results of the analysis
The results of the analysis are parsed from the output of the pipeline (TSV files). Then they are filtered for the requested chromosome, FDR is applied and the results are returned.

In [5]:
# Constants
results_folder = "../results/"
CTCF_results_file = "HMEC.CTCF.tsv"
chromo = "chr16"

# Reading the results
ctcf_results = ResultsTable.from_file(os.path.join(results_folder, CTCF_results_file))
cage_results = ResultsTable.from_file(os.path.join(results_folder, "HMEC.CAGE.tsv"))


# Applying the FDR
ctcf_results = ctcf_results.apply_fdr(FDRMode.PER_CHROMOSOME).where(ValueKeys.Q_VALUE.value, less_than=0.01)
cage_results = cage_results.apply_fdr(FDRMode.WHOLE_GENOME).where(ValueKeys.Q_VALUE.value, less_than=0.01)

# Importing the cluter description from thclusters file (they contain information about the bins)
desc = ClustersDescription("../data/clusters/HMEC/chr16_spec_res.json", "chr16")

# Intersect the results
intersected = ctcf_results.intersect(cage_results)

In [6]:
# Save the intersection to a file.
intersected.write_clus_file(os.path.join("../analysis_results", "intersected.CLUS"))

# Loading and FDRring the results for the CTCF Enrichment

In [5]:
# Constants
results_folder = "../results/"
CTCF_results_file = "HMEC.CTCF.tsv"

ctcf_results = ResultsTable.from_file(os.path.join(results_folder, CTCF_results_file))

# Applying the FDR
ctcf_results = ctcf_results.apply_fdr(FDRMode.PER_CHROMOSOME).where(ValueKeys.Q_VALUE.value, less_than=0.01)

# Save the results to a file.
ctcf_results.write_clus_file(os.path.join("../analysis_results", "ctcf.CLUS"))