In [1]:
import pandas as pd
import pysam
import numpy as np
import akita_utils
import h5py
from scipy import stats

from akita_utils.utils import ut_dense
from akita_utils.format_io import h5_to_df

import matplotlib.pyplot as plt
import seaborn as sns

## Getting all the scores in one place

In [2]:
def add_scores(stats, h5_path, df_to_extend=None, model_index=None):

    """
    Extracts specified statistics from an HDF5 file and adds them to a DataFrame for analysis.

    This function reads statistics from the specified HDF5 file and maps them to specific column names
    based on the provided model index and the type of experiment (boundary or dot). The extracted statistics
    are added to the input DataFrame (if provided) or a new DataFrame is created. The column names for the
    added statistics are generated using the model index and the provided statistics.

    Parameters:
    - stats (list of str): Names of statistics to be extracted from the HDF5 file.
    - h5_path (str): Path to the HDF5 file containing the statistics data.
    - df_to_extend (pd.DataFrame, optional): DataFrame to which the extracted statistics will be added.
      If None, a new DataFrame will be created. Default is None.
    - model_index (int, optional): Index of the model used for the experiment. If None, the function extracts
      the model index from the path to the HDF5 file. Default is None.

    Returns:
    pd.DataFrame: DataFrame containing the extracted statistics mapped to specific column names.
    """
    
    if model_index == None:
        # Extract model index from the directory name in the path
        # it is assumed that the name of directory ends with "_m{model_index}"
        model_index = int(h5_path.split("/")[-2].split("_")[-1][-1])
        
    # Creating column names based on experiment type and model index
    if len(stats) == 1 and stats[0] == "SCD":
        # it's boundary experiment
        column_names = [f"boSCD_m{model_index}"]
    else:
        # it's dot experiment
        column_names = []
        for dot_stat in dot_stats:
            if dot_stat == "SCD":
                column_names.append(f"dotSCD_m{model_index}")
            else:
                column_names.append(f"{dot_stat}_m{model_index}")

    # Mapping statistics to column names
    column_mapper = dict(zip(stats, column_names))

    # Extract statistics from the HDF5 file and rename columns
    df = h5_to_df(h5_path, stats) 
    df = df.rename(columns=column_mapper)

    # Add extracted statistics to the input DataFrame or create a new DataFrame
    if type(df_to_extend) == pd.core.frame.DataFrame:
        for column_name in column_names:
            df_to_extend[column_name] = df[column_name]
        return df_to_extend
    else:
        return df

In [3]:
summary_dict = {"1" : {"dot_path" : "/scratch2/smaruj/all_dots_vs_boundaries/dots_all_motifs_m1/STATS_OUT.h5",
                       "boundary_path" : "/scratch2/smaruj/all_dots_vs_boundaries/boundaries_all_motifs_m1/STATS_OUT.h5"},
                "2" : {"dot_path" : "/scratch2/smaruj/all_dots_vs_boundaries/dots_all_motifs_m2/STATS_OUT.h5",
                       "boundary_path" : "/scratch2/smaruj/all_dots_vs_boundaries/boundaries_all_motifs_m2/STATS_OUT.h5"}}

In [4]:
dot_stats = ["SCD", "dot-score", "cross-score", "x-score"]
boundary_stats =["SCD"] 

In [5]:
df = add_scores(dot_stats, summary_dict["1"]["dot_path"])
df = add_scores(boundary_stats, summary_dict["1"]["boundary_path"], df_to_extend=df)
df = add_scores(dot_stats, summary_dict["2"]["dot_path"], df_to_extend=df)
df = add_scores(boundary_stats, summary_dict["2"]["boundary_path"], df_to_extend=df)

In [None]:
# df.to_csv("./ctcf_tsv/filtered_base_mouse_ctcf_scored.tsv", sep = "\t", index=False)

## Averaging over background sequences

In [6]:
summary_df = pd.read_table("./../ctcf_tsv/filtered_base_mouse_ctcf.tsv", sep="\t")
nr_sites = len(summary_df)
background_indices = list(df.background_index.unique())
col_names = ["boSCD", "dotSCD", "dot-score", "cross-score", "x-score"]

In [7]:
for model in summary_dict:
    print("Averaging over backgrounds for model", model)
    
    for score in col_names:
        summary_score_array = np.zeros((nr_sites, ))
        score_column = f"{score}_m{model}"
        print("\t-", score_column)

        for background_i in background_indices:
            summary_score_array += np.array(df[df["background_index"] == background_i][score_column])
    
        summary_df[score_column] = summary_score_array / len(background_indices)

Averaging over backgrounds for model 1
	- boSCD_m1
	- dotSCD_m1
	- dot-score_m1
	- cross-score_m1
	- x-score_m1
Averaging over backgrounds for model 2
	- boSCD_m2
	- dotSCD_m2
	- dot-score_m2
	- cross-score_m2
	- x-score_m2


In [9]:
# summary_df

In [None]:
# summary_df.to_csv("./../ctcf_tsv/filtered_base_mouse_ctcf_scored_and_averaged.tsv", sep = "\t", index=False)