In [1]:
import pandas as pd
import pysam
import numpy as np
import akita_utils
import h5py
from scipy import stats

from akita_utils.utils import ut_dense
from akita_utils.format_io import h5_to_df

import matplotlib.pyplot as plt
import seaborn as sns

## Getting all the scores in one place

In [2]:
def add_scores(stats, h5_path, df_to_extend=None, model_index=None):

    if model_index == None:
        # it is assumed that the name of directory ends with "_m{model_index}"
        model_index = int(h5_path.split("/")[-2].split("_")[-1][-1])
        
    # creating mapper
    if len(stats) == 1 and stats[0] == "SCD":
        # it's boundary experiment
        column_names = [f"boSCD_m{model_index}"]
    else:
        # it's dot experiment
        column_names = []
        for dot_stat in dot_stats:
            if dot_stat == "SCD":
                column_names.append(f"dotSCD_m{model_index}")
            else:
                column_names.append(f"{dot_stat}_m{model_index}")

    column_mapper = dict(zip(stats, column_names))

    df = h5_to_df(h5_path, stats) 
    df = df.rename(columns=column_mapper)
    
    if type(df_to_extend) == pd.core.frame.DataFrame:
        for column_name in column_names:
            df_to_extend[column_name] = df[column_name]
        return df_to_extend
    else:
        return df

In [3]:
summary_dict = {"1" : {"dot_path" : "/scratch2/smaruj/all_dots_vs_boundaries/dots_all_motifs_m1/STATS_OUT.h5",
                       "boundary_path" : "/scratch2/smaruj/all_dots_vs_boundaries/boundaries_all_motifs_m1/STATS_OUT.h5"},
                "2" : {"dot_path" : "/scratch2/smaruj/all_dots_vs_boundaries/dots_all_motifs_m2/STATS_OUT.h5",
                       "boundary_path" : "/scratch2/smaruj/all_dots_vs_boundaries/boundaries_all_motifs_m2/STATS_OUT.h5"}}

In [4]:
dot_stats = ["SCD", "dot-score", "cross-score", "x-score"]
boundary_stats =["SCD"] 

In [5]:
df = add_scores(dot_stats, summary_dict["1"]["dot_path"])
df = add_scores(boundary_stats, summary_dict["1"]["boundary_path"], df_to_extend=df)
df = add_scores(dot_stats, summary_dict["2"]["dot_path"], df_to_extend=df)
df = add_scores(boundary_stats, summary_dict["2"]["boundary_path"], df_to_extend=df)

In [None]:
# df.to_csv("./ctcf_tsv/filtered_base_mouse_ctcf_scored.tsv", sep = "\t", index=False)

## Averaging over background sequences

In [7]:
summary_df = pd.read_table("./../ctcf_tsv/filtered_base_mouse_ctcf.tsv", sep="\t")
nr_sites = len(summary_df)
background_indices = list(df.background_index.unique())
col_names = ["boSCD", "dotSCD", "dot-score", "cross-score", "x-score"]

In [8]:
for model in summary_dict:
    print("Averaging over backgrounds for model", model)
    
    for score in col_names:
        summary_score_array = np.zeros((nr_sites, ))
        score_column = f"{score}_m{model}"
        print("\t-", score_column)

        for background_i in background_indices:
            summary_score_array += np.array(df[df["background_index"] == background_i][score_column])
    
        summary_df[score_column] = summary_score_array / len(background_indices)

Averaging over backgrounds for model 1
	- boSCD_m1
	- dotSCD_m1
	- dot-score_m1
	- cross-score_m1
	- x-score_m1
Averaging over backgrounds for model 2
	- boSCD_m2
	- dotSCD_m2
	- dot-score_m2
	- cross-score_m2
	- x-score_m2


In [10]:
summary_df

Unnamed: 0,boundary_index,chrom,boundary_end,index,num_ctcf,span,boundary_start,strand,start,end,...,boSCD_m1,dotSCD_m1,dot-score_m1,cross-score_m1,x-score_m1,boSCD_m2,dotSCD_m2,dot-score_m2,cross-score_m2,x-score_m2
0,0,chr1,4410000,0,2,4403267-4403286,4400000,-,4403267,4403286,...,1.837402,2.558350,0.141705,-0.000067,-0.000417,2.757031,2.637695,0.092927,-0.000032,-0.000762
1,1,chr1,4780000,4,10,4770055-4770074,4770000,+,4770055,4770074,...,57.490625,56.521875,10.389062,0.614551,0.839331,45.271875,49.806250,9.035352,0.557190,0.744580
2,1,chr1,4780000,5,10,4770180-4770199,4770000,-,4770180,4770199,...,0.183185,0.171674,0.160630,0.000159,-0.000174,0.169409,0.148956,0.122577,0.000388,0.000477
3,1,chr1,4780000,6,10,4770867-4770886,4770000,+,4770867,4770886,...,0.220569,0.207544,0.160944,0.000183,-0.000135,0.165839,0.167371,0.122313,0.000371,0.000481
4,1,chr1,4780000,7,10,4773435-4773454,4770000,+,4773435,4773454,...,0.426611,0.456836,0.158552,0.000076,-0.000258,0.195990,0.303076,0.118950,0.000174,0.000198
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7555,4472,chr19,59390000,26376,10,59388426-59388445,59380000,+,59388426,59388445,...,0.310925,0.230750,0.161167,0.000260,-0.000022,0.275305,0.280493,0.124078,0.000524,0.000648
7556,4472,chr19,59390000,26377,10,59388676-59388695,59380000,-,59388676,59388695,...,0.570312,0.512500,0.158622,0.000161,-0.000126,0.318506,0.485669,0.117799,0.000205,0.000126
7557,4472,chr19,59390000,26378,10,59389390-59389409,59380000,+,59389390,59389409,...,43.628125,43.987500,5.834082,0.360199,0.480334,35.562500,40.651562,5.365820,0.347778,0.455627
7558,4473,chr19,59780000,26381,3,59770157-59770176,59770000,-,59770157,59770176,...,0.272974,0.292395,0.160394,0.000164,-0.000118,0.248724,0.175800,0.121094,0.000380,0.000473


In [11]:
summary_df.to_csv("./../ctcf_tsv/filtered_base_mouse_ctcf_scored_and_averaged.tsv", sep = "\t", index=False)