In [1]:
import collections
import numpy as np
import os
import pandas as pd
import utils

In [2]:
method = "phm"
method_name = "phm"
pp_threshold = 0.8
chromosomes = ["chr22"]
dataset = "test_data"
save_files = True

# to use TAD and AB compartment information, set use_tad_ab_info = True
# Make sure to prepare the data in the correct format!
# (see Jupyter notebooks in Chromatin_modules/genome_annotations/TADs_AB_compartments)
use_tad_ab_info = False

In [3]:
core_path = "/data/pushkare/Chromatin_modules"
# Input data paths
path_to_cm_peaks = os.path.join(core_path, "2.peaks_in_CMs")
overlapping_peaks_path = os.path.join(
    core_path,
    "2.peaks_in_CMs",
    "peak_overlaps",
    "_".join(["overlap_of", method, "and_not", method, "peaks"]),
)

count_mtx_path = os.path.join(core_path, "test_data")
if use_tad_ab_info:
    tads_ab_path = os.path.join(core_path, "genome_annotations", "TADs_AB_compartments")
# Path to store input data for simulation
output_data_path = os.path.join(
    core_path,
    "3.simulate_CMs",
    "input",
    method,
)

In [4]:
single_peak_df = pd.read_csv(
    os.path.join(output_data_path, dataset + "_single_peak_ids.txt"),
    sep="\t",
    header=None,
    names=["chromosome", "peak_start", "peak_end", "mark", "peak_id"],
)

# Prepare data for CM peaks
cm_peaks_df = utils.read_bed(
    path_to_file_dir=os.path.join(path_to_cm_peaks, "peak_files"),
    file_name="_".join([dataset, method, "all_peaks.bed"]),
    sep="\t",
    add_chr=True,
    add_chr_col="chr",
    columns=["chr", "peak_start", "peak_end", "peak_id", "cm_id", "strand"],
    header=None,
)
cm_peaks_df.loc[:, "cm_id"] = cm_peaks_df.loc[:, "cm_id"].str.replace("_", "~")
n_peaks_per_cm_dict = cm_peaks_df.groupby("cm_id").size().to_dict()
cm_peak_coordinates_by_chr_dict = utils.get_peak_coord_in_cms_by_chr(
    cm_peaks_df=cm_peaks_df,
    chr_col="chr",
    peak_start_col="peak_start",
    peak_end_col="peak_end",
    cm_id_col="cm_id",
)
cm_peaks_dict = utils.get_peak_ids_by_cm(
    peak_df=cm_peaks_df, peak_id_col="peak_id", cm_id_col="cm_id"
)
mark_counts_dict = {
    ref_cm_id.replace("_", "~"): {
        mark: n_peaks / n_peaks_per_cm_dict[ref_cm_id]
        for mark, n_peaks in collections.Counter(
            [ref_peak.split(":")[0] for ref_peak in cm_peaks]
        ).items()
    }.get("H3K27ac", 0.0)
    for ref_cm_id, cm_peaks in cm_peaks_dict.items()
}

# Prepare data for not CM peaks
not_cm_peaks_df = utils.read_bed(
    path_to_file_dir=os.path.join(path_to_cm_peaks, "peak_files"),
    file_name="_".join([dataset, "not", method, "peaks.bed"]),
    sep="\t",
    add_chr=True,
    add_chr_col="chr",
    columns=["chr", "peak_start", "peak_end", "peak_id", "peak_id_dup", "strand"],
    header=None,
)

# Exclude not CM peaks that overlap CM peaks
peaks_to_exclude_df = pd.read_csv(
    os.path.join(
        overlapping_peaks_path,
        "_".join([dataset, method, "not", method, "peaks_overlap.bed"]),
    ),
    sep="\t",
    header=None,
)
peaks_to_exclude_df.columns = [
    "chr_1",
    "start_1",
    "end_1",
    "pid_1",
    "cm_id_1",
    "strand_1",
    "chr_2",
    "start_2",
    "end_2",
    "pid_2",
    "pid_2_dup",
    "strand_2",
    "ovrlp_len",
]
peaks_overlapping_cm_peaks = set(peaks_to_exclude_df.loc[:, "pid_1"]).union(
    set(peaks_to_exclude_df.loc[:, "pid_2"])
)
# Exclude not CM H3K27ac peaks that overlap not CM peaks.
# The reason to exclude H3K27ac peaks is that they are broader and
# therefore can overlap more H3K4me1 peaks
k27ac_peaks_ovrlp_others = np.load(
    os.path.join(
        path_to_cm_peaks,
        "peak_overlaps",
        "overlapping_marks_peak_mapping",
        method,
        "_".join(
            [
                dataset,
                "overlapping_not",
                method,
                "peaks_k27ac_to_k4me1_mapping.npy",
            ]
        ),
    ),
    allow_pickle=True,
).item()

all_possible_peaks_to_exclude = list(
    peaks_overlapping_cm_peaks.union(set(cm_peaks_df.loc[:, "peak_id"])).union(
        set(k27ac_peaks_ovrlp_others.keys())
    )
)

not_cm_peaks_df = not_cm_peaks_df.loc[
    ~not_cm_peaks_df.loc[:, "peak_id"].isin(all_possible_peaks_to_exclude), :
].copy()
single_peak_df = single_peak_df.loc[
    ~single_peak_df.loc[:, "peak_id"].isin(all_possible_peaks_to_exclude), :
]
single_peak_df = single_peak_df.loc[
    single_peak_df.loc[:, "peak_id"].isin(not_cm_peaks_df.loc[:, "peak_id"]), :
]

not_cm_peak_coordinates_by_chr_dict = utils.get_peak_coord_by_chr(
    peak_df=not_cm_peaks_df,
    chr_col="chr",
    peak_id_col="peak_id",
    peak_start_col="peak_start",
    peak_end_col="peak_end",
)
count_mtx_k4me1 = utils.read_count_mtx(
    count_mtx_path,
    "H3K4me1_chr22.bed",
    "H3K4me1",
    chromosomes=chromosomes,
    add_marks=True,
    add_chr=True,
)
count_mtx_k27ac = utils.read_count_mtx(
    count_mtx_path,
    "H3K27ac_chr22.bed",
    "H3K27ac",
    chromosomes=chromosomes,
    add_marks=True,
    add_chr=True,
)

# Merge count matrices per mark into one common matrix
count_mtx = pd.concat([count_mtx_k4me1, count_mtx_k27ac])

count_mtx_dict = {
    chromosome: count_mtx_chr.set_index("pid").iloc[:, 5:].astype(float)
    for chromosome, count_mtx_chr in count_mtx.groupby("chr")
}
if save_files:
    if not os.path.exists(os.path.join(output_data_path, dataset)):
        os.makedirs(os.path.join(output_data_path, dataset))
    np.save(
        os.path.join(output_data_path, dataset, "peak_count_matrix_dict_by_chr.npy"),
        count_mtx_dict,
        allow_pickle=True,
    )
    single_peak_df.to_csv(
        os.path.join(output_data_path, dataset, "single_peak_df_for_simulation.txt"),
        sep="\t",
        index=False,
        header=False,
    )

peak_dfs_dict = {"cm_peaks": cm_peaks_df, "not_cm_peaks": not_cm_peaks_df}

tracks_df, _ = utils.get_cm_tracks_content(
    os.path.join(core_path, "1.mapped_CMs"), method, dataset, pp_threshold=pp_threshold
)
tracks_df.columns = [
    "chr",
    "start",
    "end",
    "cm_id",
    "number",
    "strain",
    "start_duplicate",
    "end_duplicate",
    "numbers",
    "cm_size",
    "peak_length",
    "peak_starts",
    "is_totem",
]
tracks_df.loc[:, "cm_length"] = tracks_df.loc[:, "end"] - tracks_df.loc[:, "start"]
tracks_df.loc[:, "cm_id"] = tracks_df.loc[:, "cm_id"].str.replace("_", "~")

# Prepare dictionary with peak coordinates.
# Consider two groups of peaks: CM- and not CM-peaks
# Get peak coordinates by CM and chromosome
cm_peak_coordinates_by_chr = utils.get_peak_coord_in_cms_by_chr(
    cm_peaks_df=peak_dfs_dict["cm_peaks"],
    chr_col="chr",
    peak_start_col="peak_start",
    peak_end_col="peak_end",
    cm_id_col="cm_id",
)
# Get peak coordinates by chromosome for not CM-peaks
not_cm_peak_coordinates_by_chr = utils.get_peak_coord_by_chr(
    peak_df=peak_dfs_dict["not_cm_peaks"],
    chr_col="chr",
    peak_id_col="peak_id",
    peak_start_col="peak_start",
    peak_end_col="peak_end",
)
if save_files:
    np.save(
        os.path.join(output_data_path, dataset, "cm_peak_coordinates_dict_by_chr.npy"),
        cm_peak_coordinates_by_chr,
        allow_pickle=True,
    )
    np.save(
        os.path.join(
            output_data_path, dataset, "not_cm_peak_coordinates_dict_by_chr.npy"
        ),
        not_cm_peak_coordinates_by_chr,
        allow_pickle=True,
    )

cm_param_df = tracks_df.loc[
    :,
    [
        "chr",
        "start",
        "end",
        "cm_id",
        "cm_size",
        "cm_length",
        "peak_length",
        "peak_starts",
    ],
].copy()

cm_param_df.loc[:, "mean_d"] = np.nan
cm_param_df.loc[:, "H3K27ac_fraction"] = np.nan
for index, row in cm_param_df.iterrows():
    starts = (
        np.array([int(p_s) for p_s in row["peak_starts"].split(",")]) + row["start"]
    )
    ends = starts + np.array([int(p_len) for p_len in row["peak_length"].split(",")])
    start_end = list(zip(starts, ends))

    ends = np.array([se_pair[1] for se_pair in start_end[:-1]])
    pairwise_peak_dist = []
    for i in range(1, len(start_end)):
        lst = np.array([se_pair[0] for se_pair in start_end[i:]])
        pairwise_peak_dist.extend(lst - np.ravel([ends[i - 1]] * len(lst)))
    cm_param_df.loc[index, "mean_d"] = np.mean(pairwise_peak_dist)
    cm_param_df.loc[index, "H3K27ac_fraction"] = mark_counts_dict.get(row["cm_id"])

if save_files:
    cm_param_df.to_csv(
        os.path.join(output_data_path, dataset, "cm_parameter_df.bed"),
        sep="\t",
        header=True,
        index=False,
    )