In [1]:
import numpy as np
import os
import pandas as pd

In [2]:
core_path = "/data/pushkare/Chromatin_modules"
method = "phm"
dataset = "test_data"
chromosomes = ["chr22"]
threshold = 0.8

In [3]:
path_to_output_data = os.path.join(core_path, "3.simulate_CMs", "input", method)
if not os.path.exists(path_to_output_data):
    os.makedirs(path_to_output_data)

In [4]:
single_peaks_list = []
for chromosome in chromosomes:
    pp_hypotheses = pd.read_csv(
        os.path.join(
            core_path,
            "1.mapped_CMs",
            method,
            chromosome,
            "phm_output",
            "pp.gz",
        ),
        sep="\t",
        header=None,
    )
    pp_hypotheses.columns = [
        "peak_1",
        "peak_2",
        "H_0",
        "H_11",
        "H_12",
        "linkage",
        "pleiotropy",
        "causality_1",
        "causality_2",
    ]
    pp_hypotheses = pp_hypotheses.set_index(["peak_1", "peak_2"])
    pp_hypotheses_filt = pp_hypotheses[pp_hypotheses > np.log(threshold)]
    h11_peaks = set(
        pp_hypotheses_filt.loc[:, ["H_11"]]
        .dropna(how="all")
        .reset_index()
        .loc[:, "peak_1"]
    )
    h12_peaks = set(
        pp_hypotheses_filt.loc[:, ["H_12"]]
        .dropna(how="all")
        .reset_index()
        .loc[:, "peak_2"]
    )
    linkage_pleiotropy_peaks = set.union(
        *[
            set(peak_idx)
            for peak_idx in pp_hypotheses_filt.loc[:, ["linkage", "pleiotropy"]]
            .dropna(how="all")
            .index.to_list()
        ]
    )
    single_peaks_with_qtl = h11_peaks.union(h12_peaks).union(linkage_pleiotropy_peaks)

    peak_ids = pd.read_csv(
        os.path.join(
            core_path,
            "1.mapped_CMs",
            method,
            chromosome,
            chromosome + "_peak_coordinates.bed.gz",
        ),
        sep="\t",
        header=None,
    )
    peak_ids.loc[:, "peak_id"] = np.arange(1, peak_ids.shape[0] + 1)
    peak_ids = peak_ids.loc[peak_ids.loc[:, "peak_id"].isin(single_peaks_with_qtl), :]
    if not "chr" in str(peak_ids.iloc[0, 0]):
        peak_ids.iloc[:, 0] = "chr" + peak_ids.iloc[:, 0].astype(str)

    peak_ids.loc[:, "full_pid"] = (
        peak_ids.iloc[:, 3]
        + ":"
        + peak_ids.iloc[:, 0]
        + ":"
        + peak_ids.iloc[:, 1].astype(str)
        + ":"
        + peak_ids.iloc[:, 2].astype(str)
    )

    single_peaks_list.append(peak_ids.loc[:, [0, 1, 2, 3, "full_pid"]])
single_peaks_df = pd.concat(single_peaks_list)
single_peaks_df.to_csv(
    os.path.join(path_to_output_data, dataset + "_single_peak_ids.txt"),
    sep="\t",
    index=False,
    header=False,
)