In [1]:
import os
import pandas as pd

In [2]:
dataset = "test_data"
add_marks = True
chromosomes = ["chr" + str(i) for i in range(1, 23)]
count_mtx_path = "../" + dataset

In [4]:
def prepare_count_mtx(
    count_mtx_path, file_name, mark, add_marks=True, save=False, output_folder=None
):
    count_mtx = pd.read_csv(
        os.path.join(count_mtx_path, file_name),
        sep="\t",
        usecols=["#Chr", "start", "end", "pid", "gid"],
    )
    if add_marks:
        count_mtx.loc[:, "mark"] = mark
    count_mtx = count_mtx.rename(columns={"#Chr": "chr"})
    if not "chr" in str(count_mtx.iloc[0, 0]):
        count_mtx.loc[:, "chr"] = "chr" + count_mtx.loc[:, "chr"].astype(str)
    if not "chr" in count_mtx.loc[0, "pid"]:
        count_mtx.loc[:, "pid"] = "chr" + count_mtx.loc[:, "pid"]
    if not "chr" in count_mtx.loc[0, "gid"]:
        count_mtx.loc[:, "gid"] = "chr" + count_mtx.loc[:, "gid"]
    if add_marks:
        count_mtx.loc[:, "pid"] = (
            count_mtx.loc[:, "mark"] + ":" + count_mtx.loc[:, "pid"]
        )
    count_mtx.loc[:, "strand"] = "+"
    count_mtx = count_mtx.loc[count_mtx.loc[:, "chr"].isin(chromosomes), :]
    count_mtx.loc[:, "chr_int"] = (
        count_mtx.loc[:, "chr"].str.replace("chr", "").astype(int)
    )
    count_mtx.loc[:, "chr_str"] = count_mtx.loc[:, "chr"].str.replace("chr", "")
    count_mtx = count_mtx.sort_values(["chr_str", "start"])[
        ["chr", "start", "end", "pid", "strand"]
    ]
    if save:
        if not os.path.exists(output_folder):
            os.makedirs(output_folder)
        count_mtx.to_csv(
            os.path.join(
                output_folder,
                mark + "_all_peaks.bed",
            ),
            sep="\t",
            header=False,
            index=False,
        )

In [5]:
prepare_count_mtx(
    count_mtx_path=count_mtx_path,
    file_name="H3K4me1_chr22.bed",
    mark="H3K4me1",
    add_marks=True,
    save=True,
    output_folder="./peak_bed_files",
)
prepare_count_mtx(
    count_mtx_path=count_mtx_path,
    file_name="H3K27ac_chr22.bed",
    mark="H3K27ac",
    add_marks=True,
    save=True,
    output_folder="./peak_bed_files",
)

In [6]:
k27ac = pd.read_csv(
    os.path.join(
        "./peak_bed_files/H3K27ac_all_peaks.bed",
    ),
    sep="\t",
    header=None,
)
k4me1 = pd.read_csv(
    os.path.join(
        "./peak_bed_files/H3K4me1_all_peaks.bed",
    ),
    sep="\t",
    header=None,
)

In [9]:
all_peaks = pd.concat([k27ac, k4me1])
all_peaks["chr_str"] = all_peaks[0].str.replace("chr", "")
del all_peaks["chr_str"]
all_peaks.to_csv(
    os.path.join(
        "./peak_bed_files/k27ac_k4me1_peaks.bed",
    ),
    sep="\t",
    header=False,
    index=False,
)

In [11]:
assert all_peaks.shape[0] == k27ac.shape[0] + k4me1.shape[0]