In [1]:
import cooler
import itertools
import matplotlib.pyplot as plt
import multiprocessing as mp
import numpy as np
import os
import pandas as pd
import random
import scipy
import scipy.stats as stats
import sys
import tqdm.auto as tqdm

sys.path.append(
    os.path.abspath(
        os.path.join(
            "/data/pushkare/Chromatin_modules/4.quantify_3D_interactions", "src"
        )
    )
)
from HiCM import HiCM
import utils

random.seed(42)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
save_files = True
resolutions = [500, 1000]
interaction_type = "HiC"
dataset = "test_data"
hic_data = "GM12878"

core_path = "/data/pushkare/Chromatin_modules"  ## change path
path_to_cm_peaks = os.path.join(core_path, "2.peaks_in_CMs")

count_mtx_path = os.path.join(core_path, "test_data")
chr_sizes_path = os.path.join(core_path, "genome_annotations", "chr_sizes_hg19.txt")

path_to_3d_data = os.path.join(
    "/data/pushkare/computational_paper",
    "08.3D_interactions_of_CREs_in_chromatin_modules",
    "3D_chromatin_data",
    interaction_type,
)

if interaction_type == "HiC":
    cool_dict = {
        "500bp": path_to_3d_data + "_500bp/LCL_mega_42B_500bp_30_cool.cool",
        "1kb": os.path.join(
            path_to_3d_data,
            "Rao2014-" + hic_data + "-MboI-allreps-filtered.1kb.cool",
        ),
    }
elif interaction_type == "MicroC":
    cool_dict = {
        "500bp": os.path.join(path_to_3d_data, "microc_800m_500bp.cool"),
        "1kb": os.path.join(path_to_3d_data, "microc_800m_1kb.cool"),
    }


output_path = os.path.join(
    "/data/pushkare/computational_paper/06.peak_based_3D_interactions_and_correlations",
)

In [5]:
chromosomes = ["22"]
count_matrices = {
    "H3K4me1": os.path.join(count_mtx_path, "H3K4me1_chr22.bed"),
    "H3K27ac": os.path.join(count_mtx_path, "H3K27ac_chr22.bed"),
}

mark_dfs_lst = []
for mark, count_mtx_path in count_matrices.items():
    df = pd.read_csv(
        count_mtx_path,
        sep="\t",
    )
    df.loc[:, "#Chr"] = df.loc[:, "#Chr"].astype(str)
    df.loc[:, "#Chr"] = df.loc[:, "#Chr"].str.replace("chr", "")
    df.loc[:, "pid"] = df.loc[:, "pid"].str.replace("chr", "")
    df = df.loc[:, ["#Chr", "start", "end", "pid"]]
    df.columns = ["chr", "start", "end", "pid"]
    mark_dfs_lst.append(df)
full_mark_df = pd.concat(mark_dfs_lst, axis=0)
full_mark_df = full_mark_df.loc[full_mark_df.loc[:, "chr"].isin(chromosomes), :]
chr_peak_dict = {
    chromosome: chr_df for chromosome, chr_df in full_mark_df.groupby("chr")
}

  df.loc[:, "#Chr"] = df.loc[:, "#Chr"].astype(str)
  df.loc[:, "#Chr"] = df.loc[:, "#Chr"].astype(str)


### Extract Hi-C interactions per peak

In [7]:
target_regions = pd.DataFrame(
    [
        [
            "chr22",
            23349145,
            23692745,
            "chr22_23349145_23692745_BCR",  # B cell receptor
            "crd76",
            "",
            "",
        ]
    ],
    columns=["chr_id", "start", "end", "pid", "cm_id", "rs_id", "LD_rs_ids"],
)

In [8]:
if not os.path.exists(os.path.join(output_path, "3D_correlations", dataset)):
    os.makedirs(os.path.join(output_path, "3D_correlations", dataset))
for resolution in resolutions:
    if resolution == 500:
        res_str = "500bp"
    elif resolution == 1000:
        res_str = "1kb"
    elif resolution == 5000:
        res_str = "5kb"

    cool_file = cool_dict.get(res_str)
    cool_mtx = cooler.Cooler(cool_file)
    hiCM = HiCM(cool_mtx=cool_mtx, resolution=resolution, chr_sizes_path=chr_sizes_path)
    myfile = open(
        os.path.join(
            output_path,
            "3D_correlations",
            dataset,
            "_".join(
                [
                    dataset,
                    "peak_based_average",
                    interaction_type,
                    "interactions",
                    str(resolution) + "bp.txt",
                ]
            ),
        ),
        "a",
    )
    for chromosome, start, end, region_id, _, gene_symbol, _ in tqdm.tqdm(
        target_regions.values
    ):
        chr_df = chr_peak_dict.get(chromosome.replace("chr", ""))
        peaks_in_query = chr_df.loc[
            (chr_df.loc[:, "start"] >= start) & (chr_df.loc[:, "end"] <= end), "pid"
        ].to_list()
        for i, peak1 in enumerate(peaks_in_query):
            peak1_variables = peak1.split(":")
            chromosome1 = peak1_variables[0].replace("chr", "")
            peak1_start = int(peak1_variables[1])
            peak1_end = int(peak1_variables[2])
            for peak2 in peaks_in_query[i + 1 :]:
                peak2_variables = peak2.split(":")
                chromosome2 = peak2_variables[0].replace("chr", "")
                peak2_start = int(peak2_variables[1])
                peak2_end = int(peak2_variables[2])
                if peak1 != peak2:
                    avg_frequencies = hiCM.get_peak_interactions(
                        chromosome1,
                        sorted([[peak1_start, peak1_end], [peak2_start, peak2_end]]),
                        drop_zeros=True,
                    )
                    myfile.write(
                        "\t".join(
                            [
                                str(chromosome1),
                                str(peak1_start),
                                str(peak1_end),
                                peak1,
                                str(chromosome2),
                                str(peak2_start),
                                str(peak2_end),
                                peak2,
                                str(avg_frequencies) + "\n",
                            ]
                        )
                    )
    myfile.close()

  0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 1/1 [09:20<00:00, 560.20s/it]
100%|██████████| 1/1 [04:26<00:00, 266.26s/it]


In [9]:
print("DONE!")

DONE!
