In [1]:
import numpy as np
import pandas as pd
import os

In [2]:
core_path = "/data/pushkare/Chromatin_modules/genome_annotations/TADs_AB_compartments"
data_path = os.path.join(core_path, "NCBI_data")

In [3]:
tads = pd.read_csv(
    os.path.join(
        data_path, "GSE63525_GM12878_primary+replicate_Arrowhead_domainlist.txt.gz"
    ),
    sep="\t",
)
ab_comp = pd.read_csv(
    os.path.join(data_path, "GSE63525_GM12878_subcompartments.bed.gz"),
    sep="\t",
    header=None,
)
if not "chr" in str(ab_comp.iloc[0, 0]):
    ab_comp.iloc[:, 0] = "chr" + ab_comp.iloc[:, 0].astype(str)

tads = tads.loc[:, ["chr1", "x1", "x2"]].copy()
tads = tads.sort_values(["chr1", "x1"])
tads.loc[:, "tad_id"] = np.arange(tads.shape[0])
tads.loc[:, "tad_id"] = "tad_" + tads.loc[:, "tad_id"].astype(str)
if not "chr" in str(tads.loc[0, "chr1"]):
    tads.loc[:, "chr1"] = "chr" + tads.loc[:, "chr1"].astype(str)

In [4]:
tads.to_csv(
    os.path.join(core_path, "GSE63525_GM12878_TADs_with_chr.bed"),
    sep="\t",
    index=False,
    header=False,
)
ab_comp.loc[:, [0, 1, 2, 3]].dropna().to_csv(
    os.path.join(core_path, "GSE63525_GM12878_AB_compartments_with_chr.bed"),
    sep="\t",
    index=False,
    header=False,
)

In [5]:
tads.loc[:, "chr1"] = tads.loc[:, "chr1"].str.replace("chr", "")
tads.to_csv(
    os.path.join(core_path, "GSE63525_GM12878_TADs.bed"),
    sep="\t",
    index=False,
    header=False,
)
ab_comp.iloc[:, 0] = ab_comp.iloc[:, 0].str.replace("chr", "")
ab_comp.loc[:, [0, 1, 2, 3]].dropna().to_csv(
    os.path.join(core_path, "GSE63525_GM12878_AB_compartments.bed"),
    sep="\t",
    index=False,
    header=False,
)

### Overlap peaks with AB compartments and TADs

In [8]:
%%bash
core_path="/data/pushkare/Chromatin_modules/genome_annotations/TADs_AB_compartments";
mkdir -p ${core_path}/peak_overlaps/

cm_peaks_path="/data/pushkare/Chromatin_modules/2.peaks_in_CMs/peak_files";

methods=( "vcmtools" "clomics" "phm" );
dataset="test_data";
for method in ${methods[*]}
do  
    mkdir -p ${core_path}/peak_overlaps/${method}
    ## Overlap chromatin module peaks
    bedtools intersect -wo \
        -a ${cm_peaks_path}/${dataset}_${method}_all_peaks.bed \
        -b ${core_path}/GSE63525_GM12878_AB_compartments_with_chr.bed \
        > ${core_path}/peak_overlaps/${method}/${dataset}_AB_cm_peaks_overlap.bed
    bedtools intersect -wo \
        -a ${cm_peaks_path}/${dataset}_not_${method}_peaks.bed \
        -b ${core_path}/GSE63525_GM12878_AB_compartments_with_chr.bed \
        > ${core_path}/peak_overlaps/${method}/${dataset}_AB_not_cm_peaks_overlap.bed
    
    ## Overlap chromatin module peaks
    bedtools intersect -wo \
        -a ${cm_peaks_path}/${dataset}_${method}_all_peaks.bed \
        -b ${core_path}/GSE63525_GM12878_TADs_with_chr.bed \
        > ${core_path}/peak_overlaps/${method}/${dataset}_TADs_cm_peaks_overlap.bed
    bedtools intersect -wo \
        -a ${cm_peaks_path}/${dataset}_not_${method}_peaks.bed \
        -b ${core_path}/GSE63525_GM12878_TADs_with_chr.bed \
        > ${core_path}/peak_overlaps/${method}/${dataset}_TADs_not_cm_peaks_overlap.bed
done

#### Overlap CMs with AB compartments and TADs

In [7]:
%%bash
core_path="/data/pushkare/Chromatin_modules/genome_annotations/TADs_AB_compartments";

core_cm_path="/data/pushkare/Chromatin_modules/1.mapped_CMs";
methods=( "vcmtools" "clomics" "phm" );
dataset="test_data";

pp_threshold="0.8";

for method in ${methods[*]}
do  
    mkdir -p ${core_path}/cm_overlaps/${method};
    if [ ${method} == "phm" ]
    then
        tracks_bed=${core_cm_path}/${method}/${dataset}_phm_tracks_content/${dataset}_${pp_threshold}_merged_phm_all_chr.tracks.bed;
        AB_file=${core_path}/GSE63525_GM12878_AB_compartments.bed;
        TAD_file=${core_path}/GSE63525_GM12878_TADs.bed;
    elif [ ${method} == "vcmtools" ]
    then
        tracks_bed=${core_cm_path}/${method}/VCMs/0.5Mb/test_data/${dataset}_all_VCMs_corrected_pvalue_0.001.tracks.bed;
        AB_file=${core_path}/GSE63525_GM12878_AB_compartments_with_chr.bed;
        TAD_file=${core_path}/GSE63525_GM12878_TADs_with_chr.bed;
    else
        tracks_bed=${core_cm_path}/${method}/n_peaks_200/bg_threshold_3/${dataset}/${dataset}_Clomics_CM.tracks.bed;
        AB_file=${core_path}/GSE63525_GM12878_AB_compartments.bed;
        TAD_file=${core_path}/GSE63525_GM12878_TADs.bed;
    fi

    ## Overlap chromatin module peaks
    bedtools intersect -wo \
        -a ${tracks_bed} \
        -b ${AB_file} \
        > ${core_path}/cm_overlaps/${method}/${dataset}_AB_cm_tracks_overlap.bed
    
    bedtools intersect -wo \
        -a ${tracks_bed} \
        -b ${TAD_file} \
        > ${core_path}/cm_overlaps/${method}/${dataset}_TADs_cm_tracks_overlap.bed
done

In [11]:
print("DONE!")

DONE!
