In [1]:
# This previews the first few lines of a compressed NCOR peaks file (.peaks.txt.gz).

import pandas as pd
import gzip

# Path to the peaks file (change as needed)
# peaks_file = "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak file/GSM865745_NCOR_siGPS2.peaks.txt.gz"
#peaks_file = '/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak_file/GSM865744_NCOR_siCTL.peaks.txt.gz'
peaks_file = '/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak_file/GSM865742_GPS2_minusTNFA.peaks.txt.gz'
# Decompress and print the first 5 lines
with gzip.open(peaks_file, "rt") as f:
    for i in range(5):
        print(f.readline().strip())


chr1	121186505	121186905
chr6	58886177	58886577
chr7	61606407	61606807
chr18	98102	98502
chr17	38756264	38756664


In [4]:
# This script converts a compressed NCOR peaks file (.peaks.txt.gz) 
# into a BED file containing only the first three columns (chr, start, end).

import gzip

# peaks_bed = "NCOR_siGPS2_hg18.bed"
# peaks_bed = "NCOR_siCTL_hg18.bed"
peaks_bed = "GPS2_minusTNFA_hg18.bed"

with gzip.open(peaks_file, "rt") as f, open(peaks_bed, "w") as out:
    for line in f:
        # Skip header lines or empty lines
        if line.startswith("#") or line.strip() == "":
            continue
        cols = line.strip().split()
        # Extract the first three columns (chr, start, end)
        out.write("\t".join(cols[:3]) + "\n")

print(f"BED file saved: {peaks_bed}")


BED file saved: GPS2_minusTNFA_hg18.bed


In [6]:
# This script loads an NCOR/GPS2 peaks BED file (hg38) using pybedtools and 
# prints the total number of peak regions.

import pybedtools

# peaks_bed_hg38 = pybedtools.BedTool("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak file/NCOR_siGPS2_hg38.bed")
# peaks_bed_hg38 = pybedtools.BedTool("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak_file/NCOR_siCTL_hg38.bed")
peaks_bed_hg38 = pybedtools.BedTool("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak_file/GPS2_minusTNFA_hg38.bed")
print(f"loaded peaks, number of regions: {len(peaks_bed_hg38)}")



loaded peaks, number of regions: 33437


In [3]:
# This script extracts DNA-end sites associated with mtRNA (chrM) from iMARGI .pairs.gz files,
# merges nearby sites (±250 bp), and saves them as BED files for each sample.

import gzip
import pybedtools
import os

pairs_files = {
    "Day0": "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/output/final_SRR9900120.pairs.gz",
    "Day3": "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/output/final_SRR9900121.pairs.gz",
    "Day7": "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/output/final_SRR9900122.pairs.gz"
}

outdir = "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/analysis"
os.makedirs(outdir, exist_ok=True)

for name, pairs_file in pairs_files.items():
    dna_sites = []
    with gzip.open(pairs_file, "rt") as f:
        for line in f:
            if line.startswith("#"):
                continue
            cols = line.strip().split("\t")
            rna_chr = cols[1]
            dna_chr, dna_pos = cols[3], int(cols[4])

            # Filter for RNA-end that is mitochondrial (mtRNA)
            if rna_chr in ["chrM", "chrMT", "MT"]:
                dna_sites.append([dna_chr, dna_pos, dna_pos + 1])

    if len(dna_sites) == 0:
        print(f"{name}: No mtRNA-binding read pairs found!")
        continue

    # Merge nearby DNA-end sites within ±250 bp
    dna_bed = pybedtools.BedTool(dna_sites).sort().merge(d=500)
    
    outfile = os.path.join(outdir, f"mtRNA_DNA_sites_{name}.bed")
    dna_bed.saveas(outfile)
    print(f"{name}: Successfully generated {outfile}, number of regions: {len(dna_bed)}")


Day0: Successfully generated /projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/analysis/mtRNA_DNA_sites_Day0.bed, number of regions: 294752
Day3: Successfully generated /projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/analysis/mtRNA_DNA_sites_Day3.bed, number of regions: 380812
Day7: Successfully generated /projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/analysis/mtRNA_DNA_sites_Day7.bed, number of regions: 368771
