In [1]:
# ─────────────────────────────────────────────────────────────────────────────
# Notebook: T293.ipynb
# Purpose:
#   Cell 1) Preview the first few lines of a compressed NCOR/GPS2 peaks file (.peaks.txt.gz).
#   Cell 2) Convert a .peaks.txt.gz file into a BED with only chr/start/end.
#   Cell 3) Load an hg38 BED of peaks with pybedtools and print the region count.
#   Cell 4) Extract mtRNA-associated DNA‐end sites from iMARGI .pairs.gz, merge nearby sites (±250 bp), and save per-sample BEDs.

# This previews the first few lines of a compressed NCOR peaks file (.peaks.txt.gz).

import pandas as pd
import gzip

# Path to the peaks file (change as needed)
# peaks_file = "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak file/GSM865745_NCOR_siGPS2.peaks.txt.gz"
#peaks_file = '/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak_file/GSM865744_NCOR_siCTL.peaks.txt.gz'
peaks_file = '/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak_file/GSM865742_GPS2_minusTNFA.peaks.txt.gz'
# Decompress and print the first 5 lines
with gzip.open(peaks_file, "rt") as f:
    for i in range(5):
        print(f.readline().strip())


chr1	121186505	121186905
chr6	58886177	58886577
chr7	61606407	61606807
chr18	98102	98502
chr17	38756264	38756664


In [4]:
# This script converts a compressed NCOR peaks file (.peaks.txt.gz) 
# into a BED file containing only the first three columns (chr, start, end).

import gzip

# peaks_bed = "NCOR_siGPS2_hg18.bed"
# peaks_bed = "NCOR_siCTL_hg18.bed"
peaks_bed = "GPS2_minusTNFA_hg18.bed"

with gzip.open(peaks_file, "rt") as f, open(peaks_bed, "w") as out:
    for line in f:
        # Skip header or empty lines
        if line.startswith("#") or not line.strip():
            continue
        cols = line.strip().split()
        # Write chr, start, end as BED
        out.write("\t".join(cols[:3]) + "\n")

print(f"BED file saved: {peaks_bed}")


BED file saved: GPS2_minusTNFA_hg18.bed


In [6]:
# This script loads an NCOR/GPS2 peaks BED file (hg38) using pybedtools and 
# prints the total number of peak regions.

import pybedtools

# peaks_bed_hg38 = pybedtools.BedTool("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak file/NCOR_siGPS2_hg38.bed")
# peaks_bed_hg38 = pybedtools.BedTool("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak_file/NCOR_siCTL_hg38.bed")
peaks_bed_hg38 = pybedtools.BedTool("/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/T263/Peak_file/GPS2_minusTNFA_hg38.bed")
print(f"loaded peaks, number of regions: {len(peaks_bed_hg38)}")



loaded peaks, number of regions: 33437


In [3]:
#Extract and merge mtRNA-associated DNA-end sites from iMARGI
import gzip
import pybedtools
import os

# iMARGI .pairs.gz files for each timepoint
pairs_files = {
    "Day0": "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/output/final_SRR9900120.pairs.gz",
    "Day3": "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/output/final_SRR9900121.pairs.gz",
    "Day7": "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/output/final_SRR9900122.pairs.gz"
}

# Directory to save merged BEDs
outdir = "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/analysis"
os.makedirs(outdir, exist_ok=True)

for name, pairs_file in pairs_files.items():
    dna_sites = []
    # Read each line of the compressed .pairs.gz
    with gzip.open(pairs_file, "rt") as f:
        for line in f:
            if line.startswith("#"):
                continue  # skip headers
            cols = line.strip().split("\t")
            rna_chr = cols[1]             # RNA-end chromosome
            dna_chr = cols[3]             # DNA-end chromosome
            dna_pos = int(cols[4])        # DNA-end position
            # Keep only mitochondrial RNA-end pairs
            if rna_chr in ["chrM", "chrMT", "MT"]:
                # Record as a 1-bp interval at the DNA-end
                dna_sites.append([dna_chr, dna_pos, dna_pos + 1])

    if not dna_sites:
        print(f"{name}: No mtRNA-binding read pairs found!")
        continue

    # Merge intervals within ±250 bp (max gap=500)
    dna_bed = pybedtools.BedTool(dna_sites).sort().merge(d=500)

    # Save merged intervals as BED
    outfile = os.path.join(outdir, f"mtRNA_DNA_sites_{name}.bed")
    dna_bed.saveas(outfile)
    print(f"{name}: Generated {outfile}, regions = {len(dna_bed)}")

Day0: Successfully generated /projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/analysis/mtRNA_DNA_sites_Day0.bed, number of regions: 294752
Day3: Successfully generated /projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/analysis/mtRNA_DNA_sites_Day3.bed, number of regions: 380812
Day7: Successfully generated /projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/analysis/mtRNA_DNA_sites_Day7.bed, number of regions: 368771


In [1]:
import pyranges as pr

gtf_path = "/projectnb/perissilab/Xinyu/GPS2_CHIPseq/iMargi/ref/Homo_sapiens.GRCh38.84.gtf.gz"
gtf = pr.read_gtf(gtf_path)
gtf.Chromosome = gtf.Chromosome.apply(lambda x: "chr" + x if not x.startswith("chr") else x)

# 看前几行和所有列名
print(gtf.df.columns)
gtf.df.head(10)


  import pkg_resources


Index(['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand',
       'Frame', 'gene_id', 'gene_version', 'gene_name', 'gene_source',
       'gene_biotype', 'havana_gene', 'havana_gene_version', 'transcript_id',
       'transcript_version', 'transcript_name', 'transcript_source',
       'transcript_biotype', 'havana_transcript', 'havana_transcript_version',
       'tag', 'transcript_support_level', 'exon_number', 'exon_id',
       'exon_version', 'ccds_id', 'protein_id', 'protein_version'],
      dtype='object')


Unnamed: 0,Chromosome,Source,Feature,Start,End,Score,Strand,Frame,gene_id,gene_version,...,havana_transcript,havana_transcript_version,tag,transcript_support_level,exon_number,exon_id,exon_version,ccds_id,protein_id,protein_version
0,chr1,havana,gene,11868,14409,.,+,.,ENSG00000223972,5,...,,,,,,,,,,
1,chr1,havana,transcript,11868,14409,.,+,.,ENSG00000223972,5,...,OTTHUMT00000362751,1.0,basic,1.0,,,,,,
2,chr1,havana,exon,11868,12227,.,+,.,ENSG00000223972,5,...,OTTHUMT00000362751,1.0,basic,1.0,1.0,ENSE00002234944,1.0,,,
3,chr1,havana,exon,12612,12721,.,+,.,ENSG00000223972,5,...,OTTHUMT00000362751,1.0,basic,1.0,2.0,ENSE00003582793,1.0,,,
4,chr1,havana,exon,13220,14409,.,+,.,ENSG00000223972,5,...,OTTHUMT00000362751,1.0,basic,1.0,3.0,ENSE00002312635,1.0,,,
5,chr1,havana,transcript,12009,13670,.,+,.,ENSG00000223972,5,...,OTTHUMT00000002844,2.0,basic,,,,,,,
6,chr1,havana,exon,12009,12057,.,+,.,ENSG00000223972,5,...,OTTHUMT00000002844,2.0,basic,,1.0,ENSE00001948541,1.0,,,
7,chr1,havana,exon,12178,12227,.,+,.,ENSG00000223972,5,...,OTTHUMT00000002844,2.0,basic,,2.0,ENSE00001671638,2.0,,,
8,chr1,havana,exon,12612,12697,.,+,.,ENSG00000223972,5,...,OTTHUMT00000002844,2.0,basic,,3.0,ENSE00001758273,2.0,,,
9,chr1,havana,exon,12974,13052,.,+,.,ENSG00000223972,5,...,OTTHUMT00000002844,2.0,basic,,4.0,ENSE00001799933,2.0,,,
