In [21]:
import gzip
import pysam

In [None]:
regions = [
    ["chr22", 23176000, 23320000, "+"],
    ["chr9", 130710000, 130890000, "+"]
]

In [19]:
# make fasta

fasta = "/home/chenzonggui/species/homo_sapiens/GRCh38.p13/GRCh38.canonical.genome.fa"

seq = ""
with pysam.FastaFile(fasta) as f:
    for chrom, start, end, strand in regions:
        seq += f.fetch(chrom, start, end)
        
with open("results/bcr_abl_fusion/reference.fasta", "w+") as fw:
    fw.write(">BCR_ABL\n")
    for i in range(0, len(seq), 60):
        s = seq[i * 60:min((i + 1) * 60, len(seq))]
        fw.write("%s\n" % s)

In [16]:
# build minimap2 index
! cat /home/chenzonggui/species/homo_sapiens/GRCh38.p13/GRCh38.canonical.genome.fa results/bcr_abl_fusion/reference.fasta > results/bcr_abl_fusion/grch38_bcr_abl.fasta
! minimap2 -t 4 -x splice -d results/bcr_abl_fusion/bcr_abl_fusion.mm2.splice.mmi results/bcr_abl_fusion/reference.fasta
! minimap2 -t 20 -x splice -d results/bcr_abl_fusion/grch38_bcr_abl_fusion.mm2.splice.mmi results/bcr_abl_fusion/grch38_bcr_abl.fasta

[M::mm_idx_gen::0.021*2.40] collected minimizers
[M::mm_idx_gen::0.042*2.99] sorted minimizers
[M::main::0.051*2.65] loaded/built the index for 1 target sequence(s)
[M::mm_idx_stat] kmer size: 15; skip: 5; is_hpc: 0; #seq: 1
[M::mm_idx_stat::0.053*2.59] distinct minimizers: 99693 (97.15% are singletons); average occurrences: 1.119; average spacing: 2.905; total length: 324000
[M::main] Version: 2.26-r1175
[M::main] CMD: minimap2 -t 4 -x splice -d results/bcr_abl_fusion/bcr_abl_fusion.mm2.splice.mmi results/bcr_abl_fusion/reference.fasta
[M::main] Real time: 0.073 sec; CPU: 0.141 sec; Peak RSS: 0.057 GB


In [22]:
# make bed

bed = "/home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.annotation.transcripts.bed.gz"

rows = []
offset = 0
with pysam.TabixFile(bed) as f:
    for chrom, start, end, strand in regions:
        length = end - start
        for line in f.fetch(chrom, start, end):
            row = line.split("\t")
            row[0] = "BCR_ABL"
            for idx in [1, 2, 6, 7]:
                row[idx] = int(row[idx]) - start
                assert row[idx] >= 0 and row[idx] < length
                row[idx] += offset
            rows.append(row)
        offset += length

with open("results/bcr_abl_fusion/transcripts.bed", "w+") as fw:
    with gzip.open(bed, "rt") as f:
        for line in f:
            fw.write(line)
    for row in rows:
        line = "\t".join(map(str, row))
        fw.write(line + "\n")

In [15]:
# make gtf

gtf = "/home/chenzonggui/species/homo_sapiens/GRCh38.p13/gencode.v39.annotation.sorted.gtf.gz"

rows = []
offset = 0
with pysam.TabixFile(gtf) as f:
    for chrom, start, end, strand in regions:
        length = end - start
        for line in f.fetch(chrom, start, end):
            row = line.split("\t")
            row[0] = "BCR_ABL"
            for idx in [3, 4]:
                row[idx] = int(row[idx]) - start
                assert row[idx] >= 0 and row[idx] < length
                row[idx] += offset
            rows.append(row)
        offset += length
        
with open("results/bcr_abl_fusion/transcripts.gtf", "w+") as fw:
    for row in rows:
        line = "\t".join(map(str, row))
        fw.write(line + "\n")