In [1]:
import os
import glob
import gzip
from collections import defaultdict
import multiprocessing as mp
import numpy as np
import pandas as pd
import pysam
import pyBigWig
from pyBioInfo.Range import GRange
from pyBioInfo.IO.File import BedFile
from pyBioInfo.Utils import ShiftLoader

# Scan extreme coverage regions

In [2]:
def worker(f_bw, chrom, step, min_coverage, max_coverage):
    regions = []
    with pyBigWig.open(f_bw) as f:
        length = f.chroms()[chrom]
        step1 = step * 1000
        for start1 in range(0, length, step1):
            end1 = min(start1 + step1, length)
            covs = f.values(chrom, start1, end1)
            for start2 in range(start1, end1, step):
                end2 = min(start2 + step, end1)
                vs = np.nan_to_num(covs[start2 - start1:end2 - start1])
                mean = np.mean(vs)
                if mean < min_coverage or mean > max_coverage:
                    if len(regions) == 0:
                        regions.append([chrom, start2, end2])
                    else:
                        if start2 <= regions[-1][2]:
                            regions[-1][2] = end2
                        else:
                            regions.append([chrom, start2, end2])    
    return regions

def scan_extreme_coverage_regions(f_bw, f_bed, threads=1, min_coverage=15, max_coverage=80):
    array = []
    pool = mp.Pool(threads)
    with pyBigWig.open(f_bw) as f:
        for chrom in f.chroms():
            args = (f_bw, chrom, 100, min_coverage, max_coverage)
            array.append(pool.apply_async(worker, args))
    pool.close()
    pool.join()

    regions = []
    for r in array:
        regions.extend(r.get())
    regions.sort()
    
    with open(f_bed, "w+") as fw:
        for r in regions:
            fw.write("\t".join(map(str, r)) + "\n")

In [3]:
f_bw = "results/bw/PacBio.full.bw"
f_bed = "results/sv/regions/PacBio.full.coverage_lt_10_gt_80.bed"
scan_extreme_coverage_regions(f_bw, f_bed, 24, 10, 80)

In [4]:
f_bw = "results/bw/PacBio.full.bw"
f_bed = "results/sv/regions/PacBio.full.coverage_lt_15_gt_80.bed"
scan_extreme_coverage_regions(f_bw, f_bed, 24, 15, 80)

In [5]:
f_bw = "results/bw/Ultralong.full.bw"
f_bed = "results/sv/regions/Ultralong.full.coverage_lt_10_gt_80.bed"
scan_extreme_coverage_regions(f_bw, f_bed, 24, 10, 80)

In [6]:
f_bw = "results/bw/Ultralong.full.bw"
f_bed = "results/sv/regions/Ultralong.full.coverage_lt_15_gt_80.bed"
scan_extreme_coverage_regions(f_bw, f_bed, 24, 15, 80)

# Scan dense SV regions

In [8]:
def collapse_regions(regions):
    regions = regions.copy()
    i = 0
    while i < len(regions) - 1:
        r1 = regions[i]
        r2 = regions[i + 1]
        if r1[0] == r2[0]:
            if r1[2] >= r2[1]:
                r1[2] = max(r1[2], r2[2])
                regions.pop(i + 1)
            else:
                i += 1
        else:
            i += 1
    return regions

def scan_dense_sv_regions(f_vcf, f_bed, slop=1000):
    svs = []
    with pysam.VariantFile(f_vcf) as f:
        for record in f:
            svtype = record.info["SVTYPE"]
            if svtype != "DEL" and svtype != "INS":
                continue
            if list(record.filter)[0] != "PASS":
                continue
            r = GRange(chrom=record.contig, start=record.start, end=record.stop, name=record.id)
            r.length = abs(record.info["SVLEN"])
            r.record = record
            svs.append(r)
    svs.sort()
    regions = []
    loader = ShiftLoader(svs)
    for sv in svs:
        chrom, start, end = sv.chrom, max(0, sv.start - slop), sv.end + slop
        hit = False
        for sv2 in loader.fetch(chrom=chrom, start=start, end=end):
            if sv2 is not sv:
                hit = True
        if hit:
            regions.append([chrom, start, end])
    regions.sort()
    regions = collapse_regions(regions)
    with open(f_bed, "w+") as fw:
        for r in regions:
            fw.write("\t".join(map(str, r)) + "\n")

In [9]:
f_vcf = "results/sv/cutesv/PacBio.full.vcf.gz"
f_bed = "results/sv/regions/PacBio.full.dense_sv_1kb.bed"
scan_dense_sv_regions(f_vcf, f_bed, 1000)

In [10]:
f_vcf = "results/sv/cutesv/Ultralong.full.vcf.gz"
f_bed = "results/sv/regions/Ultralong.full.dense_sv_1kb.bed"
scan_dense_sv_regions(f_vcf, f_bed, 1000)

# Scan too long SV regions

In [11]:
def collapse_regions(regions):
    regions = regions.copy()
    i = 0
    while i < len(regions) - 1:
        r1 = regions[i]
        r2 = regions[i + 1]
        if r1[0] == r2[0]:
            if r1[2] >= r2[1]:
                r1[2] = max(r1[2], r2[2])
                regions.pop(i + 1)
            else:
                i += 1
        else:
            i += 1
    return regions

def scan_long_sv_regions(f_vcf, f_bed, max_length=10000, slop=1000):
    regions = []
    with pysam.VariantFile(f_vcf) as f:
        for record in f:
            svtype = record.info["SVTYPE"]
            if svtype != "DEL" and svtype != "INS":
                continue
            if list(record.filter)[0] != "PASS":
                continue
            if abs(record.info["SVLEN"]) > max_length:
                chrom = record.contig
                start = record.start
                end = record.stop
                start = max(0, start - slop)
                end = end + slop
                
                regions.append([chrom, start, end])
    regions.sort()
    regions = collapse_regions(regions)
    with open(f_bed, "w+") as fw:
        for r in regions:
            fw.write("\t".join(map(str, r)) + "\n")

In [12]:
f_vcf = "results/sv/cutesv/PacBio.full.vcf.gz"
f_bed = "results/sv/regions/PacBio.full.long_sv_10kb_slop_1kb.bed"
scan_long_sv_regions(f_vcf, f_bed, max_length=10000, slop=1000)

In [13]:
f_vcf = "results/sv/cutesv/Ultralong.full.vcf.gz"
f_bed = "results/sv/regions/Ultralong.full.long_sv_10kb_slop_1kb.bed"
scan_long_sv_regions(f_vcf, f_bed, max_length=10000, slop=1000)

# Merge blacklist regions

In [14]:
def collapse_regions(regions):
    regions = regions.copy()
    i = 0
    while i < len(regions) - 1:
        r1 = regions[i]
        r2 = regions[i + 1]
        if r1[0] == r2[0]:
            if r1[2] >= r2[1]:
                r1[2] = max(r1[2], r2[2])
                regions.pop(i + 1)
            else:
                i += 1
        else:
            i += 1
    return regions

regions = []

with open("results/sv/regions/PacBio.full.coverage_lt_10_gt_80.bed") as f:
    for line in f:
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        start, end = max(0, start - 1000), end + 1000
        regions.append([chrom, start, end])

with open("results/sv/regions/Ultralong.full.coverage_lt_10_gt_80.bed") as f:
    for line in f:
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        start, end = max(0, start - 1000), end + 1000
        regions.append([chrom, start, end])
        
with open("results/sv/regions/PacBio.full.dense_sv_1kb.bed") as f:
    for line in f:
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        regions.append([chrom, start, end])
        
with open("results/sv/regions/Ultralong.full.dense_sv_1kb.bed") as f:
    for line in f:
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        regions.append([chrom, start, end])
        
with open("results/sv/regions/PacBio.full.long_sv_10kb_slop_1kb.bed") as f:
    for line in f:
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        regions.append([chrom, start, end])
        
with open("results/sv/regions/Ultralong.full.long_sv_10kb_slop_1kb.bed") as f:
    for line in f:
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        regions.append([chrom, start, end])

with gzip.open("/lustre/grp/tfclab/chenzg/repositories/genome-stratifications/v3.1-genome-stratifications-GRCh38/OtherDifficult/GRCh38_allOtherDifficultregions.bed.gz", "rt") as f:
    for line in f:
        if line.startswith("#"):
            continue
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        if end - start >= 200:
            regions.append([chrom, start, end])

with gzip.open("/lustre/grp/tfclab/chenzg/repositories/genome-stratifications/v3.1-genome-stratifications-GRCh38/LowComplexity/GRCh38_AllTandemRepeatsandHomopolymers_slop5.bed.gz", "rt") as f:
    for line in f:
        if line.startswith("#"):
            continue
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        if end - start >= 200:
            regions.append([chrom, start, end])
        
regions.sort()
print("All regions:", len(regions))

regions = collapse_regions(regions)
print("Collapsed regions:", len(regions))

regions.sort()

All regions: 94182
Collapsed regions: 78245


In [15]:
with open("results/sv/regions/benchmark_sv_blacklist.bed", "w+") as fw:
    for r in regions:
        fw.write("\t".join(map(str, r)) + "\n")

In [16]:
! md5sum results/sv/regions/benchmark_sv_blacklist.bed

c65553bcf55a33a8a83634fd3e63f582  results/sv/regions/benchmark_sv_blacklist.bed
