In [1]:
import pickle
from collections import defaultdict
import numpy as np
import pandas as pd
import pysam
from pyBioInfo.Range import GRange
from pyBioInfo.IO.File import BedFile
from pyBioInfo.Utils import SegmentTools, ShiftLoader

# Prepare phased SVs for validation

In [3]:
def load_het_snps(f_vcf):
    snps = []
    with pysam.VariantFile(f_vcf) as f:
        sample = list(f.header.samples)[0]
        for record in f:
            gt = record.samples[sample]["GT"]
            ps = record.samples[sample]["PS"]
            a1, a2 = record.alleles[gt[0]], record.alleles[gt[1]]
            if len(a1) > 1 or len(a2) > 1 or a1 == a2:
                continue
            name1 = "%s|%s" % (a1, a2)
            name2 = "%s/%s" % (a1, a2)
            if ps == "PATMAT" or ps == "0":
                name = name1
            else:
                name = name2
            snp = GRange(chrom=record.chrom, start=record.start, end=record.stop, name=name)
            snps.append(snp)
    snps.sort()
    return snps
snps1 = load_het_snps("../../4_NanoStrandSeq_Phasing/results/HG001_Cell_350/round2/snvs.vcf.gz")
snps2 = load_het_snps("/lustre/grp/tfclab/chenzg/repositories/GRCh38_HG001_SNP_Indel/GRCh38_HG001_SNP_Indel.GIAB_v4.2.1_and_v3.3.2.vcf.gz")
print(len(snps1), len(snps2))

[W::hts_idx_load3] The index file is older than the data file: ../../4_NanoStrandSeq_Phasing/results/HG001_Cell_350/round2/snvs.vcf.gz.tbi


2024999 2080839


In [4]:
def load_het_svs(f_tsv):
    dat = pd.read_csv(f_tsv, sep="\t")
    svs = []
    d = dat[["Chrom_HP1", "Start_HP1", "End_HP1", "Name_HP1", "Length_HP1", "GenoType"]]
    for chrom, start, end, name, length, gt in d.values:
        obj = GRange(chrom=chrom, start=start, end=end, name=name)
        if gt == "." or gt == "1|1":
            continue
        obj.gt = gt
        obj.length = length
        svs.append(obj)
    svs.sort()
    return svs

svs = load_het_svs("results/quant_phased_svs.HG001_Cell_350.tsv")
print(len(svs))

6010


In [5]:
def get_closest_snp(sv, loader, max_distance=100000):
    closest_distance = None
    closest_snp = None
    for snp in loader.fetch(chrom=sv.chrom, start=sv.start - max_distance, end = sv.end + max_distance):
        if snp.end < sv.start:
            distance = sv.start - snp.end
        elif snp.start > sv.end:
            distance = snp.start - sv.end
        else:
            continue
        if distance < 10:
            continue
        if closest_snp is None or distance < closest_distance:
            closest_distance = distance
            closest_snp = snp
    return closest_distance, closest_snp

In [13]:
n1 = 0
n2 = 0
loader1 = ShiftLoader(snps1) # NanoStrand-seq
loader2 = ShiftLoader(snps2) # GIAB
rows = []
with pysam.FastaFile("/lustre/grp/tfclab/chenzg/species/homo_sapiens/GRCh38.p13/GRCh38.canonical.genome.fa") as fasta:
    for sv in svs:
        closest_distance1, closest_snp1 = get_closest_snp(sv, loader1)
        closest_distance2, closest_snp2 = get_closest_snp(sv, loader2)
        if closest_distance1 is None:
            continue
        if closest_distance1 >= 10000:
            n1 += 1
        else:
            continue
        if closest_distance1 >= 0:
            n2 += 1
            chrom = sv.chrom
            start = min(sv.start, closest_snp1.start)
            end = max(sv.end, closest_snp1.end)
            start1, end1 = start - 5000 - 100, start - 100
            start2, end2 = end + 100, end + 100 + 5000
            seq1 = fasta.fetch(chrom, start1, end1)
            seq2 = fasta.fetch(chrom, start2, end2)
            s = dict()
            s["Chrom"] = sv.chrom
            s["RangeStart"] = start
            s["RangeEnd"] = end
            s["Location"] = "%s:%s-%s" % (chrom, start, end)
            s["SV_To_hetSNP"] = closest_distance1
            s["SV_Start"] = sv.start
            s["SV_End"] = sv.end
            s["SV_Length"] = sv.length
            s["SV_Name"] = sv.name
            s["SV_GT"] = sv.gt
            s["SNP_Start"] = closest_snp1.start
            s["SNP_Name"] = closest_snp1.name
            #s["GIAB_SNP_Start"] = closest_snp2.start
            #s["GIAB_SNP_Name"] = closest_snp2.name
            s["UpstreamStart"] = start1
            s["UpstreamEnd"] = end1
            s["UpstreamSequence"] = seq1
            s["DownstreamStart"] = start2
            s["DownstreamEnd"] = end2
            s["DownstreamSequence"] = seq2
            rows.append(s)
print(n1, n2)

61 61


In [14]:
dat = pd.DataFrame(rows)
dat.to_csv("results/candidate_sv_for_validate.csv", index=False)

# Validate phased SVs by long reads

In [4]:
def get_deletion_blocks(segment):
    start = segment.reference_start
    blocks = [] # [start, end]
    for flag, count in segment.cigartuples:
        if flag == pysam.CMATCH:
            start += count
        elif flag == pysam.CDEL:
            end = start + count
            blocks.append([start, end])
            start = end
        elif flag == pysam.CINS:
            continue
        elif flag == pysam.CSOFT_CLIP or flag == pysam.CHARD_CLIP:
            continue
        elif flag == pysam.CEQUAL:
            start += count
        elif flag == pysam.CDIFF:
            start += count
        else:
            assert False
    return blocks


def get_insertion_blocks(segment):
    start = segment.reference_start
    offset = 0
    ins = [] # [start, length]
    for flag, count in segment.cigartuples:
        if flag == pysam.CMATCH:
            offset += count
        elif flag == pysam.CDEL:
            offset += count
        elif flag == pysam.CINS:
            ins.append([offset + start, count])
        elif flag == pysam.CSOFT_CLIP or flag == pysam.CHARD_CLIP:
            continue
        elif flag == pysam.CEQUAL:
            offset += count
        elif flag == pysam.CDIFF:
            offset += count
        else:
            print(flag, count)
            assert False
    return ins


def load_segments(f_bam, chrom, start, end, snp_start, slop=300, perfect_width=20, max_reads=200):
    segments = []
    perfect_start = snp_start - perfect_width
    perfect_end = snp_start + 1 + perfect_width
    with pysam.AlignmentFile(f_bam) as f:
        for s in f.fetch(chrom, start, end):
            if s.reference_start > start - slop:
                continue
            if s.reference_end < end + slop:
                continue
            if s.mapping_quality < 60:
                continue
            # perfect snp content
            perfect = True
            for e in SegmentTools.get_events(s):
                if e[1] == "-": # insertion
                    if e[0] >= perfect_start and e[0] < perfect_end:
                        perfect = False
                        break
                    elif e[0] >= perfect_end:
                        break
                elif e[2] == "-": # deletion
                    start1 = e[0]
                    end1 = start1 + len(e[1])
                    if max(start1, perfect_start) < min(end1, perfect_end):
                        perfect = False
                        break
                    elif e[0] >= perfect_end:
                        break
                else: # mismatch
                    if perfect_start <= e[0] < perfect_end and e[0] != snp_start:
                        perfect = False
                        break
                    elif e[0] >= perfect_end:
                        break
            if not perfect:
                continue
            segments.append(s)
            if len(segments) >= max_reads:
                break
    return segments

In [5]:
dat = pd.read_csv("results/candidate_sv_for_validate.csv")
f_bams = {
    "PacBio": "../../3_NanoStrandSeq_PseudoBulk/results/bams/PacBio.full.bam",
    "Ultralong": "../../3_NanoStrandSeq_PseudoBulk/results/bams/Ultralong.full.bam",
    "TangONT": "../../9_Validate_SV_by_bulk_ONT/results/GRCh38_HG001_ONT_PCR_SVs.bam"
}
array = []
for i in range(len(dat)):
    d = dict(dat.iloc[i])
    print(i, d["SV_Name"])     
    chrom = d["Chrom"]
    start = d["RangeStart"]
    end = d["RangeEnd"]
    sv_start = d["SV_Start"]
    sv_end = d["SV_End"]
    sv_len = d["SV_Length"]
    sv_type = d["SV_Name"].split(".")[1]
    snp_start = d["SNP_Start"]
    for name, f_bam in f_bams.items():
        segments = load_segments(f_bam, chrom, start, end, snp_start, slop=300, perfect_width=20, max_reads=200)
        fw = None
        if True and name == "TangONT":
            fw = pysam.AlignmentFile("results/%s.bam" % d["SV_Name"], "wb", pysam.AlignmentFile(f_bam))
        counter = defaultdict(int)
        for s in segments:
            support = None
            if sv_type == "INS":
                pos = sv_start
                n = 0
                pmin, pmax = pos - sv_len, pos + sv_len
                for x, length in get_insertion_blocks(s):
                    if pmin < x < pmax:
                        n += length
                if min(sv_len, n) >= max(sv_len, n) * 0.7:
                    support = "INS"
                else:
                    support = "REF"
            elif sv_type == "DEL":
                n = 0
                for x, y in get_deletion_blocks(s):
                    if max(x, sv_start) < min(y, sv_end):
                        n += (y - x)
                if min(sv_len, n) >= max(sv_len, n) * 0.7:
                    support = "DEL"
                else:
                    support = "REF"
            else:
                assert False
            base = SegmentTools.get_query_base(segment=s, position=snp_start)
            counter[(base, support)] += 1
            if fw:
                s.set_tag("XV", "%s_%s" % (base, support))
                fw.write(s)
        if fw:
            fw.close()
        d[name] = counter  
    array.append(d)

0 cuteSV.DEL.1463
1 cuteSV.INS.1390
2 cuteSV.INS.1671
3 cuteSV.DEL.1250
4 cuteSV.INS.431
5 cuteSV.DEL.2166
6 cuteSV.DEL.10
7 cuteSV.DEL.1018
8 cuteSV.INS.870
9 cuteSV.INS.444
10 cuteSV.DEL.544
11 cuteSV.INS.368
12 cuteSV.INS.294
13 cuteSV.DEL.1482
14 cuteSV.DEL.1014
15 cuteSV.DEL.1015
16 cuteSV.DEL.725
17 cuteSV.DEL.2258
18 cuteSV.DEL.2259
19 cuteSV.DEL.3095
20 cuteSV.DEL.509
21 cuteSV.DEL.1752
22 cuteSV.INS.616
23 cuteSV.DEL.3057
24 cuteSV.DEL.1727
25 cuteSV.DEL.2528
26 cuteSV.DEL.1300


In [8]:
with open("results/validate_results.pkl", "wb") as fw:
    pickle.dump(array, fw)

In [9]:
array1 = pickle.load(open("results/validate_results.pkl", "rb"))
for i in range(len(array1)):
    d = array1[i]
    if sum(d["TangONT"].values()) < 100:
        continue
    print("-" * 80)
    print(i, d["SV_Name"], d["SV_GT"], d["SNP_Name"])
    print(d["Chrom"])
    print("SV:", d["SV_Start"], d["SV_End"], d["SV_Length"])
    print("SNP:", d["SNP_Start"])
    print("Distance:", d["SV_To_hetSNP"])
    print("PacBio:", sorted(d["PacBio"].items(), key=lambda item: item[1], reverse=True))
    print("ONT-UL:", sorted(d["Ultralong"].items(), key=lambda item: item[1], reverse=True))
    print("TargetPCR", sorted(d["TangONT"].items(), key=lambda item: item[1], reverse=True))

--------------------------------------------------------------------------------
0 cuteSV.DEL.1463 1|0 A|G
chr1
SV: 58300577 58300628 50
SNP: 58323583
Distance: 22955
PacBio: []
ONT-UL: []
TargetPCR [(('G', 'REF'), 95), (('A', 'REF'), 87), (('A', 'DEL'), 15), (('G', 'DEL'), 3)]
--------------------------------------------------------------------------------
1 cuteSV.INS.1390 0|1 A|G
chr1
SV: 188863707 188863708 320
SNP: 188889328
Distance: 25620
PacBio: []
ONT-UL: [(('G', 'INS'), 1)]
TargetPCR [(('A', 'REF'), 96), (('G', 'INS'), 70), (('G', 'REF'), 17), (('A', 'INS'), 17)]
--------------------------------------------------------------------------------
2 cuteSV.INS.1671 1|0 A|G
chr1
SV: 219475626 219475627 59
SNP: 219465411
Distance: 10214
PacBio: []
ONT-UL: [(('G', 'REF'), 3)]
TargetPCR [(('G', 'REF'), 121), (('G', 'INS'), 79)]
--------------------------------------------------------------------------------
3 cuteSV.DEL.1250 0|1 A|G
chr10
SV: 49127172 49127232 59
SNP: 49141670
Distanc