In [7]:
import sys
import json
import pandas as pd
from collections import Counter, defaultdict
import pysam
from pyBioInfo.Range import GRange
from pyBioInfo.Utils import ShiftLoader

In [8]:
def load_svs(path):
    svs = []
    with pysam.VariantFile(path) as f:
        for record in f:
            if record.contig == "chrY":
                continue
            svtype = record.info["SVTYPE"]
            if svtype != "DEL" and svtype != "INS":
                continue
#             if list(record.filter)[0] != "PASS":
#                 continue
            svlen = abs(record.info["SVLEN"])
            sv = GRange(chrom=record.contig, start=record.start, end=record.stop, name=record.id)
            sv.record = record
            sv.svtype = svtype
            sv.svlen = svlen
            svs.append(sv)
    return svs

def filter_sv_by_regions(svs, regions):
    svs1 = []
    loader = ShiftLoader(regions)
    for sv in svs:
        n = len(list(loader.fetch(obj=sv)))
        if n == 0:
            svs1.append(sv)
    return svs1

def get_recall(svs_ref, svs_que, svtype):
    svs_ref = list(filter(lambda sv: sv.svtype == svtype, svs_ref))
    svs_que = list(filter(lambda sv: sv.svtype == svtype, svs_que))
    print("-" * 80)
    print(svtype, len(svs_ref), len(svs_que), sep="\t")
    print(Counter([sv.gt for sv in svs_ref]))
    
    n_hit = 0
    loader = ShiftLoader(svs_que)
    counter = defaultdict(int)
    for sv in svs_ref:
        hit = False
        for sv2 in loader.fetch(chrom=sv.chrom, start=sv.start - 1000, end=sv.end + 1000):
            if sv.svtype == sv2.svtype and min(sv.svlen, sv2.svlen) >= max(sv.svlen, sv2.svlen) * 0.7:
                try:
                    sv.new_gt = sv2.nss_gt
                except AttributeError:
                    pass
                hit = True
                break
        if hit:
            counter[sv.gt] += 1
            n_hit += 1
    print(counter)
    data = dict()
    data["Reference"] = len(svs_ref)
    data["Query"] = len(svs_que)
    data["Reference_Hit"] = n_hit
    data["Reference_Recall"] = data["Reference_Hit"] / data["Reference"]
    return data

In [9]:
f_vcf1 = "../../3_NanoStrandSeq_PseudoBulk/results/sv/filtered/PacBio.full.vcf.gz"
f_quant1 = "../../3_NanoStrandSeq_PseudoBulk/results/sv/quantify_lite/PacBio.full.tsv"
f_vcf2 = "../../4_NanoStrandSeq_Phasing/results/HG001_Cell_350/sv/cutesv.filtered.vcf.gz"
f_quant2 = "../../4_NanoStrandSeq_Phasing/results/HG001_Cell_350/sv/quantify_merged.tsv"
f_bed = "../../3_NanoStrandSeq_PseudoBulk/results/sv/regions/benchmark_sv_blacklist.bed"
min_query_cell = 3

# load SVs

svs_ref = load_svs(f_vcf1)
svs_que = load_svs(f_vcf2)

# SV names

max_length = 10000
min_length = 50
min_reads = 5
min_freq = 0.2

dat = pd.read_csv(f_quant1, sep="\t")
dat["AgreeReadRatio"] = dat["AgreeRead"] / (dat["AgreeRead"] + dat["DisagreeRead"])
dat = dat[dat["Length"] <= max_length]
dat = dat[dat["Length"] >= min_length]
dat = dat[dat["Chrom"] != "chrY"]
dat = dat[dat["AgreeRead"] >= min_reads]
dat = dat[(dat["AgreeRead"] / (dat["AgreeRead"] + dat["DisagreeRead"])) >= min_freq]
ratios_ref = {name: ratio for name, ratio in dat[["Name", "AgreeReadRatio"]].values}
dat_ref = dat
names_ref = set(dat["Name"])

dat = pd.read_csv(f_quant2, sep="\t")
dat["AgreeReadRatio"] = dat["AgreeRead"] / (dat["AgreeRead"] + dat["DisagreeRead"])
dat = dat[dat["Length"] <= max_length]
dat = dat[dat["Length"] >= min_length]
dat = dat[dat["Chrom"] != "chrY"]
dat = dat[dat["AgreeRead"] >= min_reads]
if min_query_cell > 1:
    dat = dat[dat["AgreeCell"] >= min_query_cell]
dat = dat[(dat["AgreeRead"] / (dat["AgreeRead"] + dat["DisagreeRead"])) >= min_freq]
ratios_que = {name: ratio for name, ratio in dat[["Name", "AgreeReadRatio"]].values}
dat_que = dat
names_que = set(dat["Name"])

# filter sv by names

svs_ref_1 = list(filter(lambda sv: sv.name in names_ref, svs_ref))
svs_que_1 = list(filter(lambda sv: sv.name in names_que, svs_que))
print("Number of filtered SVs (ref):", len(svs_ref_1))
print("Number of filtered SVs (que):", len(svs_que_1))

# blacklist regions

regions = []
with open(f_bed) as f:
    for line in f:
        chrom, start, end = line.strip("\n").split("\t")
        start, end = int(start), int(end)
        regions.append(GRange(chrom=chrom, start=start, end=end))
regions.sort()

# filter sv by regions

svs_ref_2 = filter_sv_by_regions(svs_ref_1, regions)
svs_que_2 = filter_sv_by_regions(svs_que_1, regions)

# benchmark

ref = svs_ref_2
que = svs_que_2

for sv in ref:
    if ratios_ref[sv.name] >= 0.8:
        sv.gt = "1/1"
    else:
        sv.gt = "0/1"
    
for sv in que:
    if ratios_que[sv.name] >= 0.8:
        sv.gt = "1/1"
    else:
        sv.gt = "0/1"

[W::hts_idx_load3] The index file is older than the data file: ../../4_NanoStrandSeq_Phasing/results/HG001_Cell_350/sv/cutesv.filtered.vcf.gz.tbi


Number of filtered SVs (ref): 19878
Number of filtered SVs (que): 21211


In [4]:
m = pd.read_csv("results/quant_phased_svs.HG001_Cell_350.tsv", sep="\t", header=0)
nss_gt = dict()
for name, gt in m[["Name_HP1", "GenoType"]].values:
    if gt != ".":
        nss_gt[name] = gt
for sv in que:
    if sv.name in nss_gt:
        sv.nss_gt = nss_gt[sv.name]

In [5]:
data = dict()

# deletion

d1 = get_recall(ref, que, "DEL")
d2 = get_recall(que, ref, "DEL")
recall = d1["Reference_Recall"]
precision = d2["Reference_Recall"]
f1 = 2 * recall * precision / (recall + precision)
data["Del_Recall"] = recall
data["Del_Precision"] = precision
data["Del_F1"] = f1
data["Del_Detail"] = [d1, d2]

# precision

d1 = get_recall(ref, que, "INS")
d2 = get_recall(que, ref, "INS")
recall = d1["Reference_Recall"]
precision = d2["Reference_Recall"]
f1 = 2 * recall * precision / (recall + precision)
data["Ins_Recall"] = recall
data["Ins_Precision"] = precision
data["Ins_F1"] = f1
data["Ins_Detail"] = [d1, d2]

# output
# with open(outfile, "w+") as fw:
#     json.dump(data, fw, indent=4)
data

--------------------------------------------------------------------------------
DEL	3160	3531
Counter({'0/1': 1846, '1/1': 1314})
defaultdict(<class 'int'>, {'1/1': 1296, '0/1': 1611})
--------------------------------------------------------------------------------
DEL	3531	3160
Counter({'0/1': 2154, '1/1': 1377})
defaultdict(<class 'int'>, {'1/1': 1251, '0/1': 1701})
--------------------------------------------------------------------------------
INS	5134	5112
Counter({'0/1': 2712, '1/1': 2422})
defaultdict(<class 'int'>, {'0/1': 2326, '1/1': 2094})
--------------------------------------------------------------------------------
INS	5112	5134
Counter({'0/1': 3112, '1/1': 2000})
defaultdict(<class 'int'>, {'0/1': 2601, '1/1': 1847})


{'Del_Recall': 0.9199367088607595,
 'Del_Precision': 0.8360237892948174,
 'Del_F1': 0.8759752557771202,
 'Del_Detail': [{'Reference': 3160,
   'Query': 3531,
   'Reference_Hit': 2907,
   'Reference_Recall': 0.9199367088607595},
  {'Reference': 3531,
   'Query': 3160,
   'Reference_Hit': 2952,
   'Reference_Recall': 0.8360237892948174}],
 'Ins_Recall': 0.8609271523178808,
 'Ins_Precision': 0.8701095461658842,
 'Ins_F1': 0.8654939949468944,
 'Ins_Detail': [{'Reference': 5134,
   'Query': 5112,
   'Reference_Hit': 4420,
   'Reference_Recall': 0.8609271523178808},
  {'Reference': 5112,
   'Query': 5134,
   'Reference_Hit': 4448,
   'Reference_Recall': 0.8701095461658842}]}

In [6]:
counter = defaultdict(int)
for sv in svs_ref:
    try:
        counter[sv.new_gt] += 1
    except AttributeError:
        pass
counter

defaultdict(int, {'1|0': 1623, '1|1': 2367, '0|1': 1577})