In [4]:
import os, glob, json, re, multiprocessing, gzip
import random
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn, pysam
from PyPDF2 import PdfFileMerger
from pyBioInfo.Range import GRange
from pyBioInfo.IO.File import BedFile, SegmentTools, Alignment, GffFile, FastaFile
from pyBioInfo.Utils import ShiftLoader
from sstools.utils import BaseMatrix2
from matplotlib_venn import *
# infos = pd.read_excel("../1_NanoStrandseq/NanoStrandseq_summary.xlsx")

# SV cases for testing

In [8]:
svs = []
for path in glob.glob("results/assembly/round1/*.vcf"):
    with open(path) as f:
        for line in f:
            if line.startswith("#"):
                continue
            row = line.strip("\n").split("\t")
            chrom = row[0]
            pos = int(row[1])
            ref, alt = row[3], row[4]
            if len(ref) > len(alt):
                start = pos
                end = pos + len(ref) - 1
                name = "DEL.%d" % (len(ref) - len(alt))
            else:
                start = pos
                end = pos + 1
                name = "INS.%d" % (len(alt) - len(ref))
            obj = GRange(chrom=chrom, start=start, end=end, name=name, strand="+")
            svs.append(obj)

In [9]:
with open("results/assembly/cuteSV.bed", "w+") as fw:
    for sv in svs:
        fw.write(sv.format("BED") + "\n")

In [10]:
data = defaultdict(list)
for sv in svs:
    data[sv.chrom].append(sv)

In [11]:
def calculate(sv, f):
    alignments = []
    for segment in f.fetch(sv.chrom, sv.start, sv.end):
        cigars = SegmentTools.parse_cigar(segment)
        obj = Alignment(segment)
        deletions = []
        insertions = []
        for cigar in cigars:
            if cigar[0] == "D":
                deletions.append(cigar[3])
            elif cigar[0] == "I":
                x, y = cigar[2]
                insertions.append([cigar[3][0], y - x])
        obj.deletions = deletions
        obj.insertions = insertions
        alignments.append(obj)
    N_overlap = len(alignments)
    N_agree = 0
    N_disagree = 0
    is_deletion = sv.name.startswith("DEL")
    if is_deletion:
        for obj in alignments:
            s = max(obj.start, sv.start)
            e = min(obj.end, sv.end)
            if e - s < 30:
                continue
            if obj.start < sv.start and obj.end > sv.end:
                d = 0
                for x, y in obj.deletions:
                    if max(x, sv.start) < min(y, sv.end):
                        d += y - x
                if d > len(sv) * 0.5 and d < len(sv) * 1.5:
                    N_agree += 1
                else:
                    N_disagree += 1
            else:
                N_disagree += 1  
    else:
        start1, end1 = sv.start - 30, sv.end + 30
        num1 = int(sv.name.split(".")[1])
        for obj in alignments:
            d1 = sv.start - obj.start
            d2 = obj.end - sv.end
            # assert d1 > 0 and d2 > 0
            if min(d1, d2) < 50:
                continue
            s = 0
            for pos, num in obj.insertions:
                if start1 < pos < end1:
                    s += num
            if s > num1 * 0.5 and s < num1 * 1.5:
                N_agree += 1
            else:
                N_disagree += 1
    return N_overlap, N_agree, N_disagree

selected = []
for chrom, svs1 in sorted(data.items()):
    print(chrom)
    
    path1 = "results/assembly/round1/%s.hp1.sorted.bam" % chrom
    path2 = "results/assembly/round1/%s.hp2.sorted.bam" % chrom
    
    f1 = pysam.AlignmentFile(path1)
    f2 = pysam.AlignmentFile(path2)
    
    for sv in svs1:
        vs1 = calculate(sv, f1)
        vs2 = calculate(sv, f2)
        t1 = vs1[1] + vs1[2]
        t2 = vs2[1] + vs2[2]
        p1 = np.divide(vs1[1], t1)
        p2 = np.divide(vs2[1], t2)
        if t1 > 10 and t2 > 10:
            if (p1 > 0.8 and p2 < 0.2) or (p1 < 0.2 and p2 > 0.8):
                sv.p1 = p1
                sv.p2 = p2
                selected.append(sv)
    f1.close()
    f2.close()
    # break

chr1




chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrX


In [12]:
len(selected)

1652

In [15]:
with BedFile("results/assembly/round1/all.hets.bed.gz") as f:
    hets = [x for x in f]

In [16]:
loader = ShiftLoader(hets)
for obj in sorted(selected):
    items = []
    for het in loader.fetch(chrom=obj.chrom, start=max(0, obj.start - 500), end=obj.end + 500):
        items.append(het)
    obj.hets = items

In [17]:
counter = Counter([len(obj.hets) for obj in selected])

In [18]:
for obj in selected:
    if not obj.chrom == "chr22":
        continue
    if len(obj.hets) == 1:
        print(obj)

GRange: 41744475-41744797 [strand: +, block count: 1]
GRange: 47260818-47260889 [strand: +, block count: 1]
GRange: 47400075-47400157 [strand: +, block count: 1]
GRange: 19062800-19062801 [strand: +, block count: 1]
GRange: 28380049-28380050 [strand: +, block count: 1]
GRange: 35249171-35250145 [strand: +, block count: 1]
GRange: 43562651-43562728 [strand: +, block count: 1]
GRange: 48497566-48497798 [strand: +, block count: 1]
GRange: 50075125-50075188 [strand: +, block count: 1]


In [19]:
len(selected) - counter[0]

1193

In [20]:
with open("results/assembly/SVs_with_hetSNVs_in_500bp.bed", "w+") as fw:
    for obj in sorted(selected):
        if len(obj.hets) > 0:
            fw.write(obj.format("BED") + "\n")

In [21]:
with GffFile("/home/chenzonggui/species/homo_sapiens/RepeatMasker/hg38.fa.out.sorted.gff.gz") as f:
    records = [x for x in f]

In [24]:
records.sort()

In [25]:
records[0].attributes

OrderedDict([('ID', '1_Simple_repeat_(TAACCC)n'),
             ('Name', '1_Simple_repeat_(TAACCC)n'),
             ('PercDiv', '1.3'),
             ('PercDel', '0.6'),
             ('PercIns', '1.7'),
             ('QueryLeft', '248945954'),
             ('RepeatBegin', '1'),
             ('RepeatEnd', '463'),
             ('RepeatLeft', '0')])

In [26]:
loader = ShiftLoader(records)
for obj in sorted(selected):
    array = []
    for item in loader.fetch(obj=obj):
        array.append(item)
    obj.repeats = array

In [56]:
fasta = FastaFile("/home/chenzonggui/species/homo_sapiens/GRCh38.p13/GRCh38.canonical.genome.fa")
fw = open("results/assembly/SVs_with_hetSNVs_in_500bp.tsv", "w+")
fw.write("Chrom\tStart\tEnd\tType\tLength\tProximalHetSNV\tSNVDistance\tRepeatMasker\tUpStart\tUpEnd\tUpSequence\tDownStart\tDownEnd\tDownSequence\n")
for obj in sorted(selected):
    if len(obj.hets) == 0:
        continue
        
    s = []
    for record in obj.repeats:
        s1 = record.attributes["Name"]
        i1 = s1.find("_")
        assert i1 != -1
        s.append(s1[i1+1:])
    s = ",".join(s)
        
    array = []
    for het in obj.hets:
        if het.start < obj.start:
            distance = obj.start - het.start
        elif het.start > obj.end:
            distance = het.start - obj.end
        array.append([distance, het.start])
    array.sort()
    if len(array) == 0:
        continue
    distance, position = array[0]
    start = min(obj.start, position)
    end = max(obj.end, position)
    
    start1, end1 = start - 1000, start - 100
    start2, end2 = end + 100, end + 1000
    seq1 = fasta.fetch(chrom=obj.chrom, start=start1, end=end1)
    seq2 = fasta.fetch(chrom=obj.chrom, start=start2, end=end2)
    
    line = "\t".join(map(str, [obj.chrom, obj.start, obj.end, 
                               obj.name.split(".")[0], int(obj.name.split(".")[1]), 
                               position, distance, 
                               s,
                               start1, end1, seq1, 
                               start2, end2, seq2]))
    fw.write(line + "\n")
    
fasta.close()