In [26]:
from collections import defaultdict, Counter
import numpy as np
import pandas as pd
import pysam
from pyBioInfo.Range import GRange
from pyBioInfo.IO.File import BedFile
from pyBioInfo.Utils import ShiftLoader

# Haplotype-resolved SNVs distribution

In [11]:
chrom_lengths = dict()
with open("/home/chenzonggui/species/mus_musculus/GRCm38.p6/GRCm38.canonical.genome.sizes") as f:
    for line in f:
        chrom, length = line.strip("\n").split("\t")
        chrom_lengths[chrom] = int(length)
chroms = ["chr%s" % i for i in range(1, 20)] + ["chrX", "chrY"]

In [13]:
def load_snvs(path):
    with pysam.VariantFile(path) as f:
        sample = list(f.header.samples)[0]
        data1 = defaultdict(list)
        data2 = defaultdict(list)
        for record in f:
            chrom = record.chrom
            start = record.start
            ps = record.samples[sample]["PS"]
            if ps != "PATMAT":
                continue               
            gt = record.samples[sample]["GT"]
            if gt[0] != 0:
                data1[chrom].append(start)
            if gt[1] != 0:
                data2[chrom].append(start)
    return data1, data2

path = "../../A1_NanoStrandseqAssembly/results/Mouse_Cell_200/round2/snvs.vcf.gz"
snvs1, snvs2 = load_snvs(path)

In [18]:
width = 10000000
for hp in [0, 1]:
    with open("/home/chenzonggui/software/circos-0.69-9/mouse_snvs_distribution_hp%d.txt" % (hp + 1), "w+") as fw:
        for chrom in chroms:
            if hp == 0:
                snvs = snvs1[chrom]
            else:
                snvs = snvs2[chrom]
            length = chrom_lengths[chrom]
            nbin = int(length / width)
            if length % width > 0:
                nbin += 1
            ys = np.zeros(nbin)
            for start in snvs:
                ys[int(start / width)] += 1
            for i, c in enumerate(ys):
                start = i * width
                end = min((i + 1) * width, length)
                if hp == 1:
                    c = c * -1
                line = "\t".join(map(str, ["mm" + chrom[3:], start, end, c]))
                fw.write(line + "\n")

# Haplotype-resolved SVs distribution

In [2]:
chrom_lengths = dict()
with open("/home/chenzonggui/species/mus_musculus/GRCm38.p6/GRCm38.canonical.genome.sizes") as f:
    for line in f:
        chrom, length = line.strip("\n").split("\t")
        chrom_lengths[chrom] = int(length)
chroms = ["chr%s" % i for i in range(1, 20)] + ["chrX", "chrY"]

In [4]:
path1 = "../../A1_NanoStrandseqAssembly/results/Mouse_Cell_200/inversions/inversions.bed.gz"
inversions = []
with BedFile(path1) as f:
    for inv in f:
        crick, watson = inv.name.split(";")[:2]
        crick, watson = int(crick), int(watson)
        t = "Unknown"
        if crick + watson >= 20:
            r = watson / (crick + watson)
            if r >= 0.9:
                t = "HOM"
            elif 0.4 <= r < 0.6:
                t = "HET"
        inv.type = t
        inversions.append(inv)
len(inversions)

2116

In [24]:
dat = pd.read_csv("data/quant_phased_svs.Mouse_Cell_200.tsv", sep="\t")
dat = dat[dat["GenoType"] != "."]

In [43]:
svs = []
for chrom, start, end, gt in dat[["Chrom_HP1", "Start_HP1", "End_HP1", "GenoType"]].values:
    svs.append(GRange(chrom=chrom, start=start, end=end, name=gt))
svs.sort()
print(len(svs))

data = defaultdict(list)

loader = ShiftLoader(inversions)
for sv in svs:
    invs1 = list(loader.fetch(obj=sv))
    if len(invs1) == 0:
        pass
    elif len(invs1) == 1:
        if invs1[0].type == "HOM":
            # pass
            sv.name = sv.name[::-1]
        elif invs1[0].type == "HET":
            continue
        else:
            continue
    else:
        continue
    
    if sv.name[0] == "1":
        data[(sv.chrom, 0)].append([sv.start, sv.end])
    if sv.name[2] == "1":
        data[(sv.chrom, 1)].append([sv.start, sv.end])

24950


In [44]:
width = 10000000
for hp in [0, 1]:
    with open("/home/chenzonggui/software/circos-0.69-9/mouse_svs_distribution_hp%d.txt" % (hp + 1), "w+") as fw:
        for chrom in chroms:
            length = chrom_lengths[chrom]
            nbin = int(length / width)
            if length % width > 0:
                nbin += 1
            counts = np.zeros(nbin)
            for sv in data[(chrom, hp)]:
                counts[int(sv[0] / width)] += 1
            for i, c in enumerate(counts):
                start = i * width
                end = min((i + 1) * width, length)
                if hp == 1:
                    c = c * -1
                line = "\t".join(map(str, ["mm" + chrom[3:], start, end, c]))
                fw.write(line + "\n")