In [3]:
import re
from collections import Counter, defaultdict
import pysam
import multiprocessing as mp

In [49]:
def parse_cigar_string(s):
    array = []
    while len(s) > 0:
        ret = re.search("^[0-9]+[MIDSH]", s)
        i1, i2, = ret.span()
        s1 = s[:i2]
        k, v = s1[-1], int(s1[:-1])
        s = s[i2:]
        array.append([k ,v])
    return array

def get_read_length(cigars):
    v = 0
    for cigar in cigars:
        if cigar[0] in ["M", "S", "H", "I"]:
            v += cigar[1]
    return v

def get_clips(cigars):
    clip1 = 0
    clip2 = 0
    if len(cigars) >= 2:
        if cigars[0][0] in ["S", "H"]:
            clip1 = cigars[0][1]
        if cigars[-1][0] in ["S", "H"]:
            clip2 = cigars[-1][1]
    return clip1, clip2

def get_mapped_index(cigars, strand, length):
    clip1, clip2 = get_clips(cigars)
    if strand == "-":
        clip1, clip2 = clip2, clip1
    return [clip1, length - clip2] 

def get_mapped_region(cigars, start):
    end = start
    for cigar in cigars:
        if cigar[0] in ["M", "D", "N"]:
            end += cigar[1]
    return start, end
    
def parse_supplementary_alignment(s):
    d = dict()
    chrom, start, strand, cigarstring, mapq, nm = s.split(",")
    start, mapq, nm = int(start), int(mapq), int(nm)
    cigars = parse_cigar_string(cigarstring)
    length = get_read_length(cigars)
    start, end = get_mapped_region(cigars, start)
    i1, i2 = get_mapped_index(cigars, strand, length)
    d["Chrom"] = chrom
    d["Start"] = start
    d["End"] = end
    d["Strand"] = strand
    d["Length"] = length
    d["CigarString"] = cigarstring
    d["Cigars"] = cigars
    d["ReadStart"] = i1
    d["ReadEnd"] = i2
    d["MapQ"] = mapq
    d["NM"] = nm
    return d

def worker(f_bam, chrom, start, end):
    print(chrom, start, end, sep="\t")
    counter = defaultdict(int)
    with pysam.AlignmentFile(f_bam) as f:
        for s in f.fetch(chrom, start, end):
            if s.reference_start < start:
                continue
            hits = []

            # primary
            cigars = parse_cigar_string(s.cigarstring)
            length = get_read_length(cigars)
            strand = "+" if s.is_forward else "-"
            i1, i2 = get_mapped_index(cigars, strand, length)
            hit1 = dict()
            hit1["Chrom"] = s.reference_name
            hit1["Start"] = s.reference_start
            hit1["End"] = s.reference_end
            hit1["Strand"] = strand
            hit1["Length"] = length
            hit1["CigarString"] = s.cigarstring
            hit1["Cigars"] = cigars
            hit1["ReadStart"] = i1
            hit1["ReadEnd"] = i2
            hit1["MapQ"] = s.mapping_quality
            hit1["NM"] = s.get_tag("NM")
            hits.append(hit1)
            
            min_clip = 200
            clip1, clip2 = get_clips(hit1["Cigars"])
            if clip1 < min_clip and clip2 < min_clip:
                continue
            if clip1 >= min_clip and clip2 >= min_clip:
                continue
            
            if True:
                if s.has_tag("SA"):
                    for sa in s.get_tag("SA").split(";"):
                        if sa == "":
                            continue
                        hit2 = parse_supplementary_alignment(sa)
                        if hit2["Chrom"] == s.reference_name:
                            hits.append(hit2)
            if len(hits) != 2:
                continue
                
            hits = list(sorted(hits, key=lambda item: item["Start"]))
            
            
            hit1, hit2 = hits
            clip1, clip2 = get_clips(hit1["Cigars"])
            clip3, clip4 = get_clips(hit2["Cigars"])
            if clip1 >= min_clip and clip2 < min_clip and clip3 >= min_clip and clip4 < min_clip:
                counter[hit1["Start"]] += 1
                counter[hit2["Start"]] += 1
            elif clip1 < min_clip and clip2 >= min_clip and clip3 < min_clip and clip4 >= min_clip:
                counter[hit1["End"]] += 1
                counter[hit2["End"]] += 1
            elif clip1 >= min_clip and clip2 < min_clip and clip3 < min_clip and clip4 >= min_clip:
                counter[hit1["Start"]] += 1
                counter[hit2["End"]] += 1
            elif clip1 < min_clip and clip2 >= min_clip and clip3 >= min_clip and clip4 < min_clip:
                counter[hit1["End"]] += 1
                counter[hit2["Start"]] += 1
    return [chrom, counter]

In [50]:
f_bam = "../../A1_NanoStrandseqAssembly/results/HG001_Cell_350/prepare/all_cells.all_chroms.bam"
chrom, counter = worker(f_bam, "chr16", 21434238, 21652017)
items = list(sorted(counter.items()))
items

chr16	21434238	21652017


[(21504240, 1),
 (21505629, 1),
 (21508971, 1),
 (21509009, 1),
 (21554955, 1),
 (21555017, 1),
 (21573589, 4),
 (21583076, 1),
 (21583103, 1),
 (21583118, 1),
 (21583121, 21),
 (21583122, 1),
 (22630990, 4),
 (22699414, 2),
 (22699421, 1),
 (22699429, 2),
 (22699431, 20)]

In [52]:
f_bam = "../../A1_NanoStrandseqAssembly/results/HG001_Cell_350/prepare/all_cells.all_chroms.bam"
results = []
pool = mp.Pool(24)
with pysam.AlignmentFile(f_bam) as f:
    chroms = list(f.references)
    for chrom in chroms:
        length = f.get_reference_length(chrom)
        step = 10000000
        for start in range(0, length, step):
            end = min(start + step, length)
            args = (f_bam, chrom, start, end)
            r = pool.apply_async(worker, args)
            results.append(r)
pool.close()
pool.join()

chr1chr1chr1chr1chr1chr1chr1chr1chr1chr1	chr1										300000000100000002000000050000000chr16000000090000000400000008000000010000000070000000												400000001000000030000000600000001000000002000000050000000110000000900000007000000080000000110000000







	


120000000
chr1	chr1chr1chr1chr1chr1	chr1chr1	200000000190000000		220000000	140000000210000000200000000				
		210000000220000000120000000230000000130000000chr1

	chr1chr1	170000000	140000000		160000000180000000130000000	

	
chr1180000000	170000000	
150000000	150000000
	160000000	240000000230000000


190000000
chr1	240000000	248956422
chr2	0	10000000
chr2	10000000	20000000
chr2	20000000	30000000
chr2	30000000	40000000
chr2	40000000	50000000
chr2	50000000	60000000
chr2	60000000	70000000
chr2	70000000	80000000
chr2	80000000	90000000
chr2	90000000	100000000
chr2	100000000	110000000
chr2	110000000	120000000
chr2	120000000	130000000
chr2	130000000	140000000
chr2	140000000	150000000
chr2	150000000	160000000
chr2	160000000	170000000
ch

Process ForkPoolWorker-166:
Process ForkPoolWorker-161:
Process ForkPoolWorker-158:
Process ForkPoolWorker-156:
Process ForkPoolWorker-152:
Process ForkPoolWorker-157:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Process ForkPoolWorker-164:
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", 

  File "<ipython-input-49-fe6785cb94b0>", line 73, in worker
    cigars = parse_cigar_string(s.cigarstring)
  File "<ipython-input-49-fe6785cb94b0>", line 5, in parse_cigar_string
    i1, i2, = ret.span()
  File "<ipython-input-49-fe6785cb94b0>", line 4, in parse_cigar_string
    ret = re.search("^[0-9]+[MIDSH]", s)
  File "<ipython-input-49-fe6785cb94b0>", line 73, in worker
    cigars = parse_cigar_string(s.cigarstring)
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/re.py", line 289, in _compile
    p, loc = _cache[type(pattern), pattern, flags]
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "<ipython-input-49-fe6785cb94b0>", line 83, in worker
    hit1["CigarString"] = s.cigarstring
Traceback (most recent call last):
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
Traceback (most

KeyboardInterrupt: 

  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/pool.py", line 119, in worker
    result = (True, func(*args, **kwds))
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
Process ForkPoolWorker-160:
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 258, in _bootstrap
    self.run()
  File "<ipython-input-49-fe6785cb94b0>", line 6, in parse_cigar_string
    s1 = s[:i2]
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/process.py", line 93, in run
    self._target(*self._args, **self._kwargs)
KeyboardInterrupt
Traceback (most recent call last):
  File "/home/chenzonggui/miniconda3/envs/py36/lib/python3.6/multiprocessing/pool.py", line 119, i