BAM: allow to retrieve matrices outside diagonal

3DGenomes · Nov 25, 2016 · 845b20f · 845b20f
1 parent b3a8839
commit 845b20f
Show file tree

Hide file tree

Showing 3 changed files with 164 additions and 72 deletions.
diff --git a/_pytadbit/_version.py b/_pytadbit/_version.py
@@ -1 +1 @@
-__version__ = "3DAROC_2016.16"
+__version__ = "3DAROC_2016.17"
diff --git a/scripts/matrix_from_BAM.py b/scripts/matrix_from_BAM.py
@@ -42,7 +42,7 @@ def print_progress(procs):
                         '%s/%s' % (len(procs),len(procs)))
 
 
-def read_bam_frag(inbam, filter_exclude, sections, 
+def read_bam_frag(inbam, filter_exclude, sections1, sections2,
                   resolution, outdir, region, start, end, half=False):
     bamfile = pysam.AlignmentFile(inbam, 'rb')
     refs = bamfile.references
@@ -57,11 +57,9 @@ def read_bam_frag(inbam, filter_exclude, sections,
             pos1 = r.reference_start + 1
             crm2 = refs[r.mrnm]
             pos2 = r.mpos + 1
-            print r
-            print crm1, pos1, crm2, pos2
             try:
-                pos1 = sections[(crm1, pos1 / resolution)]
-                pos2 = sections[(crm2, pos2 / resolution)]
+                pos1 = sections1[(crm1, pos1 / resolution)]
+                pos2 = sections2[(crm2, pos2 / resolution)]
             except KeyError:
                 continue  # not in the subset matrix we want
             try:
@@ -84,7 +82,8 @@ def read_bam_frag(inbam, filter_exclude, sections,
 
 
 def read_bam(inbam, filter_exclude, resolution, biases, ncpus=8,
-             region=None, start=None, end=None, outdir='.'):
+             region1=None, start1=None, end1=None,
+             region2=None, start2=None, end2=None, outdir='.'):
 
     bamfile = pysam.AlignmentFile(inbam, 'rb')
     sections = OrderedDict(zip(bamfile.references,
@@ -101,38 +100,39 @@ def read_bam(inbam, filter_exclude, resolution, biases, ncpus=8,
 
     start_bin = 0
     end_bin   = len(bins) + 1
-    if region:
-        regions = [region]
-        start_bin = [i for i, b in enumerate(bins) if b[0] == region][0]
-        end_bin   = [i for i, b in enumerate(bins[start_bin:], start_bin) if b[0] == region][-1]
+    if region1:
+        regions = [region1]
+        start_bin = [i for i, b in enumerate(bins) if b[0] == region1][0]
+        end_bin   = [i for i, b in enumerate(bins[start_bin:], start_bin)
+                     if b[0] == region1][-1]
     else:
         regions = bamfile.references
         total = len(bins)
-        if start or end:
+        if start1 or end1:
             raise Exception('ERROR: Cannot use start/end1 without region')
 
-    if start:
-        start_bin = section_pos[region][0] + start / resolution
+    if start1:
+        start_bin = section_pos[region1][0] + start1 / resolution
     else:
-        start = 0
-    if end:
-        end_bin = section_pos[region][0] + end / resolution
+        start1 = 0
+    if end1:
+        end_bin = section_pos[region1][0] + end1 / resolution
     else:
         end = len(bins)
 
     total = end_bin - start_bin + 1
-    regs = []
-    begs = []
-    ends = []
+    regs  = []
+    begs  = []
+    ends  = []
     njobs = min(total, 100) + 1
     nbins = total / njobs + 1
     for i in xrange(start_bin, end_bin, nbins):
         if i + nbins > end_bin:  # make sure that we stop at the right place
             nbins = end_bin - i
         try:
-            (crm1, beg1), (crm2, end2) = bins[i], bins[i + nbins - 1]
+            (crm1, beg1), (crm2, fin2) = bins[i], bins[i + nbins - 1]
         except IndexError:
-            (crm1, beg1), (crm2, end2) = bins[i], bins[-1]
+            (crm1, beg1), (crm2, fin2) = bins[i], bins[-1]
         if crm1 != crm2:
             end1 = sections[crm1]
             beg2 = 0
@@ -141,11 +141,11 @@ def read_bam(inbam, filter_exclude, resolution, biases, ncpus=8,
             begs.append(beg1 * resolution)
             begs.append(beg2 * resolution)
             ends.append(end1 * resolution + resolution)  # last nt included
-            ends.append(end2 * resolution + resolution - 1)  # last nt not included (overlap with next window)
+            ends.append(fin2 * resolution + resolution - 1)  # last nt not included (overlap with next window)
         else:
             regs.append(crm1)
             begs.append(beg1 * resolution)
-            ends.append(end2 * resolution + resolution - 1)            
+            ends.append(fin2 * resolution + resolution - 1)            
     ends[-1] += 1  # last nucleotide included
 
     # reduce dictionaries
@@ -159,69 +159,125 @@ def read_bam(inbam, filter_exclude, resolution, biases, ncpus=8,
             start = 0
             end   = section_pos[crm][1] - section_pos[crm][0] + 1
         bins.extend([(crm, i) for i in xrange(start, end)])
+    bins_dict1 = dict([(j, i) for i, j in enumerate(bins)])
+    if region2:
+        bins = []
+        beg_crm = section_pos[region2][0]
+        if start2:
+            start_bin2 = section_pos[region2][0] + start2 / resolution
+            end_bin2   = section_pos[region2][0] + end2   / resolution
+        else:
+            start_bin2 = 0
+            end_bin2   = section_pos[region2][0]
+        start = start_bin2 - beg_crm
+        end   = end_bin2   - beg_crm
+        bins = [(region2, i) for i in xrange(start, end)]
+        bins_dict2 = dict([(j, i) for i, j in enumerate(bins)])
+    else:
+        bins_dict2 = bins_dict1
     pool = mu.Pool(ncpus)
-    bins_dict = dict([(j, i) for i, j in enumerate(bins)])
     ## RUN!
     printime('\n  - Parsing BAM (%d chunks)' % (len(regs)))
     procs = []
     for i, (region, b, e) in enumerate(zip(regs, begs, ends)):
         if ncpus == 1:
-            read_bam_frag(inbam, filter_exclude, bins_dict, 
+            read_bam_frag(inbam, filter_exclude,
+                          bins_dict1, bins_dict2,
                           resolution, outdir, region, b, e,)
         else:
             procs.append(pool.apply_async(
-                read_bam_frag, args=(inbam, filter_exclude, bins_dict, 
+                read_bam_frag, args=(inbam, filter_exclude,
+                                     bins_dict1, bins_dict2,
                                      resolution, outdir, region, b, e,)))
     pool.close()
     print_progress(procs)
     pool.join()
 
     printime('  - Writing matrices')
-    bias  = dict((k - start_bin, v)
-                 for k, v in biases.get('biases', {}).iteritems()
-                 if start_bin <= k <= end_bin)
+    bias1  = dict((k - start_bin, v)
+                  for k, v in biases.get('biases', {}).iteritems()
+                  if start_bin <= k <= end_bin)
+    if region2:
+        bias2  = dict((k - start_bin2, v)
+                      for k, v in biases.get('biases', {}).iteritems()
+                      if start_bin2 <= k <= end_bin2)
+    else:
+        bias2 = bias1
     decay = biases.get('decay' , {})
-    bads  = dict((k - start_bin, v)
-                 for k, v in biases.get('badcol', {}).iteritems()
-                 if start_bin <= k <= end_bin)
+    bads1  = dict((k - start_bin, v)
+                  for k, v in biases.get('badcol', {}).iteritems()
+                  if start_bin <= k <= end_bin)
+    if region2:
+        bads2  = dict((k - start_bin2, v)
+                      for k, v in biases.get('badcol', {}).iteritems()
+                      if start_bin2 <= k <= end_bin2)
+    else:
+        bads2 = bads1
     # hic_data = HiC_data((), len(bins_dict), sections,
     #                     bins_dict, resolution=resolution)
     if len(regions) == 1:
-        name = '%s:%d-%d' % (region, start, end)
+        if region2:
+            name = '%s:%d-%d_%s:%d-%d' % (region1, start1 / resolution, end1 / resolution,
+                                          region2, start2 / resolution, end2 / resolution)
+        else:
+            name = '%s:%d-%d' % (region1, start1 / resolution, end1 / resolution)
     else:
         name = 'full'
     out_raw = open(os.path.join(outdir, 'matrix_raw_%s_%s.abc' % (
         name, nicer(resolution).replace(' ', ''))), 'w')
     out_raw.write('# %s resolution:%d\n' % (name, resolution))
-    out_raw.write('# BADS %s\n' % (','.join([str(b) for b in bads])))
+    if region2:
+        out_raw.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
+        out_raw.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
+    else:
+        out_raw.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))
     if biases:
         out_nrm = open(os.path.join(outdir, 'matrix_nrm_%s_%s.abc' % (
             name, nicer(resolution).replace(' ', ''))), 'w')
         out_nrm.write('# %s resolution:%d\n' % (name, resolution))
-        out_nrm.write('# BADS %s\n' % (','.join([str(b) for b in bads])))
+        if region2:
+            out_nrm.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
+            out_nrm.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
+        else:
+            out_nrm.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))
         out_dec = open(os.path.join(outdir, 'matrix_dec_%s_%s.abc' % (
             name, nicer(resolution).replace(' ', ''))), 'w')
         out_dec.write('# %s resolution:%d\n' % (
             name, resolution))
-        out_dec.write('# BADS %s\n' % (','.join([str(b) for b in bads])))
+        if region2:
+            out_dec.write('# BADROWS %s\n' % (','.join([str(b) for b in bads1])))
+            out_dec.write('# BADCOLS %s\n' % (','.join([str(b) for b in bads2])))
+        else:
+            out_dec.write('# BADS %s\n' % (','.join([str(b) for b in bads1])))
 
     def write2matrix(a, b, c):
         out_raw.write('%d\t%d\t%d\n' % (a, b, c))
     def write2matrices(a, b, c):
         out_raw.write('%d\t%d\t%d\n' % (a, b, c))
-        out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias[a] * bias[b])))
-        out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias[a] * bias[b] * decay[abs(a-b)])))
+        out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b])))
+        out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] *
+                                                   decay[abs(a-b)])))
+    def write2matrices_2reg(a, b, c):
+        out_raw.write('%d\t%d\t%d\n' % (a, b, c))
+        out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b])))
+        out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] *
+                                                   decay[abs((a + start_bin) -
+                                                             (b + start_bin2))])))
     def write2matrices_err(a, b, c):
         out_raw.write('%d\t%d\t%d\n' % (a, b, c))
-        out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias[a] * bias[b])))
+        out_nrm.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b])))
         try:
-            out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias[a] * bias[b] * decay[abs(a-b)])))
+            out_dec.write('%d\t%d\t%f\n' % (a, b, c / (bias1[a] * bias2[b] *
+                                                       decay[abs(a-b)])))
         except KeyError:  # different chromsomes
             out_dec.write('%d\t%d\t%s\n' % (a, b, 'nan'))
 
     if biases:
         if len(regions) == 1:
-            write = write2matrices
+            if region2:
+                write = write2matrices_2reg
+            else:
+                write = write2matrices
         else:
             write = write2matrices_err
     else:
@@ -239,7 +295,7 @@ def write2matrices_err(a, b, c):
         fname = os.path.join(outdir, 'tmp_%s:%d-%d.pickle' % (region, start, end))
         dico = load(open(fname))
         for (j, k), v in dico.iteritems():
-            if j in bads or k in bads:
+            if j in bads1 or k in bads2:
                 continue
             write(j, k, v)
         os.system('rm -f %s' % (fname))
@@ -262,41 +318,71 @@ def main():
     else:
         biases     = {}
     outdir         = opts.outdir
-    coord          = opts.coord
+    coord1         = opts.coord1
+    coord2         = opts.coord2
 
-    if biases['resolution'] != resolution:
+    if biases and biases['resolution'] != resolution:
         raise Exception('ERROR: different resolution in bias file (you want %d,'
                         ' there is %d).\n' % (resolution, biases['resolution']))
-
-    if not coord:
-        region = None
-        start = None
-        end   = None
+    if coord2 and not coord1:
+        coord1, coord2 = coord2, coord1
+
+    if not coord1:
+        region1 = None
+        start1  = None
+        end1    = None
+        region2 = None
+        start2  = None
+        end2    = None
     else:
         try:
-            crm, pos   = coord.split(':')
-            region     = crm
-            start, end = pos.split('-')
-            start      = int(start)
-            end        = int(end)
+            crm1, pos1   = coord1.split(':')
+            start1, end1 = pos1.split('-')
+            region1 = crm1
+            start1  = int(start1)
+            end1    = int(end1)
         except ValueError:
-            region = coord
-            start  = None
-            end    = None
+            region1 = coord1
+            start1  = None
+            end1    = None
+        if coord2:
+            try:
+                crm2, pos2   = coord2.split(':')
+                start2, end2 = pos2.split('-')
+                region2 = crm2
+                start2  = int(start2)
+                end2    = int(end2)
+            except ValueError:
+                region2 = coord2
+                start2  = None
+                end2    = None
+        else:
+            region2 = None
+            start2  = None
+            end2    = None
 
-
     mkdir(outdir)
-    if region:
-        sys.stdout.write('\nExtraction of %s' % (region))
-        if start:
-            sys.stdout.write(':%s-%s\n' % (start, end))
-        else:
-            sys.stdout.write(' (full chromosome)\n')
+    if region1:
+        if region1:
+            sys.stdout.write('\nExtraction of %s' % (region1))
+            if start1:
+                sys.stdout.write(':%s-%s' % (start1, end1))
+            else:
+                sys.stdout.write(' (full chromosome)')
+            if region2:
+                sys.stdout.write(' intersection with %s' % (region2))
+                if start2:
+                    sys.stdout.write(':%s-%s\n' % (start2, end2))
+                else:
+                    sys.stdout.write(' (full chromosome)\n')
+            else:
+                sys.stdout.write('\n')
     else:
         sys.stdout.write('\nExtraction of full genome\n')
 
     read_bam(inbam, filter_exclude, resolution, biases,
-             region=region, start=start, end=end,
+             region1=region1, start1=start1, end1=end1,
+             region2=region2, start2=start2, end2=end2,
              ncpus=ncpus, outdir=outdir)
 
     printime('\nDone.')
@@ -310,14 +396,19 @@ def get_options():
     parser.add_argument('-o', '--outdir', dest='outdir', metavar='',
                         required=True, default=False, help='output directory.')
     parser.add_argument('-r', '--resolution', dest='reso', type=int, metavar='',
-                        required=True, help='''wanted resolution form th generated matrix''')
-    parser.add_argument('-c', '--coord', dest='coord',  metavar='', 
-                        default=None, help='''Coordinate of the region to 
-                        retrieve. By default all genome, arguments can be 
-                        either one chromosome name, or the_shape coordinate in 
-                        the form "chr3:110000000-120000000"''')
+                        required=True, help='''wanted resolution form the 
+                        generated matrix''')
     parser.add_argument('-b', '--biases', dest='biases', metavar='',
                         help='''path to pickle file with array of biases''')
+    parser.add_argument('-c', '--coord', dest='coord1',  metavar='', 
+                        default=None, help='''Coordinate of the region to 
+                        retrieve. By default all genome, arguments can be 
+                        either one chromosome name, or the coordinate in 
+                        the form: "-c chr3:110000000-120000000"''')
+    parser.add_argument('-c2', '--coord2', dest='coord2',  metavar='', 
+                        default=None, help='''Coordinate of a second region to 
+                        retrieve the matrix in the intersection with the first
+                        region.''')
     parser.add_argument('-C', '--cpus', dest='cpus', metavar='', type=int,
                         default=8, help='''[%(default)s] number of cpus to be 
                         used for parsing the HiC-BAM file''')

diff --git a/scripts/normalize_from_BAM.py b/scripts/normalize_from_BAM.py
@@ -227,6 +227,7 @@ def read_bam(inbam, filter_exclude, resolution, min_count=2500,
                 sumdec[k] += v
             except KeyError:
                 sumdec[k]  = v
+    # divide by the average of the diagonals and multiply by two, because 1 diag per half square
     target = dict([(d, sumdec[d] / float((total - d) * factor * 2))
                    for d in sumdec])
     diasum = dict([(d, diasum[d] * target[d]) for d in diasum])