Modified hisat2 perl script to handle path names including space, and…

… further elaborated HISAT2's alignment summary
DaehwanKimLab · Jun 1, 2017 · da647a7 · da647a7
1 parent ee8f209
commit da647a7
Show file tree

Hide file tree

Showing 6 changed files with 92 additions and 86 deletions.
diff --git a/aln_sink.h b/aln_sink.h
@@ -1503,15 +1503,15 @@ void AlnSink<index_t>::printAlSumm(
     uint64_t tot_al = (met.nconcord_uni + met.nconcord_rep) * 2 + (met.ndiscord) * 2 + met.nunp_0_uni + met.nunp_0_rep + met.nunp_uni + met.nunp_rep;
     assert_leq(tot_al, tot_al_cand);
     if(newSummary) {
-        out << "Summary stats:" << endl;
+        out << "HISAT2 summary stats:" << endl;
         if(totpair > 0) {
+            uint64_t ncondiscord_0 = met.nconcord_0 - met.ndiscord;
             out << "\tTotal pairs: " << totpair << endl;
-            out << "\t\tAligned concordantly 0 time: " << met.nconcord_0 << " ("; printPct(out, met.nconcord_0, met.npaired); out << ")" << endl;
+            out << "\t\tAligned concordantly or discordantly 0 time: " << ncondiscord_0 << " ("; printPct(out, ncondiscord_0, met.npaired); out << ")" << endl;
             out << "\t\tAligned concordantly 1 time: " << met.nconcord_uni1 << " ("; printPct(out, met.nconcord_uni1, met.npaired); out << ")" << endl;
             out << "\t\tAligned concordantly >1 times: " << met.nconcord_uni2 << " ("; printPct(out, met.nconcord_uni2, met.npaired); out << ")" << endl;
-            out << "\t\tAligned discordantly 1 time: " << met.ndiscord << " ("; printPct(out, met.ndiscord, met.nconcord_0); out << ")" << endl;
+            out << "\t\tAligned discordantly 1 time: " << met.ndiscord << " ("; printPct(out, met.ndiscord, met.npaired); out << ")" << endl;
 
-            uint64_t ncondiscord_0 = met.nconcord_0 - met.ndiscord;
             out << "\tTotal unpaired reads: " << ncondiscord_0 * 2 << endl;
             out << "\t\tAligned 0 time: " << met.nunp_0_0 << " ("; printPct(out, met.nunp_0_0, ncondiscord_0 * 2); out << ")" << endl;
             out << "\t\tAligned 1 time: " << met.nunp_0_uni1 << " ("; printPct(out, met.nunp_0_uni1, ncondiscord_0 * 2); out << ")" << endl;

diff --git a/diff_sample.h b/diff_sample.h
@@ -172,8 +172,8 @@ void calcExhaustiveDC(T i, bool verbose = false, bool sanityCheck = false) {
 					assert_lt(d2, v);
 					assert_gt(d1, 0);
 					assert_gt(d2, 0);
-					if(!diffs[d1]) diffCnt++; diffs[d1] = true;
-					if(!diffs[d2]) diffCnt++; diffs[d2] = true;
+                    if(!diffs[d1]) { diffCnt++; diffs[d1] = true; }
+                    if(!diffs[d2]) { diffCnt++; diffs[d2] = true; }
 				}
 			}
 			// Do we observe all possible differences (except 0)

diff --git a/doc/sidebar.inc.shtml b/doc/sidebar.inc.shtml
@@ -371,7 +371,8 @@
 
 <h2>Related Tools</h2>
 <div class="box">
- <ul>
+  <ul>
+    <li><a href="http://www.ccb.jhu.edu/hisat-genotype">HISAT-genotype</a>: Next-generation analysis of human genomes</li>
    <li><a href="http://www.ccb.jhu.edu/software/hisat">HISAT</a>: Fast and sensitive spliced alignment</li>
    <li><a href="http://bowtie-bio.sourceforge.net/bowtie2">Bowtie2</a>: Ultrafast read alignment</li>
    <li><a href="http://www.ccb.jhu.edu/software/tophat">TopHat2</a>: Spliced read mapper for RNA-Seq</li>

diff --git a/evaluation/simulation/calculate_read_cost.py b/evaluation/simulation/calculate_read_cost.py
@@ -312,7 +312,7 @@ def is_small_exon_junction_read(cigars, min_exon_len = 23):
 
 """
 """
-def extract_single(infilename, outfilename, chr_dic, aligner, debug_dic):
+def extract_single(infilename, outfilename, chr_dic, aligner, version, debug_dic):
     infile = open(infilename)
     outfile = open(outfilename, "w")
     prev_read_id = ""
@@ -338,8 +338,7 @@ def extract_single(infilename, outfilename, chr_dic, aligner, debug_dic):
         if read_id != prev_read_id:
             num_reads += 1
 
-        flag = int(flag)
-        pos = int(pos)
+        flag, pos, mapQ = int(flag), int(pos), int(mapQ)
         if flag & 0x4 != 0:
             prev_read_id = read_id
             continue
@@ -360,7 +359,9 @@ def extract_single(infilename, outfilename, chr_dic, aligner, debug_dic):
             if aligner == "hisat2":
                 if prev_read_id == read_id:
                     assert prev_NH == NH
-
+                if NH == 1 or mapQ == 60:
+                    assert NH == 1 and mapQ == 60
+
         if read_id != prev_read_id:
             num_aligned_reads += 1
             if aligner == "hisat2" and \
@@ -435,7 +436,7 @@ def extract_single(infilename, outfilename, chr_dic, aligner, debug_dic):
         hisat2_reads, hisat2_0aligned_reads, hisat2_ualigned_reads, hisat2_maligned_reads = 0, 0, 0, 0
         for line in open(infilename + ".summary"):
             line = line.strip()
-            if line.startswith("Summary") or \
+            if line.startswith("HISAT2 summary") or \
                line.startswith("Overall"):
                 continue
             category, num = line.split(':')
@@ -459,7 +460,7 @@ def extract_single(infilename, outfilename, chr_dic, aligner, debug_dic):
 
 """
 """
-def extract_pair(infilename, outfilename, chr_dic, aligner, debug_dic):
+def extract_pair(infilename, outfilename, chr_dic, aligner, version, debug_dic):
     read_dic = {}
     pair_reported = set()
 
@@ -497,6 +498,7 @@ def extract_pair(infilename, outfilename, chr_dic, aligner, debug_dic):
         canonical_pos1, canonical_pos2 = int(pos1), int(pos2)
         left_read = (flag & 0x40 != 0)
         pos1 = canonical_pos1
+        mapQ = int(mapQ)
         if flag & 0x4 != 0:
             prev_read_id, is_prev_read_left = read_id, left_read
             continue
@@ -522,20 +524,23 @@ def extract_pair(infilename, outfilename, chr_dic, aligner, debug_dic):
                     assert prev_NH1 == 0 or prev_NH1 == NH
                 else:
                     assert prev_NH2 == 0 or prev_NH2 == NH
+            if NH == 1 or mapQ == 60:
+                assert NH == 1 and mapQ == 60
 
         unpaired = (flag & 0x8 != 0) or (YT in ["UU", "UP"])
         if unpaired:
             if left_read not in pair_list:
                 pair_list.add(left_read)
                 num_aligned_reads += 1
                 if aligner == "hisat2" and NH == 1:
-                    num_ualigned_reads += 1                
+                    num_ualigned_reads += 1
+                    assert mapQ == 60
         else:
             if read_id != prev_read_id:
                 if concordant:
                     num_conc_aligned_pairs += 1
                     if aligner == "hisat2" and NH == 1:
-                        num_conc_ualigned_pairs += 1
+                        num_conc_ualigned_pairs += 1                        
                 else:
                     if aligner == "hisat2":
                         assert YT == "DP"
@@ -646,27 +651,26 @@ def extract_pair(infilename, outfilename, chr_dic, aligner, debug_dic):
 
         for line in open(infilename + ".summary"):
             line = line.strip()
-            if line.startswith("Summary") or \
+            if line.startswith("HISAT2 summary") or \
                line.startswith("Overall"):
                 continue
             category, num = line.split(':')
             num = num.strip()
             num = int(num.split(' ')[0])
             if category.startswith("Total pairs"):
                 hisat2_pairs = num
-            elif category.startswith("Aligned concordantly 0 time"):
-                hisat2_conc_0aligned_pairs = num
+            elif category.startswith("Aligned concordantly or discordantly 0 time"):
+                hisat2_0aligned_pairs = num
             elif category.startswith("Aligned concordantly 1 time"):
                 hisat2_conc_ualigned_pairs = num
             elif category.startswith("Aligned concordantly >1 time"):
                 hisat2_conc_maligned_pairs = num
-                assert hisat2_pairs == hisat2_conc_0aligned_pairs + hisat2_conc_ualigned_pairs + hisat2_conc_maligned_pairs
             elif category.startswith("Aligned discordantly"):
                 hisat2_disc_aligned_pairs = num
-                assert hisat2_disc_aligned_pairs <= hisat2_conc_0aligned_pairs
+                assert hisat2_pairs == hisat2_0aligned_pairs + hisat2_conc_ualigned_pairs + hisat2_conc_maligned_pairs + hisat2_disc_aligned_pairs
             elif category.startswith("Total unpaired reads"):
                 hisat2_reads = num
-                assert hisat2_reads == (hisat2_conc_0aligned_pairs - hisat2_disc_aligned_pairs) * 2
+                assert hisat2_reads == hisat2_0aligned_pairs * 2
             elif category.startswith("Aligned 0 time"):
                 hisat2_0aligned_reads = num
             elif category.startswith("Aligned 1 time"):
@@ -1334,7 +1338,7 @@ def calculate_read_cost(verbose):
 
     aligners = [
         # ["hisat", "", "", ""],
-        # ["hisat2", "", "", "205"],
+        ["hisat2", "", "", "205"],
         # ["hisat2", "", "snp", "203b"],
         # ["hisat2", "", "snp", "205"],
         ["hisat2", "", "", ""],
@@ -1811,9 +1815,9 @@ def get_aligner_cmd(RNA, aligner, type, index_type, version, read1_fname, read2_
                 if not os.path.exists(out_fname2):
                     debug_dic = {}
                     if paired:
-                        extract_pair(out_fname, out_fname2, chr_dic, aligner, debug_dic)
+                        extract_pair(out_fname, out_fname2, chr_dic, aligner, version, debug_dic)
                     else:
-                        extract_single(out_fname, out_fname2, chr_dic, aligner, debug_dic)
+                        extract_single(out_fname, out_fname2, chr_dic, aligner, version, debug_dic)
 
                 for readtype2 in readtypes:
                     if not two_step and readtype != readtype2: