solved conflict

EI-CoreBioinformatics · Jan 29, 2016 · 6d47725 · 6d47725
2 parents 44c178a + c12b8c6
commit 6d47725
Show file tree

Hide file tree

Showing 13 changed files with 650 additions and 78 deletions.
diff --git a/.gitignore b/.gitignore
@@ -43,4 +43,13 @@ build/*
 shanghai.egg-info/
 dist/*
 Mikado.egg-info/*
-.idea/*
+.idea/*sample_data/configuration2.yaml
+.idea/codeStyleSettings.xml
+.idea/dictionaries
+.idea/encodings.xml
+.idea/inspectionProfiles
+.idea/Mikado.iml
+.idea/misc.xml
+.idea/modules.xml
+.idea/vcs.xml
+.idea/workspace.xml
diff --git a/mikado_lib/configuration/configuration_blueprint.json b/mikado_lib/configuration/configuration_blueprint.json
@@ -278,14 +278,14 @@
                   "n",
                   "O",
                   "e",
-                  "K",
                   "o",
-                  "h"
+                  "h",
+                  "J",
+                  "C"
                 ]
               },
               "default": [
                 "j",
-                "n",
                 "O",
                 "h"
               ]

diff --git a/mikado_lib/scales/__init__.py b/mikado_lib/scales/__init__.py
@@ -38,3 +38,4 @@ def calc_f1(recall, precision):
 from . import accountant
 # noinspection PyPep8
 from . import assigner
+from . import compare
diff --git a/mikado_lib/scales/assigner.py b/mikado_lib/scales/assigner.py
@@ -227,7 +227,7 @@ def __check_for_fusions(self, prediction, matches):
             self.logger.debug("Match for %s: %s",
                               match,
                               [_.id for _ in gene_matches])
-            gid_groups.append(tuple(gene.id for gene in gene_matches))
+            gid_groups.append(tuple(sorted([gene.id for gene in gene_matches])))
             temp_results = []
             # This will result in calculating the matches twice unfortunately
 
@@ -295,11 +295,12 @@ def __check_for_fusions(self, prediction, matches):
 
         # Create the fusion best result
         values = []
-        fused_group = tuple(_.ref_gene[0] for _ in best_fusion_results)
+        fused_group = tuple(sorted([_.ref_gene[0] for _ in best_fusion_results]))
         self.logger.debug("Fused group: %s; GID_groups: %s",
                           fused_group,
                           gid_groups)
 
+        # TODO: this is a very stuypid way of checking, it must be improved
         if fused_group not in gid_groups:
             for key in ResultStorer.__slots__:
                 if key in ["gid", "tid", "distance"]:
@@ -454,6 +455,7 @@ def finish(self):
                          self.done, "s" if self.done > 1 else "")
         self.refmap_printer()
         self.stat_calculator.print_stats()
+        self.tmap_out.close()
 
     def calc_compare(self, prediction: Transcript, reference: Transcript) -> ResultStorer:
         """Thin layer around the calc_compare class method.
@@ -490,7 +492,7 @@ def compare(cls, prediction: Transcript, reference: Transcript) -> (ResultStorer
         Available ccodes (from Cufflinks documentation):
 
         - =    Complete intron chain match
-        - c    Contained
+        - c    Contained (perfect junction recall and precision, imperfect recall)
         - j    Potentially novel isoform (fragment): at least one splice junction is shared
         with a reference transcript
         - e    Single exon transfrag overlapping a reference exon and at least
@@ -512,7 +514,11 @@ def compare(cls, prediction: Transcript, reference: Transcript) -> (ResultStorer
         - _    Complete match, for monoexonic transcripts
         (nucleotide F1>=95% - i.e. min(precision,recall)>=90.4%
         - m    Exon overlap between two monoexonic transcripts
-        - n    Potentially novel isoform, where all the known junctions
+        - n    Potential extension of the reference - we have added new splice junctions
+        *outside* the boundaries of the transcript itself
+        - C    Contained transcript with overextensions on either side
+        (perfect junction recall, imperfect nucleotide specificity)
+        - J    Potentially novel isoform, where all the known junctions
         have been confirmed and we have added others as well *externally*
         - I    *multiexonic* transcript falling completely inside a known transcript
         - h    the transcript is multiexonic and extends a monoexonic reference transcript
@@ -600,36 +606,66 @@ def compare(cls, prediction: Transcript, reference: Transcript) -> (ResultStorer
         if ccode is None:
             if min(prediction.exon_num, reference.exon_num) > 1:
                 if junction_recall == 1 and junction_precision < 1:
-                    new_splices = set.difference(set(prediction.splices),
-                                                 set(reference.splices))
-
+                    # Check if this is an extension
+                    new_splices = set(prediction.splices) - set(reference.splices)
+                    # If one of the new splices is internal to the intron chain, it's a j
                     if any(min(reference.splices) <
                            splice <
                            max(reference.splices) for splice in new_splices):
                         ccode = "j"
                     else:
-                        # we have recovered all the junctions AND
-                        # added some reference junctions of our own
-                        ccode = "n"
-                elif junction_recall > 0 and 0 < junction_precision < 1:
-                    if one_intron_confirmed is True:
-                        ccode = "j"
+                        if any(reference.start < splice < reference.end for splice in new_splices):
+                            # if nucl_recall < 1:
+                            ccode = "J"
+                        else:
+                            ccode = "n"
+                elif 0 < junction_recall < 1 and 0 < junction_precision < 1:
+                    # if one_intron_confirmed is True:
+                    ccode = "j"
+                    # else:
+                    #     ccode = "o"
+                elif junction_precision == 1 and junction_recall < 1:
+                    if nucl_precision == 1:
+                        assert nucl_recall < 1
+                        ccode = "c"
                     else:
-                        ccode = "o"
-                elif junction_precision == 1:
-                    ccode = "c"
-                    if nucl_precision < 1:
-                        for intron in reference.introns:
-                            if intron in prediction.introns:
-                                continue
-                            if intron[1] < prediction.start:
-                                continue
-                            elif intron[0] > prediction.end:
-                                continue
-                            if prediction.start < intron[0] and intron[1] < prediction.end:
-                                ccode = "j"
-                                break
+                        missed_introns = reference.introns - prediction.introns
+                        start_in = any([True for intron in missed_introns if
+                                        (prediction.start < intron[0] and
+                                         intron[1] < min(prediction.splices))])
+                        end_in = any([True for intron in missed_introns if
+                                        (prediction.end > intron[1] and
+                                         intron[0] > max(prediction.splices))])
+
+                        if start_in or end_in:
+                            ccode = "j"
+                        else:
+                            ccode = "C"
 
+                        # end_in = any(prediction.end in intron for intron in missed_introns)
+                        #
+                        #
+                        #
+                        #
+                        #
+                        # ref_only_introns = sorted(list(reference.introns - prediction.introns))
+                        # pred_introns = sorted(prediction.introns)
+                        # min_pred_intron = pred_introns[0][0]
+                        # max_pred_intron = pred_introns[-1][1]
+                        # assert ref_only_introns != set(), (prediction, reference)
+                        # left = sorted(
+                        #     [_[0] for _ in ref_only_introns if _[1] < min_pred_intron])
+                        # if left:
+                        #     left = left[-1]
+                        # right = sorted(
+                        #     [_[1] for _ in ref_only_introns if _[0] > max_pred_intron])
+                        # if right:
+                        #     right = right[0]
+                        # if ((left and left > prediction.start) or
+                        #         (right and right < prediction.end)):
+                        #     ccode = "j"  # Retained intron
+                        # else:
+                        #     ccode = "C"
                 elif junction_recall == 0 and junction_precision == 0:
                     if nucl_f1 > 0:
                         ccode = "o"

diff --git a/mikado_lib/scales/compare.py b/mikado_lib/scales/compare.py
@@ -228,7 +228,7 @@ def parse_prediction(args, genes, positions, queue_logger):
 
     # Finish everything, including printing refmap and stats
     assigner_instance.finish()
-
+    args.prediction.close()
 
 def compare(args):
     """

diff --git a/mikado_lib/subprograms/prepare.py b/mikado_lib/subprograms/prepare.py
@@ -245,7 +245,6 @@ def setup(args):
     if args.output_dir is not None:
         args.json_conf["prepare"]["output_dir"] = getattr(args,
                                                           "output_dir")
-
     if not os.path.exists(args.json_conf["prepare"]["output_dir"]):
         try:
             os.makedirs(args.json_conf["prepare"]["output_dir"])
@@ -561,7 +560,7 @@ def positive(string):
     parser.add_argument("--single", action="store_true", default=False,
                         help="Disable multi-threading. Useful for debugging.")
     parser.add_argument("-od", "--output-dir", dest="output_dir",
-                        type=str, default=None,
+                        type=str, default=".",
                         help="Output directory. Default: current working directory")
     parser.add_argument("-o", "--out", default=None,
                         help="Output file. Default: mikado_prepared.fasta.")