Merge pull request #170 from EI-CoreBioinformatics/issue-166

This commit should fix some of the performance issues found in Mikado…
EI-CoreBioinformatics · Apr 30, 2019 · f90a7a4 · f90a7a4
2 parents ca4a84c + c5d2a3e
commit f90a7a4
Show file tree

Hide file tree

Showing 5 changed files with 67 additions and 54 deletions.
diff --git a/Mikado/scales/assigner.py b/Mikado/scales/assigner.py
@@ -103,6 +103,7 @@ def __init__(self,
             self.args.verbose = False
             self.lenient = False
             self.use_prediction_alias = False
+            self.__report_fusions = True
         else:
             self.args = args
             if not hasattr(args, "out"):
@@ -119,6 +120,7 @@ def __init__(self,
                 self.args.verbose = False
             self.lenient = getattr(args, "lenient", False)
             self.use_prediction_alias = getattr(args, "use_prediction_alias", False)
+            self.__report_fusions = getattr(args, "report_fusions", True)
 
         # noinspection PyUnresolvedReferences
         # pylint: disable=no-member
@@ -444,46 +446,48 @@ def __check_for_fusions(self, prediction, matches, fuzzymatch=None):
             if len(local_best) > 0:
                 best.add((match, sorted(local_best, key=self.get_f1, reverse=True)[0]))
 
-        if len(best) > 1:  # We have a fusion
-            # Now retrieve the results according to their order on the genome
-            # Keep only the result, not their position
-            best = [_[1] for _ in sorted(best, key=lambda res: (res[0][0], res[0][1]))]
-            chrom = prediction.chrom
-            start = min([prediction.start] + [self.genes[_.ref_gene[0]][_.ref_id[0]].start for _ in best])
-            end = max([prediction.end] + [self.genes[_.ref_gene[0]][_.ref_id[0]].end for _ in best])
-            location = "{}:{}..{}".format(chrom, start, end)
-
-            best_result = []
-            for result in best:
-                new_result = []
-                for key in ResultStorer.__slots__:
-                    if key == "location":
-                        new_result.append(location)
-                    elif key == "ccode":
-                        tup = tuple(["f"] + [getattr(result, key)[0]])
-                        new_result.append(tup)
-                    else:
-                        new_result.append(getattr(result, key))
-                new_result = ResultStorer(*new_result)
-                best_result.append(new_result)
-
-            for match, genes in fused.items():
-                for gene in iter(_ for _ in genes if _ not in dubious):
-                    for position, result in enumerate(new_matches[gene]):
-                        if result.j_f1[0] > 0 or result.n_recall[0] > 10:
-                            result.ccode = ("f", result.ccode[0])
-                            new_matches[gene][position] = result
-
-        elif len(best) == 1:
-            best_result = best.pop()[1]
-        else:  # We have to retrieve the best result from the dubious
+        if len(best) == 0:
             self.logger.debug("Filtered out all results for %s, selecting the best dubious one",
                               prediction.id)
             # I have filtered out all the results,
             # because I only match partially the reference genes
             best_result = sorted([new_matches[gene][0] for gene in dubious],
                                  key=self.get_f1, reverse=True)[0]
 
+        else:
+            best = [_[1] for _ in sorted(best, key=lambda res: (res[0][0], res[0][1]))]
+            if not (len(best) > 1 and self.__report_fusions is True):
+                best_result = best.pop()
+            else:  # We have a fusion
+                # Now retrieve the results according to their order on the genome
+                # Keep only the result, not their position
+
+                chrom = prediction.chrom
+                start = min([prediction.start] + [self.genes[_.ref_gene[0]][_.ref_id[0]].start for _ in best])
+                end = max([prediction.end] + [self.genes[_.ref_gene[0]][_.ref_id[0]].end for _ in best])
+                location = "{}:{}..{}".format(chrom, start, end)
+
+                best_result = []
+                for result in best:
+                    new_result = []
+                    for key in ResultStorer.__slots__:
+                        if key == "location":
+                            new_result.append(location)
+                        elif key == "ccode":
+                            tup = tuple(["f"] + [getattr(result, key)[0]])
+                            new_result.append(tup)
+                        else:
+                            new_result.append(getattr(result, key))
+                    new_result = ResultStorer(*new_result)
+                    best_result.append(new_result)
+
+                for match, genes in fused.items():
+                    for gene in iter(_ for _ in genes if _ not in dubious):
+                        for position, result in enumerate(new_matches[gene]):
+                            if result.j_f1[0] > 0 or result.n_recall[0] > 10:
+                                result.ccode = ("f", result.ccode[0])
+                                new_matches[gene][position] = result
+
         # Finally add the results
         for gene in new_matches:
             results.extend(new_matches[gene])

diff --git a/Mikado/scales/compare.py b/Mikado/scales/compare.py
@@ -15,7 +15,6 @@
 import sys
 from logging import handlers as log_handlers
 from ..transcripts.transcript import Transcript
-from .accountant import Accountant
 from .assigner import Assigner
 from .resultstorer import ResultStorer
 from ..exceptions import CorruptIndex
@@ -30,13 +29,10 @@
 import multiprocessing as mp
 import tempfile
 from ..exceptions import InvalidTranscript
-from time import sleep
 import json
 import gzip
 import itertools
-from ..utilities import NumpyEncoder
 import functools
-from threading import Thread
 import msgpack
 
 
@@ -556,6 +552,21 @@ def check_index(args, queue_logger):
         res = cursor.execute("PRAGMA integrity_check;").fetchone()
         if res[0] != "ok":
             raise CorruptIndex("Corrupt database, integrity value: {}".format(res[0]))
+        gid, obj = cursor.execute("SELECT * from genes").fetchone()
+        try:
+            obj = msgpack.loads(obj, raw=False)
+        except TypeError:
+            try:
+                obj = json.loads(obj)
+            except (ValueError, TypeError, json.decoder.JSONDecodeError):
+                raise CorruptIndex("Corrupt index")
+            raise CorruptIndex("Old index, deleting and rebuilding")
+
+        gene = Gene(None)
+        try:
+            gene.load_dict(obj)
+        except:
+            raise CorruptIndex("Invalid value for genes, indicating a corrupt index. Deleting and rebuilding.")
 
     except sqlite3.DatabaseError:
         raise CorruptIndex("Invalid database file")

diff --git a/Mikado/scales/gene_dict.py b/Mikado/scales/gene_dict.py
@@ -12,10 +12,15 @@ def __init__(self, dbname: str, logger=None):
         self.__dbname = dbname
         self.__db = sqlite3.connect("file:{}?mode=ro".format(self.__dbname), uri=True)
         self.__cursor = self.__db.cursor()
+        self.__cache = dict()
+
         self.logger = logger
 
     def __getitem__(self, item):
 
+        if item in self.__cache:
+            return self.__cache[item]
+
         failed = 0
         while True:
             try:
@@ -29,7 +34,9 @@ def __getitem__(self, item):
 
         if res:
             try:
-                return self.__load_gene(res[0][0])
+                gene = self.__load_gene(res[0][0])
+                self.__cache[item] = gene
+                return gene
             except IndexError:
                 raise IndexError(res)
         else:
@@ -49,7 +56,7 @@ def __load_gene(self, jdict):
     def load_all(self):
 
         for row in self.__cursor.execute("SELECT gid, json from genes"):
-            gid, gene = row[0], self.__load_gene(row[1])
+            gid, gene = row[0], self.__load_gene(row[0])
             yield (gid, gene)
 
     def __iter__(self):

diff --git a/Mikado/scales/resultstorer.py b/Mikado/scales/resultstorer.py
@@ -72,22 +72,9 @@ def _asdict(self):
         result_dict = dict().fromkeys(self.__slots__)
         for attr in self.__slots__:
             result_dict[attr] = getattr(self, attr)
+            if isinstance(result_dict[attr], (tuple, list)):
+                result_dict[attr] = ",".join(["{}".format(_) for _ in result_dict[attr]])
         return result_dict
-
-        # for attr in self.__slots__[:3]:
-        #     try:
-        #         result_dict[attr] = ",".join(list(getattr(self, attr)))
-        #     except TypeError as exc:
-        #         raise TypeError("{0}; {1}".format(exc, getattr(self, attr)))
-        # for attr in self.__slots__[3:6]:
-        #     result_dict[attr] = getattr(self, attr)
-        # for attr in (self.__slots__[6],):  # prediction exons
-        #     result_dict[attr] = ",".join("{0}".format(x) for x in getattr(self, attr))
-        # for attr in self.__slots__[7:-2]:
-        #     result_dict[attr] = ",".join("{0:,.2f}".format(x) for x in getattr(self, attr))
-        # result_dict["distance"] = self.distance   # [0]  # Last attribute
-        # result_dict["location"] = self.location   # [0]
-        # return result_dict
 
     def as_dict(self):
         """

diff --git a/Mikado/subprograms/compare.py b/Mikado/subprograms/compare.py
@@ -75,6 +75,10 @@ def get_procs(arg):
                         help="""If set, exonic statistics will be calculated leniently
 in the TMAP as well - ie they will consider an exon as match even if only
 the internal junction has been recovered.""")
+    parser.add_argument("-nF", "--do-not-report-fusions", dest="report_fusions", action="store_false",
+                        default=True,
+                        help="Flag. If invoked, Mikado compare will not report fusions in the input. Useful \
+when the reference transcripts have not been clustered properly into genes (e.g. a Mikado prepare run).")
     parser.add_argument("-eu", "--exclude-utr", dest="exclude_utr",
                         default=False, action="store_true",
                         help="""Flag. If set, reference and prediction transcripts