Skip to content

Commit

Permalink
Merge pull request #170 from EI-CoreBioinformatics/issue-166
Browse files Browse the repository at this point in the history
This commit should fix some of the performance issues found in Mikado…
  • Loading branch information
lucventurini committed Apr 30, 2019
2 parents ca4a84c + c5d2a3e commit f90a7a4
Show file tree
Hide file tree
Showing 5 changed files with 67 additions and 54 deletions.
70 changes: 37 additions & 33 deletions Mikado/scales/assigner.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def __init__(self,
self.args.verbose = False
self.lenient = False
self.use_prediction_alias = False
self.__report_fusions = True
else:
self.args = args
if not hasattr(args, "out"):
Expand All @@ -119,6 +120,7 @@ def __init__(self,
self.args.verbose = False
self.lenient = getattr(args, "lenient", False)
self.use_prediction_alias = getattr(args, "use_prediction_alias", False)
self.__report_fusions = getattr(args, "report_fusions", True)

# noinspection PyUnresolvedReferences
# pylint: disable=no-member
Expand Down Expand Up @@ -444,46 +446,48 @@ def __check_for_fusions(self, prediction, matches, fuzzymatch=None):
if len(local_best) > 0:
best.add((match, sorted(local_best, key=self.get_f1, reverse=True)[0]))

if len(best) > 1: # We have a fusion
# Now retrieve the results according to their order on the genome
# Keep only the result, not their position
best = [_[1] for _ in sorted(best, key=lambda res: (res[0][0], res[0][1]))]
chrom = prediction.chrom
start = min([prediction.start] + [self.genes[_.ref_gene[0]][_.ref_id[0]].start for _ in best])
end = max([prediction.end] + [self.genes[_.ref_gene[0]][_.ref_id[0]].end for _ in best])
location = "{}:{}..{}".format(chrom, start, end)

best_result = []
for result in best:
new_result = []
for key in ResultStorer.__slots__:
if key == "location":
new_result.append(location)
elif key == "ccode":
tup = tuple(["f"] + [getattr(result, key)[0]])
new_result.append(tup)
else:
new_result.append(getattr(result, key))
new_result = ResultStorer(*new_result)
best_result.append(new_result)

for match, genes in fused.items():
for gene in iter(_ for _ in genes if _ not in dubious):
for position, result in enumerate(new_matches[gene]):
if result.j_f1[0] > 0 or result.n_recall[0] > 10:
result.ccode = ("f", result.ccode[0])
new_matches[gene][position] = result

elif len(best) == 1:
best_result = best.pop()[1]
else: # We have to retrieve the best result from the dubious
if len(best) == 0:
self.logger.debug("Filtered out all results for %s, selecting the best dubious one",
prediction.id)
# I have filtered out all the results,
# because I only match partially the reference genes
best_result = sorted([new_matches[gene][0] for gene in dubious],
key=self.get_f1, reverse=True)[0]

else:
best = [_[1] for _ in sorted(best, key=lambda res: (res[0][0], res[0][1]))]
if not (len(best) > 1 and self.__report_fusions is True):
best_result = best.pop()
else: # We have a fusion
# Now retrieve the results according to their order on the genome
# Keep only the result, not their position

chrom = prediction.chrom
start = min([prediction.start] + [self.genes[_.ref_gene[0]][_.ref_id[0]].start for _ in best])
end = max([prediction.end] + [self.genes[_.ref_gene[0]][_.ref_id[0]].end for _ in best])
location = "{}:{}..{}".format(chrom, start, end)

best_result = []
for result in best:
new_result = []
for key in ResultStorer.__slots__:
if key == "location":
new_result.append(location)
elif key == "ccode":
tup = tuple(["f"] + [getattr(result, key)[0]])
new_result.append(tup)
else:
new_result.append(getattr(result, key))
new_result = ResultStorer(*new_result)
best_result.append(new_result)

for match, genes in fused.items():
for gene in iter(_ for _ in genes if _ not in dubious):
for position, result in enumerate(new_matches[gene]):
if result.j_f1[0] > 0 or result.n_recall[0] > 10:
result.ccode = ("f", result.ccode[0])
new_matches[gene][position] = result

# Finally add the results
for gene in new_matches:
results.extend(new_matches[gene])
Expand Down
19 changes: 15 additions & 4 deletions Mikado/scales/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
import sys
from logging import handlers as log_handlers
from ..transcripts.transcript import Transcript
from .accountant import Accountant
from .assigner import Assigner
from .resultstorer import ResultStorer
from ..exceptions import CorruptIndex
Expand All @@ -30,13 +29,10 @@
import multiprocessing as mp
import tempfile
from ..exceptions import InvalidTranscript
from time import sleep
import json
import gzip
import itertools
from ..utilities import NumpyEncoder
import functools
from threading import Thread
import msgpack


Expand Down Expand Up @@ -556,6 +552,21 @@ def check_index(args, queue_logger):
res = cursor.execute("PRAGMA integrity_check;").fetchone()
if res[0] != "ok":
raise CorruptIndex("Corrupt database, integrity value: {}".format(res[0]))
gid, obj = cursor.execute("SELECT * from genes").fetchone()
try:
obj = msgpack.loads(obj, raw=False)
except TypeError:
try:
obj = json.loads(obj)
except (ValueError, TypeError, json.decoder.JSONDecodeError):
raise CorruptIndex("Corrupt index")
raise CorruptIndex("Old index, deleting and rebuilding")

gene = Gene(None)
try:
gene.load_dict(obj)
except:
raise CorruptIndex("Invalid value for genes, indicating a corrupt index. Deleting and rebuilding.")

except sqlite3.DatabaseError:
raise CorruptIndex("Invalid database file")
Expand Down
11 changes: 9 additions & 2 deletions Mikado/scales/gene_dict.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,15 @@ def __init__(self, dbname: str, logger=None):
self.__dbname = dbname
self.__db = sqlite3.connect("file:{}?mode=ro".format(self.__dbname), uri=True)
self.__cursor = self.__db.cursor()
self.__cache = dict()

self.logger = logger

def __getitem__(self, item):

if item in self.__cache:
return self.__cache[item]

failed = 0
while True:
try:
Expand All @@ -29,7 +34,9 @@ def __getitem__(self, item):

if res:
try:
return self.__load_gene(res[0][0])
gene = self.__load_gene(res[0][0])
self.__cache[item] = gene
return gene
except IndexError:
raise IndexError(res)
else:
Expand All @@ -49,7 +56,7 @@ def __load_gene(self, jdict):
def load_all(self):

for row in self.__cursor.execute("SELECT gid, json from genes"):
gid, gene = row[0], self.__load_gene(row[1])
gid, gene = row[0], self.__load_gene(row[0])
yield (gid, gene)

def __iter__(self):
Expand Down
17 changes: 2 additions & 15 deletions Mikado/scales/resultstorer.py
Original file line number Diff line number Diff line change
Expand Up @@ -72,22 +72,9 @@ def _asdict(self):
result_dict = dict().fromkeys(self.__slots__)
for attr in self.__slots__:
result_dict[attr] = getattr(self, attr)
if isinstance(result_dict[attr], (tuple, list)):
result_dict[attr] = ",".join(["{}".format(_) for _ in result_dict[attr]])
return result_dict

# for attr in self.__slots__[:3]:
# try:
# result_dict[attr] = ",".join(list(getattr(self, attr)))
# except TypeError as exc:
# raise TypeError("{0}; {1}".format(exc, getattr(self, attr)))
# for attr in self.__slots__[3:6]:
# result_dict[attr] = getattr(self, attr)
# for attr in (self.__slots__[6],): # prediction exons
# result_dict[attr] = ",".join("{0}".format(x) for x in getattr(self, attr))
# for attr in self.__slots__[7:-2]:
# result_dict[attr] = ",".join("{0:,.2f}".format(x) for x in getattr(self, attr))
# result_dict["distance"] = self.distance # [0] # Last attribute
# result_dict["location"] = self.location # [0]
# return result_dict

def as_dict(self):
"""
Expand Down
4 changes: 4 additions & 0 deletions Mikado/subprograms/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,10 @@ def get_procs(arg):
help="""If set, exonic statistics will be calculated leniently
in the TMAP as well - ie they will consider an exon as match even if only
the internal junction has been recovered.""")
parser.add_argument("-nF", "--do-not-report-fusions", dest="report_fusions", action="store_false",
default=True,
help="Flag. If invoked, Mikado compare will not report fusions in the input. Useful \
when the reference transcripts have not been clustered properly into genes (e.g. a Mikado prepare run).")
parser.add_argument("-eu", "--exclude-utr", dest="exclude_utr",
default=False, action="store_true",
help="""Flag. If set, reference and prediction transcripts
Expand Down

0 comments on commit f90a7a4

Please sign in to comment.