Skip to content

Commit

Permalink
Merge branch 'master' of https://github.com/lucventurini/mikado
Browse files Browse the repository at this point in the history
  • Loading branch information
lucventurini committed Aug 3, 2018
2 parents a1dbc9e + 4ea1f1c commit d77bf08
Show file tree
Hide file tree
Showing 11 changed files with 136 additions and 22 deletions.
8 changes: 8 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,11 @@
# Version 1.2.4

Enhancement release. Following version 1.2.3, now Mikado can accept BED12 files as input for convert, compare and stats (see #122). This is becoming necessary as many long-reads alignment tools are preferentially outputting (or can be easily converted to) this format.

# Version 1.2.3

Mainly this is a bug fix release. It has a key advancement though, as now Mikado can accept BED12 files as input assemblies. This makes it compatible with Minimap2 PAF > BED12 system.

# Version 1.2.2

Minor bugfixes:
Expand Down
2 changes: 1 addition & 1 deletion Mikado/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
__author__ = 'Luca Venturini'
__license__ = 'GPL3'
__copyright__ = 'Copyright 2015-2016 Luca Venturini'
__version__ = "1.2.3"
__version__ = "1.2.4"

__all__ = ["configuration",
"exceptions",
Expand Down
3 changes: 2 additions & 1 deletion Mikado/loci/reference_gene.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,7 +269,8 @@ def load_dict(self, state, exclude_utr=False, protein_coding=False):
self.transcripts[tid] = transcript

self.chrom = intern(self.chrom)
self.source = intern(self.source)
if self.source:
self.source = intern(self.source)
self.id = intern(self.id)

return
Expand Down
30 changes: 30 additions & 0 deletions Mikado/parsers/bed12.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
self.__has_start = False
self.__has_stop = False
self.__transcriptomic = False
self.__parent = None
self.transcriptomic = transcriptomic
if phase:
self.phase = phase # This will check the validity of the phase itself
Expand Down Expand Up @@ -209,6 +210,35 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
if self.invalid and self.coding:
self.coding = False

@property
def is_transcript(self):
"""BED12 files are always transcripts for Mikado."""

return True

@property
def is_gene(self):
"""BED12 files are never "genes" according to Mikado"""
return False

@property
def source(self):
return "bed12"

@property
def gene(self):
return self.parent[0] if self.parent else None

@property
def parent(self):
return self.__parent

@parent.setter
def parent(self, parent):
if parent is not None and not isinstance(parent, str):
raise TypeError(type(parent))
self.__parent = [parent]

def __set_values_from_fields(self):

"""
Expand Down
25 changes: 21 additions & 4 deletions Mikado/scales/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from ..exceptions import CorruptIndex
from ..loci.reference_gene import Gene
from ..parsers.GFF import GFF3
from ..parsers.bed12 import Bed12Parser
from ..parsers import to_gff
from ..utilities.log_utils import create_default_logger, formatter
import magic
Expand Down Expand Up @@ -156,7 +157,17 @@ def prepare_reference(args, queue_logger, ref_gff=False) -> (dict, collections.d

for row in args.reference:
# Assume we are going to use GTF for the moment
if row.is_transcript is True or (ref_gff is True and row.feature == "match"):
if row.header is True:
continue
elif args.reference.__annot_type__ == Bed12Parser.__annot_type__:
transcript = Transcript(row, logger=queue_logger)
transcript.parent = gid = row.id
transcript2gene[row.id] = gid
if gid not in genes:
genes[gid] = Gene(transcript, gid=gid, logger=queue_logger)
genes[gid].add(transcript)

elif row.is_transcript is True or (ref_gff is True and row.feature == "match"):
queue_logger.debug("Transcript\n%s", str(row))
transcript = Transcript(row, logger=queue_logger)
if row.feature == "match":
Expand Down Expand Up @@ -233,12 +244,15 @@ def parse_prediction(args, genes, positions, queue_logger):
args.prediction = to_gff(args.reference.name)
ref_gff = isinstance(args.prediction, GFF3)
__found_with_orf = set()
is_bed12 = isinstance(args.prediction, Bed12Parser)

for row in args.prediction:
if row.header is True:
continue
# queue_logger.debug("Row:\n{0:>20}".format(str(row)))
if row.is_transcript is True or row.feature == "match":
elif (row.is_transcript is True or
args.prediction.__annot_type__ == Bed12Parser.__annot_type__ or
row.feature == "match"):
queue_logger.debug("Transcript row:\n%s", str(row))
if transcript is not None:
if re.search(r"\.orf[0-9]+$", transcript.id):
Expand All @@ -251,6 +265,9 @@ def parse_prediction(args, genes, positions, queue_logger):
else:
assigner_instance.get_best(transcript)
transcript = Transcript(row, logger=queue_logger)
if is_bed12:
transcript.parent = transcript.id

elif row.is_exon is True:
# Case 1: we are talking about cDNA_match and GFF
if ref_gff is True and "match" not in row.feature:
Expand Down Expand Up @@ -549,8 +566,8 @@ def compare(args):
if genes is None:
queue_logger.info("Starting parsing the reference")
exclude_utr, protein_coding = args.exclude_utr, args.protein_coding
if args.no_save_index is False:
args.exclude_utr, args.protein_coding = False, False
# if args.no_save_index is False:
# args.exclude_utr, args.protein_coding = False, False
genes, positions = prepare_reference(args,
queue_logger,
ref_gff=ref_gff)
Expand Down
11 changes: 8 additions & 3 deletions Mikado/subprograms/util/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ def launch(args):
current = None
if parser.__annot_type__ == "gtf":
out_format = "gff3"
else:
elif parser.__annot_type__ in ("bed12", "gff3"):
out_format = "gtf"
else:
raise TypeError("Invalid annotation type: {}".format(parser.__annot_type__))

if (args.out_format is not None and
args.out_format != out_format):
Expand All @@ -34,11 +36,14 @@ def launch(args):
if current is None or ((line.is_gene or line.is_transcript) and line.gene != current.id):
if current:
print(current.format(out_format), file=args.out)
if "superlocus" in line.feature: # Hack for Mikado files

if hasattr(line, "feature") and "superlocus" in line.feature: # Hack for Mikado files
continue
elif line.is_gene is True:
current = Gene(line)
else:
if line.parent is None:
line.parent = "{}.gene".format(line.id) # Hack for BED12 files
current = Gene(Transcript(line))
elif parser.__annot_type__ == "gtf" and line.is_exon is True and (
current is None or current.id != line.gene):
Expand All @@ -63,7 +68,7 @@ def convert_parser():
"""

parser = argparse.ArgumentParser(
"Utility to covert from GTF to GFF3 and vice versa.")
"Utility to covert across GTF, GFF3 and BED12.")
parser.add_argument("-of", "--out-format", dest="out_format",
choices=["bed12", "gtf", "gff3"], default=None)
parser.add_argument("gf", type=argparse.FileType())
Expand Down
27 changes: 17 additions & 10 deletions Mikado/subprograms/util/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from ...exceptions import InvalidCDS
from .. import to_gff
from ...loci import Transcript, Gene
from ...parsers import GFF
from ...parsers.GFF import GFF3
import numpy
from collections import namedtuple, Counter
from ...utilities.log_utils import create_default_logger
Expand Down Expand Up @@ -181,10 +181,8 @@ def __init__(self, parsed_args):
"""Constructor function"""

self.gff = parsed_args.gff
if isinstance(self.gff, GFF.GFF3):
self.is_gff = True
else:
self.is_gff = False

self.atype = self.gff.__annot_type__

self.__logger = create_default_logger("calculator")
if parsed_args.verbose is True:
Expand Down Expand Up @@ -236,12 +234,20 @@ def parse_input(self):
current_gene = None

for record in self.gff:
if record.is_gene is True:
if self.atype == "bed12":
self.__store_gene(current_gene)
if not record.parent:
record.parent = "{}.gene".format(record.id)
current_gene = Gene(record.copy(), gid=record.parent[0], only_coding=self.only_coding,
logger=self.__logger)
transcript2gene[record.id] = record.parent[0]
current_gene.transcripts[record.id] = TranscriptComputer(record, logger=self.__logger)
elif record.is_gene is True:
self.__store_gene(current_gene)
current_gene = Gene(record,
only_coding=self.only_coding,
logger=self.__logger)
elif record.is_transcript is True or (self.is_gff is True and record.feature == "match"):
elif record.is_transcript is True or (self.atype == GFF3.__annot_type__ and record.feature == "match"):
if record.parent is None and record.feature != "match":
raise TypeError("No parent found for:\n{0}".format(str(record)))
elif record.feature == "match":
Expand All @@ -264,10 +270,11 @@ def parse_input(self):
transcript2gene[record.id] = gid
current_gene.transcripts[record.id] = TranscriptComputer(record,
logger=self.__logger)

elif record.is_derived is True and record.is_exon is False:
derived_features.add(record.id)
elif record.is_exon is True:
if self.is_gff is False:
if self.atype != GFF3.__annot_type__:
if "transcript_id" not in record.attributes:
# Probably truncated record!
record.header = True
Expand All @@ -292,7 +299,7 @@ def parse_input(self):
logger=self.__logger)
else:
current_gene.transcripts[record.transcript].add_exon(record)
elif self.is_gff is True and "cDNA_match" in record.feature:
elif self.atype == GFF3.__annot_type__ and "cDNA_match" in record.feature:
# Here we assume that we only have "cDNA_match" lines, with no parents
record.parent = record.id
if current_gene is None or record.id != current_gene.id:
Expand All @@ -313,7 +320,7 @@ def parse_input(self):
else:
current_gene.transcripts[record.id].add_exon(record)

elif self.is_gff is True:
elif self.atype == GFF3.__annot_type__:
current_gene.add_exon(record)

elif record.header is True:
Expand Down
9 changes: 6 additions & 3 deletions Mikado/tests/test_system_calls.py
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,8 @@ def test_index(self):
files = ["trinity.gtf",
"trinity.gff3",
"trinity.cDNA_match.gff3",
"trinity.match_matchpart.gff3"]
"trinity.match_matchpart.gff3",
"trinity.bed12"]
# files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

namespace = Namespace(default=False)
Expand Down Expand Up @@ -238,7 +239,8 @@ def test_compare_trinity(self):
files = ["trinity.gtf",
"trinity.gff3",
"trinity.cDNA_match.gff3",
"trinity.match_matchpart.gff3"]
"trinity.match_matchpart.gff3",
"trinity.bed12"]
files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

namespace = Namespace(default=False)
Expand Down Expand Up @@ -292,7 +294,8 @@ def test_stat(self):
files = ["trinity.gtf",
"trinity.gff3",
"trinity.cDNA_match.gff3",
"trinity.match_matchpart.gff3"]
"trinity.match_matchpart.gff3",
"trinity.bed12"]
files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]

std_lines = []
Expand Down
38 changes: 38 additions & 0 deletions Mikado/tests/trinity.bed12
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
Chr5 26578495 26579563 ID=c73_g1_i1.mrna1.160;coding=False 95.0 - 26578495 26578496 0 2 23,263 0,805
Chr5 26581224 26583874 ID=c11_g1_i2.mrna1.111;coding=False 99.0 - 26581224 26581225 0 2 304,540 0,2110
Chr5 26583215 26583874 ID=c11_g1_i1.mrna1.350;coding=False 99.0 - 26583215 26583216 0 1 659 0
Chr5 26584172 26584464 ID=c3_g1_i2.mrna1.181;coding=False 100.0 - 26584172 26584173 0 2 106,107 0,185
Chr5 26584172 26584464 ID=c3_g1_i1.mrna1.716;coding=False 99.0 + 26584172 26584173 0 1 292 0
Chr5 26584796 26595528 ID=c58_g1_i3.mrna1.19;coding=False 100.0 + 26584796 26584797 0 11 83,54,545,313,105,213,63,119,59,46,118 0,423,548,1185,1623,1841,2137,2287,2490,2630,10614
Chr5 26585506 26586850 ID=c58_g1_i2.mrna1.35;coding=False 99.0 + 26585506 26585507 0 5 383,121,78,105,213 0,475,710,913,1131
Chr5 26586850 26595457 ID=c58_g1_i1.mrna1.190;coding=False 100.0 + 26586850 26586851 0 5 146,119,59,46,47 0,233,436,576,8560
Chr5 26586850 26595457 ID=c58_g1_i7.mrna1.1;coding=False 99.0 + 26586850 26586851 0 4 352,59,46,47 0,436,576,8560
Chr5 26587454 26587912 ID=c108_g1_i1.mrna1.104;coding=False 99.0 + 26587454 26587455 0 1 458 0
Chr5 26588407 26594772 ID=c58_g1_i11.mrna1;coding=False 100.0 + 26588407 26588408 0 6 218,84,782,133,72,455 0,788,978,1853,5764,5910
Chr5 26588407 26598230 ID=c58_g1_i5.mrna1.3;coding=False 100.0 + 26588407 26588408 0 12 218,84,782,133,72,99,213,63,119,59,346,2519 0,788,978,1853,2087,2233,2472,2766,2916,3112,3273,7304
Chr5 26588407 26601049 ID=c58_g1_i10.mrna1;coding=False 100.0 + 26588407 26588408 0 9 218,84,782,133,72,99,204,18,45 0,788,978,1853,5764,5910,6161,6452,12597
Chr5 26588602 26589218 ID=c68_g1_i1.mrna1.173;coding=False 100.0 + 26588602 26588603 0 2 252,23 0,593
Chr5 26591173 26595528 ID=c58_g1_i11.mrna2;coding=False 100.0 + 26591173 26591174 0 5 63,119,59,90,73 0,150,346,507,4282
Chr5 26591323 26591981 ID=c58_g1_i8.mrna2;coding=False 100.0 + 26591323 26591324 0 4 119,59,165,40 0,196,357,618
Chr5 26591323 26591981 ID=c58_g1_i10.mrna2;coding=False 100.0 + 26591323 26591324 0 4 119,59,165,40 0,196,357,618
Chr5 26591980 26594589 ID=c58_g1_i12.mrna1;coding=False 100.0 + 26591980 26591981 0 6 22,838,388,133,72,272 0,547,1468,1949,2191,2337
Chr5 26591980 26595084 ID=c58_g1_i9.mrna1;coding=False 100.0 + 26591980 26591981 0 8 22,838,388,133,72,99,354,82 0,547,1468,1949,2191,2337,2588,3022
Chr5 26591980 26598230 ID=c58_g1_i4.mrna1.6;coding=False 100.0 + 26591980 26591981 0 11 22,838,388,133,72,99,204,63,119,59,2865 0,547,1468,1949,2191,2337,2588,2879,3022,3229,3385
Chr5 26591980 26598230 ID=c58_g1_i6.mrna1.3;coding=False 100.0 + 26591980 26591981 0 10 22,1309,133,72,99,204,63,119,59,2865 0,547,1949,2191,2337,2588,2879,3022,3229,3385
Chr5 26591980 26601049 ID=c58_g1_i8.mrna1;coding=False 100.0 + 26591980 26591981 0 9 22,838,388,133,72,99,204,18,45 0,547,1468,1949,2191,2337,2588,2879,9024
Chr5 26592042 26592421 ID=c113_g1_i1.mrna1.94;coding=False 96.0 + 26592042 26592043 0 1 379 0
Chr5 26599400 26600244 ID=c77_g1_i1.mrna1.153;coding=False 99.0 + 26599400 26599401 0 3 254,287,94 0,366,750
Chr5 26600324 26600815 ID=c109_g1_i1.mrna1.102;coding=False 98.0 + 26600324 26600325 0 3 70,120,120 0,172,371
Chr5 26602732 26604736 ID=c60_g1_i1.mrna1.189;coding=False 98.0 - 26602732 26602733 0 2 906,1010 0,994
Chr5 26604727 26604970 ID=c115_g1_i1.mrna1.88;coding=False 99.0 - 26604727 26604728 0 1 243 0
Chr5 26604952 26605502 ID=c152_g1_i1.mrna1.66;coding=False 99.0 + 26604952 26604953 0 1 550 0
Chr5 26605480 26605955 ID=c21_g1_i1.mrna1.302;coding=False 99.0 + 26605480 26605481 0 1 475 0
Chr5 26605931 26606539 ID=c41_g1_i1.mrna1.224;coding=False 99.0 + 26605931 26605932 0 1 608 0
Chr5 26609248 26610543 ID=c6_g1_i1.mrna1.412;coding=False 99.0 + 26609248 26609249 0 1 1295 0
Chr5 26611511 26611831 ID=c74_g1_i1.mrna1.154;coding=False 99.0 - 26611511 26611512 0 1 320 0
Chr5 26611809 26612012 ID=c71_g1_i1.mrna1.167;coding=False 100.0 - 26611809 26611810 0 1 203 0
Chr5 26612156 26612362 ID=c114_g1_i1.mrna1.89;coding=False 99.0 - 26612156 26612157 0 1 206 0
Chr5 26612357 26613923 ID=c137_g1_i1.mrna1.77;coding=False 98.0 - 26612357 26612358 0 3 370,349,24 0,488,1542
Chr5 26612910 26614294 ID=c37_g1_i2.mrna1.66;coding=False 100.0 - 26612910 26612911 0 2 22,395 0,989
Chr5 26613845 26614294 ID=c37_g1_i1.mrna1.234;coding=False 96.0 + 26613845 26613846 0 1 449 0
Chr5 26614713 26614982 ID=c120_g1_i1.mrna1.89;coding=False 98.0 + 26614713 26614714 0 1 269 0
2 changes: 2 additions & 0 deletions Mikado/transcripts/transcript.py
Original file line number Diff line number Diff line change
Expand Up @@ -364,6 +364,8 @@ def __initialize_with_bed12(self, transcript_row: BED12):
exon_starts = np.array([_ + self.start for _ in transcript_row.block_starts])
exon_ends = exon_starts + np.array(transcript_row.block_sizes) - 1
self.add_exons(list(zip(list(exon_starts), list(exon_ends))))
self.parent = getattr(transcript_row, "parent", transcript_row.id)
self.source = getattr(transcript_row, "source", None)
# Now we have to calculate the CDS
cds = []
if transcript_row.coding is True:
Expand Down
3 changes: 3 additions & 0 deletions util/bam2gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,10 +29,13 @@ def main():
# X 8 sequence mismatch

for record in args.bam:
record.cigar = [(key, val) if key not in (7, 8) else (0, val) for key, val in record.cigar]

try:
start, end = record.reference_start, record.get_blocks()[-1][1]
except IndexError:
continue

current = start

exons = []
Expand Down

0 comments on commit d77bf08

Please sign in to comment.