Merge branch 'master' of https://github.com/lucventurini/mikado

EI-CoreBioinformatics · Aug 3, 2018 · d77bf08 · d77bf08
2 parents a1dbc9e + 4ea1f1c
commit d77bf08
Show file tree

Hide file tree

Showing 11 changed files with 136 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,11 @@
+# Version 1.2.4
+
+Enhancement release. Following version 1.2.3, now Mikado can accept BED12 files as input for convert, compare and stats (see #122). This is becoming necessary as many long-reads alignment tools are preferentially outputting (or can be easily converted to) this format.
+
+# Version 1.2.3
+
+Mainly this is a bug fix release. It has a key advancement though, as now Mikado can accept BED12 files as input assemblies. This makes it compatible with Minimap2 PAF > BED12 system.
+
 # Version 1.2.2
 
 Minor bugfixes:

diff --git a/Mikado/__init__.py b/Mikado/__init__.py
@@ -10,7 +10,7 @@
 __author__ = 'Luca Venturini'
 __license__ = 'GPL3'
 __copyright__ = 'Copyright 2015-2016 Luca Venturini'
-__version__ = "1.2.3"
+__version__ = "1.2.4"
 
 __all__ = ["configuration",
            "exceptions",

diff --git a/Mikado/loci/reference_gene.py b/Mikado/loci/reference_gene.py
@@ -269,7 +269,8 @@ def load_dict(self, state, exclude_utr=False, protein_coding=False):
             self.transcripts[tid] = transcript
 
         self.chrom = intern(self.chrom)
-        self.source = intern(self.source)
+        if self.source:
+            self.source = intern(self.source)
         self.id = intern(self.id)
 
         return

diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py
@@ -119,6 +119,7 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
         self.__has_start = False
         self.__has_stop = False
         self.__transcriptomic = False
+        self.__parent = None
         self.transcriptomic = transcriptomic
         if phase:
             self.phase = phase  # This will check the validity of the phase itself
@@ -209,6 +210,35 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
         if self.invalid and self.coding:
             self.coding = False
 
+    @property
+    def is_transcript(self):
+        """BED12 files are always transcripts for Mikado."""
+
+        return True
+
+    @property
+    def is_gene(self):
+        """BED12 files are never "genes" according to Mikado"""
+        return False
+
+    @property
+    def source(self):
+        return "bed12"
+
+    @property
+    def gene(self):
+        return self.parent[0] if self.parent else None
+
+    @property
+    def parent(self):
+        return self.__parent
+
+    @parent.setter
+    def parent(self, parent):
+        if parent is not None and not isinstance(parent, str):
+            raise TypeError(type(parent))
+        self.__parent = [parent]
+
     def __set_values_from_fields(self):
 
         """

diff --git a/Mikado/scales/compare.py b/Mikado/scales/compare.py
@@ -21,6 +21,7 @@
 from ..exceptions import CorruptIndex
 from ..loci.reference_gene import Gene
 from ..parsers.GFF import GFF3
+from ..parsers.bed12 import Bed12Parser
 from ..parsers import to_gff
 from ..utilities.log_utils import create_default_logger, formatter
 import magic
@@ -156,7 +157,17 @@ def prepare_reference(args, queue_logger, ref_gff=False) -> (dict, collections.d
 
     for row in args.reference:
         # Assume we are going to use GTF for the moment
-        if row.is_transcript is True or (ref_gff is True and row.feature == "match"):
+        if row.header is True:
+            continue
+        elif args.reference.__annot_type__ == Bed12Parser.__annot_type__:
+            transcript = Transcript(row, logger=queue_logger)
+            transcript.parent = gid = row.id
+            transcript2gene[row.id] = gid
+            if gid not in genes:
+                genes[gid] = Gene(transcript, gid=gid, logger=queue_logger)
+            genes[gid].add(transcript)
+
+        elif row.is_transcript is True or (ref_gff is True and row.feature == "match"):
             queue_logger.debug("Transcript\n%s", str(row))
             transcript = Transcript(row, logger=queue_logger)
             if row.feature == "match":
@@ -233,12 +244,15 @@ def parse_prediction(args, genes, positions, queue_logger):
         args.prediction = to_gff(args.reference.name)
     ref_gff = isinstance(args.prediction, GFF3)
     __found_with_orf = set()
+    is_bed12 = isinstance(args.prediction, Bed12Parser)
 
     for row in args.prediction:
         if row.header is True:
             continue
         #         queue_logger.debug("Row:\n{0:>20}".format(str(row)))
-        if row.is_transcript is True or row.feature == "match":
+        elif (row.is_transcript is True or
+                      args.prediction.__annot_type__ == Bed12Parser.__annot_type__ or
+                      row.feature == "match"):
             queue_logger.debug("Transcript row:\n%s", str(row))
             if transcript is not None:
                 if re.search(r"\.orf[0-9]+$", transcript.id):
@@ -251,6 +265,9 @@ def parse_prediction(args, genes, positions, queue_logger):
                 else:
                     assigner_instance.get_best(transcript)
             transcript = Transcript(row, logger=queue_logger)
+            if is_bed12:
+                transcript.parent = transcript.id
+
         elif row.is_exon is True:
             # Case 1: we are talking about cDNA_match and GFF
             if ref_gff is True and "match" not in row.feature:
@@ -549,8 +566,8 @@ def compare(args):
         if genes is None:
             queue_logger.info("Starting parsing the reference")
             exclude_utr, protein_coding = args.exclude_utr, args.protein_coding
-            if args.no_save_index is False:
-                args.exclude_utr, args.protein_coding = False, False
+            # if args.no_save_index is False:
+            #     args.exclude_utr, args.protein_coding = False, False
             genes, positions = prepare_reference(args,
                                                  queue_logger,
                                                  ref_gff=ref_gff)

diff --git a/Mikado/subprograms/util/convert.py b/Mikado/subprograms/util/convert.py
@@ -14,8 +14,10 @@ def launch(args):
     current = None
     if parser.__annot_type__ == "gtf":
         out_format = "gff3"
-    else:
+    elif parser.__annot_type__ in ("bed12", "gff3"):
         out_format = "gtf"
+    else:
+        raise TypeError("Invalid annotation type: {}".format(parser.__annot_type__))
 
     if (args.out_format is not None and
                 args.out_format != out_format):
@@ -34,11 +36,14 @@ def launch(args):
         if current is None or ((line.is_gene or line.is_transcript) and line.gene != current.id):
             if current:
                 print(current.format(out_format), file=args.out)
-            if "superlocus" in line.feature:  # Hack for Mikado files
+
+            if hasattr(line, "feature") and "superlocus" in line.feature:  # Hack for Mikado files
                 continue
             elif line.is_gene is True:
                 current = Gene(line)
             else:
+                if line.parent is None:
+                    line.parent = "{}.gene".format(line.id)  # Hack for BED12 files
                 current = Gene(Transcript(line))
         elif parser.__annot_type__ == "gtf" and line.is_exon is True and (
                         current is None or current.id != line.gene):
@@ -63,7 +68,7 @@ def convert_parser():
     """
 
     parser = argparse.ArgumentParser(
-        "Utility to covert from GTF to GFF3 and vice versa.")
+        "Utility to covert across GTF, GFF3 and BED12.")
     parser.add_argument("-of", "--out-format", dest="out_format",
                         choices=["bed12", "gtf", "gff3"], default=None)
     parser.add_argument("gf", type=argparse.FileType())

diff --git a/Mikado/subprograms/util/stats.py b/Mikado/subprograms/util/stats.py
@@ -10,7 +10,7 @@
 from ...exceptions import InvalidCDS
 from .. import to_gff
 from ...loci import Transcript, Gene
-from ...parsers import GFF
+from ...parsers.GFF import GFF3
 import numpy
 from collections import namedtuple, Counter
 from ...utilities.log_utils import create_default_logger
@@ -181,10 +181,8 @@ def __init__(self, parsed_args):
         """Constructor function"""
 
         self.gff = parsed_args.gff
-        if isinstance(self.gff, GFF.GFF3):
-            self.is_gff = True
-        else:
-            self.is_gff = False
+
+        self.atype = self.gff.__annot_type__
 
         self.__logger = create_default_logger("calculator")
         if parsed_args.verbose is True:
@@ -236,12 +234,20 @@ def parse_input(self):
         current_gene = None
 
         for record in self.gff:
-            if record.is_gene is True:
+            if self.atype == "bed12":
+                self.__store_gene(current_gene)
+                if not record.parent:
+                    record.parent = "{}.gene".format(record.id)
+                current_gene = Gene(record.copy(), gid=record.parent[0], only_coding=self.only_coding,
+                                    logger=self.__logger)
+                transcript2gene[record.id] = record.parent[0]
+                current_gene.transcripts[record.id] = TranscriptComputer(record, logger=self.__logger)
+            elif record.is_gene is True:
                 self.__store_gene(current_gene)
                 current_gene = Gene(record,
                                     only_coding=self.only_coding,
                                     logger=self.__logger)
-            elif record.is_transcript is True or (self.is_gff is True and record.feature == "match"):
+            elif record.is_transcript is True or (self.atype == GFF3.__annot_type__ and record.feature == "match"):
                 if record.parent is None and record.feature != "match":
                     raise TypeError("No parent found for:\n{0}".format(str(record)))
                 elif record.feature == "match":
@@ -264,10 +270,11 @@ def parse_input(self):
                 transcript2gene[record.id] = gid
                 current_gene.transcripts[record.id] = TranscriptComputer(record,
                                                                          logger=self.__logger)
+
             elif record.is_derived is True and record.is_exon is False:
                 derived_features.add(record.id)
             elif record.is_exon is True:
-                if self.is_gff is False:
+                if self.atype != GFF3.__annot_type__:
                     if "transcript_id" not in record.attributes:
                         # Probably truncated record!
                         record.header = True
@@ -292,7 +299,7 @@ def parse_input(self):
                                                                                          logger=self.__logger)
                     else:
                         current_gene.transcripts[record.transcript].add_exon(record)
-                elif self.is_gff is True and "cDNA_match" in record.feature:
+                elif self.atype == GFF3.__annot_type__ and "cDNA_match" in record.feature:
                     # Here we assume that we only have "cDNA_match" lines, with no parents
                     record.parent = record.id
                     if current_gene is None or record.id != current_gene.id:
@@ -313,7 +320,7 @@ def parse_input(self):
                     else:
                         current_gene.transcripts[record.id].add_exon(record)
 
-                elif self.is_gff is True:
+                elif self.atype == GFF3.__annot_type__:
                     current_gene.add_exon(record)
 
             elif record.header is True:

diff --git a/Mikado/tests/test_system_calls.py b/Mikado/tests/test_system_calls.py
@@ -202,7 +202,8 @@ def test_index(self):
         files = ["trinity.gtf",
                  "trinity.gff3",
                  "trinity.cDNA_match.gff3",
-                 "trinity.match_matchpart.gff3"]
+                 "trinity.match_matchpart.gff3",
+                 "trinity.bed12"]
         # files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]
 
         namespace = Namespace(default=False)
@@ -238,7 +239,8 @@ def test_compare_trinity(self):
         files = ["trinity.gtf",
                  "trinity.gff3",
                  "trinity.cDNA_match.gff3",
-                 "trinity.match_matchpart.gff3"]
+                 "trinity.match_matchpart.gff3",
+                 "trinity.bed12"]
         files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]
 
         namespace = Namespace(default=False)
@@ -292,7 +294,8 @@ def test_stat(self):
         files = ["trinity.gtf",
                  "trinity.gff3",
                  "trinity.cDNA_match.gff3",
-                 "trinity.match_matchpart.gff3"]
+                 "trinity.match_matchpart.gff3",
+                 "trinity.bed12"]
         files = [pkg_resources.resource_filename("Mikado.tests", filename) for filename in files]
 
         std_lines = []

diff --git a/Mikado/tests/trinity.bed12 b/Mikado/tests/trinity.bed12
@@ -0,0 +1,38 @@
+Chr5	26578495	26579563	ID=c73_g1_i1.mrna1.160;coding=False	95.0	-	26578495	26578496	0	2	23,263	0,805
+Chr5	26581224	26583874	ID=c11_g1_i2.mrna1.111;coding=False	99.0	-	26581224	26581225	0	2	304,540	0,2110
+Chr5	26583215	26583874	ID=c11_g1_i1.mrna1.350;coding=False	99.0	-	26583215	26583216	0	1	659	0
+Chr5	26584172	26584464	ID=c3_g1_i2.mrna1.181;coding=False	100.0	-	26584172	26584173	0	2	106,107	0,185
+Chr5	26584172	26584464	ID=c3_g1_i1.mrna1.716;coding=False	99.0	+	26584172	26584173	0	1	292	0
+Chr5	26584796	26595528	ID=c58_g1_i3.mrna1.19;coding=False	100.0	+	26584796	26584797	0	11	83,54,545,313,105,213,63,119,59,46,118	0,423,548,1185,1623,1841,2137,2287,2490,2630,10614
+Chr5	26585506	26586850	ID=c58_g1_i2.mrna1.35;coding=False	99.0	+	26585506	26585507	0	5	383,121,78,105,213	0,475,710,913,1131
+Chr5	26586850	26595457	ID=c58_g1_i1.mrna1.190;coding=False	100.0	+	26586850	26586851	0	5	146,119,59,46,47	0,233,436,576,8560
+Chr5	26586850	26595457	ID=c58_g1_i7.mrna1.1;coding=False	99.0	+	26586850	26586851	0	4	352,59,46,47	0,436,576,8560
+Chr5	26587454	26587912	ID=c108_g1_i1.mrna1.104;coding=False	99.0	+	26587454	26587455	0	1	458	0
+Chr5	26588407	26594772	ID=c58_g1_i11.mrna1;coding=False	100.0	+	26588407	26588408	0	6	218,84,782,133,72,455	0,788,978,1853,5764,5910
+Chr5	26588407	26598230	ID=c58_g1_i5.mrna1.3;coding=False	100.0	+	26588407	26588408	0	12	218,84,782,133,72,99,213,63,119,59,346,2519	0,788,978,1853,2087,2233,2472,2766,2916,3112,3273,7304
+Chr5	26588407	26601049	ID=c58_g1_i10.mrna1;coding=False	100.0	+	26588407	26588408	0	9	218,84,782,133,72,99,204,18,45	0,788,978,1853,5764,5910,6161,6452,12597
+Chr5	26588602	26589218	ID=c68_g1_i1.mrna1.173;coding=False	100.0	+	26588602	26588603	0	2	252,23	0,593
+Chr5	26591173	26595528	ID=c58_g1_i11.mrna2;coding=False	100.0	+	26591173	26591174	0	5	63,119,59,90,73	0,150,346,507,4282
+Chr5	26591323	26591981	ID=c58_g1_i8.mrna2;coding=False	100.0	+	26591323	26591324	0	4	119,59,165,40	0,196,357,618
+Chr5	26591323	26591981	ID=c58_g1_i10.mrna2;coding=False	100.0	+	26591323	26591324	0	4	119,59,165,40	0,196,357,618
+Chr5	26591980	26594589	ID=c58_g1_i12.mrna1;coding=False	100.0	+	26591980	26591981	0	6	22,838,388,133,72,272	0,547,1468,1949,2191,2337
+Chr5	26591980	26595084	ID=c58_g1_i9.mrna1;coding=False	100.0	+	26591980	26591981	0	8	22,838,388,133,72,99,354,82	0,547,1468,1949,2191,2337,2588,3022
+Chr5	26591980	26598230	ID=c58_g1_i4.mrna1.6;coding=False	100.0	+	26591980	26591981	0	11	22,838,388,133,72,99,204,63,119,59,2865	0,547,1468,1949,2191,2337,2588,2879,3022,3229,3385
+Chr5	26591980	26598230	ID=c58_g1_i6.mrna1.3;coding=False	100.0	+	26591980	26591981	0	10	22,1309,133,72,99,204,63,119,59,2865	0,547,1949,2191,2337,2588,2879,3022,3229,3385
+Chr5	26591980	26601049	ID=c58_g1_i8.mrna1;coding=False	100.0	+	26591980	26591981	0	9	22,838,388,133,72,99,204,18,45	0,547,1468,1949,2191,2337,2588,2879,9024
+Chr5	26592042	26592421	ID=c113_g1_i1.mrna1.94;coding=False	96.0	+	26592042	26592043	0	1	379	0
+Chr5	26599400	26600244	ID=c77_g1_i1.mrna1.153;coding=False	99.0	+	26599400	26599401	0	3	254,287,94	0,366,750
+Chr5	26600324	26600815	ID=c109_g1_i1.mrna1.102;coding=False	98.0	+	26600324	26600325	0	3	70,120,120	0,172,371
+Chr5	26602732	26604736	ID=c60_g1_i1.mrna1.189;coding=False	98.0	-	26602732	26602733	0	2	906,1010	0,994
+Chr5	26604727	26604970	ID=c115_g1_i1.mrna1.88;coding=False	99.0	-	26604727	26604728	0	1	243	0
+Chr5	26604952	26605502	ID=c152_g1_i1.mrna1.66;coding=False	99.0	+	26604952	26604953	0	1	550	0
+Chr5	26605480	26605955	ID=c21_g1_i1.mrna1.302;coding=False	99.0	+	26605480	26605481	0	1	475	0
+Chr5	26605931	26606539	ID=c41_g1_i1.mrna1.224;coding=False	99.0	+	26605931	26605932	0	1	608	0
+Chr5	26609248	26610543	ID=c6_g1_i1.mrna1.412;coding=False	99.0	+	26609248	26609249	0	1	1295	0
+Chr5	26611511	26611831	ID=c74_g1_i1.mrna1.154;coding=False	99.0	-	26611511	26611512	0	1	320	0
+Chr5	26611809	26612012	ID=c71_g1_i1.mrna1.167;coding=False	100.0	-	26611809	26611810	0	1	203	0
+Chr5	26612156	26612362	ID=c114_g1_i1.mrna1.89;coding=False	99.0	-	26612156	26612157	0	1	206	0
+Chr5	26612357	26613923	ID=c137_g1_i1.mrna1.77;coding=False	98.0	-	26612357	26612358	0	3	370,349,24	0,488,1542
+Chr5	26612910	26614294	ID=c37_g1_i2.mrna1.66;coding=False	100.0	-	26612910	26612911	0	2	22,395	0,989
+Chr5	26613845	26614294	ID=c37_g1_i1.mrna1.234;coding=False	96.0	+	26613845	26613846	0	1	449	0
+Chr5	26614713	26614982	ID=c120_g1_i1.mrna1.89;coding=False	98.0	+	26614713	26614714	0	1	269	0
diff --git a/Mikado/transcripts/transcript.py b/Mikado/transcripts/transcript.py
@@ -364,6 +364,8 @@ def __initialize_with_bed12(self, transcript_row: BED12):
         exon_starts = np.array([_ + self.start for _ in transcript_row.block_starts])
         exon_ends = exon_starts + np.array(transcript_row.block_sizes) - 1
         self.add_exons(list(zip(list(exon_starts), list(exon_ends))))
+        self.parent = getattr(transcript_row, "parent", transcript_row.id)
+        self.source = getattr(transcript_row, "source", None)
         # Now we have to calculate the CDS
         cds = []
         if transcript_row.coding is True:

diff --git a/util/bam2gtf.py b/util/bam2gtf.py
@@ -29,10 +29,13 @@ def main():
     # X 8 sequence mismatch
 
     for record in args.bam:
+        record.cigar = [(key, val) if key not in (7, 8) else (0, val) for key, val in record.cigar]
+
         try:
             start, end = record.reference_start, record.get_blocks()[-1][1]
         except IndexError:
             continue
+
         current = start
 
         exons = []