Merged issue-270, fix #270

EI-CoreBioinformatics · Oct 7, 2020 · 9b04246 · 9b04246
1 parent ca90031
commit 9b04246
Show file tree

Hide file tree

Showing 18 changed files with 145 additions and 132 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -5,6 +5,7 @@ python:
   # - "3.5"
   - "3.6"
   - "3.7.3"
+  - "3.8"
 # Setup anaconda, see https://gist.github.com/dan-blanchard/7045057
 before_install:
   - wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh -O miniconda.sh

diff --git a/Mikado/configuration/configuration_blueprint.json b/Mikado/configuration/configuration_blueprint.json
@@ -250,7 +250,7 @@
       "required": ["files", "strand_specific"],
       "properties":
       {
-        "keep_redundant": {
+        "exclude_redundant": {
           "type": "boolean", "default": false
         },
         "minimum_cdna_length": {
@@ -290,7 +290,7 @@
             "labels": {"type": "array", "default": []},
             "strand_specific_assemblies": {"type": "array", "default": []},
             "reference": {"type": "array", "default": []},
-            "keep_redundant": {"type": "array", "default": []},
+            "exclude_redundant": {"type": "array", "default": []},
             "source_score":{
               "type": "object",
               "default": {},

diff --git a/Mikado/loci/locus.py b/Mikado/loci/locus.py
@@ -266,7 +266,8 @@ def __launch_padding(self):
         # The "templates" are the transcripts that we used to expand the others.
         templates = self.pad_transcripts()
         # First off, let us update the transcripts.
-        for tid in self.transcripts:
+        tid_keys = list(self.transcripts.keys())
+        for tid in tid_keys:
             self.logger.debug("Swapping %s", tid)
             self._swap_transcript(backup[tid], self.transcripts[tid])
 

diff --git a/Mikado/parsers/bed12.py b/Mikado/parsers/bed12.py
@@ -62,8 +62,8 @@ def get_tables(table, to_stop=False, gap=None):
     if gap is not None:
         forward_table[gap * 3] = "*"
 
-    if table.nucleotide_alphabet.letters is not None:
-        valid_letters = set(table.nucleotide_alphabet.letters.upper())
+    if table.nucleotide_alphabet is not None:
+        valid_letters = set(table.nucleotide_alphabet.upper())
     else:
         # Assume the worst case, ambiguous DNA or RNA:
         valid_letters = backup_valid_letters
@@ -298,8 +298,8 @@ def __init__(self, *args: Union[str, list, tuple, GffLine],
         self.score = 0
         self.strand = None
         self.rgb = ''
-        self.__block_sizes = np.zeros(1, dtype=np.int_)
-        self.__block_starts = np.zeros(1, dtype=np.int_)
+        self.__block_sizes = np.zeros(1, dtype=np.int64)
+        self.__block_starts = np.zeros(1, dtype=np.int64)
         self.__block_count = 1
         self.__invalid = None
         self.invalid_reason = None
@@ -1167,14 +1167,14 @@ def block_sizes(self):
     @block_sizes.setter
     def block_sizes(self, sizes):
         sizes = np.array(sizes)
-        if not issubclass(sizes.dtype.type, np.int_):
+        if not issubclass(sizes.dtype.type, np.int64):
             raise TypeError("Block sizes should be integers!")
         self.__block_sizes = sizes
         del self.invalid
 
     @block_sizes.deleter
     def block_sizes(self):
-        self.__block_sizes = np.zeros(1, dtype=np.int_)
+        self.__block_sizes = np.zeros(1, dtype=np.int64)
         del self.invalid
 
     @property
@@ -1184,7 +1184,7 @@ def block_starts(self):
     @block_starts.setter
     def block_starts(self, starts):
         starts = np.array(starts)
-        if not issubclass(starts.dtype.type, np.int_):
+        if not issubclass(starts.dtype.type, np.int64):
             raise TypeError("Block sizes should be integers! Dtype: {}; array: {}".format(
                 starts.dtype, starts
             ))
@@ -1193,7 +1193,7 @@ def block_starts(self, starts):
 
     @block_starts.deleter
     def block_starts(self):
-        self.__block_starts = np.zeros(1, dtype=np.int_)
+        self.__block_starts = np.zeros(1, dtype=np.int64)
         del self.invalid
 
     @property
@@ -1364,7 +1364,7 @@ def to_transcriptomic(self, sequence=None, fasta_index=None, start_adjustment=Fa
             bsizes = np.flip(self.block_sizes)
             tStart, tEnd = self.block_sizes.sum() - tEnd, self.block_sizes.sum() - tStart
 
-        bstarts = np.concatenate([np.zeros(1, dtype=np.int_), bsizes[:-1].cumsum()])
+        bstarts = np.concatenate([np.zeros(1, dtype=np.int64), bsizes[:-1].cumsum()])
         # bstarts = [0]
         # for bs in bsizes[:-1]:
         #     bstarts.append(bs + bstarts[-1])

diff --git a/Mikado/preparation/annotation_parser.py b/Mikado/preparation/annotation_parser.py
@@ -76,7 +76,7 @@ def run(self):
         while True:
             results = self.submission_queue.get()
             try:
-                label, handle, strand_specific, is_reference, keep_redundant, shelf_name = results
+                label, handle, strand_specific, is_reference, exclude_redundant, shelf_name = results
             except ValueError as exc:
                 raise ValueError("{}.\tValues: {}".format(exc, ", ".join([str(_) for _ in results])))
             if handle == "EXIT":
@@ -100,7 +100,7 @@ def run(self):
                                             max_intron=self.max_intron,
                                             strip_cds=self.__strip_cds,
                                             is_reference=is_reference,
-                                            keep_redundant=keep_redundant,
+                                            exclude_redundant=exclude_redundant,
                                             strand_specific=strand_specific)
                 elif gff_handle.__annot_type__ == "gtf":
                     new_ids = load_from_gtf(shelf_name,
@@ -112,7 +112,7 @@ def run(self):
                                             max_intron=self.max_intron,
                                             is_reference=is_reference,
                                             strip_cds=self.__strip_cds,
-                                            keep_redundant=keep_redundant,
+                                            exclude_redundant=exclude_redundant,
                                             strand_specific=strand_specific)
                 elif gff_handle.__annot_type__ == "bed12":
                     new_ids = load_from_bed12(shelf_name,
@@ -124,7 +124,7 @@ def run(self):
                                               min_length=self.min_length,
                                               max_intron=self.max_intron,
                                               strip_cds=self.__strip_cds,
-                                              keep_redundant=keep_redundant,
+                                              exclude_redundant=exclude_redundant,
                                               strand_specific=strand_specific)
                 else:
                     raise ValueError("Invalid file type: {}".format(gff_handle.name))
@@ -397,7 +397,7 @@ def load_from_gff(shelf_name,
                   min_length=0,
                   max_intron=3*10**5,
                   is_reference=False,
-                  keep_redundant=False,
+                  exclude_redundant=False,
                   strip_cds=False,
                   strand_specific=False):
     """
@@ -420,6 +420,8 @@ def load_from_gff(shelf_name,
     :type strand_specific: bool
     :param is_reference: boolean. If set to True, the transcript will always be retained.
     :type is_reference: bool
+    :param exclude_redundant: boolean. If set to True, fully redundant transcripts will be removed.
+    :type exclude_redundant: bool
     :return:
     """
 
@@ -475,7 +477,7 @@ def load_from_gff(shelf_name,
 
             exon_lines[row.id]["strand_specific"] = strand_specific
             exon_lines[row.id]["is_reference"] = is_reference
-            exon_lines[row.id]["keep_redundant"] = keep_redundant
+            exon_lines[row.id]["exclude_redundant"] = exclude_redundant
             continue
         elif row.is_exon is True:
             if not row.is_cds or (row.is_cds is True and strip_cds is False):
@@ -520,7 +522,7 @@ def load_from_gff(shelf_name,
                         exon_lines[tid]["parent"] = transcript2genes[tid]
                         exon_lines[tid]["strand_specific"] = strand_specific
                         exon_lines[tid]["is_reference"] = is_reference
-                        exon_lines[tid]["keep_redundant"] = keep_redundant
+                        exon_lines[tid]["exclude_redundant"] = exclude_redundant
                     elif tid not in exon_lines and tid not in transcript2genes:
                         continue
                     else:
@@ -555,7 +557,7 @@ def load_from_gtf(shelf_name,
                   min_length=0,
                   max_intron=3*10**5,
                   is_reference=False,
-                  keep_redundant=False,
+                  exclude_redundant=False,
                   strip_cds=False,
                   strand_specific=False):
     """
@@ -578,6 +580,8 @@ def load_from_gtf(shelf_name,
     :type strand_specific: bool
     :param is_reference: boolean. If set to True, the transcript will always be retained.
     :type is_reference: bool
+    :param exclude_redundant: boolean. If set to True, the transcript will be marked for potential redundancy removal.
+    :type exclude_redundant: bool
     :return:
     """
 
@@ -618,7 +622,7 @@ def load_from_gtf(shelf_name,
             exon_lines[row.transcript]["parent"] = "{}.gene".format(row.id)
             exon_lines[row.transcript]["strand_specific"] = strand_specific
             exon_lines[row.transcript]["is_reference"] = is_reference
-            exon_lines[row.transcript]["keep_redundant"] = keep_redundant
+            exon_lines[row.transcript]["exclude_redundant"] = exclude_redundant
             if "exon_number" in exon_lines[row.transcript]["attributes"]:
                 del exon_lines[row.transcript]["attributes"]["exon_number"]
             continue
@@ -645,7 +649,7 @@ def load_from_gtf(shelf_name,
             exon_lines[row.transcript]["parent"] = "{}.gene".format(row.transcript)
             exon_lines[row.transcript]["strand_specific"] = strand_specific
             exon_lines[row.transcript]["is_reference"] = is_reference
-            exon_lines[row.transcript]["keep_redundant"] = keep_redundant
+            exon_lines[row.transcript]["exclude_redundant"] = exclude_redundant
         else:
             if row.transcript in to_ignore:
                 continue
@@ -677,7 +681,7 @@ def load_from_bed12(shelf_name,
                     min_length=0,
                     max_intron=3*10**5,
                     is_reference=False,
-                    keep_redundant=False,
+                    exclude_redundant=False,
                     strip_cds=False,
                     strand_specific=False):
     """
@@ -700,6 +704,8 @@ def load_from_bed12(shelf_name,
     :type strand_specific: bool
     :param is_reference: boolean. If set to True, the transcript will always be retained.
     :type is_reference: bool
+    :param exclude_redundant: boolean. If set to True, the transcript will be marked for potential redundancy removal.
+    :type exclude_redundant: bool
     :return:
     """
 
@@ -739,7 +745,7 @@ def load_from_bed12(shelf_name,
             exon_lines[transcript.id]["parent"] = "{}.gene".format(transcript.id)
             exon_lines[transcript.id]["strand_specific"] = strand_specific
             exon_lines[transcript.id]["is_reference"] = is_reference
-            exon_lines[transcript.id]["keep_redundant"] = keep_redundant
+            exon_lines[transcript.id]["exclude_redundant"] = exclude_redundant
             exon_lines[transcript.id]["features"]["exon"] = [
                 (exon[0], exon[1]) for exon in transcript.exons
             ]