In [24]:
from __future__ import print_function
from __future__ import division

import subprocess
import os
import sys
import pandas as pd
from itertools import groupby
import click
import os.path as op
import logging
import shutil
import gzip
import pysam
from Bio import SeqIO



# SAG functions
def mask_sag(input_gb, out_fasta):
    '''output masked contigs with rRNA changed to 'N's
    Args:
        input_gb (str): path to annotated input genbank formatted genome
        out_fasta (str): where to write the output fasta to
    Returns:
        fasta file with rRNA regions masked with 'N's
        out_fasta (str)
    '''
    #if input_gb.endswith(".gb") == False or input_gb.endswith(".gbk") == False:
    #    logger.error("input file does not appear to be in genbank format.  Please check.")
    #    return None

    with open(input_gb, "rU") as input_handle, open(out_fasta, "w") as oh:
        rrna_count = 0
        for r in SeqIO.parse(input_handle, "genbank"):
            print(">", r.name, sep="", file=oh)
            s = r.seq
            cloc = []
            masked = ""
            for f in r.features:
                if f.type == "rRNA" or f.type == "RNA":
                    if ('gene' in f.qualifiers and 
                       ("16S" in str(f.qualifiers['gene']).upper() or 
                        "23S" in str(f.qualifiers['gene']).upper() or 
                        "Subunit Ribosomal RNA" in str(f.qualifiers['gene']).upper() or
                        "suRNA" in str(f.qualifiers['gene']).upper())):
                            cloc.append(f.location)    # if the 'type' is rRNA, it should be masked... don't have to check for 16 or 23S
                            print('rRNA gene found on contig %s' % r.name)
                            rrna_count += 1      
                    elif ('product' in f.qualifiers and 
                         ("16S" in str(f.qualifiers['product']).upper() or 
                        "23S" in str(f.qualifiers['product']).upper() or 
                        "Subunit Ribosomal RNA".upper() in str(f.qualifiers['product']).upper() or
                        "suRNA".upper() in str(f.qualifiers['product']).upper())):
                        #print(f)
                            cloc.append(f.location)    # if the 'type' is rRNA, it should be masked... don't have to check for 16 or 23S
                            print('rRNA gene found on contig %s' % r.name)
                            rrna_count += 1
                    #else:
                        #print(f)

            # if the contig contains one rRNA gene (most common if rRNA present)
            if len(cloc) == 1:
                masked += s[0:cloc[0].start-1]
                masked += 'N'*(cloc[0].end - cloc[0].start)
                masked += s[cloc[0].end-1:]
            # probably won't be instances where below is true
            elif len(cloc) > 1:
                for i in range(0, len(cloc)):
                    # if it's the first entry
                    if i == 0:
                        masked += s[0:cloc[i].start-1]
                        masked += 'N'*(cloc[i].end - cloc[i].start)
                    # if it's the last entry
                    elif i == len(cloc)-1:
                        masked += s[cloc[i-1].end-1:cloc[i].start-1]
                        masked += 'N'*(cloc[i].end - cloc[i].start)
                        masked += s[cloc[i].end:]
                    else:
                        masked += s[cloc[i-1].end-1:cloc[i].start-1]
                        masked += 'N'*(cloc[i].end - cloc[i].start)
            # if no rRNA on contig, just return the sequence, unmasked
            else:
                masked = s

            for i in range(0, len(masked), 80):
                print(masked[i:i+80], file=oh)
    print('%s rRNA genes found in %s' % (rrna_count, op.basename(input_gb)))
    return out_fasta

In [4]:
os.mkdir("../data/test_outfasta")

In [5]:
mask_sag("../data/gbks/AAA288-L16_JGI.gbk", "../data/test_outfasta/AAA288-L16_JGI.fasta")

rRNA gene found on contig A288L16DRAFT_2505696051.7
rRNA gene found on contig A288L16DRAFT_2505696051.7
2 rRNA genes found in AAA288-L16_JGI.gbk




'../data/test_outfasta/AAA288-L16_JGI.fasta'

In [26]:
mask_sag("../data/gbks/AB-679-I05_RAST.gbk", "../data/test_outfasta/AB-679-I05_RAST.fasta")

rRNA gene found on contig AB-679-I05_NODE_5_length_29376_cov_49.753_ID_9
rRNA gene found on contig AB-679-I05_NODE_5_length_29376_cov_49.753_ID_9
2 rRNA genes found in AB-679-I05_RAST.gbk


'../data/test_outfasta/AB-679-I05_RAST.fasta'

There should be two rRNA genes (for SSU and LSU), and one 5S subunit...

In [27]:
mask_sag("../data/gbks/Nitrospina_gracilis_Genbank.gbff", "../data/test_outfasta/Nitrospina_gracilis_Genbank.fasta")

rRNA gene found on contig NZ_HG422173
rRNA gene found on contig NZ_HG422173
2 rRNA genes found in Nitrospina_gracilis_Genbank.gbff


'../data/test_outfasta/Nitrospina_gracilis_Genbank.fasta'

In [23]:
input_gb = "../data/gbks/AB-679-I05_RAST.gbk"
with open(input_gb, "rU") as input_handle:#, open(out_fasta, "w") as oh:
    rrna_count = 0
    for r in SeqIO.parse(input_handle, "genbank"):
        #print(">", r.name, sep="", file=oh)
        s = r.seq
        cloc = []
        masked = ""
        for f in r.features:
            if f.type == "rRNA" or f.type == "RNA":
                if ('gene' in f.qualifiers and 
                   ("16S" in str(f.qualifiers['gene']).upper() or 
                    "23S" in str(f.qualifiers['gene']).upper() or 
                    "Subunit Ribosomal RNA" in str(f.qualifiers['gene']).upper() or
                    "suRNA" in str(f.qualifiers['gene']).upper())):
                        
                    print('rRNA gene found on contig %s' % r.name)
                    rrna_count += 1      
                elif 'product' in f.qualifiers and 'Subunit Ribosomal RNA' in str(f.qualifiers['product']):
                    print(str(f.qualifiers['product']))
                    #print(f)
                    
                    print('rRNA gene found on contig %s' % r.name)
                    rrna_count += 1
    print(rrna_count)

['Small Subunit Ribosomal RNA; ssuRNA; SSU rRNA']
rRNA gene found on contig AB-679-I05_NODE_5_length_29376_cov_49.753_ID_9
['Large Subunit Ribosomal RNA; lsuRNA; LSU rRNA']
rRNA gene found on contig AB-679-I05_NODE_5_length_29376_cov_49.753_ID_9
2
