In [1]:
import sys
sys.path.extend(
[
    '/Users/fragoso/modules/',
    '/Users/fragoso/modules/emeraldbgc',
    '/Users/fragoso/modules/emeraldbgc/modules',
]
)

In [2]:


import json
import logging
import os
import re
import pickle
from itertools import groupby

import numpy as np
import tensorflow as tf
from joblib import load

# from emeraldbgc import __version__, _params
_params = {
    "thBase": 0.8733,
    "thBorder": 0.98,
    "rmless": 1,
    "fill": 3,
    "greed": {"0": 0.58, "1": 0.41, "2": 0.345, "3": 0},
    "shape": 200,
    "ip_an": ["Pfam", "TIGRFAM", "PRINTS", "ProSitePatterns", "Gene3D"]
}

log = logging.getLogger(f"EMERALD.{__name__}")


class AnnotationFilesToEmerald:
    """Transform external tool's files into EMERALD STR"""

    def __init__(self):

        self.entriesDct = {}
        self.contigsDct = {}
        self.annDct = {}
        self.typeDct = {}
        self.annResults = {}
        post_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "..",
            "models",
            "post_filters.pickle",
        )
        with open(post_file, "rb") as h:
            self.vocab, self.typeModel, self.mbdoms = pickle.load(h)

    def transformIPS(self, ipsFile):

        if not os.path.isfile(ipsFile):
            log.exception(f"{ipsFile} file not found")

        with open(ipsFile, "r") as h:

            lines = h.readlines()
            fmt = "gff" if lines[0][:5] == "##gff" else "tsv"

            for l in lines:
        
                if fmt == "tsv":

                    spl = l.split("\t")
                    self.entriesDct.setdefault(spl[0], []).append(
                        spl[-2] if spl[-2] != '-' else spl[4]
                    )
                if fmt == "gff":
                    spl = l.split("\t")
                    if len(spl) > 3 and spl[2]=='protein_match':
                        self.entriesDct.setdefault(spl[0],[]).append(
                                re.split("InterPro:|\"",spl[-1])[-2] if 'InterPro' in spl[-1]     else re.split("Name=|;",spl[-1])[-2]
                        )

    def transformEmeraldHmm(self, hmmFile):

        log.info(f"processing {hmmFile}")
        if not os.path.isfile(hmmFile):
            log.exception(f"{hmmFile} file not found")

        with open(hmmFile, "r") as h:

            for l in h:

                if l[0] == "#":
                    continue

                spl = l.split()
                self.entriesDct.setdefault(spl[3], []).append(spl[0])

    def transformCDSpredToCDScontigs(self, cdsPredFile, f):

        if not os.path.isfile(cdsPredFile):
            log.exception(f"{cdsPredFile} file not found")

        with open(cdsPredFile, "r") as h:

            if f == "fna":

                for l in h:

                    if l[0] != ">":
                        continue

                    spl = l.split()
                    start, end = int(spl[2]), int(spl[4])

                    self.contigsDct.setdefault(
                        "_".join(spl[0].split("_")[:-1])[1:], []
                    ).append((spl[0][1:], (start, end)))

            elif f == "gbk":

                from Bio import SeqIO

                recs = list(SeqIO.parse(open(cdsPredFile, "r"), "gb"))

                for rec in recs:
                    for f in rec.features:
                        if f.type == "CDS":


                            start, end = int(f.location.start) + 1, int(f.location.end)

                            self.contigsDct.setdefault(rec.id, []).append(
                                (
                                    f.qualifiers["protein_id"][0]
                                    if "protein_id" in f.qualifiers
                                    else f.qualifiers["locus_tag"][0],
                                    (start, end),
                                )
                            )

    def buildMatrices(self):

        for contig in self.contigsDct:

            cdss = self.contigsDct[contig]

            samps = (
                len(cdss)
                if not _params["shape"]
                else ((len(cdss) // _params["shape"]) + 1) * _params["shape"]
            )

            self.annDct[contig] = np.zeros((samps, len(self.vocab)))

            for ix, cds in enumerate(cdss):

                name = cds[0]
                if name not in self.entriesDct:
                    continue

                modiAnn = [
                    self.vocab[x] for x in self.entriesDct[name] if x in self.vocab
                ]
                self.annDct[contig][ix][modiAnn] = 1


    def predictAnn(self, colapseFunc=max):

        log.info("Predict BGC probability w/ TensorFlow")
        model_BGC_file = os.path.join(
            os.path.dirname(os.path.abspath(__file__)), "..", "models", "emerald.h5"
        )

        model = tf.keras.models.load_model(
            model_BGC_file, custom_objects={"robustLoss": {}}
        )

        for contig in self.annDct:
            mat = self.annDct[contig]
            xva_, vaIx = self.transformMat(mat)
            vaS = (xva_.shape[0] // _params["shape"]) * _params["shape"]
            xva1 = xva_[:vaS].reshape(vaS // _params["shape"], _params["shape"])
            xva = np.append(
                xva1,
                [list(xva_[vaS:]) + [0] * (_params["shape"] - (vaIx.shape[0] - vaS))],
                axis=0,
            )
            self.predict_ = model.predict(xva)
            predict = self.predict_.reshape(vaS + _params["shape"])
            predict_str_, nix = self.partialReStrMat(vaIx, predict, func=colapseFunc)
            self.annResults[contig] = self.projectRes(predict_str_, nix, mat.shape[0])

    def transformMat(self, mat):

        scuash = np.array(list(map(lambda x: np.where(x == 1)[0], mat)))
        nli, nix = [], []
        for ix, x in enumerate(scuash):
            if x.shape[0] == 0:
                continue
            nli.extend(x)
            nix.extend([ix] * x.shape[0])
        sups = [
            (k, list(zip(*g))[1][0])
            for k, g in groupby(zip(nli, nix), key=lambda x: x[0])
        ]
        if len(nli) == 0:
            return np.array([0]), np.array([0])
        a, b = list(zip(*sups))
        a, b = np.array(a), np.array(b)
        return a, b

    def projectRes(self, res, ixes, lenOri):

        nres_ = np.zeros(lenOri)
        nres_[ixes] = res
        prev = 0
        rprev = 0
        for ix, r in zip(ixes, res):
            if ix - 1 == prev:
                prev = ix
                rprev = r
                continue
            for ixx, s in list(enumerate(np.linspace(rprev, r, ix - prev + 1)))[1:-1]:
                nres_[prev + ixx] = s
            prev = ix

        return nres_

    def partialReStrMat(self, Ix, res, func):

        nres, nix = [], []
        for k, g in groupby(zip(res, Ix), key=lambda x: x[1]):

            gg = list(list(zip(*g))[0])
            nix.append(k)
            nres.append(func(gg))
        return np.array(nres), np.array(nix)

    def defineLooseClusters(self):

        log.info("Define Clusters")
        self.bridged, self.looseClst, self.borderClst, self.typesClst = {}, {}, {}, {}

        for contig in self.annResults:
            self.looseClst[contig] = np.array(
                self.rmLessThan(
                    self.fillGap(
                        np.where(self.annResults[contig] < _params["thBase"], 0, 1),
                        _params["fill"],
                    ),
                    _params["rmless"],
                )
            )
            self.borderClst[contig] = np.where(
                self.annResults[contig] < _params["thBorder"], 0, 1
            )
            self.typesClst[contig] = np.empty(
                len(self.annResults[contig]), dtype=object
            )

    def rmLessThan(self, contig, n):
        newContig = []
        for k, g in groupby(contig):
            if k == 0:
                newContig.extend(list(g))
            else:
                p = list(g)
                if len(p) <= n:
                    newContig.extend([0 for x in p])
                else:
                    newContig.extend(p)
        return newContig

    def fillGap(self, contig, gap):
        newContig = []
        for k, g in groupby(contig):
            gg = list(g)
            if k == 0 and len(gg) <= gap:
                newContig.extend([1 for x in gg])
            else:
                newContig.extend(gg)
        return newContig

    def predictType(self, score=None, g=None):

        log.info("Predict BGC classes")

        score = _params["greed"][str(g)] if score == None else score
        log.info(f"Positive class model threshold: {score}")
        locations, matrix, typeLi = [], [], []
        claa = [
            "Alkaloid",
            "NRP",
            "Polyketide",
            "RiPP",
            "Saccharide",
            "Terpene",
            "Other",
        ]

        for contig in self.contigsDct:
            for k, g in groupby(enumerate(self.looseClst[contig]), key=lambda x: x[1]):
                if k == 0:
                    continue
                gg = list(list(zip(*g))[0])
                tmat_ = np.sum(self.annDct[contig][gg], axis=0)
                tmat = np.where(tmat_ > 0, 1, 0)
                locations.append((contig, gg))
                matrix.append(tmat)
        if len(matrix) > 0:
            pred = np.empty((len(matrix), len(claa)))
            for nc, cla in enumerate(claa):
                pred[:, nc] = self.typeModel[cla].predict_proba(matrix)[:, 1]
        else:
            pred = []

        for ix, p in enumerate(pred):

            contig, gg = locations[ix]

            if not (np.max(p[[0, 1, 2, 3, 4, 5, 6]]) >= score):
                self.looseClst[contig][gg] = 0
                self.borderClst[contig][gg] = 0

            else:
                nearest_ = self.near_classifer(set(np.where(matrix[ix] == 1)[0]))
                nearest = "nearest_MiBIG={};nearest_MiBIG_class={};nearest_MiBIG_jaccardDistance={:.3f}".format(
                    nearest_[0], nearest_[1], nearest_[2]
                )
                self.typesClst[contig][gg] = nearest

    def near_classifer(self, doms):
        mb_ord = [(b, c, self.jacardDistance(doms, mbd)) for b, c, mbd in self.mbdoms]
        nearest = sorted(mb_ord, key=lambda x: x[2])[0]
        return nearest

    def jacardDistance(self, a, b):
        try:
            return 1 - ((len(set(a) & set(b))) / len(set(a) | set(b)))
        except:
            return 1

In [3]:

import logging
import os
import sys
import subprocess

# from emeraldbgc import _params
# log = logging.getLogger(f"EMERALD.{__name__}")

from distutils.spawn import find_executable


class Preprocess:
    """External tools needed for emerald bgc detection"""

    def __init__(self, seq_file, ip_file, meta, cpus, outdir):

        self.seq_file = seq_file
        self.ip_file = ip_file
        self.meta = meta
        self.cpus = int(cpus)
        self.outdir = outdir if outdir else "temp"

    def runProdigal(self):
        """Predict genes using prodigal"""
        log.info("Progial gene prediction...")

        if not find_executable("prodigal"):
            log.exception("Parodigal is not installed or in PATH")

        if not os.path.isfile(self.seq_file):
            log.exception(f"{self.seq_file} file not found")

        outFaa = os.path.join(
            self.outdir, "{}.prodigal.faa".format(os.path.basename(self.seq_file))
        )
        cmd = ["prodigal", "-i", self.seq_file, "-a", outFaa] + (
            ["-p", "meta"] if self.meta == "True" else []
        )

        outs, errs = subprocess.Popen(
            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        ).communicate()

        if "Error:  Sequence must be 20000 characters" in errs.decode("utf8"):

            log.info(
                "Sequence must be 20000 characters when running Prodigal in normal mode. Trying -p meta"
            )

            cmd = ["prodigal", "-i", self.seq_file, "-a", outFaa, "-m"] + (["-p", "meta"])

            outs, errs = subprocess.Popen(
                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
            ).communicate()

        log.info(errs.decode("utf8"))

        """ Remove asterixs from faa
            """
        log.info("Removing asterix from prodigal faa")
        with open(outFaa, "r") as h:
            noAstFaa = [x.replace("*", "") for x in h]
        with open(outFaa, "w") as h:
            for l in noAstFaa:
                h.write(f"{l}")

        return os.path.abspath(outFaa)

    def gbkToProdigal(self):
        """Transform gbk to faa. This enables preprocessing with sequences"""
        log.info("write gbk as faa")
        from Bio import SeqIO

        recs = list(SeqIO.parse(open(self.seq_file, "r"), "gb"))

        base = os.path.basename(self.seq_file)

        outFaa = os.path.join(
            self.outdir, "{}.prodigal.faa".format(os.path.basename(self.seq_file))
        )

        with open(outFaa, "w") as h:

            for rec in recs:
                ct = 0
                for f in rec.features:

                    
                    if f.type == "CDS" and "translation" in f.qualifiers:
                        ct += 1
                        pid = (
                            f.qualifiers["protein_id"][0]
                            if "protein_id" in f.qualifiers
                            else f.qualifiers["locus_tag"][0]
                        )
                        seq = f.qualifiers["translation"][0]
                        h.write(f">{pid}\n{seq}\n")
                        
                if ct == 0:
                    log.info("{} CDS found with translation in {}".format(ct, rec.name) )

        return os.path.abspath(outFaa)

    def check_fmt(self):
        """ Evaluate if input format is FNA or GBK"""
        with open(self.seq_file) as h:
            if h.readline()[0] == ">":
                self.fmt = "fna"
                log.info("FASTA sequence file detected")
            else:
                self.fmt = "gbk"
                log.info(f"FASTA sequence file NOT detected; trying GBK")

    def process_sequence(self):
        """ CDS prediction on sequence file"""
        if 'linux' not in sys.platform and self.ip_file == None:
            log.info("internal run of InterProScan only available for Linux OS. Make sure to use --ip-file option")
            raise ValueError('Non Linux OS must be run with --ip-file option')

        self.check_fmt()
        if self.fmt == "fna":
            self.outFaa = self.runProdigal()
        elif self.fmt == "gbk":
            self.outFaa = self.gbkToProdigal()
        else:
            log.info("missing sequence file format")

        ih_f = self.runHmmScan()
        ip_f = self.runInterproscan() if self.ip_file == None else self.ip_file
        return self.outFaa, ip_f, ih_f

    def runHmmScan(self):
        """annotate functionally with emerald hmm library and hmmScan"""
        log.info("emerald functional annotation...")
        hmmLib = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            "..",
            "models",
            "hmm_lib",
            "emerald.hmm",
        )

        if not find_executable("hmmscan"):
            log.exception("hmmscan is not installed or in PATH")

        if not os.path.isfile(self.outFaa):
            log.exception(f"{self.outFaa} file not found")

        if not os.path.isfile(hmmLib):
            log.exception(f"{hmmLib} file not found")

        outTsv = os.path.join(
            self.outdir, "{}.emerald.tsv".format(os.path.basename(self.outFaa))
        )

        cmd = (
            ["hmmscan", "--domtblout", outTsv, "--cut_ga"]
            + (["--cpu", str(self.cpus)] if self.cpus else [])
            + ([hmmLib, self.outFaa])
        )
        log.info(" ".join(cmd))
        try:
            outs, errs = subprocess.Popen(
                cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
            ).communicate()
            log.info(errs.decode("utf8"))
        except subprocess.CalledProcessError as err:
            log.exception(err.output)

        return os.path.abspath(outTsv)

    def runInterproscan(self):
        """annotate functionally with InterproScan"""
        log.info("InterProScan")

        if not find_executable('interproscan.sh'):
            log.exception(f"interproscan.sh not found, only available for Linux OS")

        if not os.path.isfile(self.outFaa):
            log.exception(f"self.outFaa file not found")

        outGff = os.path.join(
            self.outdir, "{}.ip.tsv".format(os.path.basename(self.outFaa))
        )
        cmd = (
            ["interproscan.sh", "-i", self.outFaa, "-o", outGff, "-f", "TSV"]
            + (["-appl", ",".join(_params["ip_an"])] if _params["ip_an"] else [])
            + (["-cpu", str(self.cpus)] if self.cpus else [])
        )
        log.info(" ".join(cmd))

        outs, errs = subprocess.Popen(
            cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
        ).communicate()
        log.info(outs.decode("utf8"))
        log.info(errs.decode("utf8"))

        return os.path.abspath(outGff)

In [4]:

import json
import logging
import os
import pickle
from itertools import groupby

import numpy as np
import tensorflow as tf
from joblib import load

from emeraldbgc import __version__

log = logging.getLogger(f"EMERALD.{__name__}")


class Outputs:

    """ write results in file """

    def __init__(self, obj, minimal_output, refined_borders, outfile):

        self.annotation = obj
        self.minimal = minimal_output
        self.ref_b = refined_borders
        self.outfile = outfile
        self.gff3 = []
        self.clusts = []
        self.writeGff3()

    def writeGff3(self):

        log.info("build GFF3")
        log.info(self.outfile)
        type_code = {
            0: "Alkaloid",
            1: "NRP",
            2: "Polyketide",
            3: "RiPP",
            4: "Saccharide",
            5: "Terpene",
            6: "Other",
        }

        for contig in self.annotation.looseClst:

            if self.minimal != "True":
                for ix, f in enumerate(self.annotation.contigsDct[contig]):

                    ID, (start, end) = f
                    emrldProb = "{:.3f}".format(self.annotation.annResults[contig][ix])
                    self.gff3.append(
                        f"{contig}\tEMERALDv{__version__}\tCDS\t{start}\t{end}\t.\t.\t.\tID={ID};emerald_probability={emrldProb}"
                    )

            ct = 1
            for k, g in groupby(
                enumerate(
                    self.annotation.looseClst[contig][
                        : len(self.annotation.contigsDct[contig])
                    ]
                ),
                key=lambda x: x[1],
            ):

                if k == 0:
                    continue

                ID = f"{contig}_emrld_{ct}"
                ct += 1

                gg = list(list(zip(*g))[0])

                start, end = (
                    self.annotation.contigsDct[contig][gg[0]][1][0],
                    self.annotation.contigsDct[contig][gg[-1]][1][1],
                )

                edge = "{}{}".format(
                    1 if gg[0] == 0 else 0,
                    1 if gg[-1] == len(self.annotation.contigsDct[contig]) - 1 else 0,
                )

                typs = self.annotation.typesClst[contig][gg[0]]
                # typs = "nearest_MiBIG={};nearest_MiBIG_class={};nearest_MiBIG_jacardDistance={}".format(typs[0],typs[1],typs[2])
                self.gff3.append(
                    f"{contig}\tEMERALDv{__version__}\tCLUSTER\t{start}\t{end}\t.\t.\t.\tID={ID};{typs};partial={edge}"
                )

                if self.ref_b == "True":

                    ct2 = 1
                    for k2, g2 in groupby(
                        zip(gg, self.annotation.borderClst[contig][gg]),
                        key=lambda x: x[1],
                    ):
                        if k2 == 0:
                            continue
                        gg2 = list(list(zip(*g2))[0])
                        start, end = (
                            self.annotation.contigsDct[contig][gg2[0]][1][0],
                            self.annotation.contigsDct[contig][gg2[-1]][1][1],
                        )
                        edge = "{}{}".format(
                            1 if gg[0] == 0 else 0,
                            1
                            if gg[-1] == len(self.annotation.contigsDct[contig]) - 1
                            else 0,
                        )

                        self.gff3.append(
                            f"{contig}\tEMERALDv{__version__}\tCLUSTER_border\t{start}\t{end}\t.\t.\t.\tID={ID}_{ct2};{typs};partial={edge}"
                        )
                        ct2 += 1

        log.info(f"Writing output to file {self.outfile}")
        with open(self.outfile, "w") as h:

            h.write("##gff-version 3\n")

            key = lambda x: (
                x.split("\t")[0],
                int(x.split("\t")[3]),
                "Z" if x.split("\t")[2] == "CDS" else x.split("\t")[2],
            )
            for l in sorted(self.gff3, key=key):

                h.write(f"{l}\n")

In [5]:
sys.path

['/Users/fragoso/modules/emeraldbgc/emeraldbgc/modules',
 '/Users/fragoso/miniconda3/lib/python38.zip',
 '/Users/fragoso/miniconda3/lib/python3.8',
 '/Users/fragoso/miniconda3/lib/python3.8/lib-dynload',
 '',
 '/Users/fragoso/miniconda3/lib/python3.8/site-packages',
 '/Users/fragoso/miniconda3/lib/python3.8/site-packages/IPython/extensions',
 '/Users/fragoso/.ipython',
 '/Users/fragoso/modules/',
 '/Users/fragoso/modules/emeraldbgc',
 '/Users/fragoso/modules/emeraldbgc/modules']

In [6]:
cd /Users/fragoso/modules/emeraldbgc

/Users/fragoso/modules/emeraldbgc


In [7]:
import argparse
import glob
import logging
import os
import sys

# from emeraldbgc import __version__

# sys.path.append(os.path.dirname(os.path.abspath('.')))

# from modules.BGCdetection import AnnotationFilesToEmerald
# from modules.Preproc import Preprocess
# from modules.WriteOutput import Outputs

In [11]:
args = [
    '--ip-file',
    '/Users/fragoso/Downloads/ws/bench/files/comp/CM000950.fna.prodigal.faa.ip.tsv',
    '/Users/fragoso/Downloads/ws/bench/files/comp/CM000950.gb'
]
__file__ = '/Users/fragoso/modules/emeraldbgc/emeraldbgc/modules/dax.txt'

In [12]:
# def main(args=None):

parser = argparse.ArgumentParser(description="EMERALD. SMBGC detection tool")
parser.add_argument(
    "seq_file",
    type=str,
    help="input nucleotide sequence file. FASTA or GBK. mandatory",
    metavar="SEQUENCE_FILE",
)
parser.add_argument(
    "-v",
    "--version",
    action="version",
    help="Show the version number and exit.",
    version=f"EMERALD {__version__}",
)
parser.add_argument(
    "--ip-file",
    dest="ip_file",
    default=None,
    type=str,
    help="Optional, preprocessed InterProScan GFF3 output file. Requires a GBK file as SEQUENCE_FILE. The GBK must have CDS as features, and \"protein_id\" matching the ids in the InterProScan file",
    metavar="FILE",
)
parser.add_argument(
    "--greed",
    dest="greed",
    default=2,
    type=int,
    help="Level of greediness. 0,1,2,3 [default 2]",
    metavar="INT",
)
parser.add_argument(
    "--score",
    dest="score",
    default=None,
    type=float,
    help="validation filter threshold. overrides --greed",
    metavar="FLOAT",
)
parser.add_argument(
    "--meta",
    dest="meta",
    default="True",
    type=str,
    help="prodigal option meta [default True]",
    metavar="True|False",
)
parser.add_argument(
    "--outdir",
    default=os.getcwd(),
    dest="outdir",
    type=str,
    help="output directory [default $PWD/SEQUENCE_FILE.emerald]",
    metavar="DIRECTORY",
)
parser.add_argument(
    "--outfile",
    dest="outfile",
    type=str,
    help="output file [default outdir/SEQUENCE_FILE.emerald.gff]",
    metavar="FILE",
)
parser.add_argument(
    "--minimal",
    dest="minimal_out",
    default="True",
    type=str,
    help="minimal output in a gff3 file [default True]",
    metavar="True|False",
)
parser.add_argument(
    "--refined",
    dest="ref_b",
    default="False",
    type=str,
    help="annotate high probability borders [default False]",
    metavar="True|False",
)
parser.add_argument(
    "--cpu",
    dest="cpu",
    default=1,
    type=int,
    help="cpus for INTERPROSCAN and HMMSCAN",
    metavar="INT",
)

args = parser.parse_args(args)

basef = args.seq_file
base = os.path.basename(basef)

outdir = os.path.join(args.outdir, f"{base}.emerald")

os.makedirs(outdir, exist_ok=True)

logging.basicConfig(
    filename=f"{outdir}/emerald.log",
    level=logging.DEBUG,
    format="%(asctime)s %(levelname)s %(name)s %(message)s",
)
print(f"LOG_FILE: {outdir}/emerald.log")
log = logging.getLogger("EMERALD")
log.info(
    f"""
******
EMERALD v{__version__}
******"""
)
log.info(f"outdir: {outdir}")

log.info("preprocessing files")
preprocess = Preprocess(
        os.path.abspath(args.seq_file), 
        args.ip_file, 
        args.meta, 
        args.cpu, 
        outdir
)


prodigal_file, ips_file, hmm_file = preprocess.process_sequence()

log.info("EMERALD process")
annotate = AnnotationFilesToEmerald()

log.info("transform interpro file")
annotate.transformIPS(ips_file)

log.info("transform inhouse hmm file")
annotate.transformEmeraldHmm(hmm_file)

log.info("transform proteins file")
annotate.transformCDSpredToCDScontigs(
        prodigal_file if preprocess.fmt == "fna" else args.seq_file,
        preprocess.fmt)

log.info("transform dicts to np matrices")
annotate.buildMatrices()

log.info("predict bgc regions")
annotate.predictAnn()

log.info("define clusters")
annotate.defineLooseClusters()

log.info("post-processing filters and type classification")
annotate.predictType(score=args.score, g=args.greed)

log.info("write output file file")
outp = Outputs(
    annotate,
    args.minimal_out,
    args.ref_b,
    args.outfile if args.outfile else f"{outdir}/{base}.emerald.full.gff",
)

log.info("EMERALD succesful")
print("EMERALD succesful")


# if __name__ == "__main__":
#     main(sys.argv[1:])

LOG_FILE: /Users/fragoso/modules/emeraldbgc/CM000950.gb.emerald/emerald.log


  scuash = np.array(list(map(lambda x: np.where(x == 1)[0], mat)))


EMERALD succesful


In [18]:
f"{outdir}/{base}.emerald.full.gff"

'/Users/fragoso/modules/emeraldbgc/CM000950.gb.emerald/CM000950.gb.emerald.full.gff'

In [19]:
!grep "TER\s" /Users/fragoso/modules/emeraldbgc/CM000950.gb.emerald/CM000950.gb.emerald.full.gff

CM000950	EMERALDv0+unknown	CLUSTER	235657	280316	.	.	.	ID=CM000950_emrld_1;nearest_MiBIG=BGC0000804;nearest_MiBIG_class=Saccharide;nearest_MiBIG_jaccardDistance=0.810;partial=00
CM000950	EMERALDv0+unknown	CLUSTER	651667	716832	.	.	.	ID=CM000950_emrld_2;nearest_MiBIG=BGC0001767;nearest_MiBIG_class=NRP;nearest_MiBIG_jaccardDistance=0.564;partial=00
CM000950	EMERALDv0+unknown	CLUSTER	747262	777103	.	.	.	ID=CM000950_emrld_3;nearest_MiBIG=BGC0000663;nearest_MiBIG_class=Terpene;nearest_MiBIG_jaccardDistance=0.678;partial=00
CM000950	EMERALDv0+unknown	CLUSTER	917061	974020	.	.	.	ID=CM000950_emrld_4;nearest_MiBIG=BGC0001496;nearest_MiBIG_class=RiPP;nearest_MiBIG_jaccardDistance=0.833;partial=00
CM000950	EMERALDv0+unknown	CLUSTER	1451219	1472608	.	.	.	ID=CM000950_emrld_5;nearest_MiBIG=BGC0001496;nearest_MiBIG_class=RiPP;nearest_MiBIG_jaccardDistance=0.806;partial=00
CM000950	EMERALDv0+unknown	CLUSTER	1783748	1853946	.	.	.	ID=CM000950_emrld_6;nearest_MiBIG=BGC0001062;nearest_MiBIG_class=Polyketi

In [20]:

!grep "TER\s" /Users/fragoso/modules/emeraldbgc/emeraldbgc/modules/CM000950.fna.prodigal.faa.emrld/CM000950.fna.prodigal.faa.emerald.full.gff2

CM000950	EMERALDv0.1	CLUSTER	235657	280316	.	.	.	ID=CM000950_emrld_1;Class_Probability=Alkaloid:0.020,NRP:0.080,Polyketide:0.400,RiPP:0.070,Saccharide:0.120,Terpene:0.210,Other:0.240;Edge=False
CM000950	EMERALDv0.1	CLUSTER	651667	716832	.	.	.	ID=CM000950_emrld_2;Class_Probability=Alkaloid:0.410,NRP:0.880,Polyketide:0.720,RiPP:0.490,Saccharide:0.450,Terpene:0.270,Other:0.570;Edge=False
CM000950	EMERALDv0.1	CLUSTER	747262	777103	.	.	.	ID=CM000950_emrld_3;Class_Probability=Alkaloid:0.030,NRP:0.050,Polyketide:0.050,RiPP:0.030,Saccharide:0.260,Terpene:0.440,Other:0.120;Edge=False
CM000950	EMERALDv0.1	CLUSTER	917061	974020	.	.	.	ID=CM000950_emrld_4;Class_Probability=Alkaloid:0.070,NRP:0.110,Polyketide:0.200,RiPP:0.420,Saccharide:0.050,Terpene:0.020,Other:0.170;Edge=False
CM000950	EMERALDv0.1	CLUSTER	1451219	1472608	.	.	.	ID=CM000950_emrld_5;Class_Probability=Alkaloid:0.000,NRP:0.010,Polyketide:0.000,RiPP:0.400,Saccharide:0.000,Terpene:0.000,Other:0.020;Edge=False
CM000950	EMERALDv0.1	CLUSTER

In [15]:
from itertools import groupby

In [17]:
c = 'CM000950'

for k,g in groupby(enumerate(annotate.looseClst[c]),key = lambda z:z[1]):
    if k ==0:
        continue
    gg = list(zip(*g))
    print(gg[0][0],gg[0][-1])

218 250
587 629
657 684
810 867
1280 1297
1553 1605
1643 1690
1757 1777
2081 2086
2552 2570
3169 3198
3316 3345
3453 3469
3943 3990
4139 4161
4439 4452
4537 4538
4594 4608
4702 4712
4745 4755
5315 5340
5655 5672
5884 5926
6527 6548
6672 6791
6800 6918


In [14]:

annotate.looseClst['CM000950']

array([0, 0, 0, ..., 0, 0, 0])