From a6c0c43cadff7f70073dab6f30a2e5048b5297f0 Mon Sep 17 00:00:00 2001 From: Eric Nawrocki Date: Tue, 28 Jun 2016 10:27:54 -0400 Subject: [PATCH] Userguide is ready for 1.1.2 release. --- Bugs/BUGTRAX | 6 ++ LICENSE | 2 +- documentation/manpages/cmscan.man | 10 +- documentation/manpages/cmsearch.man | 2 +- documentation/userguide/Makefile.in | 9 +- documentation/userguide/ack.tex | 5 +- documentation/userguide/cmbuild.tex | 4 +- documentation/userguide/copyright.tex | 0 documentation/userguide/copyright.tex.in | 18 ---- documentation/userguide/formats.tex | 3 - documentation/userguide/install.tex | 23 ++-- documentation/userguide/introduction.tex | 6 +- documentation/userguide/main.tex | 2 +- documentation/userguide/more.tex | 18 ++-- documentation/userguide/tabular.tex | 2 +- documentation/userguide/titlepage.tex.in | 8 +- documentation/userguide/tutorial.tex | 177 ++++++++++++++++--------------- tutorial/tRNA5.c.cm | 14 +-- 18 files changed, 146 insertions(+), 163 deletions(-) create mode 100644 documentation/userguide/copyright.tex delete mode 100644 documentation/userguide/copyright.tex.in diff --git a/Bugs/BUGTRAX b/Bugs/BUGTRAX index e664c9e7..5fad5845 100644 --- a/Bugs/BUGTRAX +++ b/Bugs/BUGTRAX @@ -1342,6 +1342,9 @@ an optimally accurate parsetree, if it's not a legal local begin state, then don't allow a local begin. // +#### +#### 1.1.1 release: 23 July 2014 +#### ID i44 TITLE cmalign --mapstr allows broken basepairs @@ -1374,3 +1377,6 @@ checks that both a broken non-pseudoknotted basepair and a pseudoknotted broken basepair are both correctly removed. // +#### +#### 1.1.2 release: June 2014 +#### diff --git a/LICENSE b/LICENSE index 9cc2a8cc..5f6b2993 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ Infernal - inference of RNA secondary structural alignments @INFERNAL_COPYRIGHT@ -Copyright (C) 1991-2013 Sean R. Eddy +Copyright (C) 1991-2016 Sean R. Eddy Copyright (C) 2005-2016 Eric P. Nawrocki Copyright (C) 2005-2011 Diana L. Kolbe Copyright (C) 2004 Zasha Weinberg diff --git a/documentation/manpages/cmscan.man b/documentation/manpages/cmscan.man index 05c9d805..cde06298 100644 --- a/documentation/manpages/cmscan.man +++ b/documentation/manpages/cmscan.man @@ -68,8 +68,8 @@ option saves output in a simple tabular format that is concise and easier to parse. The .BI --fmt " 2" option modifies the format of the tabular output by adding several -fields, including markup of overlapping hits, as described in the -Infernal user guide. +fields, including markup of overlapping hits, as described in section +6 of the Infernal user guide. The .B -o option allows redirecting the main output, including throwing it away @@ -176,7 +176,7 @@ instead of the default stdout. .BI --tblout " " Save a simple tabular (space-delimited) file summarizing the hits found, with one data line per hit. -The format of this file is described in the Infernal user guide. +The format of this file is described in section 6 of the Infernal user guide. .TP .BI --fmt " " @@ -196,7 +196,7 @@ With .BI --fmt " 2" nine additional fields are added to the tabular output file, most of which pertain to the annotation of overlapping hits. -See the Infernal user guide for a description of both formats. +See section 6 the Infernal user guide for a description of both formats. .TP .B --acc @@ -726,7 +726,7 @@ and .B --tblout because clan annotation is only output in format 2 of the tabular output file. -See the Infernal user guide for specifications on the format of the +See section 9 of the Infernal user guide for specifications on the format of the clan input file .I . diff --git a/documentation/manpages/cmsearch.man b/documentation/manpages/cmsearch.man index 59b50629..04bfebae 100644 --- a/documentation/manpages/cmsearch.man +++ b/documentation/manpages/cmsearch.man @@ -167,7 +167,7 @@ to the file .BI --tblout " " Save a simple tabular (space-delimited) file summarizing the hits found, with one data line per hit. The format of this file is -described in the Infernal user guide. +described in section 6 of the Infernal user guide. .TP .B --acc diff --git a/documentation/userguide/Makefile.in b/documentation/userguide/Makefile.in index 76001000..e26adf78 100644 --- a/documentation/userguide/Makefile.in +++ b/documentation/userguide/Makefile.in @@ -41,6 +41,7 @@ MANPAGES = \ TEXFILES =\ cmbuild.tex\ + copyright.tex\ filter.tex\ format_prior.tex\ formats.tex\ @@ -57,7 +58,7 @@ TEXFILES =\ pdf: Userguide.pdf -Userguide.pdf: symlinks.stamp titlepage.tex copyright.tex manpages.tex +Userguide.pdf: symlinks.stamp titlepage.tex manpages.tex @for prog in pdflatex bibtex; do \ command -v $$prog >/dev/null 2>&1 || { echo >&2 "The $$prog program is required to build the Userguide, but it's not installed. Aborting."; exit 1; } \ done @@ -83,10 +84,6 @@ titlepage.tex: ${srcdir}/titlepage.tex.in @cp -f ${srcdir}/titlepage.tex.in titlepage.tex @${SEDITION} INFERNAL_VERSION ${INFERNAL_VERSION} INFERNAL_DATE ${INFERNAL_DATE} titlepage.tex -copyright.tex: ${srcdir}/copyright.tex.in - @cp -f ${srcdir}/copyright.tex.in copyright.tex - @${SEDITION} INFERNAL_COPYRIGHT ${INFERNAL_COPYRIGHT} copyright.tex - # manpages: convert man pages to LaTeX chapter in User Guide. # uses PolyglotMan 3.2 "rman", and rmanprocess.pl script in easel's devkit manpages.tex: ${MANPAGES} @@ -114,7 +111,7 @@ distclean: clean done ;\ fi -rm -f symlinks.stamp - -rm -f titlepage.tex copyright.tex manpages.tex + -rm -f titlepage.tex manpages.tex -rm -f Userguide.pdf -rm -f Makefile diff --git a/documentation/userguide/ack.tex b/documentation/userguide/ack.tex index 3fde3a53..4a6cacb5 100644 --- a/documentation/userguide/ack.tex +++ b/documentation/userguide/ack.tex @@ -34,9 +34,8 @@ \section{Acknowledgements} Infernal testing requires \emph{a lot} of compute power, and we are extremely fortunate to have access to a highly reliable and -state-of-the-art computing cluster, thanks to Goran Ceric, Rob Lines, -Peter Bukowinski, Ken Carlile, Patrick Yeboah, and others here at -Janelia. +state-of-the-art computing cluster, thanks to Jesse Becker, Ron +Patterson and others at NCBI. Infernal is primarily developed on GNU/Linux and Apple Macintosh machines, but is tested on a variety of hardware. Over the years, diff --git a/documentation/userguide/cmbuild.tex b/documentation/userguide/cmbuild.tex index af540187..a1cb86f6 100644 --- a/documentation/userguide/cmbuild.tex +++ b/documentation/userguide/cmbuild.tex @@ -524,9 +524,7 @@ \subsubsection{Architecture construction} an insertion. Importantly though this frequency is determined using the relative weights from the sequence weighting step, instead of absolute gaps (e.g. a residue in a sequence with weight $0.8$ will count -as $0.8$ residues)\footnote{This behavior is new in Infernal 1.1, in all -previous versions of Infernal, absolute weights, not relative weights -were used at this step.}. + as $0.8$ residues). The threshold defaults to 0.5. It can be changed to another number \otext{} (from 0 to 1.0) by the \prog{--symfrac } option. The diff --git a/documentation/userguide/copyright.tex b/documentation/userguide/copyright.tex new file mode 100644 index 00000000..e69de29b diff --git a/documentation/userguide/copyright.tex.in b/documentation/userguide/copyright.tex.in deleted file mode 100644 index fbeaa3e2..00000000 --- a/documentation/userguide/copyright.tex.in +++ /dev/null @@ -1,18 +0,0 @@ -\vspace*{\fill} -\begin{flushleft} -@INFERNAL_COPYRIGHT@\vspace{5mm} - -\vspace{5mm} -Permission is granted to make and distribute verbatim copies of this -manual provided the copyright notice and this permission notice are -retained on all copies.\vspace{5mm} - -\vspace{5mm} Infernal is licensed and freely distributed under the GNU -General Public License version 3 (GPLv3). For a copy of the License, -see \url{http://www.gnu.org/licenses/}. - -\vspace{5mm} -\end{flushleft} - - - diff --git a/documentation/userguide/formats.tex b/documentation/userguide/formats.tex index bc0efb05..cc57be46 100644 --- a/documentation/userguide/formats.tex +++ b/documentation/userguide/formats.tex @@ -129,9 +129,6 @@ \subsection{Infernal CM files} The CM format is described in more detail below, followed by a description of the HMMER3 HMM format for the CM's mandatory filter HMM filter. -%The HMM format is similar but is not described in detail here. See the -%HMMER user guide for more information -%(\url{http://hmmer.janelia.org}{http://hmmer.janelia.org}). \subsubsection{CM header section} diff --git a/documentation/userguide/install.tex b/documentation/userguide/install.tex index ed5cdeae..82d187e9 100644 --- a/documentation/userguide/install.tex +++ b/documentation/userguide/install.tex @@ -4,14 +4,14 @@ \section{Installation} \subsection{Quick installation instructions} -Download \prog{infernal-1.1.1.tar.gz} from \url{http://infernal.janelia.org/}, or -directly from -\url{ftp://selab.janelia.org/pub/software/infernal/infernal-1.1.1.tar.gz}; +Download \prog{infernal-1.1.2.tar.gz} from \url{http://eddylab.org/infernal/}, or +directly from \\ +\url{eddylab.org/infernal/infernal-1.1.2.tar.gz}; unpack it, configure, and make: -\user{wget ftp://selab.janelia.org/pub/software/infernal/infernal-1.1.1.tar.gz}\\ -\user{tar xf infernal-1.1.1.tar.gz}\\ -\user{cd infernal-1.1.1}\\ +\user{wget eddylab.org/infernal/infernal-1.1.2.tar.gz}\\ +\user{tar xf infernal-1.1.2.tar.gz}\\ +\user{cd infernal-1.1.2}\\ \user{./configure}\\ \user{make} @@ -77,9 +77,11 @@ \subsection{System requirements} \paragraph{Compiler:} The source code is C conforming to POSIX and ANSI C99 standards. It should compile with any ANSI C99 compliant compiler, -including the GNU C compiler \prog{gcc}. We test the code using both -the \prog{gcc} and \prog{icc} compilers. We find that \prog{icc} -produces somewhat faster code at present. +including the GNU C compiler \prog{gcc}. +% as of 1.1.2, I don't test on icc anymore: +%We test the code using both +%the \prog{gcc} and \prog{icc} compilers. We find that \prog{icc} +%produces somewhat faster code at present. \paragraph{Libraries and other installation requirements:} Infernal includes two software libraries, HMMER and Easel, which it will automatically @@ -134,8 +136,7 @@ \subsection{MPI parallelization for clusters is optional} The \prog{cmalign}, \prog{cmcalibrate}, \prog{cmsearch} and \prog{cmscan} programs also support MPI (Message Passing Interface) parallelization on clusters. To use MPI, you first need to have an -MPI library installed, such as OpenMPI (\url{www.open-mpi.org}). We -use Intel MPI at Janelia. +MPI library installed, such as OpenMPI (\url{www.open-mpi.org}). MPI support is not enabled by default, and it is not compiled into the precompiled binaries that we supply with Infernal. To enable MPI support diff --git a/documentation/userguide/introduction.tex b/documentation/userguide/introduction.tex index 2f90f850..2ebf62ce 100644 --- a/documentation/userguide/introduction.tex +++ b/documentation/userguide/introduction.tex @@ -8,7 +8,7 @@ \section{Introduction} structurally annotated multiple sequence alignment of an RNA family with a position-specific scoring system for substitutions, insertions, and deletions. Positions in the profile that are basepaired in the -consensus secondary structure of the alignment are modeled as + h consensus secondary structure of the alignment are modeled as dependent on one another, allowing Infernal's scoring system to consider the secondary structure, in addition to the primary sequence, of the family being modeled. Infernal profiles are probabilistic @@ -239,7 +239,7 @@ \subsection{How to learn more about CMs and profile HMMs} book chapters \citep{Eddy06b,NawrockiEddy09}, and a few doctoral theses \citep{Klein03,Nawrocki09b,Kolbe10} related to CMs\footnote{Eddy lab publications are available from -\url{http://selab.janelia.org/publications.html}}. The book +\url{http://eddylab.org/publications.html}}. The book \emph{Biological Sequence Analysis: Probabilistic Models of Proteins and Nucleic Acids} \citep{Durbin98} has several chapters devoted to HMMs and CMs. Profile HMM filtering for CMs was introduced by Weinberg @@ -269,7 +269,7 @@ \subsection{How to learn more about CMs and profile HMMs} searches, EP Nawrocki and SR Eddy. Bioinformatics, 29:2933-2935, 2013.) is the most appropriate paper to cite. If you’re writing for an enlightened (url-friendly) journal, you may want to cite the webpage -\url{infernal.janelia.org} because it is kept up-to-date. +\url{http://eddylab.org/infernal/} because it is kept up-to-date. \end{srefaq} diff --git a/documentation/userguide/main.tex b/documentation/userguide/main.tex index c57c3d0b..ef73a24a 100644 --- a/documentation/userguide/main.tex +++ b/documentation/userguide/main.tex @@ -47,7 +47,7 @@ \newpage \input{tabular} -% Changes in options between 1.0 and 1.1 are omitted from the 1.2 user guide. +% Changes in options between 1.0 and 1.1 are omitted from the 1.1.2 user guide. %\newpage %\input{diffoptions} diff --git a/documentation/userguide/more.tex b/documentation/userguide/more.tex index 7533bc5d..2d88894a 100644 --- a/documentation/userguide/more.tex +++ b/documentation/userguide/more.tex @@ -12,10 +12,10 @@ \subsection{How do I cite Infernal?} Bioinformatics, 29:2933-2935, 2013. The most appropriate citation is to the web site, -\url{infernal.janelia.org}. You should also cite what version of the -software you used. We archive all old versions, so anyone should be -able to obtain the version you used, when exact reproducibility of an -analysis is an issue. +\url{http://eddylab.org/infernal/}. You should also cite what version +of the software you used. We archive all old versions, so anyone +should be able to obtain the version you used, when exact +reproducibility of an analysis is an issue. The version number is in the header of most output files. To see it quickly, do something like \prog{cmscan -h} to get a help page, and @@ -23,17 +23,17 @@ \subsection{How do I cite Infernal?} \begin{sreoutput} # cmscan :: search sequence(s) against a CM database -# INFERNAL 1.1.1 (July 2014) -# Copyright (C) 2014 Howard Hughes Medical Institute. -# Freely distributed under the GNU General Public License (GPLv3). +# INFERNAL 1.1.2 (June 2016) +# Copyright (C) 2016 Howard Hughes Medical Institute. +# Freely distributed under a BSD open source license. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \end{sreoutput} -So (from the second line there) this is from Infernal 1.1.1. +So (from the second line there) this is from Infernal 1.1.2. \subsection{How do I report a bug?} -Email us, at \url{infernal@janelia.hhmi.org}. +Email us, at \url{sean@eddylab.org}. Before we can see what needs fixing, we almost always need to reproduce a bug on one of our machines. This means we want to have a diff --git a/documentation/userguide/tabular.tex b/documentation/userguide/tabular.tex index 1e703387..ff7e99a3 100644 --- a/documentation/userguide/tabular.tex +++ b/documentation/userguide/tabular.tex @@ -9,7 +9,7 @@ \subsection{Target hits tables} different formats of target hits table, which are both described below. By default, both \prog{cmsearch} and \prog{cmscan} produce the target hits table in \emph{format 1}. Format 1 is the only format that -was used by Infernal versions 1.1rc1 through 1.1.1. As of version 1.2, +was used by Infernal versions 1.1rc1 through 1.1.1. As of version 1.1.2, with \prog{cmscan}, the \ccode{--fmt 2} option can be used in combination with \ccode{--tblout} to produce a target hits table in the alternative \emph{format 2}. Both formats 1 and 2 target hits diff --git a/documentation/userguide/titlepage.tex.in b/documentation/userguide/titlepage.tex.in index 18d0a081..658b0a00 100644 --- a/documentation/userguide/titlepage.tex.in +++ b/documentation/userguide/titlepage.tex.in @@ -18,11 +18,9 @@ Version @INFERNAL_VERSION@; @INFERNAL_DATE@ \\ \vspace*{\fill} Eric Nawrocki and Sean Eddy\\ -for the INFERNAL Development Team\\ -Janelia Farm Research Campus\\ -19700 Helix Drive\\ -Ashburn VA 20147 USA\\ -\url{http://eddylab.org/} \\ +for the INFERNAL development team\\ +\url{github.com/EddyRivasLab/infernal/} +\url{http://eddylab.org/infernal} \end{center} \vspace*{\fill} diff --git a/documentation/userguide/tutorial.tex b/documentation/userguide/tutorial.tex index 5f9d62af..7d30a72c 100644 --- a/documentation/userguide/tutorial.tex +++ b/documentation/userguide/tutorial.tex @@ -1,6 +1,9 @@ % EPN, Mon Oct 21 12:57:38 2013 +% EPN, Mon Jun 27 12:32:24 2016 [1.1.2 release] % Actual commands run on: -% login-eddy +% cbbdev13 +% $ uname -a +% Linux cbbdev13 2.6.32-573.18.1.el6.x86_64 #1 SMP Tue Feb 9 22:46:17 UTC 2016 x86_64 x86_64 x86_64 GNU/Linux \section{Tutorial} \label{section:tutorial} @@ -197,9 +200,9 @@ \subsubsection{Step 1: build a covariance model with cmbuild} \begin{sreoutput} # cmbuild :: covariance model construction from multiple sequence alignments -# INFERNAL 1.1.1 (July 2014) -# Copyright (C) 2014 Howard Hughes Medical Institute. -# Freely distributed under the GNU General Public License (GPLv3). +# INFERNAL 1.1.2 (June 2016) +# Copyright (C) 2016 Howard Hughes Medical Institute. +# Freely distributed under a BSD open source license. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # CM file: tRNA5.cm # alignment file: tutorial/tRNA5.sto @@ -210,7 +213,7 @@ \subsubsection{Step 1: build a covariance model with cmbuild} # ------ -------------------- -------- -------- ------ ----- ---- ---- ----- ----- ----------- 1 tRNA5 5 3.73 74 72 21 2 0.783 0.489 # -# CPU time: 0.57u 0.00s 00:00:00.56 Elapsed: 00:00:00.57 +# CPU time: 0.29u 0.00s 00:00:00.28 Elapsed: 00:00:00.30 \end{sreoutput} If your input file had contained more than one alignment, you'd get @@ -282,35 +285,36 @@ \subsubsection{Step 2: calibrate the model with cmcalibrate} \begin{sreoutput} # cmcalibrate :: fit exponential tails for CM E-values -# INFERNAL 1.1.1 (July 2014) -# Copyright (C) 2014 Howard Hughes Medical Institute. -# Freely distributed under the GNU General Public License (GPLv3). +# INFERNAL 1.1.2 (June 2016) +# Copyright (C) 2016 Howard Hughes Medical Institute. +# Freely distributed under a BSD open source license. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # CM file: tRNA5.cm # forecast mode (no calibration): on # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # -# Forecasting running time for CM calibration(s) on 8 cpus: +# Forecasting running time for CM calibration(s) on 24 cpus: # # predicted # running time # model name (hr:min:sec) # -------------------- ------------ - tRNA5 00:06:26 + tRNA5 00:01:00 # -# CPU time: 0.27u 0.00s 00:00:00.27 Elapsed: 00:00:00.28 +# CPU time: 0.15u 0.00s 00:00:00.15 Elapsed: 00:00:00.16 [ok] \end{sreoutput} The header comes first, telling you what program you ran, on what file -and with what options. This calibration will use 8 CPUs, your output +and with what options. This calibration will use 24 CPUs, your output +% ^^ may vary depending on how many cores you have available on the machine you're using. (If you are planning to use MPI to parallelize the calibration (see the Installation section), you can specify the number of CPUs for the time estimate as \otext{} with the -\otext{--nforecast } option.) Using 8 CPUs, \prog{cmcalibrate} +\otext{--nforecast } option.) Using 24 CPUs, \prog{cmcalibrate} estimates the time required for calibration on the machine I'm using -at about seven minutes. +at about one minute. Feel free to perform the calibration yourself if you'd like (with the command \otext{cmcalibrate tRNA5.cm}). However, we've included the file @@ -337,13 +341,13 @@ \subsubsection{Step 3: search a sequence database with cmsearch} \begin{sreoutput} # cmsearch :: search CM(s) against a sequence database -# INFERNAL 1.1.1 (July 2014) -# Copyright (C) 2014 Howard Hughes Medical Institute. -# Freely distributed under the GNU General Public License (GPLv3). +# INFERNAL 1.1.2 (June 2016) +# Copyright (C) 2016 Howard Hughes Medical Institute. +# Freely distributed under a BSD open source license. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # query CM file: tRNA5.cm # target sequence database: tutorial/mrum-genome.fa -# number of worker threads: 8 +# number of worker threads: 24 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \end{sreoutput} @@ -367,8 +371,8 @@ \subsubsection{Step 3: search a sequence database with cmsearch} (12) ! 1.6e-15 61.7 0.0 NC_013790.1 2350984 2350911 - cm no 0.53 Methanobrevibacter ruminantium M1 chromosome, complete genome (13) ! 3.3e-15 60.7 0.0 NC_013790.1 2186090 2186019 - cm no 0.54 Methanobrevibacter ruminantium M1 chromosome, complete genome (14) ! 4.1e-15 60.4 0.0 NC_013790.1 2680159 2680233 + cm no 0.67 Methanobrevibacter ruminantium M1 chromosome, complete genome - (15) ! 7.9e-15 59.5 0.0 NC_013790.1 2749839 2749768 - cm no 0.53 Methanobrevibacter ruminantium M1 chromosome, complete genome - (16) ! 7.9e-15 59.5 0.0 NC_013790.1 2749945 2749874 - cm no 0.53 Methanobrevibacter ruminantium M1 chromosome, complete genome + (15) ! 7.9e-15 59.5 0.0 NC_013790.1 2749945 2749874 - cm no 0.53 Methanobrevibacter ruminantium M1 chromosome, complete genome + (16) ! 7.9e-15 59.5 0.0 NC_013790.1 2749839 2749768 - cm no 0.53 Methanobrevibacter ruminantium M1 chromosome, complete genome (17) ! 9.8e-15 59.2 0.0 NC_013790.1 361676 361604 - cm no 0.51 Methanobrevibacter ruminantium M1 chromosome, complete genome (18) ! 1e-14 59.2 0.0 NC_013790.1 2585073 2584999 - cm no 0.60 Methanobrevibacter ruminantium M1 chromosome, complete genome (19) ! 1.1e-14 59.1 0.0 NC_013790.1 2130422 2130349 - cm no 0.59 Methanobrevibacter ruminantium M1 chromosome, complete genome @@ -664,18 +668,18 @@ \subsubsection{Step 3: search a sequence database with cmsearch} Query model(s): 1 (72 consensus positions) Target sequences: 1 (5874406 residues searched) Target sequences re-searched for truncated hits: 1 (360 residues re-searched) -Windows passing local HMM SSV filter: 11200 (0.2111); expected (0.35) +Windows passing local HMM SSV filter: 11205 (0.2116); expected (0.35) Windows passing local HMM Viterbi filter: (off) Windows passing local HMM Viterbi bias filter: (off) -Windows passing local HMM Forward filter: 137 (0.002691); expected (0.005) -Windows passing local HMM Forward bias filter: 134 (0.002621); expected (0.005) -Windows passing glocal HMM Forward filter: 87 (0.001923); expected (0.005) -Windows passing glocal HMM Forward bias filter: 87 (0.001923); expected (0.005) -Envelopes passing glocal HMM envelope defn filter: 100 (0.001342); expected (0.005) -Envelopes passing local CM CYK filter: 60 (0.0007631); expected (0.0001) +Windows passing local HMM Forward filter: 136 (0.002693); expected (0.005) +Windows passing local HMM Forward bias filter: 133 (0.002623); expected (0.005) +Windows passing glocal HMM Forward filter: 84 (0.001951); expected (0.005) +Windows passing glocal HMM Forward bias filter: 84 (0.001951); expected (0.005) +Envelopes passing glocal HMM envelope defn filter: 98 (0.001318); expected (0.005) +Envelopes passing local CM CYK filter: 60 (0.0007629); expected (0.0001) Total CM hits reported: 56 (0.0007205); includes 0 truncated hit(s) -# CPU time: 2.15u 0.03s 00:00:02.17 Elapsed: 00:00:00.89 +# CPU time: 2.01u 0.05s 00:00:02.05 Elapsed: 00:00:00.49 // [ok] \end{sreoutput} @@ -730,19 +734,19 @@ \subsubsection{Step 3: search a sequence database with cmsearch} % ^^^^ residues will pass the filter). Here, about 21\% of the database in % ^^^^ -11,200 separate windows got through the SSV filter. For a database of +11,205 separate windows got through the SSV filter. For a database of %^^^^^ this size, the local Viterbi filter is turned off. The local Forward filter is set to allow an expected 0.5\% of the database survive. Here about % ^^^^ -0.3\% survives in 137 windows. Next, each surviving window is checked +0.3\% survives in 136 windows. Next, each surviving window is checked %^^ to see if the target sequence is ``obviously'' so biased in its composition that it's unlikely to be a true homolog. This is called the ``bias filter''\footnote{There's also a bias filter step used in the local Viterbi filter stage, when it is used.} and applying a bit score correction to previous filter's score for each window and -recomputing the P-value. Three of the 137 windows fail to pass +recomputing the P-value. Three of the 136 windows fail to pass % ^^^^^ ^^^ the local Forward bias filter stage. Next, the Forward algorithm is used to score each window again, but this time with the HMM configured @@ -751,7 +755,7 @@ \subsubsection{Step 3: search a sequence database with cmsearch} difference between Infernal and HMMER3's (v3.0) pipeline. HMMER v3.0 only uses local HMM algorithms.} As with the local stage, an expected 0.5\% of the database is expected to survive. In this case, -87 of the 134 windows, comprising about 0.2\% of the database, +84 of the 134 windows, comprising about 0.2\% of the database, %^ ^^^ survive. The bias filter is run again, this time applying a correction to the glocal Forward scores. For this search, 0 windows are removed at @@ -762,13 +766,13 @@ \subsubsection{Step 3: search a sequence database with cmsearch} or more hit envelopes in each window, where each envelope contains one putative hit. Often residues at the beginning and ends of windows are determined to be nonhomologous and are not included in the -envelope. In this search, 100 envelopes are defined within the 87 +envelope. In this search, 98 envelopes are defined within the 84 % ^^^ ^^ windows. Note that the envelopes comprise only about 70\% of the % ^^^ -residues from the 87 windows, indicated by the drop of 0.1923\% to +residues from the 84 windows, indicated by the drop of 0.1951\% to % ^^^^^^^^ -0.1342\%. +0.1318\%. %%^^^^ After hit envelopes have been defined with the filter HMM, the two @@ -794,9 +798,9 @@ \subsubsection{Step 3: search a sequence database with cmsearch} % ^^ Finally, the running time of the search is reported, in CPU time and -elapsed time. This search took about 1 second (wall +elapsed time. This search took about half a second (wall % ^ -clock time) (running on eight cores). +clock time) (running on twenty four cores). \subsubsection{Truncated RNA detection} @@ -942,13 +946,13 @@ \subsubsection{Step 3: search the CM database with cmscan} \begin{sreoutput} # cmscan :: search sequence(s) against a CM database -# INFERNAL 1.1.1 (July 2014) -# Copyright (C) 2014 Howard Hughes Medical Institute. -# Freely distributed under the GNU General Public License (GPLv3). +# INFERNAL 1.1.2 (June 2016) +# Copyright (C) 2016 Howard Hughes Medical Institute. +# Freely distributed under a BSD open source license. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -# query sequence file: ../../tutorial/metag-example.fa +# query sequence file: tutorial/metag-example.fa # target CM database: minifam.cm -# number of worker threads: 8 +# number of worker threads: 24 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - Query: AAGA01015927.1 [L=943] @@ -1022,7 +1026,7 @@ \subsubsection{Step 3: search the CM database with cmscan} Envelopes passing local CM CYK filter: 4 (0.05189); expected (0.0001) Total CM hits reported: 3 (0.03046); includes 0 truncated hit(s) -# CPU time: 0.21u 0.01s 00:00:00.22 Elapsed: 00:00:00.22 +# CPU time: 0.15u 0.02s 00:00:00.17 Elapsed: 00:00:00.12 // \end{widesreoutput} @@ -1159,7 +1163,7 @@ \subsection{Searching the Rfam CM database with a query sequence} with a single command. To complete this step of the tutorial you'll need to download the Rfam -12.1 CM file from here: +12.1 CM file from here: \\ \url{ftp://ftp.ebi.ac.uk/pub/databases/Rfam/12.1/Rfam.cm.gz} and gunzipping it, like this: @@ -1175,11 +1179,10 @@ \subsection{Searching the Rfam CM database with a query sequence} The next step is to run \prog{cmscan}. In order to reproduce how Rfam searches are performed \citep{Nawrocki15} several command line options are required. Each of these options is explained below. The -full command is (split up into three lines so it fits on the page): +full command is (split up into two lines so it fits on the page): -\user{cmscan --rfam --cut\_ga --nohmmonly --fmt 2 --tblout \ } -\user{mrum-genome.tblout --clanin tutorial/Rfam.12.1.claninfo \ } -\user{Rfam.cm tutorial/mrum-genome.fa > mrum-genome.cmscan} +\indent\indent\small\verb+> cmscan --rfam --cut_ga --nohmmonly --tblout mrum-genome.tblout --fmt 2 \+\\ +\indent\indent\small\verb+> --clanin testsuite/Rfam.12.1.clanin Rfam.cm tutorial/mrum-genome.fa > mrum-genome.cmscan+\\ This command will take at least several minutes and possibly up to about 30 minutes depending on the number of cores and speed of your @@ -1188,33 +1191,34 @@ \subsection{Searching the Rfam CM database with a query sequence} The command line options used in the above command are as follows: \begin{sreitems}{\emprog{--nohmmonly}} -\item[\otext{--rfam}] Specify that the filter pipeline run in fast +\item[\otext{--rfam}] Specifies that the filter pipeline run in fast mode, with the same strict filters that are used for Rfam searches and for other sequence databases larger than 20 Gb (see section~\ref{section:pipeline}). % \item[\otext{--cut\_ga}] Specifies that the special Rfam \emph{GA} - (gathering) thresholds that are stored in the \prog{Rfam.cm} file to determine - which hits are reported. Each model has its own GA bit score - threshold, which were determined by Rfam curators as the bit score - at and above which all hits are believed to be homologous. These - determinations were made based on observed hit results against a - the large Rfamseq database used by Rfam \citep{Nawrocki15}. + (gathering) thresholds be used to determine which hits are + reported. These thresholds are stored in the \prog{Rfam.cm} file. + Each model has its own GA bit score threshold, which was determined + by Rfam curators as the bit score at and above which all hits are + believed to be true homologs to the model. These determinations were made based on + observed hit results against the large Rfamseq database used by + Rfam \citep{Nawrocki15}. % \item[\otext{--nohmmonly}] All models, even those with zero basepairs, - are run in CM mode. This ensures all GA cutoffs, which were + are run in CM mode (not HMM mode). This ensures all GA cutoffs, which were determined in CM mode for each model, are valid. % +\item[\otext{--tblout}] Specifies that a tabular output + file should be created, see section~\ref{section:tabular}. +% \item[\otext{--fmt 2}] The tabular output file will be in format 2, which includes annotation of overlapping hits. See page~\pageref{tabular-format2} for a complete description of this format. % -\item[\otext{--tblout}] Specifies that a tabular output - file should be created, see section~\ref{section:tabular}. -% \item[\otext{--clanin}] Clan information should be read - from the file \prog{tutorial/Rfam.12.1.claninfo}. This file lists + from the file \prog{testsuite/Rfam.12.1.claninfo}. This file lists which models belong to the same clan. Clans are groups of models that are homologous and therefore it is expected that some hits to these models will overlap. For example, the LSU\_rRNA\_archaea and @@ -1224,7 +1228,7 @@ \subsection{Searching the Rfam CM database with a query sequence} When the \prog{cmscan} command finishes running, the file \prog{mrum-genome.cmscan} will contain the standard output of the -program which will be similar to what we saw in the earlier example of +program. This file will be similar to what we saw in the earlier example of \prog{cmscan}. The file \prog{mrum-genome.tblout} has also been created, which is a tabular representation of all hits, one line per hit. Take a look at this file. The first two lines are comment lines @@ -1265,10 +1269,10 @@ \subsection{Searching the Rfam CM database with a query sequence} This tabular format includes the target model name, sequence name (in column 3, which is omitted above to save space), clan name, sequence coordinates, bit score, E-value and more. Because the \prog{--fmt 2} -option was used, this file includes information on which hit overlap +option was used, this file includes information on which hits overlap with other hits, starting at the column labelled ``olp'' and ending with ``wfrct2''. Hits with the ``*'' character in the ``olp'' column -do not overlap with any other hits. Those with ``\^'' do overlap with +do not overlap with any other hits. Those with ``\verb+^+'' do overlap with at least one other hit, but none of those overlapping hits have a better score (that occurs higher in the list). Those with ``='' also overlap with at least one other hit that does have a better score, the @@ -1279,8 +1283,8 @@ \subsection{Searching the Rfam CM database with a query sequence} model. These are the two copies of LSU rRNA in the \emph{Methanobrevibacter ruminantium} genome. Hits number 3 and 4 are to the \prog{LSU\_rRNA\_bacteria} model and overlap with hits 1 -and 2 nearly completely (hit 1 is from sequence positions 76282 to 765862 and -hit 3 is from sequence positions 76284 to 765862). This overlap is not +and 2 nearly completely (hit 1 is from sequence positions 762872 to 765862 and +hit 3 is from sequence positions 762874 to 765862). This overlap is not surprising because the bacterial and archaeal LSU rRNA models are very similar, and so are assigning high scores to the same subsequences. Further, hit 5 is to \prog{LSU\_rRNA\_eukarya} and also @@ -1288,13 +1292,13 @@ \subsection{Searching the Rfam CM database with a query sequence} to produce overlapping hits due to their homology, Rfam has grouped them into the same \emph{clan}, note the ``CL00112'' value in the ``clan name'' column for all three hits. This clan information was provided -in the \prog{rfam.12.1claninfo} input file we provided \prog{cmscan} -with the \prog{--clanin} option. +in the \prog{rfam.12.1claninfo} input file we provided to \prog{cmscan} +by using the \prog{--clanin} option. The ``olp'' column indicates that hit 1 is the highest scoring of the -three because it contains the ``\^'' character. Hits 3 and 5 both have +three overlapping hits because it contains the ``\verb+^+'' character. Hits 3 and 5 both have ``='' in the ``olp'' column indicating that there is another hit to -another model that overlaps these hits that has a better score. +another model which overlaps these hits and has a better score. If you were using these results to produce annotations for the \emph{Methanobrevibacter ruminantium} genome, you may want to ignore @@ -1304,13 +1308,13 @@ \subsection{Searching the Rfam CM database with a query sequence} additionally providing the \prog{--oskip} option to \prog{cmscan}. You can also modify the overlap annotation behavior with \prog{--oclan} option which restricts the annotation of overlaps to -hits for models within the same clan. Hits that overlap but are to -different models that are not in the same clan will not be marked as -overlaps, they will marked as ``*'' in the ``olp'' field. +hits for models within the same clan. Overlapping hits from models +that are not in the same clan will not be marked as +overlaps, instead they will marked as ``*'' in the ``olp'' field. \subsection{Creating multiple alignments with cmalign} The file \otext{tutorial/mrum-tRNAs10.fa} is a FASTA file containing -the 10 of the tRNA hits above the inclusion threshold (with an E-value +the 10 tRNA hits above the inclusion threshold (with an E-value less than $0.01$) found by \prog{cmsearch} in our search of \emph{M. ruminantium} genome\footnote{The \otext{-A } option to \prog{cmsearch} can be used to save a @@ -1326,7 +1330,7 @@ \subsection{Creating multiple alignments with cmalign} \begin{tinysreoutput} # STOCKHOLM 1.0 -#=GF AU Infernal 1.1.1 +#=GF AU Infernal 1.1.2 mrum-tRNA.1 GGAGCUAUAGCUCAAU..GGC..AGAGCGUUUGGCUGACAU........................................CCAAAAGGUUAUGGGUUCGAUUCCCUUUAGCCCCA #=GR mrum-tRNA.1 PP ****************..***..******************........................................*********************************** @@ -1456,7 +1460,7 @@ \subsubsection{cmalign assumes sequences may be truncated} \label{cmalign-cobalamin} \begin{sreoutput} # STOCKHOLM 1.0 -#=GF AU Infernal 1.1.1 +#=GF AU Infernal 1.1.2 Cobalamin.1 -------------------------------GUAGGCAAAAGGAAGAGGAAGgAUGGUGGAAAUCCUUCACGGGCCCGGCCA #=GR Cobalamin.1 PP ...............................44455566666899******989**************************** @@ -1530,7 +1534,7 @@ \subsection{Searching a sequence database for RNAs with unknown or no structure of a collection of homologous RNAs. Currently, Infernal itself does not have the capability of predicting structure, but it's predecessor COVE did with the \prog{covet} program, still available at -\url{ftp://selab.janelia.org/pub/software/cove/cove-2.4.4.tar.Z}. +\url{eddylab.org/software/cove/cove.tar.Z}. Infernal automatically detects when a model has zero basepairs and uses efficient profile HMM algorithms in \prog{cmsearch} and @@ -1556,9 +1560,9 @@ \subsection{Searching a sequence database for RNAs with unknown or no \begin{sreoutput} # cmbuild :: covariance model construction from multiple sequence alignments -# INFERNAL 1.1.1 (July 2014) -# Copyright (C) 2014 Howard Hughes Medical Institute. -# Freely distributed under the GNU General Public License (GPLv3). +# INFERNAL 1.1.2 (June 2016) +# Copyright (C) 2016 Howard Hughes Medical Institute. +# Freely distributed under a BSD open source license. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # CM file: tRNA5-noss.cm # alignment file: tutorial/tRNA5.sto @@ -1570,7 +1574,7 @@ \subsection{Searching a sequence database for RNAs with unknown or no # ------ -------------------- -------- -------- ------ ----- ---- ---- ----- ----- ----------- 1 tRNA5 5 5.00 74 72 0 0 0.552 0.552 # -# CPU time: 0.18u 0.00s 00:00:00.18 Elapsed: 00:00:00.19 +# CPU time: 0.14u 0.00s 00:00:00.14 Elapsed: 00:00:00.14 \end{sreoutput} The output reports that this model has 0 basepairs (``bps'') (the @@ -1612,7 +1616,7 @@ \subsection{Searching a sequence database for RNAs with unknown or no (10) ! 6.4e-05 24.8 0.0 NC_013790.1 1873882 1873820 - hmm - 0.63 Methanobrevibacter ruminantium M1 chromosome, complete genome (11) ! 0.00014 23.7 0.0 NC_013790.1 360882 360824 - hmm - 0.51 Methanobrevibacter ruminantium M1 chromosome, complete genome (12) ! 0.00059 21.8 0.0 NC_013790.1 361910 361851 - hmm - 0.38 Methanobrevibacter ruminantium M1 chromosome, complete genome - (13) ! 0.00091 21.2 0.0 NC_013790.1 2350586 2350528 - hmm - 0.58 Methanobrevibacter ruminantium M1 chromosome, complete genome + (13) ! 0.00092 21.2 0.0 NC_013790.1 2350586 2350528 - hmm - 0.58 Methanobrevibacter ruminantium M1 chromosome, complete genome (14) ! 0.0018 20.3 0.0 NC_013790.1 995341 995267 - hmm - 0.51 Methanobrevibacter ruminantium M1 chromosome, complete genome (15) ! 0.0026 19.7 0.0 NC_013790.1 97728 97788 + hmm - 0.49 Methanobrevibacter ruminantium M1 chromosome, complete genome (16) ! 0.0029 19.6 0.0 NC_013790.1 2186083 2186024 - hmm - 0.50 Methanobrevibacter ruminantium M1 chromosome, complete genome @@ -1622,7 +1626,7 @@ \subsection{Searching a sequence database for RNAs with unknown or no (20) ! 0.0074 18.3 0.0 NC_013790.1 361056 360994 - hmm - 0.40 Methanobrevibacter ruminantium M1 chromosome, complete genome ------ inclusion threshold ------ (21) ? 0.011 17.7 0.0 NC_013790.1 2151679 2151737 + hmm - 0.56 Methanobrevibacter ruminantium M1 chromosome, complete genome - (22) ? 0.019 17.1 0.0 NC_013790.1 2327123 2327043 - hmm - 0.62 Methanobrevibacter ruminantium M1 chromosome, complete genome + (22) ? 0.018 17.1 0.0 NC_013790.1 2327123 2327043 - hmm - 0.62 Methanobrevibacter ruminantium M1 chromosome, complete genome (23) ? 0.023 16.7 0.0 NC_013790.1 360973 360920 - hmm - 0.54 Methanobrevibacter ruminantium M1 chromosome, complete genome (24) ? 0.037 16.1 0.0 NC_013790.1 2350982 2350919 - hmm - 0.50 Methanobrevibacter ruminantium M1 chromosome, complete genome (25) ? 0.039 16.1 0.0 NC_013790.1 361671 361606 - hmm - 0.50 Methanobrevibacter ruminantium M1 chromosome, complete genome @@ -1818,9 +1822,9 @@ \subsection{Specifying and annotating match positions with cmbuild --hand} \begin{sreoutput} # cmbuild :: covariance model construction from multiple sequence alignments -# INFERNAL 1.1.1 (July 2014) -# Copyright (C) 2014 Howard Hughes Medical Institute. -# Freely distributed under the GNU General Public License (GPLv3). +# INFERNAL 1.1.2 (June 2016) +# Copyright (C) 2016 Howard Hughes Medical Institute. +# Freely distributed under a BSD open source license. # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # CM file: tRNA5-hand.cm # alignment file: ../../tutorial/tRNA5-hand.sto @@ -1832,7 +1836,7 @@ \subsection{Specifying and annotating match positions with cmbuild --hand} # ------ -------------------- -------- -------- ------ ----- ---- ---- ----- ----- ----------- 1 tRNA5-hand 5 3.59 74 74 21 2 0.763 0.476 # -# CPU time: 0.59u 0.00s 00:00:00.59 Elapsed: 00:00:00.61 +# CPU time: 0.31u 0.00s 00:00:00.31 Elapsed: 00:00:00.32 \end{sreoutput} The output reports that the model now has 74 match (consensus) @@ -1850,6 +1854,7 @@ \subsection{Specifying and annotating match positions with cmbuild --hand} search: \user{cmsearch tutorial/tRNA5-hand.c.cm tutorial/mrum-genome.fa} +% tutorial regression: trna-hand-mrum.cmsearch The results are very similar to the earlier search with the tRNA model built with default \prog{cmbuild} parameters (though not diff --git a/tutorial/tRNA5.c.cm b/tutorial/tRNA5.c.cm index 7e50cdd7..22c28b91 100644 --- a/tutorial/tRNA5.c.cm +++ b/tutorial/tRNA5.c.cm @@ -1,4 +1,4 @@ -INFERNAL1/a [1.1rc1 | June 2012] +INFERNAL1/a [1.1.1 | July 2014] NAME tRNA5 STATES 230 NODES 61 @@ -8,9 +8,9 @@ ALPH RNA RF no CONS yes MAP yes -DATE Tue Jun 12 17:22:37 2012 -COM [1] ./cmbuild tRNA5.cm ../tutorial/tRNA5.sto -COM [2] ./cmcalibrate ./tRNA5.cm +DATE Mon Jun 27 12:35:45 2016 +COM [1] ./src/cmbuild tRNA5.cm tutorial/tRNA5.sto +COM [2] cmcalibrate tRNA5.cm PBEGIN 0.05 PEND 0.05 WBETA 1e-07 @@ -321,7 +321,7 @@ CM [ END 41 ] - - - - - - E 229 228 3 -1 0 0 0 0 0 // -HMMER3/f [i1.1rc1 | June 2012] +HMMER3/f [i1.1.1 | July 2014] NAME tRNA5 LENG 72 MAXL 152 @@ -331,8 +331,8 @@ MM no CONS yes CS yes MAP yes -DATE Tue Jun 12 17:22:38 2012 -COM [1] ./cmbuild tRNA5.cm ../tutorial/tRNA5.sto +DATE Mon Jun 27 12:35:45 2016 +COM [1] ./src/cmbuild tRNA5.cm tutorial/tRNA5.sto NSEQ 5 EFFN 4.575195 CKSUM 3535941971