diff --git a/COPYRIGHT b/COPYRIGHT
new file mode 100644
index 00000000..5908a64b
--- /dev/null
+++ b/COPYRIGHT
@@ -0,0 +1,46 @@
+Infernal - inference of RNA secondary structural alignments
+@INFERNAL_COPYRIGHT@
+------------------------------------------------------------------
+
+Copyright (C) 2013 HHMI Janelia Farm Research Campus
+
+ Portions Copyright (C) 1991-2013 Sean R. Eddy
+ Portions Copyright (C) 2005-2013 Eric P. Nawrocki
+ Portions Copyright (C) 2005-2011 Diana L. Kolbe
+ Portions Copyright (C) 2004 Zasha Weinberg
+ Portions Copyright (C) 1990 Don G. Gilbert
+ Portions Copyright (C) 1995-2006 Washington University in St. Louis
+ Portions Copyright (C) 1992-1995 Medical Research Council, UK
+ Portions Copyright (C) 2004 University of Washington, Seattle
+ Portions Copyright (C) 1986,1993,1995 University of Toronto
+ Portions Copyright (C) 1989-2001 Free Software Foundation
+ Portions Copyright (C) 1991 Massachusetts Institute of Technology
+
+Infernal includes the HMMER software package, which has its own license and
+copyright information. See hmmer/COPYRIGHT and hmmer/LICENSE.
+
+Infernal uses the Easel software library, which has its own license and
+copyright information. See easel/COPYRIGHT and easel/LICENSE.
+
+Infernal is distributed under the terms of the GNU General Public
+License version 3 (GPLv3). See the file LICENSE for details.
+
+This program is free software: you can redistribute it and/or modify
+it under the terms of the GNU General Public License as published by
+the Free Software Foundation, either version 3 of the License, or (at
+your option) any later version.
+
+This program is distributed in the hope that it will be useful,
+but WITHOUT ANY WARRANTY; without even the implied warranty of
+MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+GNU General Public License for more details.
+
+A copy of the GNU General Public License is in the file LICENSE. You
+may also obtain a copy from .
+
+------------------------------------------------------------------
+The Infernal development team
+HHMI Janelia Farm Research Campus
+http://infernal.janelia.org/
+
+
diff --git a/Makefile.in b/Makefile.in
index e59c9929..9135dd89 100644
--- a/Makefile.in
+++ b/Makefile.in
@@ -153,10 +153,10 @@ check:
${QUIET_SUBDIR0}${HMMERDIR} ${QUIET_SUBDIR1} tests
${QUIET_SUBDIR0}src ${QUIET_SUBDIR1} tests
${QUIET_SUBDIR0}${ESLDIR} ${QUIET_SUBDIR1} check
- if test -d ${HMMERDIR}/easel; then cp ${ESLDIR}/miniapps/esl-afetch ${HMMERDIR}/easel/miniapps; fi
- if test -d ${HMMERDIR}/easel; then cp ${ESLDIR}/miniapps/esl-reformat ${HMMERDIR}/easel/miniapps; fi
- if test -d ${HMMERDIR}/easel; then cp ${ESLDIR}/miniapps/esl-sfetch ${HMMERDIR}/easel/miniapps; fi
- if test -d ${HMMERDIR}/easel; then cp ${ESLDIR}/miniapps/esl-shuffle ${HMMERDIR}/easel/miniapps; fi
+ if test -d ${HMMERDIR}/easel && ! test -e ${HMMERDIR}/easel/miniapps/esl-afetch; then cp ${ESLDIR}/miniapps/esl-afetch ${HMMERDIR}/easel/miniapps; fi
+ if test -d ${HMMERDIR}/easel && ! test -e ${HMMERDIR}/easel/miniapps/esl-reformat; then cp ${ESLDIR}/miniapps/esl-reformat ${HMMERDIR}/easel/miniapps; fi
+ if test -d ${HMMERDIR}/easel && ! test -e ${HMMERDIR}/easel/miniapps/esl-sfetch; then cp ${ESLDIR}/miniapps/esl-sfetch ${HMMERDIR}/easel/miniapps; fi
+ if test -d ${HMMERDIR}/easel && ! test -e ${HMMERDIR}/easel/miniapps/esl-sfetch; then cp ${ESLDIR}/miniapps/esl-shuffle ${HMMERDIR}/easel/miniapps; fi
if ! test -d ${HMMERDIR}/easel; then cd ${HMMERDIR}; @LN_S@ ../${ESLDIR} .; cd ..; fi
if ! test -d ${srcdir}/${HMMERDIR}/easel; then @LN_S@ ${srcdir}/${ESLDIR} ${srcdir}/${HMMERDIR}/easel; fi
${QUIET_SUBDIR0}${HMMERDIR}/testsuite ${QUIET_SUBDIR1} check
diff --git a/configure.ac b/configure.ac
index 72ddf29d..34e79da1 100644
--- a/configure.ac
+++ b/configure.ac
@@ -23,7 +23,7 @@
# w/ AC_CONFIG_HEADERS -- which means, easel.h.in
#
# SRE, Mon Oct 5 14:55:45 1998
-# SVN $Id$
+# SVN $Id: configure.ac 4708 2014-07-22 13:11:50Z nawrockie $
# xref autoconf macro archive: //www.gnu.org/software/ac-archive/
#
# GNU recommends the following order:
@@ -44,7 +44,7 @@
# Autoconf 2.61 has a bug in AC_FUNC_FSEEKO; don't use it.
AC_PREREQ(2.63)
-AC_INIT(Infernal, 1.1.1, nawrockie@janelia.hhmi.org, infernal)
+AC_INIT(Infernal, 1.1.1x, eric.nawrocki@nih.gov, infernal)
AC_MSG_NOTICE([Configuring Infernal for your system.])
# remember if the user is overriding CFLAGS
@@ -73,26 +73,26 @@ fi
#
################################################################
-INFERNAL_DATE="July 2014"
-INFERNAL_COPYRIGHT="Copyright (C) 2014 Howard Hughes Medical Institute."
-INFERNAL_LICENSE="Freely distributed under the GNU General Public License (GPLv3)."
+INFERNAL_DATE="May 2016"
+INFERNAL_COPYRIGHT="Copyright (C) 2016 Howard Hughes Medical Institute."
+INFERNAL_LICENSE="Freely distributed under a BSD open source license."
INFERNAL_VERSION=$PACKAGE_VERSION
INFERNAL_URL="http://infernal.janelia.org/"
INFERNAL_ESLDIR="easel"
INFERNAL_HMMERDIR="hmmer"
-INFERNAL_SADIR="hmmer/lib/libdivsufsort"
+INFERNAL_SADIR="hmmer/libdivsufsort"
-HMMER_DATE="July 2014"
-HMMER_COPYRIGHT="Copyright (C) 2014 Howard Hughes Medical Institute."
-HMMER_LICENSE="Freely distributed under the GNU General Public License (GPLv3)."
+HMMER_DATE="May 2016"
+HMMER_COPYRIGHT="Copyright (C) 2016 Howard Hughes Medical Institute."
+HMMER_LICENSE="Freely distributed under a BSD open source licence."
HMMER_VERSION=i$PACKAGE_VERSION
HMMER_URL="http://hmmer.org/"
HMMER_ESLDIR="../easel"
-HMMER_SADIR="lib/libdivsufsort"
+HMMER_SADIR="libdivsufsort"
EASEL_DATE="July 2014"
-EASEL_COPYRIGHT="Copyright (C) 2014 HHMI Janelia Farm Research Campus"
-EASEL_LICENSE="Freely distributed under the Janelia Software License."
+EASEL_COPYRIGHT="Copyright (C) 2016 Howard Hughes Medical Institute"
+EASEL_LICENSE="Freely distributed under a BSD open source license."
EASEL_VERSION="i$PACKAGE_VERSION"
EASEL_URL="http://bioeasel.org/"
@@ -697,37 +697,25 @@ AC_CONFIG_FILES([documentation/Makefile])
AC_CONFIG_FILES([documentation/manpages/Makefile])
AC_CONFIG_FILES([documentation/userguide/Makefile])
-# following from HMMER's configure.ac
-#################################################################
-AC_CONFIG_FILES([
- hmmer/documentation/Makefile
- hmmer/documentation/man/Makefile
- hmmer/documentation/userguide/Makefile
- hmmer/src/Makefile
- hmmer/src/Makefile-subdirs.mk
- hmmer/src/base/Makefile
- hmmer/src/build/Makefile
- hmmer/src/daemon/Makefile
- hmmer/src/dp_reference/Makefile
- hmmer/src/dp_sparse/Makefile
- hmmer/src/dp_vector/Makefile
- hmmer/src/experiments/Makefile
- hmmer/src/misc/Makefile
- hmmer/src/programs/Makefile
- hmmer/src/sandbox/Makefile
- hmmer/src/search/Makefile
- hmmer/src/utilities/Makefile
- hmmer/testsuite/Makefile
- hmmer/benchmarks/Makefile
- hmmer/benchmarks/profmark/Makefile
- hmmer/benchmarks/speed/Makefile
- hmmer/lib/libdivsufsort/Makefile
-])
-
-AC_CONFIG_HEADERS([
- hmmer/src/p7_config.h
- hmmer/lib/libdivsufsort/divsufsort.h
-])
+# HMMER files
+AC_CONFIG_FILES([hmmer/documentation/Makefile])
+AC_CONFIG_FILES([hmmer/documentation/man/Makefile])
+AC_CONFIG_FILES([hmmer/documentation/userguide/Makefile])
+AC_CONFIG_FILES([hmmer/src/Makefile hmmer/testsuite/Makefile hmmer/profmark/Makefile])
+AC_CONFIG_FILES([hmmer/src/impl_${impl_choice}/Makefile])
+AC_CONFIG_FILES([hmmer/libdivsufsort/Makefile])
+AC_CONFIG_HEADERS(hmmer/src/p7_config.h)
+AC_CONFIG_HEADERS(hmmer/libdivsufsort/divsufsort.h)
+
+# the following incantation establishes a symlink of
+# src/impl_{whatever} to src/impl in the *build* directory.
+# Testsuite sqc tests rely on it.
+# You can't use AC_CONFIG_LINKS, apparently, because it is only
+# designed to link files, not directories.
+# PORTABILITY: Beware: we're using a naked ln -sf here, which some systems may not support!
+AC_CONFIG_COMMANDS([hmmer/src/impl],
+ [cd hmmer/src; ln -sf impl_${impl_choice} impl],
+ [ac_top_build_prefix=${ac_top_build_prefix}; impl_choice=${impl_choice}])
# Finally, build the top-level HMMER makefile (note: this is *not* built by HMMER's configure)
AC_CONFIG_FILES([hmmer/Makefile])
diff --git a/documentation/userguide/cmbuild.tex b/documentation/userguide/cmbuild.tex
index 6bec73d5..af540187 100644
--- a/documentation/userguide/cmbuild.tex
+++ b/documentation/userguide/cmbuild.tex
@@ -595,3 +595,4 @@ \subsubsection{Saving the model}
\prog{-F} option causes the new model to overwrite an existing
cmfile.
+
diff --git a/documentation/userguide/formats.tex b/documentation/userguide/formats.tex
index dc4459a6..d7fa16b4 100644
--- a/documentation/userguide/formats.tex
+++ b/documentation/userguide/formats.tex
@@ -1331,3 +1331,4 @@ \subsection{Null model file format}
+
diff --git a/documentation/userguide/install.tex b/documentation/userguide/install.tex
index c1258b8b..ed5cdeae 100644
--- a/documentation/userguide/install.tex
+++ b/documentation/userguide/install.tex
@@ -5,7 +5,6 @@ \section{Installation}
\subsection{Quick installation instructions}
Download \prog{infernal-1.1.1.tar.gz} from \url{http://infernal.janelia.org/}, or
-
directly from
\url{ftp://selab.janelia.org/pub/software/infernal/infernal-1.1.1.tar.gz};
unpack it, configure, and make:
diff --git a/documentation/userguide/main.tex b/documentation/userguide/main.tex
index 0a88c130..7b765471 100644
--- a/documentation/userguide/main.tex
+++ b/documentation/userguide/main.tex
@@ -2,6 +2,7 @@
%
% SVN $Id$
+
\documentclass[10pt]{article}
%\usepackage{helvetic}
\usepackage{times}
diff --git a/documentation/userguide/more.tex b/documentation/userguide/more.tex
index f843fde6..b359667f 100644
--- a/documentation/userguide/more.tex
+++ b/documentation/userguide/more.tex
@@ -201,3 +201,4 @@ \subsubsection{Reading from a stdin pipe using - (dash) as a filename argument}
can't read from pipes.
+
diff --git a/documentation/userguide/pipeline.tex b/documentation/userguide/pipeline.tex
index 55cec580..d0d329a9 100644
--- a/documentation/userguide/pipeline.tex
+++ b/documentation/userguide/pipeline.tex
@@ -1437,3 +1437,4 @@ \subsection{HMM-only pipeline variant for models without structure}
turned off with the \ccode{--nohmmonly} option. When turned off, all
models will use the standard and truncated CM pipelines, even those
with no structure.
+
diff --git a/documentation/userguide/tutorial.tex b/documentation/userguide/tutorial.tex
index c1123f18..88da66f7 100644
--- a/documentation/userguide/tutorial.tex
+++ b/documentation/userguide/tutorial.tex
@@ -225,13 +225,13 @@ \subsubsection{Step 1: build a covariance model with cmbuild}
% ^
sequences were only counted as an ``effective'' total sequence number
(\otext{eff\_nseq}) of 3.73. The model includes 21 basepairs and 2
-% ^^^^ ^^ ^
+% ^^^^ ^^ ^
bifurcations. The model ended up with a relative entropy per position
(\otext{rel entropy, CM}; information content) of 0.783 bits. If the
-% ^^^^^
+% ^^^^^
secondary structure information of the model were ignored the relative
entropy per position (\otext{rel entropy, HMM}) would be 0.489 bits.
-% ^^^^^
+% ^^^^^
This output format is rudimentary. Infernal knows quite a bit more
information about what it's done to build this CM, but it's not
displaying it. You don't need to know more to be able to use the
@@ -319,7 +319,7 @@ \subsubsection{Step 2: calibrate the model with cmcalibrate}
model, copy over the \otext{tRNA5.cm} file you just made with the
calibrated version:
-\user{cp tRNA5.c.cm tRNA5.cm}\\
+\user{cp tutorial/tRNA5.c.cm tRNA5.cm}\\
\subsubsection{Step 3: search a sequence database with cmsearch}
@@ -329,7 +329,8 @@ \subsubsection{Step 3: search a sequence database with cmsearch}
\otext{NC\_13790.1}). We'll use this file as our database. To perform
the search:
-\user{cmsearch tRNA5.cm mrum-genome.fa}\\
+\user{cmsearch tRNA5.cm tutorial/mrum-genome.fa}\\
+% tutorial regression: tRNA5-search.out
As before, the first section is the header, telling you what program
your ran, on what, and with what options:
@@ -503,7 +504,6 @@ \subsubsection{Step 3: search a sequence database with cmsearch}
hit number 43. First, take a look at the first four lines for this
hit:
-% tutorial regression: tRNA5-search.out
\begin{sreoutput}
>> NC_013790.1 Methanobrevibacter ruminantium M1 chromosome, complete genome
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
@@ -574,18 +574,19 @@ \subsubsection{Step 3: search a sequence database with cmsearch}
more informative than the simple least-common-denominator format we
used in the input alignment file. It's designed to make it easier to
see the secondary structure by eye. The format is described in detail
-later (see WUSS format in section~\ref{section:formats}); for now, here's all you need to
-know. Basepairs in simple stem loops are annotated with \otext{<>}
-characters. Basepairs enclosing multifurcations (multiple stem loops)
-are annotated with \otext{()}, such as the tRNA acceptor stem in this
-example. In more complicated structures, \otext{[]} and \otext{\{\}}
-annotations also show up, to reflect deeper nestings of
-multifurcations. For single stranded residues, \otext{\_} characters
-mark hairpin loops; \otext{-} characters mark interior loops and
-bulges; \otext{,} characters mark single-stranded residues in
-multifurcation loops; and \otext{:} characters mark single stranded
-residues external to any secondary structure. Insertions relative to
-this consensus are annotated by a \otext{.} character.
+later (see WUSS format in section~\ref{section:formats}); for now,
+here's all you need to know. Basepairs in simple stem loops are
+annotated with \otext{<>} characters. Basepairs enclosing
+multifurcations (multiple stem loops) are annotated with \otext{()},
+such as the tRNA acceptor stem in this example. In more complicated
+structures, \otext{[]} and \otext{\{\}} annotations also show up, to
+reflect deeper nestings of multifurcations. For single stranded
+residues, \otext{\_} characters mark hairpin loops; \otext{-}
+characters mark interior loops and bulges; \otext{,} characters mark
+single-stranded residues in multifurcation loops; and \otext{:}
+characters mark single stranded residues external to any secondary
+structure. Insertions relative to this consensus are annotated by a
+\otext{.} character.
The line above the \otext{CS} line ends with \otext{NC} and marks
negative scoring non-canonical basepairs in the alignment with a
@@ -663,15 +664,15 @@ \subsubsection{Step 3: search a sequence database with cmsearch}
Query model(s): 1 (72 consensus positions)
Target sequences: 1 (5874406 residues searched)
Target sequences re-searched for truncated hits: 1 (360 residues re-searched)
-Windows passing local HMM SSV filter: 11197 (0.2108); expected (0.35)
+Windows passing local HMM SSV filter: 11200 (0.2111); expected (0.35)
Windows passing local HMM Viterbi filter: (off)
Windows passing local HMM Viterbi bias filter: (off)
-Windows passing local HMM Forward filter: 140 (0.002747); expected (0.005)
-Windows passing local HMM Forward bias filter: 139 (0.002728); expected (0.005)
-Windows passing glocal HMM Forward filter: 88 (0.001973); expected (0.005)
-Windows passing glocal HMM Forward bias filter: 88 (0.001973); expected (0.005)
-Envelopes passing glocal HMM envelope defn filter: 101 (0.001358); expected (0.005)
-Envelopes passing local CM CYK filter: 60 (0.0007629); expected (0.0001)
+Windows passing local HMM Forward filter: 137 (0.002691); expected (0.005)
+Windows passing local HMM Forward bias filter: 134 (0.002621); expected (0.005)
+Windows passing glocal HMM Forward filter: 87 (0.001923); expected (0.005)
+Windows passing glocal HMM Forward bias filter: 87 (0.001923); expected (0.005)
+Envelopes passing glocal HMM envelope defn filter: 100 (0.001342); expected (0.005)
+Envelopes passing local CM CYK filter: 60 (0.0007631); expected (0.0001)
Total CM hits reported: 56 (0.0007205); includes 0 truncated hit(s)
# CPU time: 2.15u 0.03s 00:00:02.17 Elapsed: 00:00:00.89
@@ -729,28 +730,29 @@ \subsubsection{Step 3: search a sequence database with cmsearch}
% ^^^^
residues will pass the filter). Here, about 21\% of the database in
% ^^^^
-11,197 separate windows got through the SSV filter. For a database of
+11,200 separate windows got through the SSV filter. For a database of
%^^^^^
this size, the local Viterbi filter is turned off. The local Forward filter
is set to allow an expected 0.5\% of the database survive. Here about
% ^^^^
-0.3\% survives in 140 windows. Next, each surviving window is checked
+0.3\% survives in 137 windows. Next, each surviving window is checked
%^^
to see if the target sequence is ``obviously'' so biased in its
composition that it's unlikely to be a true homolog. This is called
the ``bias filter''\footnote{There's also a bias filter step used in
the local Viterbi filter stage, when it is used.} and applying a bit
score correction to previous filter's score for each window and
-recomputing the P-value. Exactly 1 of the 140 windows fails to pass
-% ^ ^^^
-the local Forward bias filter stage Next, the Forward algorithm is
+recomputing the P-value. Three of the 137 windows fail to pass
+% ^^^^^ ^^^
+the local Forward bias filter stage. Next, the Forward algorithm is
used to score each window again, but this time with the HMM configured
in glocal mode requiring a full length alignment to the
model\footnote{The use of glocal Forward is another important
difference between Infernal and HMMER3's (v3.0) pipeline. HMMER v3.0
only uses local HMM algorithms.} As with the local stage, an
expected 0.5\% of the database is expected to survive. In this case,
-88 of the 139 windows, comprising about 0.2\% of the database,
+87 of the 134 windows, comprising about 0.2\% of the database,
+%^ ^^^
survive. The bias filter is run again, this time applying a correction
to the glocal Forward scores. For this search, 0 windows are removed at
this stage. The envelope definition stage is next. This stage is very
@@ -760,13 +762,13 @@ \subsubsection{Step 3: search a sequence database with cmsearch}
or more hit envelopes in each window, where each envelope contains one
putative hit. Often residues at the beginning and ends of windows are
determined to be nonhomologous and are not included in the
-envelope. In this search, 101 envelopes are defined within the 81
+envelope. In this search, 100 envelopes are defined within the 87
% ^^^ ^^
windows. Note that the envelopes comprise only about 70\% of the
% ^^^
-residues from the 81 windows, indicated by the drop of 0.1973\% to
+residues from the 87 windows, indicated by the drop of 0.1923\% to
% ^^^^^^^^
-0.1358\%.
+0.1342\%.
%%^^^^
After hit envelopes have been defined with the filter HMM, the two
@@ -789,6 +791,7 @@ \subsubsection{Step 3: search a sequence database with cmsearch}
Forward algorithm) is used to define final hit boundaries and
scores. Hits with scores above the reporting threshold were output, as
described above. In this search there were 56 such hits.
+% ^^
Finally, the running time of the search is reported, in CPU time and
elapsed time. This search took about 1 second (wall
@@ -830,7 +833,9 @@ \subsubsection{Truncated RNA detection}
beginning and end of each sequence which must be re-searched for
truncated hits. For our tRNA model the maximum expected length is
90, so exactly 360 residues were re-searched: the first and final 90
+%^ ^^^ ^^
residues on the top strand, and the first and final 90 residues on
+% ^^
the bottom strand.
There is one more aspect of truncated hit detection that is important
@@ -850,9 +855,10 @@ \subsection{Searching a CM database with a query sequence}
known/detectable RNAs in a given sequence. It takes a single query
sequence and a CM database as input. The CM database might be Rfam,
for example, or another collection of your choice. \prog{cmscan} is
-new to version 1.1 of Infernal. It is designed to be useful primarily
-for the analysis metagenomic datasets, in which one wants to know what
-families of RNAs are represented in the sequence dataset.
+new to version 1.1 of Infernal. It is designed to be useful for
+sequence datasets from RNA-Seq experiments or metagenomics surveys,
+for which one wants to know what families of RNAs are represented in
+the sequence dataset.
\subsubsection{Step 1: create an CM database flatfile}
@@ -928,6 +934,7 @@ \subsubsection{Step 3: search the CM database with cmscan}
To scan the sequences with our database:
\user{cmscan minifam.cm tutorial/metag-example.fa}
+%tutorial regression: metag-scan.out
The header and the first section of the output will look like:
@@ -941,7 +948,7 @@ \subsubsection{Step 3: search the CM database with cmscan}
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query sequence file: ../../tutorial/metag-example.fa
# target CM database: minifam.cm
-# number of worker threads: 2
+# number of worker threads: 8
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Query: AAGA01015927.1 [L=943]
@@ -950,8 +957,8 @@ \subsubsection{Step 3: search the CM database with cmscan}
rank E-value score bias modelname start end mdl trunc gc description
---- --------- ------ ----- --------- ------ ------ --- ----- ---- -----------
(1) ! 3.3e-19 77.3 0.0 5S_rRNA 59 174 + cm no 0.66 5S ribosomal RNA
- (2) ! 9.3e-19 62.4 0.0 tRNA5 229 302 + cm no 0.62
- (3) ! 6e-16 53.5 0.0 tRNA5 314 386 + cm no 0.59
+ (2) ! 9.3e-19 62.4 0.0 tRNA5 229 302 + cm no 0.62 -
+ (3) ! 6e-16 53.5 0.0 tRNA5 314 386 + cm no 0.59 -
\end{sreoutput}
\prog{cmscan} has identified three putative RNAs in the first query
@@ -1004,13 +1011,13 @@ \subsubsection{Step 3: search the CM database with cmscan}
Query sequence(s): 1 (1886 residues searched)
Query sequences re-searched for truncated hits: 1 (992.0 residues re-searched, avg per model)
Target model(s): 3 (382 consensus positions)
-Windows passing local HMM SSV filter: 16 (0.331); expected (0.35)
+Windows passing local HMM SSV filter: 16 (0.3323); expected (0.35)
Windows passing local HMM Viterbi filter: (off)
Windows passing local HMM Viterbi bias filter: (off)
-Windows passing local HMM Forward filter: 4 (0.09138); expected (0.02)
-Windows passing local HMM Forward bias filter: 4 (0.09138); expected (0.02)
-Windows passing glocal HMM Forward filter: 3 (0.09138); expected (0.02)
-Windows passing glocal HMM Forward bias filter: 3 (0.09138); expected (0.02)
+Windows passing local HMM Forward filter: 4 (0.09822); expected (0.02)
+Windows passing local HMM Forward bias filter: 4 (0.09822); expected (0.02)
+Windows passing glocal HMM Forward filter: 3 (0.09822); expected (0.02)
+Windows passing glocal HMM Forward bias filter: 3 (0.09822); expected (0.02)
Envelopes passing glocal HMM envelope defn filter: 4 (0.05189); expected (0.02)
Envelopes passing local CM CYK filter: 4 (0.05189); expected (0.0001)
Total CM hits reported: 3 (0.03046); includes 0 truncated hit(s)
@@ -1082,7 +1089,7 @@ \subsubsection{Truncated hit and local end alignment example}
\otext{trunc} column reads \otext{5'} indicating that Infernal
predicts the 5' end (beginning) of this Cobalamin riboswitch is
missing. (Note that this hit is on the bottom (reverse) strand so the
-Cobalamin hit is actually predicted to extend past the 3' end of the
+Cobalamin hit is actually predicted to extend past the end of the
input sequence (past residue 934), but on the opposite strand.) The 5'
end of the alignment indicates this with special annotation: the
\otext{<[31]*} in the model line indicates that the missing sequence
@@ -1139,13 +1146,6 @@ \subsubsection{Truncated hit and local end alignment example}
character.
\subsection{Creating multiple alignments with cmalign}
-% tutorial regression:
-% cmsearch -A mrum-tRNAs.stk tRNA5.cm mrum-genome.fa
-% esl-reformat --rename mrum-tRNA fasta mrum-tRNAs.stk > mrum-tRNAs.fa
-% then manually removed all but first 20 sequences,
-% and sequence descriptions, and converted lowercase to uppercase.
-% Saved a version of this file with mrum-tRNA.20 for an example of
-% local alignment.
The file \otext{tutorial/mrum-tRNAs10.fa} is a FASTA file containing
the 10 of the tRNA hits above the inclusion threshold (with an E-value
less than $0.01$) found by \prog{cmsearch} in our
@@ -1157,6 +1157,7 @@ \subsection{Creating multiple alignments with cmalign}
alignment:
\user{cmalign tRNA5.cm tutorial/mrum-tRNAs10.fa}
+% tutorial regression: mrum-tRNAs10.sto
The output of this is a Stockholm format multiple alignment file:
@@ -1287,6 +1288,7 @@ \subsubsection{cmalign assumes sequences may be truncated}
below so it will fit on the page):
\user{cmalign tutorial/Cobalamin.c.cm tutorial/Cobalamin.fa}
+% tutorial regression: cobalamin-align.sto
\label{cmalign-cobalamin}
\begin{sreoutput}
@@ -1379,6 +1381,7 @@ \subsection{Searching a sequence database for RNAs with unknown or no
To build a structureless tRNA model from this alignment, execute:
\user{cmbuild --noss tRNA5-noss.cm tutorial/tRNA5.sto}
+%tutorial regression: trna-noss-build.out
The \otext{--noss} option is required when the alignment file lacks
structure annotation. By forcing the use of \otext{--noss} we're
@@ -1423,11 +1426,11 @@ \subsection{Searching a sequence database for RNAs with unknown or no
Now, let's repeat the search of the \emph{M. ruminantium} genome but
using our structureless tRNA model:
-\user{cmsearch tRNA5-noss.cm mrum-genome.fa}\\
+\user{cmsearch tRNA5-noss.cm tutorial/mrum-genome.fa}\\
+%tutorial regression: trna-noss-search.out
-This search is faster than the first one (though you may not be able
-to tell), because only profile HMM algorithms are used this time
-around. Take a look at the list of hits:
+This search is faster than the first one because only profile HMM
+algorithms are used this time around. Take a look at the list of hits:
\newpage
@@ -1438,8 +1441,8 @@ \subsection{Searching a sequence database for RNAs with unknown or no
(2) ! 3e-07 32.2 0.0 NC_013790.1 762496 762559 + hmm - 0.64 Methanobrevibacter ruminantium M1 chromosome, complete genome
(3) ! 3e-07 32.2 0.0 NC_013790.1 2041698 2041635 - hmm - 0.64 Methanobrevibacter ruminantium M1 chromosome, complete genome
(4) ! 5e-06 28.3 0.0 NC_013790.1 2585264 2585203 - hmm - 0.56 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (5) ! 6.4e-06 28.0 0.0 NC_013790.1 735143 735198 + hmm - 0.59 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (6) ! 7.1e-06 27.9 0.0 NC_013790.1 2351247 2351188 - hmm - 0.57 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (5) ! 6.1e-06 28.1 0.0 NC_013790.1 735143 735198 + hmm - 0.59 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (6) ! 7e-06 27.9 0.0 NC_013790.1 2351247 2351188 - hmm - 0.57 Methanobrevibacter ruminantium M1 chromosome, complete genome
(7) ! 1.2e-05 27.2 0.0 NC_013790.1 662193 662251 + hmm - 0.64 Methanobrevibacter ruminantium M1 chromosome, complete genome
(8) ! 1.2e-05 27.1 0.0 NC_013790.1 2186009 2185951 - hmm - 0.51 Methanobrevibacter ruminantium M1 chromosome, complete genome
(9) ! 4.2e-05 25.4 0.0 NC_013790.1 2585183 2585118 - hmm - 0.56 Methanobrevibacter ruminantium M1 chromosome, complete genome
@@ -1455,25 +1458,24 @@ \subsection{Searching a sequence database for RNAs with unknown or no
(19) ! 0.0057 18.7 0.0 NC_013790.1 1160527 1160608 + hmm - 0.59 Methanobrevibacter ruminantium M1 chromosome, complete genome
(20) ! 0.0074 18.3 0.0 NC_013790.1 361056 360994 - hmm - 0.40 Methanobrevibacter ruminantium M1 chromosome, complete genome
------ inclusion threshold ------
- (21) ? 0.012 17.7 0.0 NC_013790.1 2151679 2151737 + hmm - 0.56 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (22) ? 0.012 17.7 0.0 NC_013790.1 917729 917784 + hmm - 0.61 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (23) ? 0.019 17.1 0.0 NC_013790.1 2327123 2327043 - hmm - 0.62 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (24) ? 0.023 16.7 0.0 NC_013790.1 360973 360920 - hmm - 0.54 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (25) ? 0.037 16.1 0.0 NC_013790.1 2350982 2350919 - hmm - 0.50 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (26) ? 0.039 16.1 0.0 NC_013790.1 361671 361606 - hmm - 0.50 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (27) ? 0.039 16.0 0.0 NC_013790.1 2680176 2680227 + hmm - 0.62 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (28) ? 0.062 15.4 0.0 NC_013790.1 2585067 2585000 - hmm - 0.59 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (29) ? 0.063 15.4 0.0 NC_013790.1 362793 362738 - hmm - 0.46 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (30) ? 0.063 15.4 0.0 NC_013790.1 1064778 1064857 + hmm - 0.62 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (31) ? 0.072 15.2 0.0 NC_013790.1 2749938 2749884 - hmm - 0.47 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (32) ? 0.073 15.2 0.0 NC_013790.1 2749832 2749778 - hmm - 0.47 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (33) ? 0.12 14.5 0.0 NC_013790.1 361729 361689 - hmm - 0.56 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (34) ? 1.3 11.2 0.0 NC_013790.1 361439 361393 - hmm - 0.49 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (35) ? 4.4 9.6 0.0 NC_013790.1 2583867 2583803 - hmm - 0.49 Methanobrevibacter ruminantium M1 chromosome, complete genome
- (36) ? 6 9.1 0.0 NC_013790.1 546054 546021 - hmm - 0.68 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (21) ? 0.011 17.7 0.0 NC_013790.1 2151679 2151737 + hmm - 0.56 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (22) ? 0.019 17.1 0.0 NC_013790.1 2327123 2327043 - hmm - 0.62 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (23) ? 0.023 16.7 0.0 NC_013790.1 360973 360920 - hmm - 0.54 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (24) ? 0.037 16.1 0.0 NC_013790.1 2350982 2350919 - hmm - 0.50 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (25) ? 0.039 16.1 0.0 NC_013790.1 361671 361606 - hmm - 0.50 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (26) ? 0.039 16.0 0.0 NC_013790.1 2680176 2680227 + hmm - 0.62 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (27) ? 0.062 15.4 0.0 NC_013790.1 2585067 2585000 - hmm - 0.59 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (28) ? 0.063 15.4 0.0 NC_013790.1 362793 362738 - hmm - 0.46 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (29) ? 0.063 15.4 0.0 NC_013790.1 1064778 1064857 + hmm - 0.62 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (30) ? 0.072 15.2 0.0 NC_013790.1 2749938 2749884 - hmm - 0.47 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (31) ? 0.073 15.2 0.0 NC_013790.1 2749832 2749778 - hmm - 0.47 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (32) ? 0.12 14.5 0.0 NC_013790.1 361729 361689 - hmm - 0.56 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (33) ? 1.3 11.2 0.0 NC_013790.1 361439 361393 - hmm - 0.49 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (34) ? 4.4 9.6 0.0 NC_013790.1 2583867 2583803 - hmm - 0.49 Methanobrevibacter ruminantium M1 chromosome, complete genome
+ (35) ? 5.9 9.2 0.0 NC_013790.1 546054 546021 - hmm - 0.68 Methanobrevibacter ruminantium M1 chromosome, complete genome
\end{sreoutput}
-Note that this time we only find 36 hits (20 with E-values less than
+Note that this time we only find 35 hits (20 with E-values less than
the inclusion threshold of 0.01) compared to 56 (with 54 less than
0.01) in the original search with the structure-based CM. The
increased sensitivity in the initial CM search exemplifies the
@@ -1495,16 +1497,16 @@ \subsection{Searching a sequence database for RNAs with unknown or no
at the first alignment:
\begin{sreoutput}
->> gi|288559258|ref|NC_013790.1| Methanobrevibacter ruminantium M1 chromosome, complete genome
+>> NC_013790.1 Methanobrevibacter ruminantium M1 chromosome, complete genome
rank E-value score bias mdl mdl from mdl to seq from seq to acc trunc gc
---- --------- ------ ----- --- -------- -------- ----------- ----------- ---- ----- ----
(1) ! 3.6e-09 38.2 0.0 hmm 5 67 .. 362022 361960 - .. 0.93 - 0.46
- ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: CS
- tRNA5.noss 5 auAUaGcgcAGUGGuAGcGCGgcagcCUgucaagguggAGGUCCuggGUUCGAUUCccaGUgu 67
- uAUaGc+cA+UGG AG+GCG +g CUg ca + AGGU uggGUUCGAUUCcc U+
- gi|288559258|ref|NC_013790.1| 362022 CUAUAGCUCAAUGGCAGAGCGUUUGGCUGACAUCCAAAAGGUUAUGGGUUCGAUUCCCUUUAG 361960
- 689******************************************************988876 PP
+ ::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::: CS
+ tRNA5 5 auAUaGcgcAGUGGuAGcGCGgcagcCUgucaagguggAGGUCCuggGUUCGAUUCccaGUgu 67
+ uAUaGc+cA+UGG AG+GCG +g CUg ca + AGGU uggGUUCGAUUCcc U+
+ NC_013790.1 362022 CUAUAGCUCAAUGGCAGAGCGUUUGGCUGACAUCCAAAAGGUUAUGGGUUCGAUUCCCUUUAG 361960
+ 689******************************************************988876 PP
\end{sreoutput}
First, note that the \otext{mdl} field reads \otext{hmm} instead of
@@ -1649,6 +1651,7 @@ \subsection{Specifying and annotating match positions with cmbuild --hand}
To build the hand-specified model from this alignment, do:
\user{cmbuild --hand tRNA5-hand.cm tutorial/tRNA5-hand.sto}
+%tutorial regression: trna-hand-build.out
\begin{sreoutput}
# cmbuild :: covariance model construction from multiple sequence alignments
diff --git a/rmark/rmark-create.c b/rmark/rmark-create.c
index 64571627..71942c0b 100644
--- a/rmark/rmark-create.c
+++ b/rmark/rmark-create.c
@@ -48,13 +48,13 @@
#include "esl_msa.h"
#include "esl_msafile.h"
#include "esl_msacluster.h"
+#include "esl_msaweight.h"
#include "esl_random.h"
#include "esl_randomseq.h"
#include "esl_sq.h"
#include "esl_sqio.h"
#include "esl_stack.h"
#include "esl_vectorops.h"
-#include "esl_msaweight.h"
static char banner[] = "construct a rmark benchmark profile training/test set";
static char usage1[] = "[options] ";
@@ -72,12 +72,10 @@ static ESL_OPTIONS options[] = {
{ "-F", eslARG_REAL, "0.70", NULL,"00", NULL,NULL,NULL, "full length of benchmark seqs prior to test seq embedding", 1 },
- { "-C", eslARG_INT, "1000",NULL,"n>0", NULL,NULL,"--iid", "length of seqs to extract/shuffle when making test seqs", 1 },
+ { "-C", eslARG_INT, "1000",NULL,"n>0", NULL,NULL,"--iid", "length of of seqs to extract and shuffle when making test seqs", 1 },
{ "-X", eslARG_REAL, "0.05", NULL,"00", NULL,NULL,NULL, "minimum number of training seqs per family", 1 },
{ "-E", eslARG_INT, "1", NULL,"n>0", NULL,NULL,NULL, "minimum number of test seqs per family", 1 },
- { "--maxtrain", eslARG_INT, FALSE, NULL, NULL, NULL, NULL, NULL, "maximum # of test domains taken per input MSA", 1 },
- { "--maxtest", eslARG_INT, FALSE, NULL, NULL, NULL, NULL, NULL, "maximum # of training domains taken per input MSA", 1 },
/* Options controlling negative segment randomization method */
{ "--iid", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-S", "generate random iid sequence for negatives", 2 },
@@ -113,9 +111,6 @@ struct cfg_s {
double idthresh2; /* fractional identity threshold for selecting test seqs */
int min_ntrain; /* minimum number of sequences in the training set */
int min_ntest; /* minimum number of sequences in the test set */
- int max_ntrain; /* maximum number of test domains per input alignment; 0=unlimited */
- int max_ntest; /* maximum number of test domains per input alignment; 0=unlimited */
-
FILE *out_msafp; /* output: training MSAs */
FILE *out_bmkfp; /* output: benchmark sequences */
@@ -139,8 +134,8 @@ struct cfg_s {
static int process_dbfile (struct cfg_s *cfg, char *dbfile, int dbfmt);
static int remove_fragments (struct cfg_s *cfg, ESL_MSA *msa, ESL_MSA **ret_filteredmsa, int *ret_nfrags);
static int separate_sets (struct cfg_s *cfg, ESL_MSA *msa, int **ret_i_am_train, int **ret_i_am_test);
-static int find_sets_greedily (struct cfg_s *cfg, ESL_MSA *msa, int do_xtest, int **ret_i_am_train, int **ret_i_am_test);
-static int find_sets_by_sampling(struct cfg_s *cfg, ESL_MSA *msa, int nsamples, int do_xtest, int **ret_i_am_train, int **ret_i_am_test);
+static int find_sets_greedily (struct cfg_s *cfg, ESL_MSA *msa, int do_maxtest, int **ret_i_am_train, int **ret_i_am_test);
+static int find_sets_by_sampling(struct cfg_s *cfg, ESL_MSA *msa, int nsamples, int do_maxtest, int **ret_i_am_train, int **ret_i_am_test);
static int synthesize_negatives_and_embed_positives(ESL_GETOPTS *go, struct cfg_s *cfg, ESL_SQ **posseqs, int npos);
static int set_random_segment (ESL_GETOPTS *go, struct cfg_s *cfg, FILE *logfp, ESL_DSQ *dsq, int L);
static void read_hmmfile(char *filename, ESL_HMM **ret_hmm);
@@ -193,10 +188,11 @@ main(int argc, char **argv)
ESL_MSA *origmsa = NULL; /* one multiple sequence alignment */
ESL_MSA *msa = NULL; /* MSA after frags are removed */
ESL_MSA *trainmsa= NULL; /* training set, aligned */
+ char *tmpstr = NULL; /* #=RF annotation line */
+ ESL_SQ *train_consensus = NULL;
ESL_MSA *tmpmsa= NULL; /* tmp aligned training/testing set, used if --tfile */
int *i_am_train = NULL; /* [0..msa->nseq-1]: 1 if train seq, 0 if not */
int *i_am_test = NULL; /* [0..msa->nseq-1]: 1 if test seq, 0 if not */
- int status; /* easel return code */
int nfrags; /* # of fragments removed */
int ntestseq; /* # of test sequences for cur fam */
int ntrainseq; /* # of train sequences for cur fam */
@@ -206,12 +202,11 @@ main(int argc, char **argv)
ESL_SQ **posseqs=NULL; /* all the test seqs, to be embedded */
int64_t poslen_total; /* total length of all positive seqs */
double avgid;
+ double pctid;
void *ptr;
int i, traini, testi;
- double pctid;
- ESL_SQ *train_consensus;
- char *tmpstr;
-
+ int status; /* easel return code */
+
/* Parse command line */
go = esl_getopts_Create(options);
if (esl_opt_ProcessCmdline(go, argc, argv) != eslOK) cmdline_failure(argv[0], "Failed to parse command line: %s\n", go->errbuf);
@@ -250,11 +245,7 @@ main(int argc, char **argv)
cfg.nneg = esl_opt_GetInteger(go, "-N");
cfg.negL = esl_opt_GetInteger(go, "-L");
cfg.negchunkL = esl_opt_GetInteger(go, "-C");
- cfg.max_ntest = (esl_opt_IsOn(go, "--maxtest") ? esl_opt_GetInteger(go, "--maxtest") : 0);
- cfg.max_ntrain = (esl_opt_IsOn(go, "--maxtrain") ? esl_opt_GetInteger(go, "--maxtrain") : 0);
- if (cfg.max_ntest>0 && cfg.max_ntest < cfg.min_ntest) esl_fatal("Conflict between -E and --maxtest");
- if (cfg.max_ntrain>0 && cfg.max_ntrain < cfg.min_ntrain) esl_fatal("Conflict between -R and --maxtrain");
/* Open the output files */
if (snprintf(outfile, 256, "%s.msa", basename) >= 256) esl_fatal("Failed to construct output MSA file name");
if ((cfg.out_msafp = fopen(outfile, "w")) == NULL) esl_fatal("Failed to open MSA output file %s\n", outfile);
@@ -318,61 +309,38 @@ main(int argc, char **argv)
* - more than cfg->idthresh2 similar to >=1 sequences in the test set
*/
if(! esl_opt_GetBoolean(go, "--skip")) {
- separate_sets (&cfg, msa, &i_am_train, &i_am_test);
- ntrainseq = esl_vec_ISum(i_am_train, msa->nseq);
- ntestseq = esl_vec_ISum(i_am_test, msa->nseq);
+ separate_sets (&cfg, msa, &i_am_train, &i_am_test);
+ ntrainseq = esl_vec_ISum(i_am_train, msa->nseq);
+ ntestseq = esl_vec_ISum(i_am_test, msa->nseq);
}
else { /* --skip enabled, we skipped test 1 */
- ntestseq = ntrainseq = 0;
+ ntestseq = ntrainseq = 0;
}
/* if --sub or --sample, check if we should look for subsets of
* the seqs in the msa that satisfy our thresholds */
if((esl_opt_IsOn(go, "--sub") || esl_opt_IsOn(go, "--sample")) &&
- ((ntestseq < cfg.min_ntest) || (ntrainseq < cfg.min_ntrain)))
- { /* Test 2: Is there a subset of the sequences in the msa
- * that would satisfy our thresholds (most similar
- * train/test pair < cfg->idthresh1, and most
- * similar test/test pair < cfg->idthresh2) and that
- * include a sufficient number of train/test seqs.
- *
- * We either use a greedy deterministic algorithm (by
- * default) to look for these subsets, or we use a sampling
- * algorithm (non-deterministic, enabled with --sample).
- */
- if(esl_opt_IsOn(go, "--sub")) find_sets_greedily (&cfg, msa, esl_opt_GetBoolean(go, "--xtest"), &i_am_train, &i_am_test);
- else find_sets_by_sampling(&cfg, msa, esl_opt_GetInteger(go, "--sample"), esl_opt_GetBoolean(go, "--xtest"), &i_am_train, &i_am_test);
- ntrainseq = esl_vec_ISum(i_am_train, msa->nseq);
- ntestseq = esl_vec_ISum(i_am_test, msa->nseq);
- /* we did find a satisfactory set (find_sets() checks that
- * we have a sufficient number of test/train seqs, but
- * check, to be sure */
- if ((ntestseq < cfg.min_ntest) || (ntrainseq < cfg.min_ntrain))
- esl_fatal("find_sets() returned insufficient train/test sets!");
- }
-
-
-
- //Filter both lists down as necessary.
- //Until reaching the right #, randomly find a TRUE entry in i_am_test, and set it to FALSE.
- while (cfg.max_ntest > 0 && ntestseq > cfg.max_ntest) {
- i = esl_rnd_Roll(cfg.r, msa->nseq);
- if (i_am_test[i] == TRUE) {
- i_am_test[i] = FALSE;
- ntestseq--;
- }
- }
-
- //Until reaching the right #, randomly find a TRUE entry in i_am_train, and set it to FALSE.
- while (cfg.max_ntrain > 0 && ntrainseq > cfg.max_ntrain) {
- i = esl_rnd_Roll(cfg.r, msa->nseq);
- if (i_am_train[i] == TRUE) {
- i_am_train[i] = FALSE;
- ntrainseq--;
- }
- }
-
-
+ ((ntestseq < cfg.min_ntest) || (ntrainseq < cfg.min_ntrain)))
+ { /* Test 2: Is there a subset of the sequences in the msa
+ * that would satisfy our thresholds (most similar
+ * train/test pair < cfg->idthresh1, and most
+ * similar test/test pair < cfg->idthresh2) and that
+ * include a sufficient number of train/test seqs.
+ *
+ * We either use a greedy deterministic algorithm (by
+ * default) to look for these subsets, or we use a sampling
+ * algorithm (non-deterministic, enabled with --sample).
+ */
+ if(esl_opt_IsOn(go, "--sub")) find_sets_greedily (&cfg, msa, esl_opt_GetBoolean(go, "--xtest"), &i_am_train, &i_am_test);
+ else find_sets_by_sampling(&cfg, msa, esl_opt_GetInteger(go, "--sample"), esl_opt_GetBoolean(go, "--xtest"), &i_am_train, &i_am_test);
+ ntrainseq = esl_vec_ISum(i_am_train, msa->nseq);
+ ntestseq = esl_vec_ISum(i_am_test, msa->nseq);
+ /* we did find a satisfactory set (find_sets() checks that
+ * we have a sufficient number of test/train seqs, but
+ * check, to be sure */
+ if ((ntestseq < cfg.min_ntest) || (ntrainseq < cfg.min_ntrain))
+ esl_fatal("find_sets() returned insufficient train/test sets!");
+ }
if ((ntestseq >= cfg.min_ntest) && (ntrainseq >= cfg.min_ntrain)) {
/* We have a valid train/test set, that either satisfied
@@ -498,10 +466,6 @@ main(int argc, char **argv)
esl_alphabet_Destroy(cfg.abc);
esl_msafile_Close(afp);
esl_getopts_Destroy(go);
-
- if (train_consensus != NULL) esl_sq_Destroy(train_consensus);
- if (tmpstr != NULL) free(tmpstr);
-
return 0;
ERROR:
@@ -875,7 +839,7 @@ find_sets_greedily(struct cfg_s *cfg, ESL_MSA *msa, int do_xtest, int **ret_i_am
* instead.
*/
static int
-find_sets_by_sampling(struct cfg_s *cfg, ESL_MSA *msa, int nsamples, int do_xtest, int **ret_i_am_train, int **ret_i_am_test)
+find_sets_by_sampling(struct cfg_s *cfg, ESL_MSA *msa, int nsamples, int do_maxtest, int **ret_i_am_train, int **ret_i_am_test)
{
ESL_MSA *test_msa = NULL;
int *i_am_test = NULL;
@@ -1007,8 +971,8 @@ find_sets_by_sampling(struct cfg_s *cfg, ESL_MSA *msa, int nsamples, int do_xtes
* OR maximum of |test| (enabled with --maxtest)
*/
- if(( do_xtest && (ntest > nbest_test)) ||
- (! do_xtest && ((ntrain+ntest) > (nbest_train+nbest_test)))) {
+ if(( do_maxtest && (ntest > nbest_test)) ||
+ (! do_maxtest && ((ntrain+ntest) > (nbest_train+nbest_test)))) {
esl_vec_ICopy(i_am_train, msa->nseq, i_am_best_train);
esl_vec_ICopy(i_am_test, msa->nseq, i_am_best_test);
nbest_train = ntrain;
diff --git a/src/cm.c b/src/cm.c
index ff186e1e..745e65fb 100644
--- a/src/cm.c
+++ b/src/cm.c
@@ -3507,7 +3507,7 @@ cm_p7_oprofile_CreateBlock(int count)
block->clan_idxA = NULL;
ESL_ALLOC(block->list, sizeof(P7_OPROFILE *) * count);
- ESL_ALLOC(block->msvdataA, sizeof(P7_MSVDATA *) * count);
+ ESL_ALLOC(block->msvdataA, sizeof(P7_SCOREDATA *)* count);
ESL_ALLOC(block->cm_offsetA, sizeof(off_t) * count);
ESL_ALLOC(block->cm_clenA, sizeof(int) * count);
ESL_ALLOC(block->cm_WA, sizeof(int) * count);
@@ -3571,7 +3571,7 @@ cm_p7_oprofile_DestroyBlock(CM_P7_OM_BLOCK *block)
}
if (block->msvdataA != NULL) {
for (i = 0; i < block->listSize; ++i) {
- if (block->msvdataA[i] != NULL) p7_hmm_MSVDataDestroy(block->msvdataA[i]);
+ if (block->msvdataA[i] != NULL) p7_hmm_ScoreDataDestroy(block->msvdataA[i]);
}
free(block->msvdataA);
}
diff --git a/src/cm_file.c b/src/cm_file.c
index 3eea17d3..bb420186 100644
--- a/src/cm_file.c
+++ b/src/cm_file.c
@@ -606,7 +606,7 @@ cm_file_WriteASCII(FILE *fp, int format, CM_t *cm)
if (format == -1) format = CM_FILE_1a;
- if (format == CM_FILE_1a) fprintf(fp, "INFERNAL1/a [%s | %s]\n", INFERNAL_VERSION, INFERNAL_DATE);
+ if (format == CM_FILE_1a) fprintf(fp, "INFERNAL1/a [%s | %s]\n", INFERNAL_VERSION, INFERNAL_DATE);
else ESL_EXCEPTION(eslEINVAL, "invalid CM file format code");
fprintf(fp, "NAME %s\n", cm->name);
@@ -645,18 +645,28 @@ cm_file_WriteASCII(FILE *fp, int format, CM_t *cm)
}
if (cm->flags & CMH_EXPTAIL_STATS)
{
- fprintf(fp, "ECMLC %.5f %10.5f %10.5f %10.0f %10d %.6f\n",
+ /* make sure our dbsize values can be cast to a long reliably,
+ * even on 32 bit systems (<= 2 Gb), they certainly should be,
+ * max value in cmcalibrate is 160Mb. (This is related to bug
+ * i31.)
+ */
+ if(cm->expA[EXP_CM_LC]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_LC");
+ if(cm->expA[EXP_CM_GC]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_GC");
+ if(cm->expA[EXP_CM_LI]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_LI");
+ if(cm->expA[EXP_CM_GI]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big) EXP_CM_GI");
+
+ fprintf(fp, "ECMLC %.5f %10.5f %10.5f %10ld %10d %.6f\n",
cm->expA[EXP_CM_LC]->lambda, cm->expA[EXP_CM_LC]->mu_extrap, cm->expA[EXP_CM_LC]->mu_orig,
- cm->expA[EXP_CM_LC]->dbsize, cm->expA[EXP_CM_LC]->nrandhits, cm->expA[EXP_CM_LC]->tailp);
- fprintf(fp, "ECMGC %.5f %10.5f %10.5f %10.0f %10d %.6f\n",
+ (long) (cm->expA[EXP_CM_LC]->dbsize + 0.5), cm->expA[EXP_CM_LC]->nrandhits, cm->expA[EXP_CM_LC]->tailp);
+ fprintf(fp, "ECMGC %.5f %10.5f %10.5f %10ld %10d %.6f\n",
cm->expA[EXP_CM_GC]->lambda, cm->expA[EXP_CM_GC]->mu_extrap, cm->expA[EXP_CM_GC]->mu_orig,
- cm->expA[EXP_CM_GC]->dbsize, cm->expA[EXP_CM_GC]->nrandhits, cm->expA[EXP_CM_GC]->tailp);
- fprintf(fp, "ECMLI %.5f %10.5f %10.5f %10.0f %10d %.6f\n",
+ (long) (cm->expA[EXP_CM_GC]->dbsize + 0.5), cm->expA[EXP_CM_GC]->nrandhits, cm->expA[EXP_CM_GC]->tailp);
+ fprintf(fp, "ECMLI %.5f %10.5f %10.5f %10ld %10d %.6f\n",
cm->expA[EXP_CM_LI]->lambda, cm->expA[EXP_CM_LI]->mu_extrap, cm->expA[EXP_CM_LI]->mu_orig,
- cm->expA[EXP_CM_LI]->dbsize, cm->expA[EXP_CM_LI]->nrandhits, cm->expA[EXP_CM_LI]->tailp);
- fprintf(fp, "ECMGI %.5f %10.5f %10.5f %10.0f %10d %.6f\n",
+ (long) (cm->expA[EXP_CM_LI]->dbsize + 0.5), cm->expA[EXP_CM_LI]->nrandhits, cm->expA[EXP_CM_LI]->tailp);
+ fprintf(fp, "ECMGI %.5f %10.5f %10.5f %10ld %10d %.6f\n",
cm->expA[EXP_CM_GI]->lambda, cm->expA[EXP_CM_GI]->mu_extrap, cm->expA[EXP_CM_GI]->mu_orig,
- cm->expA[EXP_CM_GI]->dbsize, cm->expA[EXP_CM_GI]->nrandhits, cm->expA[EXP_CM_GI]->tailp);
+ (long) (cm->expA[EXP_CM_GI]->dbsize + 0.5), cm->expA[EXP_CM_GI]->nrandhits, cm->expA[EXP_CM_GI]->tailp);
}
/* main model section */
@@ -855,11 +865,19 @@ cm_file_WriteBinary(FILE *fp, int format, CM_t *cm, off_t *opt_fp7_offset)
if (fwrite((char *) &(cm->fp7_evparam[CM_p7_GFLAMBDA]), sizeof(float), 1, fp) != 1) return eslFAIL;
}
if (cm->flags & CMH_EXPTAIL_STATS) {
+ long dbsize_long;
for(z = 0; z < EXP_NMODES; z++) {
+ /* make sure our dbsize values can be cast to a long reliably,
+ * even on 32 bit systems (<= 2 Gb), they certainly should be,
+ * max value in cmcalibrate is 160Mb. (This is related to bug
+ * i31.)
+ */
+ if(cm->expA[z]->dbsize > (2000. * 1000000.)) ESL_EXCEPTION(eslEINVAL, "invalid dbsize (too big)");
+ dbsize_long = (long) cm->expA[z]->dbsize + 0.5;
if (fwrite((char *) &(cm->expA[z]->lambda), sizeof(double), 1, fp) != 1) return eslFAIL;
if (fwrite((char *) &(cm->expA[z]->mu_extrap), sizeof(double), 1, fp) != 1) return eslFAIL;
if (fwrite((char *) &(cm->expA[z]->mu_orig), sizeof(double), 1, fp) != 1) return eslFAIL;
- if (fwrite((char *) &(cm->expA[z]->dbsize), sizeof(double), 1, fp) != 1) return eslFAIL;
+ if (fwrite((char *) &(dbsize_long), sizeof(long), 1, fp) != 1) return eslFAIL;
if (fwrite((char *) &(cm->expA[z]->nrandhits), sizeof(int), 1, fp) != 1) return eslFAIL;
if (fwrite((char *) &(cm->expA[z]->tailp), sizeof(double), 1, fp) != 1) return eslFAIL;
}
@@ -1454,7 +1472,7 @@ cm_p7_oprofile_ReadBlockMSV(CM_FILE *cmfp, int64_t idx0, ESL_ALPHABET **byp_abc,
}
/* create the MSV data */
for (i = 0; i < hmmBlock->count; ++i) {
- hmmBlock->msvdataA[i] = p7_hmm_MSVDataCreate(hmmBlock->list[i], FALSE);
+ hmmBlock->msvdataA[i] = p7_hmm_ScoreDataCreate(hmmBlock->list[i], FALSE);
}
/* initiate the clan idx data, caller will need to fill this if nec */
for (i = 0; i < hmmBlock->count; ++i) {
@@ -1772,7 +1790,7 @@ read_asc_1p1_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_
cm->expA[exp_mode]->lambda = atof(tok1);
cm->expA[exp_mode]->mu_extrap = atof(tok2);
cm->expA[exp_mode]->mu_orig = atof(tok3);
- cm->expA[exp_mode]->dbsize = atof(tok4);
+ cm->expA[exp_mode]->dbsize = atof(tok4); /* store as double, even though it was written as a long */
cm->expA[exp_mode]->nrandhits = atoi(tok5);
cm->expA[exp_mode]->tailp = atof(tok6);
cm->expA[exp_mode]->is_valid = TRUE;
@@ -2232,13 +2250,15 @@ read_bin_1p1_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_
if (! fread((char *) &tmp_fp7_gflambda, sizeof(float), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read additional P7 E-value stats");
}
if (cm->flags & CMH_EXPTAIL_STATS) {
+ long dbsize_long;
ESL_ALLOC(cm->expA, sizeof(ExpInfo_t *) * EXP_NMODES);
for(x = 0; x < EXP_NMODES; x++) {
cm->expA[x] = CreateExpInfo();
if (! fread((char *) &(cm->expA[x]->lambda), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
if (! fread((char *) &(cm->expA[x]->mu_extrap), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
if (! fread((char *) &(cm->expA[x]->mu_orig), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
- if (! fread((char *) &(cm->expA[x]->dbsize), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
+ if (! fread((char *) &(dbsize_long), sizeof(long), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
+ cm->expA[x]->dbsize = (double) dbsize_long;
if (! fread((char *) &(cm->expA[x]->nrandhits), sizeof(int), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
if (! fread((char *) &(cm->expA[x]->tailp), sizeof(double), 1, cmfp->f)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "failed to read CM E-value stats");
}
@@ -2551,7 +2571,7 @@ read_asc_1p0_cm(CM_FILE *cmfp, int read_fp7, ESL_ALPHABET **ret_abc, CM_t **opt_
if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No dbsize read on E-xx line");
if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "dbsize is not integer on E-xx line");
- cm->expA[exp_mode]->dbsize = atof(tok);
+ cm->expA[exp_mode]->dbsize = atof(tok); /* read it as a double, even though it's written as a long */
if ((status = esl_strtok_adv(&s, " \t\n", &tok, &toklen, NULL)) != eslOK) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "No nrandhits read on E-xx line");
if (! is_integer(tok)) ESL_XFAIL(eslEFORMAT, cmfp->errbuf, "nrandhits is not integer on E-xx line");
diff --git a/src/cm_mx.c b/src/cm_mx.c
index 2e3e362f..0d71c0ee 100644
--- a/src/cm_mx.c
+++ b/src/cm_mx.c
@@ -1884,7 +1884,7 @@ cm_shadow_mx_Create(CM_t *cm)
/* level 3: matrix cell memory, when creating only allocate 1 cell per state, for j = 0, d = 0 */
ESL_ALLOC(mx->yshadow_mem, (sizeof(char) * (M-B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->kshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
+ ESL_ALLOC(mx->kshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW)); // avoid 0 malloc
b = 0;
for (v = 0; v < M; v++) {
@@ -2266,13 +2266,13 @@ cm_tr_shadow_mx_Create(CM_t *cm)
ESL_ALLOC(mx->Lyshadow_mem, (sizeof(char) * (M-B) * (allocL) * (allocW)));
ESL_ALLOC(mx->Ryshadow_mem, (sizeof(char) * (M-B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Jkshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Lkshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Rkshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Tkshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
+ ESL_ALLOC(mx->Jkshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW)); // avoid 0 malloc
+ ESL_ALLOC(mx->Lkshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW));
+ ESL_ALLOC(mx->Rkshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW));
+ ESL_ALLOC(mx->Tkshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW));
- ESL_ALLOC(mx->Lkmode_mem, (sizeof(char) * (B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Rkmode_mem, (sizeof(char) * (B) * (allocL) * (allocW)));
+ ESL_ALLOC(mx->Lkmode_mem, sizeof(char) * ESL_MAX(1, B * allocL * allocW));
+ ESL_ALLOC(mx->Rkmode_mem, sizeof(char) * ESL_MAX(1, B * allocL * allocW));
b = 0;
for (v = 0; v < M; v++) {
@@ -2886,7 +2886,7 @@ cm_hb_shadow_mx_Create(CM_t *cm)
ESL_ALLOC(mx->nrowsA, sizeof(int) * M);
ESL_ALLOC(mx->yshadow_mem, (sizeof(char) * (M-nbifs)));
- ESL_ALLOC(mx->kshadow_mem, (sizeof(int) * (nbifs)));
+ ESL_ALLOC(mx->kshadow_mem, (sizeof(int) * ESL_MAX(1,nbifs))); // avoid 0 malloc
nb = 0;
for (v = 0; v < M; v++) {
@@ -3310,13 +3310,13 @@ cm_tr_hb_shadow_mx_Create(CM_t *cm)
ESL_ALLOC(mx->Lyshadow_mem, (sizeof(char) * (M-B) * (allocL) * (allocW)));
ESL_ALLOC(mx->Ryshadow_mem, (sizeof(char) * (M-B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Jkshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Lkshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Rkshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Tkshadow_mem, (sizeof(int) * (B) * (allocL) * (allocW)));
+ ESL_ALLOC(mx->Jkshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW));
+ ESL_ALLOC(mx->Lkshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW));
+ ESL_ALLOC(mx->Rkshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW));
+ ESL_ALLOC(mx->Tkshadow_mem, sizeof(int) * ESL_MAX(1, B * allocL * allocW));
- ESL_ALLOC(mx->Lkmode_mem, (sizeof(char) * (B) * (allocL) * (allocW)));
- ESL_ALLOC(mx->Rkmode_mem, (sizeof(char) * (B) * (allocL) * (allocW)));
+ ESL_ALLOC(mx->Lkmode_mem, sizeof(char) * ESL_MAX(1, B * allocL * allocW));
+ ESL_ALLOC(mx->Rkmode_mem, sizeof(char) * ESL_MAX(1, B * allocL * allocW));
ESL_ALLOC(mx->JnrowsA, sizeof(int) * M);
ESL_ALLOC(mx->LnrowsA, sizeof(int) * M);
@@ -5907,7 +5907,7 @@ cm_scan_mx_floatize(CM_t *cm, CM_SCAN_MX *smx, char *errbuf)
smx->ncells_alpha_begl = (smx->W+1);
smx->ncells_alpha_begl *= n_begl;
smx->ncells_alpha_begl *= (smx->W+1);
- ESL_ALLOC(smx->falpha_begl_mem, sizeof(float) * (smx->ncells_alpha_begl));
+ ESL_ALLOC(smx->falpha_begl_mem, sizeof(float) * ESL_MAX(1, smx->ncells_alpha_begl));
/* we used to define ncells_alpha_begl this way:
* smx->ncells_alpha_begl = (smx->W+1) * n_begl * (smx->W+1);
* but that overflows for large models (even though ncells_alpha_begl is an int64_t, I guess
@@ -6003,7 +6003,7 @@ cm_scan_mx_integerize(CM_t *cm, CM_SCAN_MX *smx, char *errbuf)
smx->ncells_alpha_begl = (smx->W+1);
smx->ncells_alpha_begl *= n_begl;
smx->ncells_alpha_begl *= (smx->W+1);
- ESL_ALLOC(smx->ialpha_begl_mem, sizeof(int) * (smx->ncells_alpha_begl));
+ ESL_ALLOC(smx->ialpha_begl_mem, sizeof(int) * ESL_MAX(1, smx->ncells_alpha_begl));
/* we used to define ncells_alpha_begl this way:
* smx->ncells_alpha_begl = (smx->W+1) * n_begl * (smx->W+1);
* but that overflows for large models (even though ncells_alpha_begl is an int64_t, I guess
diff --git a/src/cm_pipeline.c b/src/cm_pipeline.c
index 0d0b15e2..73002a44 100644
--- a/src/cm_pipeline.c
+++ b/src/cm_pipeline.c
@@ -26,7 +26,7 @@
#define DEBUG_NOW 0
-static int pli_p7_filter (CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P7_MSVDATA *msvdata, const ESL_SQ *sq, int64_t **ret_ws, int64_t **ret_we, float **ret_wb, int *ret_nwin);
+static int pli_p7_filter (CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P7_SCOREDATA *msvdata, const ESL_SQ *sq, int64_t **ret_ws, int64_t **ret_we, float **ret_wb, int *ret_nwin);
static int pli_p7_env_def (CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, const ESL_SQ *sq, int64_t *ws, int64_t *we, int nwin, P7_HMM **opt_hmm, P7_PROFILE **opt_gm,
P7_PROFILE **opt_Rgm, P7_PROFILE **opt_Lgm, P7_PROFILE **opt_Tgm, int64_t **ret_es, int64_t **ret_ee, float **ret_eb, int *ret_nenv);
static int pli_cyk_env_filter (CM_PIPELINE *pli, off_t cm_offset, const ESL_SQ *sq, int64_t *p7es, int64_t *p7ee, int np7env, CM_t **opt_cm, int64_t **ret_es, int64_t **ret_ee, int *ret_nenv);
@@ -1265,7 +1265,7 @@ cm_pipeline_Merge(CM_PIPELINE *p1, CM_PIPELINE *p2)
* Xref: J4/25.
*/
int
-cm_Pipeline(CM_PIPELINE *pli, off_t cm_offset, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P7_MSVDATA *msvdata, ESL_SQ *sq, CM_TOPHITS *hitlist, int in_rc, P7_HMM **opt_hmm, P7_PROFILE **opt_gm, P7_PROFILE **opt_Rgm, P7_PROFILE **opt_Lgm, P7_PROFILE **opt_Tgm, CM_t **opt_cm)
+cm_Pipeline(CM_PIPELINE *pli, off_t cm_offset, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P7_SCOREDATA *msvdata, ESL_SQ *sq, CM_TOPHITS *hitlist, int in_rc, P7_HMM **opt_hmm, P7_PROFILE **opt_gm, P7_PROFILE **opt_Rgm, P7_PROFILE **opt_Lgm, P7_PROFILE **opt_Tgm, CM_t **opt_cm)
{
int status;
int nwin = 0; /* number of windows surviving MSV & Vit & lFwd, filled by pli_p7_filter() */
@@ -2411,7 +2411,7 @@ cm_pli_AdjustNresForOverlaps(CM_PIPELINE *pli, int64_t noverlap, int in_rc)
* Xref: J4/25.
*/
int
-pli_p7_filter(CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P7_MSVDATA *msvdata, const ESL_SQ *sq, int64_t **ret_ws, int64_t **ret_we, float **ret_wb, int *ret_nwin)
+pli_p7_filter(CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P7_SCOREDATA *msvdata, const ESL_SQ *sq, int64_t **ret_ws, int64_t **ret_we, float **ret_wb, int *ret_nwin)
{
int status;
float mfsc, vfsc, fwdsc; /* filter scores */
@@ -2482,10 +2482,10 @@ pli_p7_filter(CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P
nwin = 0;
/***************************************************/
- /* Filter 1: MSV, long target-variant, with p7 HMM */
+ /* Filter 1: SSV, long target-variant, with p7 HMM */
if(cur_do_msv) {
p7_hmmwindow_init(&wlist);
- status = p7_MSVFilter_longtarget(sq->dsq, sq->n, om, pli->oxf, msvdata, bg, cur_F1, &wlist);
+ status = p7_SSVFilter_longtarget(sq->dsq, sq->n, om, pli->oxf, msvdata, bg, cur_F1, &wlist);
if(wlist.count > 0) {
/* In scan mode, if at least one window passes the MSV filter, read the rest of the profile */
@@ -2496,15 +2496,15 @@ pli_p7_filter(CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P
have_rest = TRUE;
}
if(msvdata->prefix_lengths == NULL && msvdata->suffix_lengths == NULL) {
- p7_hmm_MSVDataComputeRest(om, msvdata);
+ p7_hmm_ScoreDataComputeRest(om, msvdata);
/* only call *ComputeRest() if we haven't already (we may have
* already in a previous pipeline pass).
*/
}
- p7_pli_ExtendAndMergeWindows(om, msvdata, &wlist, sq->n, 0.0);
+ p7_pli_ExtendAndMergeWindows(om, msvdata, &wlist, 0.0);
}
- ESL_ALLOC(ws, sizeof(int64_t) * wlist.count);
- ESL_ALLOC(we, sizeof(int64_t) * wlist.count);
+ ESL_ALLOC(ws, sizeof(int64_t) * ESL_MAX(1, wlist.count)); // avoid 0 malloc
+ ESL_ALLOC(we, sizeof(int64_t) * ESL_MAX(1, wlist.count));
nwin = wlist.count;
for(i = 0; i < nwin; i++) {
ws[i] = wlist.windows[i].n;
@@ -2573,12 +2573,12 @@ pli_p7_filter(CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P
/* allocate and initialize survAA, which will keep track of number of windows surviving each stage */
ESL_ALLOC(survAA, sizeof(int *) * Np7_SURV);
for (i = 0; i < Np7_SURV; i++) {
- ESL_ALLOC(survAA[i], sizeof(int) * nwin);
+ ESL_ALLOC(survAA[i], sizeof(int) * ESL_MAX(1, nwin)); // avoid 0 mallocs
esl_vec_ISet(survAA[i], nwin, FALSE);
}
- ESL_ALLOC(wp, sizeof(double) * nwin);
- ESL_ALLOC(wb, sizeof(float) * nwin);
+ ESL_ALLOC(wp, sizeof(double) * ESL_MAX(1, nwin)); // avoid 0 mallocs
+ ESL_ALLOC(wb, sizeof(float) * ESL_MAX(1, nwin));
for (i = 0; i < nwin; i++) { wp[i] = 1.0; }
for (i = 0; i < nwin; i++) { wb[i] = -999.0; }
@@ -3002,7 +3002,8 @@ pli_p7_env_def(CM_PIPELINE *pli, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam,
p7_ForwardParser(seq->dsq, wlen, om, pli->oxf, NULL);
p7_omx_GrowTo(pli->oxb, om->M, 0, wlen);
p7_BackwardParser(seq->dsq, wlen, om, pli->oxf, pli->oxb, NULL);
- status = p7_domaindef_ByPosteriorHeuristics (seq, om, pli->oxf, pli->oxb, pli->fwd, pli->bck, pli->ddef, NULL, bg, FALSE);
+ status = p7_domaindef_ByPosteriorHeuristics (seq, NULL, om, pli->oxf, pli->oxb, pli->fwd, pli->bck, pli->ddef, bg, /*long_target=*/FALSE,
+ /*bg_tmp=*/NULL, /*scores_arr=*/NULL, /*fwd_emissions_arr=*/NULL);
}
else {
/* We're defining envelopes in glocal mode, so we need to fill
@@ -3835,7 +3836,8 @@ pli_final_stage_hmmonly(CM_PIPELINE *pli, off_t cm_offset, P7_OPROFILE *om, P7_B
p7_ForwardParser(seq->dsq, wlen, om, pli->oxf, NULL);
p7_omx_GrowTo(pli->oxb, om->M, 0, wlen);
p7_BackwardParser(seq->dsq, wlen, om, pli->oxf, pli->oxb, NULL);
- status = p7_domaindef_ByPosteriorHeuristics (seq, om, pli->oxf, pli->oxb, pli->fwd, pli->bck, pli->ddef, NULL, bg, FALSE);
+ status = p7_domaindef_ByPosteriorHeuristics (seq, /*nt_seq=*/NULL, om, pli->oxf, pli->oxb, pli->fwd, pli->bck, pli->ddef, bg, /*long_target=*/FALSE,
+ /*bg_tmp=*/NULL, /*scores_arr=*/NULL, /*fwd_emissions_arr=*/NULL);
if (status != eslOK) ESL_FAIL(status, pli->errbuf, "envelope definition workflow failure"); /* eslERANGE can happen */
if (pli->ddef->nregions == 0) continue; /* score passed threshold but there's no discrete domains here */
diff --git a/src/cm_tophits.c b/src/cm_tophits.c
index 6f31c04d..6864d880 100644
--- a/src/cm_tophits.c
+++ b/src/cm_tophits.c
@@ -3357,14 +3357,14 @@ main(int argc, char **argv)
hit->srcL = L+1;
cm_tophits_SortForOverlapRemoval(h1);
- if((status = cm_tophits_RemoveOrMarkOverlaps(h1, errbuf)) != eslOK) cm_Fail(errbuf);
+ if((status = cm_tophits_RemoveOrMarkOverlaps(h1, /*do_clans=*/FALSE, errbuf)) != eslOK) cm_Fail(errbuf);
cm_tophits_SortByEvalue(h1);
if (strcmp(h1->hit[0]->name, "third") != 0) esl_fatal("sort 1 failed (top is %s = %f)", h1->hit[0]->name, h1->hit[0]->score);
if (strcmp(h1->hit[N+1]->name, "thirdtolast") != 0) esl_fatal("sort 1 failed (last is %s = %f)", h1->hit[N+1]->name, h1->hit[N+1]->score);
cm_tophits_Merge(h1, h2);
cm_tophits_SortForOverlapRemoval(h1);
- if((status = cm_tophits_RemoveOrMarkOverlaps(h1, errbuf)) != eslOK) cm_Fail(errbuf);
+ if((status = cm_tophits_RemoveOrMarkOverlaps(h1, /*do_clans=*/FALSE, errbuf)) != eslOK) cm_Fail(errbuf);
cm_tophits_SortByEvalue(h1);
if (strcmp(h1->hit[0]->name, "second") != 0) esl_fatal("sort 2 failed (top is %s = %f)", h1->hit[0]->name, h1->hit[0]->score);
if (strcmp(h1->hit[1]->name, "third") != 0) esl_fatal("sort 2 failed (second is %s = %f)", h1->hit[1]->name, h1->hit[1]->score);
@@ -3377,7 +3377,7 @@ main(int argc, char **argv)
cm_tophits_Merge(h1, h3);
cm_tophits_SortForOverlapRemoval(h1);
- if((status = cm_tophits_RemoveOrMarkOverlaps(h1, errbuf)) != eslOK) cm_Fail(errbuf);
+ if((status = cm_tophits_RemoveOrMarkOverlaps(h1, /*do_clans=*/FALSE, errbuf)) != eslOK) cm_Fail(errbuf);
cm_tophits_SortByEvalue(h1);
if (strcmp(h1->hit[0]->name, "first") != 0) esl_fatal("sort 3 failed (top is %s = %f)", h1->hit[0]->name, h1->hit[0]->score);
if (strcmp(h1->hit[1]->name, "second") != 0) esl_fatal("sort 3 failed (second is %s = %f)", h1->hit[1]->name, h1->hit[1]->score);
@@ -3395,7 +3395,7 @@ main(int argc, char **argv)
/* test markup, first sort for removal and make sure we don't mark up any overlaps of different models */
cm_tophits_Merge(h1, h4);
cm_tophits_SortForOverlapRemoval(h1);
- if((status = cm_tophits_RemoveOrMarkOverlaps(h1, errbuf)) != eslOK) cm_Fail(errbuf);
+ if((status = cm_tophits_RemoveOrMarkOverlaps(h1, /*do_clans=*/FALSE, errbuf)) != eslOK) cm_Fail(errbuf);
cm_tophits_SortByEvalue(h1);
if (strcmp(h1->hit[0]->name, "Bfirst") != 0) esl_fatal("sort 4 failed (first is %s = %f)", h1->hit[0]->name, h1->hit[0]->score);
if (strcmp(h1->hit[1]->name, "first") != 0) esl_fatal("sort 4 failed (second is %s = %f)", h1->hit[1]->name, h1->hit[1]->score);
@@ -3416,7 +3416,7 @@ main(int argc, char **argv)
/* second sort for markup and make sure we DO mark up overlaps of different models */
cm_tophits_SortForOverlapMarkup(h1);
- if((status = cm_tophits_RemoveOrMarkOverlaps(h1, errbuf)) != eslOK) cm_Fail(errbuf);
+ if((status = cm_tophits_RemoveOrMarkOverlaps(h1, /*do_clans=*/FALSE, errbuf)) != eslOK) cm_Fail(errbuf);
cm_tophits_SortByEvalue(h1);
if (strcmp(h1->hit[0]->name, "Bfirst") != 0) esl_fatal("sort 5 failed (first is %s = %f)", h1->hit[0]->name, h1->hit[0]->score);
if (strcmp(h1->hit[1]->name, "first") != 0) esl_fatal("sort 5 failed (second is %s = %f)", h1->hit[1]->name, h1->hit[1]->score);
@@ -3521,9 +3521,9 @@ main(int argc, char **argv)
}
cm_tophits_SortForOverlapRemoval(h5);
- if((status = cm_tophits_RemoveOrMarkOverlaps(h5, errbuf)) != eslOK) cm_Fail(errbuf);
+ if((status = cm_tophits_RemoveOrMarkOverlaps(h5, /*do_clans=*/FALSE, errbuf)) != eslOK) cm_Fail(errbuf);
cm_tophits_SortForOverlapMarkup(h5);
- if((status = cm_tophits_RemoveOrMarkOverlaps(h5, errbuf)) != eslOK) cm_Fail(errbuf);
+ if((status = cm_tophits_RemoveOrMarkOverlaps(h5, /*do_clans=*/FALSE, errbuf)) != eslOK) cm_Fail(errbuf);
cm_tophits_SortByEvalue(h5);
if (strcmp(h5->hit[0]->name, "hit1") != 0) esl_fatal("sort 6 failed pass %d (first is %s = %f)", p+1, h5->hit[0]->name, h5->hit[0]->score);
diff --git a/src/cmalign.c b/src/cmalign.c
index c1436ef9..122722fa 100644
--- a/src/cmalign.c
+++ b/src/cmalign.c
@@ -49,7 +49,7 @@
/* Max number of sequences per tmp alignment, if seq file exceeds
* either of these, final output alignment will be in 1 line/seq Pfam
- * format. Max number of residues is MAX_RESIDUE_COUNT from esl_sqio_ncbi.h
+ * format. Max number of residues is CM_MAX_RESIDUE_COUNT from infernal.h
* where it's currently defined as (1024 * 1024).
*/
#define CMALIGN_MAX_NSEQ 10000 /* 10k sequences, average parsetree is 25 bytes/position this means ~250Mb for all parsetrees */
@@ -479,15 +479,15 @@ serial_master(ESL_GETOPTS *go, struct cfg_s *cfg)
}
/* Our main loop will loop over reading a single large block
- * () of sequences, up to MAX_RESIDUE_COUNT
- * (1024*1024=1048576) residues, and up to CMALIGN_MAX_NSEQ
+ * () of sequences, up to CM_MAX_RESIDUE_COUNT
+ * (100000) residues, and up to CMALIGN_MAX_NSEQ
* sequences (10,000), but potentially less if we reach the end of
* the sequence file first.
*/
/* Read the first block */
sq_block = esl_sq_CreateDigitalBlock(CMALIGN_MAX_NSEQ, cfg->abc);
- sstatus = esl_sqio_ReadBlock(cfg->sqfp, sq_block, -1, FALSE); /* FALSE says: read complete sequences */
+ sstatus = esl_sqio_ReadBlock(cfg->sqfp, sq_block, CM_MAX_RESIDUE_COUNT, CMALIGN_MAX_NSEQ, FALSE); /* FALSE says: read complete sequences */
nxt_sq_block = sq_block; /* special case of first block read */
while(sstatus == eslOK) {
@@ -517,12 +517,12 @@ serial_master(ESL_GETOPTS *go, struct cfg_s *cfg)
* fine for a while.
*/
nxt_sq_block = esl_sq_CreateDigitalBlock(CMALIGN_MAX_NSEQ, cfg->abc);
- sstatus = esl_sqio_ReadBlock(cfg->sqfp, nxt_sq_block, -1, FALSE); /* FALSE says: read complete sequences */
+ sstatus = esl_sqio_ReadBlock(cfg->sqfp, nxt_sq_block, CM_MAX_RESIDUE_COUNT, CMALIGN_MAX_NSEQ, FALSE); /* FALSE says: read complete sequences */
if(sstatus == eslEOF) {
reached_eof = TRUE; /* nxt_sq_block will not have been filled */
esl_sq_DestroyBlock(nxt_sq_block);
}
- if((! reached_eof) && cfg->do_oneblock) esl_fatal("Error: the sequence file is too big (has > %d seqs or %d residues) for --ileaved or output\nformat other than Pfam. Use esl-reformat to reformat alignment later.", CMALIGN_MAX_NSEQ, MAX_RESIDUE_COUNT);
+ if((! reached_eof) && cfg->do_oneblock) esl_fatal("Error: the sequence file is too big (has > %d seqs or %d residues) for --ileaved or output\nformat other than Pfam. Use esl-reformat to reformat alignment later.", CMALIGN_MAX_NSEQ, CM_MAX_RESIDUE_COUNT);
/* align the sequences in the block */
#ifdef HMMER_THREADS
diff --git a/src/cmbuild.c b/src/cmbuild.c
index 0076d434..07d1057a 100644
--- a/src/cmbuild.c
+++ b/src/cmbuild.c
@@ -2027,7 +2027,11 @@ build_and_calibrate_p7_filter(const ESL_GETOPTS *go, const struct cfg_s *cfg, ch
for (cpos = 1; cpos <= cm->clen; cpos++) amsa->rf[cm->map[cpos]-1] = 'x'; /* note off by one */
cfg->fp7_bld->arch_strategy = p7_ARCH_HAND;
- if ((status = p7_Builder(cfg->fp7_bld, amsa, cfg->fp7_bg, &fhmm, NULL, NULL, NULL, NULL)) != eslOK) { strcpy(errbuf, cfg->fp7_bld->errbuf); return status; }
+ if ((status = p7_Builder(cfg->fp7_bld, amsa, cfg->fp7_bg, &fhmm,
+ /*opt_trarr=*/NULL, /*opt_gm=*/NULL, /*opt_om=*/NULL, /*opt_postmsa=*/NULL,
+ /*seqweights_w_fp=*/NULL, /*seqweights_e_fp=*/NULL)) != eslOK)
+ { strcpy(errbuf, cfg->fp7_bld->errbuf); return status; }
+
/* remove the RF annotation, it only exists because we created amsa->rf above */
if(fhmm->rf != NULL) {
free(fhmm->rf);
diff --git a/src/cmcalibrate.c b/src/cmcalibrate.c
index 2bf575db..58db9be6 100644
--- a/src/cmcalibrate.c
+++ b/src/cmcalibrate.c
@@ -75,7 +75,7 @@ typedef struct {
static ESL_OPTIONS options[] = {
/* name type default env range toggles reqs incomp help docgroup*/
{ "-h", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "show brief help on version and usage", 1 },
- { "-L", eslARG_REAL, "1.6", NULL, "0.01<=x<=100.", NULL, NULL, NULL, "set random seq length to search in Mb to ", 1 },
+ { "-L", eslARG_REAL, "1.6", NULL, "0.01<=x<=160.", NULL, NULL, NULL, "set random seq length to search in Mb to ", 1 },
/* Options for predicting running time and memory requirements */
{ "--forecast", eslARG_NONE, NULL, NULL, NULL, NULL, NULL, NULL, "don't do calibration, predict running time and exit", 2 },
{ "--nforecast", eslARG_INT, NULL, NULL, "n>0", NULL, "--forecast", NULL, "w/--forecast, predict time with processors (maybe for MPI)", 2 },
diff --git a/src/cmscan.c b/src/cmscan.c
index 32cfd237..2c30725a 100644
--- a/src/cmscan.c
+++ b/src/cmscan.c
@@ -74,7 +74,7 @@ typedef struct {
float *gfmuA; /* [0..cm_idx..nmodels-1] HMM glocal mu for CM */
float *gflambdaA; /* [0..cm_idx..nmodels-1] HMM glocal lambda for CM */
P7_OPROFILE **omA; /* [0..cm_idx..nmodels-1] optimized query profile HMM for CM */
- P7_MSVDATA **msvdataA; /* [0..cm_idx..nmodels-1] P7_MSVDATA for CM */
+ P7_SCOREDATA **msvdataA; /* [0..cm_idx..nmodels-1] P7_SCOREDATA for CM */
/* variables related to --clanin */
ESL_KEYHASH *clan_fam_kh; /* these are family names in a clan, members of same clan are contiguous */
int *clan_mapA; /* [0..i..nfam-1] = c; family index i in belongs to clan
@@ -790,7 +790,7 @@ serial_loop(THREAD_INFO *tinfo, READER_INFO *rinfo, CM_FILE *cmfp)
P7_PROFILE *Rgm = NULL; /* generic query profile HMM for env defn for 5' truncated hits */
P7_PROFILE *Lgm = NULL; /* generic query profile HMM for env defn for 3' truncated hits */
P7_PROFILE *Tgm = NULL; /* generic query profile HMM for env defn for 5' and 3'truncated hits */
- P7_MSVDATA *msvdata = NULL; /* MSV/SSV specific data structure */
+ P7_SCOREDATA *msvdata = NULL; /* MSV/SSV specific data structure */
int prv_ntophits; /* number of top hits before cm_Pipeline() call */
int cm_clen, cm_W, cm_nbp; /* consensus, window length and # bps for CM */
float gfmu, gflambda; /* glocal fwd mu, lambda for current hmm filter */
@@ -818,7 +818,7 @@ serial_loop(THREAD_INFO *tinfo, READER_INFO *rinfo, CM_FILE *cmfp)
}
if(status == eslOK) {
if(rinfo->msvdataA[cm_idx] == NULL) {
- rinfo->msvdataA[cm_idx] = p7_hmm_MSVDataCreate(rinfo->omA[cm_idx], FALSE);
+ rinfo->msvdataA[cm_idx] = p7_hmm_ScoreDataCreate(rinfo->omA[cm_idx], FALSE);
}
/* set pointers for convenience */
om = rinfo->omA[cm_idx];
@@ -1106,8 +1106,8 @@ pipeline_thread(void *arg)
cm_idx = block->idx0;
for (i = 0; i < block->count; ++i)
{
- P7_OPROFILE *om = block->list[i];
- P7_MSVDATA *msvdata = block->msvdataA[i];
+ P7_OPROFILE *om = block->list[i];
+ P7_SCOREDATA *msvdata = block->msvdataA[i];
cm_offset = block->cm_offsetA[i];
cm_clen = block->cm_clenA[i];
cm_W = block->cm_WA[i];
@@ -2752,7 +2752,7 @@ create_reader_info(int64_t nmodels, ESL_KEYHASH *clan_fam_kh, int *clan_mapA)
ESL_ALLOC(rinfo->gfmuA, sizeof(float) * nmodels);
ESL_ALLOC(rinfo->gflambdaA, sizeof(float) * nmodels);
ESL_ALLOC(rinfo->omA, sizeof(P7_OPROFILE *) * nmodels);
- ESL_ALLOC(rinfo->msvdataA, sizeof(P7_MSVDATA *) * nmodels);
+ ESL_ALLOC(rinfo->msvdataA, sizeof(P7_SCOREDATA *)* nmodels);
for(cm_idx = 0; cm_idx < nmodels; cm_idx++) {
rinfo->cm_offsetA[cm_idx] = 0;
rinfo->cm_clenA[cm_idx] = -1;
@@ -2813,7 +2813,7 @@ free_reader_info(READER_INFO *rinfo)
if(rinfo->msvdataA != NULL) {
for(cm_idx = 0; cm_idx < rinfo->nmodels; cm_idx++) {
- if(rinfo->msvdataA[cm_idx] != NULL) { p7_hmm_MSVDataDestroy(rinfo->msvdataA[cm_idx]); rinfo->msvdataA[cm_idx] = NULL; }
+ if(rinfo->msvdataA[cm_idx] != NULL) { p7_hmm_ScoreDataDestroy(rinfo->msvdataA[cm_idx]); rinfo->msvdataA[cm_idx] = NULL; }
}
free(rinfo->msvdataA);
rinfo->msvdataA = NULL;
diff --git a/src/cmsearch.c b/src/cmsearch.c
index 5b0ce65f..89c44adb 100644
--- a/src/cmsearch.c
+++ b/src/cmsearch.c
@@ -51,7 +51,7 @@ typedef struct {
P7_PROFILE *Rgm; /* generic query profile HMM for 5' truncated hits */
P7_PROFILE *Lgm; /* generic query profile HMM for 3' truncated hits */
P7_PROFILE *Tgm; /* generic query profile HMM for 5' and 3' truncated hits */
- P7_SCOREDATA *scoredata; /* MSV/SSV specific data structure */
+ P7_SCOREDATA *msvdata; /* MSV/SSV specific data structure */
float *p7_evparam; /* [0..CM_p7_NEVPARAM] E-value parameters */
float smxsize; /* max size (Mb) of allowable scan mx (only relevant if --nohmm or --max) */
} WORKER_INFO;
@@ -130,7 +130,7 @@ static ESL_OPTIONS options[] = {
{ "--Fmid", eslARG_REAL, "0.02", NULL, NULL, NULL,"--mid", NULL, "with --mid, set P-value threshold for HMM stages to ", 6 },
/* Other options */
/* name type default env range toggles reqs incomp help docgroup*/
- { "--notrunc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "--anytrunc,--onlytrunc--5trunc,--3trunc", "do not allow truncated hits at sequence termini", 7 },
+ { "--notrunc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "--anytrunc,--onlytrunc,--5trunc,--3trunc", "do not allow truncated hits at sequence termini", 7 },
{ "--anytrunc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, TRUNCOPTS, "allow full and truncated hits anywhere within sequences", 7 },
{ "--onlytrunc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, TRUNCOPTS, "allow only truncated hits, anywhere within sequences", 7 },
{ "--5trunc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, TRUNCOPTS, "allow truncated hits only at 5' ends of sequences", 7 },
@@ -151,11 +151,11 @@ static ESL_OPTIONS options[] = {
{ "--oclan", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "BOGUS OPTION, NEVER ALLOWED", 999 },
{ "--oskip", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "BOGUS OPTION, NEVER ALLOWED", 999 },
#ifdef HMMER_THREADS
- { "--cpu", eslARG_INT, NULL,"INFERNAL_NCPU","n>=0",NULL, NULL, CPUOPTS, "number of parallel CPU workers to use for multithreads", 7 },
+ { "--cpu", eslARG_INT, NULL,"INFERNAL_NCPU","n>=0",NULL, NULL, CPUOPTS, "number of parallel CPU workers to use for multithreads", 7 },
#endif
#ifdef HAVE_MPI
- { "--stall", eslARG_NONE, FALSE, NULL, NULL, NULL,"--mpi", NULL, "arrest after start: for debugging MPI under gdb", 7 },
- { "--mpi", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, MPIOPTS, "run as an MPI parallel program", 7 },
+ { "--stall", eslARG_NONE, FALSE, NULL, NULL, NULL,"--mpi", NULL, "arrest after start: for debugging MPI under gdb", 7 },
+ { "--mpi", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, MPIOPTS, "run as an MPI parallel program", 7 },
#endif
/* All options below are developer options, only shown if --devhelp invoked */
@@ -223,18 +223,19 @@ static ESL_OPTIONS options[] = {
{ "--timeF6", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, TIMINGOPTS, "abort after Stage 6 CYK; for timing expts", 106 },
/* Options for terminating after individual pipeline stages, currently only works for F3 */
{ "--trmF3", eslARG_NONE, FALSE, NULL, NULL, NULL,"--noali,--nohmmonly,--notrunc",TIMINGOPTS, "terminate after Stage 3 Fwd and output surviving windows", 107 },
+
/* Other expert options */
/* name type default env range toggles reqs incomp help docgroup*/
- { "--nogreedy", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "do not resolve hits with greedy algorithm, use optimal one", 107 },
- { "--cp9noel", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-g", "turn off local ends in cp9 HMMs", 107 },
- { "--cp9gloc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-g,--cp9noel", "configure cp9 HMM in glocal mode", 107 },
- { "--null2", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "turn on null 2 biased composition HMM score corrections", 107 },
- { "--maxtau", eslARG_REAL, "0.05", NULL,"0 when tightening HMM bands", 107 },
- { "--seed", eslARG_INT, "181", NULL, "n>=0", NULL, NULL, NULL, "set RNG seed to (if 0: one-time arbitrary seed)", 107 },
+ { "--nogreedy", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "do not resolve hits with greedy algorithm, use optimal one", 108 },
+ { "--cp9noel", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-g", "turn off local ends in cp9 HMMs", 108 },
+ { "--cp9gloc", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, "-g,--cp9noel", "configure cp9 HMM in glocal mode", 108 },
+ { "--null2", eslARG_NONE, FALSE, NULL, NULL, NULL, NULL, NULL, "turn on null 2 biased composition HMM score corrections", 108 },
+ { "--maxtau", eslARG_REAL, "0.05", NULL,"0 when tightening HMM bands", 108 },
+ { "--seed", eslARG_INT, "181", NULL, "n>=0", NULL, NULL, NULL, "set RNG seed to (if 0: one-time arbitrary seed)", 108 },
#ifdef HAVE_MPI
/* Searching only a subset of sequences in the target database, currently requires MPI b/c SSI is required */
- { "--sidx", eslARG_INT, NULL, NULL, "n>0", NULL,"--mpi", NULL, "start searching at sequence index in target db SSI index" , 107 },
- { "--eidx", eslARG_INT, NULL, NULL, "n>0", NULL,"--mpi", NULL, "stop searching at sequence index in target db SSI index", 107 },
+ { "--sidx", eslARG_INT, NULL, NULL, "n>0", NULL,"--mpi", NULL, "start searching at sequence index in target db SSI index" , 108 },
+ { "--eidx", eslARG_INT, NULL, NULL, "n>0", NULL,"--mpi", NULL, "stop searching at sequence index in target db SSI index", 108 },
#endif
{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 },
};
@@ -806,7 +807,7 @@ serial_loop(WORKER_INFO *info, ESL_SQFILE *dbfp, int64_t *srcL)
if (info->pli->do_top) {
prv_pli_ntophits = info->th->N;
- if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->scoredata, dbsq, info->th, FALSE, /* FALSE: not in reverse complement */
+ if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->msvdata, dbsq, info->th, FALSE, /* FALSE: not in reverse complement */
NULL, &(info->gm), &(info->Rgm), &(info->Lgm), &(info->Tgm), &(info->cm))) != eslOK) cm_Fail("cm_pipeline() failed unexpected with status code %d\n%s\n", status, info->pli->errbuf);
cm_pipeline_Reuse(info->pli); /* prepare for next search */
@@ -821,7 +822,7 @@ serial_loop(WORKER_INFO *info, ESL_SQFILE *dbfp, int64_t *srcL)
if (info->pli->do_bot && dbsq->abc->complement != NULL) {
prv_pli_ntophits = info->th->N;
esl_sq_ReverseComplement(dbsq);
- if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->scoredata, dbsq, info->th, TRUE, /* TRUE: in reverse complement */
+ if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->msvdata, dbsq, info->th, TRUE, /* TRUE: in reverse complement */
NULL, &(info->gm), &(info->Rgm), &(info->Lgm), &(info->Tgm), &(info->cm))) != eslOK) cm_Fail("cm_pipeline() failed unexpected with status code %d\n%s\n", status, info->pli->errbuf);
cm_pipeline_Reuse(info->pli); /* prepare for next search */
@@ -893,7 +894,7 @@ thread_loop(WORKER_INFO *info, ESL_THREADS *obj, ESL_WORK_QUEUE *queue, ESL_SQFI
* overlap should be retained in the ReadWindow step. */
}
- sstatus = esl_sqio_ReadBlock(dbfp, block, CM_MAX_RESIDUE_COUNT, TRUE);
+ sstatus = esl_sqio_ReadBlock(dbfp, block, CM_MAX_RESIDUE_COUNT, /*max_sequences=*/-1, TRUE); // SRE: is it ok to pass -1 for max_seqs? That's a new arg to ReadBlock().
if (sstatus == eslOK) { /* we read a block */
if(! block->complete) {
@@ -997,7 +998,7 @@ pipeline_thread(void *arg)
if (info->pli->do_top) {
prv_pli_ntophits = info->th->N;
- if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->scoredata, dbsq, info->th, FALSE, /* FALSE: not in reverse complement */
+ if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->msvdata, dbsq, info->th, FALSE, /* FALSE: not in reverse complement */
NULL, &(info->gm), &(info->Rgm), &(info->Lgm), &(info->Tgm), &(info->cm))) != eslOK) cm_Fail("cm_pipeline() failed unexpected with status code %d\n%s\n", status, info->pli->errbuf);
cm_pipeline_Reuse(info->pli); /* prepare for next search */
@@ -1012,7 +1013,7 @@ pipeline_thread(void *arg)
if (info->pli->do_bot && dbsq->abc->complement != NULL) {
prv_pli_ntophits = info->th->N;
esl_sq_ReverseComplement(dbsq);
- if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->scoredata, dbsq, info->th, TRUE, /* TRUE: in reverse complement */
+ if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->msvdata, dbsq, info->th, TRUE, /* TRUE: in reverse complement */
NULL, &(info->gm), &(info->Rgm), &(info->Lgm), &(info->Tgm), &(info->cm))) != eslOK) cm_Fail("cm_pipeline() failed unexpected with status code %d\n%s\n", status, info->pli->errbuf);
cm_pipeline_Reuse(info->pli); /* prepare for next search */
@@ -1597,7 +1598,7 @@ mpi_worker(ESL_GETOPTS *go, struct cfg_s *cfg)
/* search top strand */
if (info->pli->do_top) {
prv_pli_ntophits = info->th->N;
- if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->scoredata, dbsq, info->th, FALSE, /* FALSE: not in reverse complement */
+ if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->msvdata, dbsq, info->th, FALSE, /* FALSE: not in reverse complement */
NULL, &(info->gm), &(info->Rgm), &(info->Lgm), &(info->Tgm), &(info->cm))) != eslOK)
mpi_failure("cm_Pipeline() failed unexpected with status code %d\n%s\n", status, info->pli->errbuf);
cm_pipeline_Reuse(info->pli); /* prepare for next search */
@@ -1611,7 +1612,7 @@ mpi_worker(ESL_GETOPTS *go, struct cfg_s *cfg)
if(info->pli->do_bot && dbsq->abc->complement != NULL) {
esl_sq_ReverseComplement(dbsq);
prv_pli_ntophits = info->th->N;
- if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->scoredata, dbsq, info->th, TRUE, /* TRUE: in reverse complement */
+ if((status = cm_Pipeline(info->pli, info->cm->offset, info->om, info->bg, info->p7_evparam, info->msvdata, dbsq, info->th, TRUE, /* TRUE: in reverse complement */
NULL, &(info->gm), &(info->Rgm), &(info->Lgm), &(info->Tgm), &(info->cm))) != eslOK)
mpi_failure("cm_Pipeline() failed unexpected with status code %d\n%s\n", status, info->pli->errbuf);
cm_pipeline_Reuse(info->pli); /* prepare for next search */
@@ -2038,6 +2039,7 @@ output_header(FILE *ofp, const ESL_GETOPTS *go, char *cmfile, char *seqfile, int
if (esl_opt_IsUsed(go, "--cyk")) fprintf(ofp, "# use CYK for final search stage on\n");
if (esl_opt_IsUsed(go, "--acyk")) fprintf(ofp, "# use CYK to align hits: on\n");
if (esl_opt_IsUsed(go, "--wcx")) fprintf(ofp, "# W set as * cm->clen: =%g\n", esl_opt_GetReal(go, "--wcx"));
+ if (esl_opt_IsUsed(go, "--onepass")) fprintf(ofp, "# using CM for best HMM pass only: on\n");
if (esl_opt_IsUsed(go, "--toponly")) fprintf(ofp, "# search top-strand only: on\n");
if (esl_opt_IsUsed(go, "--bottomonly")) fprintf(ofp, "# search bottom-strand only: on\n");
if (esl_opt_IsUsed(go, "--tformat")) fprintf(ofp, "# targ format asserted: %s\n", esl_opt_GetString(go, "--tformat"));
@@ -2286,16 +2288,16 @@ create_info(const ESL_GETOPTS *go)
WORKER_INFO *info = NULL;
ESL_ALLOC(info, sizeof(WORKER_INFO));
- info->pli = NULL;
- info->th = NULL;
- info->cm = NULL;
- info->gm = NULL;
- info->Rgm = NULL;
- info->Lgm = NULL;
- info->Tgm = NULL;
- info->om = NULL;
- info->bg = NULL;
- info->scoredata = NULL;
+ info->pli = NULL;
+ info->th = NULL;
+ info->cm = NULL;
+ info->gm = NULL;
+ info->Rgm = NULL;
+ info->Lgm = NULL;
+ info->Tgm = NULL;
+ info->om = NULL;
+ info->bg = NULL;
+ info->msvdata = NULL;
ESL_ALLOC(info->p7_evparam, sizeof(float) * CM_p7_NEVPARAM);
info->smxsize = esl_opt_GetReal(go, "--smxsize");
return info;
@@ -2328,11 +2330,11 @@ clone_info(ESL_GETOPTS *go, WORKER_INFO *src_info, WORKER_INFO *dest_infoA, int
if(src_info->Rgm != NULL) { if((dest_infoA[i].Rgm = p7_profile_Clone(src_info->Rgm)) == NULL) goto ERROR; }
if(src_info->Lgm != NULL) { if((dest_infoA[i].Lgm = p7_profile_Clone(src_info->Lgm)) == NULL) goto ERROR; }
if(src_info->Tgm != NULL) { if((dest_infoA[i].Tgm = p7_profile_Clone(src_info->Tgm)) == NULL) goto ERROR; }
- if((dest_infoA[i].om = p7_oprofile_Shadow(src_info->om)) == NULL) goto ERROR;
+ if((dest_infoA[i].om = p7_oprofile_Clone(src_info->om)) == NULL) goto ERROR;
if((dest_infoA[i].bg = p7_bg_Create(src_info->bg->abc)) == NULL) goto ERROR;
if(dest_infoA[i].p7_evparam == NULL) ESL_ALLOC(dest_infoA[i].p7_evparam, sizeof(float) * CM_p7_NEVPARAM);
esl_vec_FCopy(src_info->cm->fp7_evparam, CM_p7_NEVPARAM, dest_infoA[i].p7_evparam);
- dest_infoA[i].scoredata = p7_hmm_ScoreDataClone(src_info->scoredata, src_info->om->abc->Kp);
+ dest_infoA[i].msvdata = p7_hmm_ScoreDataClone(src_info->msvdata, src_info->om->abc->Kp);
}
return eslOK;
@@ -2360,7 +2362,7 @@ free_info(WORKER_INFO *info)
if(info->Tgm != NULL) p7_profile_Destroy(info->Tgm); info->Tgm = NULL;
if(info->bg != NULL) p7_bg_Destroy(info->bg); info->bg = NULL;
if(info->p7_evparam != NULL) free(info->p7_evparam); info->p7_evparam = NULL;
- if(info->scoredata != NULL) p7_hmm_ScoreDataDestroy(info->scoredata); info->scoredata = NULL;
+ if(info->msvdata != NULL) p7_hmm_ScoreDataDestroy(info->msvdata); info->msvdata = NULL;
return;
}
@@ -2469,8 +2471,8 @@ setup_hmm_filter(ESL_GETOPTS *go, WORKER_INFO *info)
/* copy E-value parameters */
esl_vec_FCopy(info->cm->fp7_evparam, CM_p7_NEVPARAM, info->p7_evparam);
- /* compute scoredata */
- info->scoredata = p7_hmm_ScoreDataCreate(info->om, FALSE);
+ /* compute msvdata */
+ info->msvdata = p7_hmm_ScoreDataCreate(info->om, FALSE);
return eslOK;
}
diff --git a/src/infernal.h b/src/infernal.h
index 0b47ddfd..ec4dff8d 100644
--- a/src/infernal.h
+++ b/src/infernal.h
@@ -2083,7 +2083,7 @@ typedef struct cm_file_s {
* algorithms will be called but _ANY is used as a suffix because
* HMM local alignment algorithms allow 5' and 3' truncation.
*
- * A wrinkle is that these indices used for DP truncated alignment
+ * A wrinkle is that these indices are used for DP truncated alignment
* functions called for 'cmalign' (either PLI_PASS_5P_AND_3P_FORCE or
* PLI_PASS_STD_ANY) even though those functions are not called as
* part of a search/scan pipeline. In this case, the pass index is
@@ -2209,18 +2209,21 @@ typedef struct cm_pipeline_s {
double maxtau; /* max tau when tightening bands */
int do_wcx; /* TRUE to set cm->W as cm->clen * wcx */
float wcx; /* set W as cm->clen * wcx, ignoring W from CM file */
+ int do_one_cmpass; /* TRUE to only use CM for best scoring HMM pass if envelope encompasses full sequence */
/* these are all currently hard-coded, in cm_pipeline_Create() */
float smult; /* 2.0; W multiplier for window splitting */
float wmult; /* 1.0; maxW will be max of wmult * cm->W and cmult * cm->clen */
float cmult; /* 1.25; maxW will be max of wmult * cm->W and cmult * cm->clen */
float mlmult; /* 0.10; om->max_length multiplier for MSV window defn */
/* flags for timing experiments */
- int do_time_F1; /* TRUE to abort after Stage 1 MSV */
- int do_time_F2; /* TRUE to abort after Stage 2 Vit */
- int do_time_F3; /* TRUE to abort after Stage 3 Fwd */
- int do_time_F4; /* TRUE to abort after Stage 4 glocal Fwd */
- int do_time_F5; /* TRUE to abort after Stage 5 env def */
- int do_time_F6; /* TRUE to abort after Stage 6 CYK */
+ int do_time_F1; /* TRUE to abort after Stage 1 MSV, for timing expts */
+ int do_time_F2; /* TRUE to abort after Stage 2 Vit, for timing expts */
+ int do_time_F3; /* TRUE to abort after Stage 3 Fwd, for timing expts */
+ int do_time_F4; /* TRUE to abort after Stage 4 glocal Fwd, for timing expts */
+ int do_time_F5; /* TRUE to abort after Stage 5 env def, for timing expts */
+ int do_time_F6; /* TRUE to abort after Stage 6 CYK, for timing expts */
+ /* flag for terminating after a stage and outputting surviving windows (currently only F3 is possible) */
+ int do_trm_F3; /* TRUE to abort after Stage 3 Fwd and output surviving windows */
/* Reporting threshold settings */
int by_E; /* TRUE to cut per-target report off by E */
@@ -2482,7 +2485,7 @@ typedef struct {
int64_t idx0; /* index of first profile in file >= 0 */
int listSize; /* maximum number elements in the list */
P7_OPROFILE **list; /* array of objects */
- P7_MSVDATA **msvdataA; /* array of objects */
+ P7_SCOREDATA **msvdataA; /* array of objects */
off_t *cm_offsetA; /* file offsets for CMs */
int *cm_clenA; /* consensus length of CMs */
int *cm_WA; /* window length of CMs */
@@ -2946,7 +2949,7 @@ extern int cm_pli_TargetIncludable (CM_PIPELINE *pli, float score, double
extern int cm_pli_NewModel (CM_PIPELINE *pli, int modmode, CM_t *cm, int cm_clen, int cm_W, int cm_nbp, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, int p7_max_length, int64_t cur_cm_idx, int cur_clan_idx, ESL_KEYHASH *glocal_kh);
extern int cm_pli_NewModelThresholds(CM_PIPELINE *pli, CM_t *cm);
extern int cm_pli_NewSeq (CM_PIPELINE *pli, const ESL_SQ *sq, int64_t cur_seq_idx);
-extern int cm_Pipeline (CM_PIPELINE *pli, off_t cm_offset, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P7_SCOREDATA *scoredata, ESL_SQ *sq, CM_TOPHITS *hitlist, int in_rc, P7_HMM **opt_hmm, P7_PROFILE **opt_gm, P7_PROFILE **opt_Rgm, P7_PROFILE **opt_Lgm, P7_PROFILE **opt_Tgm, CM_t **opt_cm);
+extern int cm_Pipeline (CM_PIPELINE *pli, off_t cm_offset, P7_OPROFILE *om, P7_BG *bg, float *p7_evparam, P7_SCOREDATA *msvdata, ESL_SQ *sq, CM_TOPHITS *hitlist, int in_rc, P7_HMM **opt_hmm, P7_PROFILE **opt_gm, P7_PROFILE **opt_Rgm, P7_PROFILE **opt_Lgm, P7_PROFILE **opt_Tgm, CM_t **opt_cm);
extern int cm_pli_Statistics (FILE *ofp, CM_PIPELINE *pli, ESL_STOPWATCH *w);
extern int cm_pli_ZeroAccounting(CM_PLI_ACCT *pli_acct);
extern int cm_pli_PassEnforcesFirstRes(int pass_idx);
diff --git a/testsuite/bug-i44.pl b/testsuite/bug-i44.pl
old mode 100644
new mode 100755