Skip to content

Commit

Permalink
Merge pull request #359 from ComparativeGenomicsToolkit/minigraph
Browse files Browse the repository at this point in the history
Graphmap masking pipeline
  • Loading branch information
glennhickey committed Nov 23, 2020
2 parents 0b92c00 + 9176d91 commit aae8094
Show file tree
Hide file tree
Showing 19 changed files with 347 additions and 63 deletions.
2 changes: 1 addition & 1 deletion .gitlab-ci.yml
Expand Up @@ -20,8 +20,8 @@ test-job:
- virtualenv -p python3 venv
- source venv/bin/activate
- pip install -r toil-requirement.txt
- pip install -U .
- build-tools/downloadMiniTools
- pip install -U .
- make -j 8 evolver_test
artifacts:
# Let Gitlab see the junit report
Expand Down
2 changes: 1 addition & 1 deletion README.md
Expand Up @@ -13,7 +13,7 @@ Cactus uses many different algorithms and individual code contributions, princip
- Bob Harris for providing endless support for his [LastZ](https://github.com/lastz/lastz) pairwise, blast-like genome alignment tool.
- Sneha Goenka and Yatish Turakhia for the [GPU-accelerated version of LastZ](https://github.com/ComparativeGenomicsToolkit/SegAlign).
- Yan Gao et al. for [abPOA](https://github.com/yangao07/abPOA)
- Heng Li for [minigraph](https://github.com/lh3/minigraph), [minimap2](https://github.com/lh3/minimap2) and [gfatools](https://github.com/lh3/gfatools)
- Heng Li for [minigraph](https://github.com/lh3/minigraph), [minimap2](https://github.com/lh3/minimap2), [gfatools](https://github.com/lh3/gfatools) and [dna-brnn](https://github.com/lh3/dna-rnn)

## Setup

Expand Down
16 changes: 13 additions & 3 deletions bar/cactus_bar.c
Expand Up @@ -73,6 +73,8 @@ void usage() {

fprintf(stderr, "-P --partialOrderAlignmentWindow (int >= 0): Use partial order aligner instead of Pecan for multiple alignment subproblems, on blocks up to given length (0=disable POA).\n");

fprintf(stderr, "-m --maskFilter (int N >= 0) : Trim sequences to align at run of >=N masked bases. Only implemented for POA. [N==0 : disabled] (default=0)\n");

fprintf(stderr, "-h --help : Print this help screen\n");
}

Expand Down Expand Up @@ -115,8 +117,9 @@ int main(int argc, char *argv[]) {
int64_t minimumSizeToRescue = 1;
double minimumCoverageToRescue = 0.0;
// toggle from pecan to abpoa for multiple alignment, by setting to non-zero
// Note that poa uses about N^2, so maximum value is generally in 10s of kb
// Note that poa uses about N^2 memory, so maximum value is generally in 10s of kb
int64_t poaWindow = 0;
int64_t maskFilter = 0;

PairwiseAlignmentParameters *pairwiseAlignmentBandingParameters = pairwiseAlignmentBandingParameters_construct();

Expand Down Expand Up @@ -147,11 +150,12 @@ int main(int argc, char *argv[]) {
{"minimumCoverageToRescue", required_argument, 0, 'M'},
{ "minimumNumberOfSpecies", required_argument, 0, 'N' },
{"partialOrderAlignmentWindow", required_argument, 0, 'P'},
{"maskFilter", required_argument, 0, 'm'},
{ 0, 0, 0, 0 } };

int option_index = 0;

int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:P:", long_options, &option_index);
int key = getopt_long(argc, argv, "a:b:hi:j:kl:o:p:q:r:t:u:wy:A:B:D:E:FGI:J:K:L:M:N:P:m:", long_options, &option_index);

if (key == -1) {
break;
Expand Down Expand Up @@ -278,6 +282,12 @@ int main(int argc, char *argv[]) {
st_errAbort("Error parsing poaLength parameter");
}
break;
case 'm':
i = sscanf(optarg, "%" PRIi64 "", &maskFilter);
if (i != 1) {
st_errAbort("Error parsing poaLength parameter");
}
break;
default:
usage();
return 1;
Expand Down Expand Up @@ -400,7 +410,7 @@ int main(int argc, char *argv[]) {
*
* It does not use any precomputed alignments, if they are provided they will be ignored
*/
alignment_blocks = make_flower_alignment_poa(flower, maximumLength, poaWindow);
alignment_blocks = make_flower_alignment_poa(flower, maximumLength, poaWindow, maskFilter);
st_logInfo("Created the poa alignments: %" PRIi64 " poa alignment blocks\n", stList_length(alignment_blocks));
pinchIterator = stPinchIterator_constructFromAlignedBlocks(alignment_blocks);
}
Expand Down
47 changes: 44 additions & 3 deletions bar/impl/poaBarAligner.c
Expand Up @@ -537,6 +537,37 @@ char *get_adjacency_string(Cap *cap, int *length) {
}
}

/**
* Used to find where a run of masked (hard or soft) of at least mask_filter bases starts
* @param seq : The string
* @param seq_length : The length of the string
* @param length : The maximum length we want to search in
* @param reversed : If true, scan from the end of the string
* @param mask_filter : Cut a string as soon as we hit this many hard or softmasked bases (cut is before first masked base)
* @return length of the filtered string
*/
static int get_unmasked_length(char* seq, int64_t seq_length, int64_t length, bool reversed, int64_t mask_filter) {
if (mask_filter > 0) {
int64_t run_start = -1;
for (int64_t i = 0; i < length; ++i) {
char base = reversed ? seq[seq_length - 1 - i] : seq[i];
if (islower(base) || base == 'N') {
if (run_start == -1) {
// start masked run
run_start = i;
}
if (i + 1 - run_start >= mask_filter) {
// our run exceeds or equals mask_filter, cap before the first masked base
return (int)run_start;
}
} else {
run_start = -1;
}
}
}
return (int)length;
}

/**
* Used to get a prefix of a given adjacency sequence.
* @param seq_length
Expand All @@ -545,7 +576,7 @@ char *get_adjacency_string(Cap *cap, int *length) {
* @param max_seq_length
* @return
*/
char *get_adjacency_string_and_overlap(Cap *cap, int *length, int64_t *overlap, int64_t max_seq_length) {
char *get_adjacency_string_and_overlap(Cap *cap, int *length, int64_t *overlap, int64_t max_seq_length, int64_t mask_filter) {
// Get the complete adjacency string
int seq_length;
char *adjacency_string = get_adjacency_string(cap, &seq_length);
Expand All @@ -555,6 +586,16 @@ char *get_adjacency_string_and_overlap(Cap *cap, int *length, int64_t *overlap,
*length = seq_length > max_seq_length ? max_seq_length : seq_length;
assert(*length >= 0);

if (mask_filter > 0) {
// apply the mask filter on the forward strand
int unmasked_length = get_unmasked_length(adjacency_string, seq_length, *length, false, mask_filter);
if (unmasked_length < *length) {
// if anything got filtered, check the reverse strand and take the minimum, this way the same
// length is returned no matter which strand we're on
*length = get_unmasked_length(adjacency_string, seq_length, unmasked_length, true, mask_filter);
}
}

// Cleanup the string
adjacency_string[*length] = '\0'; // Terminate the string at the given length
char *c = stString_copy(adjacency_string);
Expand Down Expand Up @@ -716,7 +757,7 @@ void create_alignment_blocks(Msa *msa, Cap **row_indexes_to_caps, stList *alignm
assert(i == msa->column_no);
}

stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size) {
stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size, int64_t mask_filter) {
// Arrays of ends and connecting the strings necessary to build the POA alignment
int64_t end_no = flower_getEndNumber(flower); // The number of ends
int64_t end_lengths[end_no]; // The number of strings incident with each end
Expand Down Expand Up @@ -756,7 +797,7 @@ stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_
}
// Get the prefix of the adjacency string and its length and overlap with its reverse complement
end_strings[i][j] = get_adjacency_string_and_overlap(cap, &(end_string_lengths[i][j]),
&(overlaps[i][j]), max_seq_length);
&(overlaps[i][j]), max_seq_length, mask_filter);

// Populate the caps to end/row indices, and vice versa, data structures
indices_to_caps[i][j] = cap;
Expand Down
3 changes: 2 additions & 1 deletion bar/inc/poaBarAligner.h
Expand Up @@ -108,9 +108,10 @@ char *get_adjacency_string(Cap *cap, int *length);
* @param max_seq_length is the maximum length of the prefix of an unaligned sequence
* to attempt to align.
* @param window_size Sliding window size which limits length of poa sub-alignments. Memory usage is quardatic in this.
* @param mask_filter Trim input sequences if encountering this many consecutive soft of hard masked bases (0 = disabled)
* Returns a list of AlignmentBlock ojects
*/
stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size);
stList *make_flower_alignment_poa(Flower *flower, int64_t max_seq_length, int64_t window_size, int64_t mask_filter);

/**
* Create a pinch iterator for a list of alignment blocks.
Expand Down
4 changes: 2 additions & 2 deletions bar/tests/poaBarTest.c
Expand Up @@ -181,7 +181,7 @@ void test_make_flower_alignment_poa(CuTest *testCase) {
}
flower_destructEndIterator(endIterator);

stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000);
stList *alignment_blocks = make_flower_alignment_poa(flower, 2, 1000000, 5);

for(int64_t i=0; i<stList_length(alignment_blocks); i++) {
AlignmentBlock *b = stList_get(alignment_blocks, i);
Expand All @@ -194,7 +194,7 @@ void test_make_flower_alignment_poa(CuTest *testCase) {
void test_alignment_block_iterator(CuTest *testCase) {
setup(testCase);

stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000);
stList *alignment_blocks = make_flower_alignment_poa(flower, 10000, 1000000, 5);

for(int64_t i=0; i<stList_length(alignment_blocks); i++) {
AlignmentBlock *b = stList_get(alignment_blocks, i);
Expand Down
24 changes: 23 additions & 1 deletion build-tools/downloadMiniTools
Expand Up @@ -13,6 +13,8 @@ set -beEu -o pipefail

miniBuildDir=$(realpath -m build-mini)
binDir=$(pwd)/bin
# just use cactusRootPath for now
dataDir=$(pwd)/src/cactus
CWD=$(pwd)

set -x
Expand Down Expand Up @@ -65,11 +67,27 @@ else
exit 1
fi

# dna-brnn
cd ${miniBuildDir}
git clone https://github.com/lh3/dna-nn.git
cd dna-nn
git checkout 2e6d242ae339457b985f50086e85194c3ce418b1
# hack in a static build
sed -i Makefile -e 's/CFLAGS=/CFLAGS+=/' -e 's/LIBS=/LIBS+=/'
CFLAGS="-static" LIBS="-static" make -j 4
if [ $(ldd dna-brnn | grep so | wc -l) -eq 0 ]
then
mv dna-brnn ${binDir}
cp models/attcc-alpha.knm ${dataDir}
else
exit 1
fi

# mzgaf2paf
cd ${miniBuildDir}
git clone https://github.com/glennhickey/mzgaf2paf.git
cd mzgaf2paf
git checkout b29f62a4a1ac1fb5abf658f55bd74607b50d2d67
git checkout c43c0b6121b95979ff484106a6c543180cdd89a2
CXXFLAGS="-static" make -j 4
if [ $(ldd mzgaf2paf | grep so | wc -l) -eq 0 ]
then
Expand All @@ -90,3 +108,7 @@ fi

cd ${CWD}
rm -rf ${miniBuildDir}

echo "--------------------------------------------------"
echo "(re)run pip install -U . to install dna-brnn model"
echo "--------------------------------------------------"
7 changes: 5 additions & 2 deletions preprocessor/Makefile
Expand Up @@ -6,7 +6,7 @@ CFLAGS += ${tokyoCabinetIncl} ${hiredisIncl}
all: all_libs all_progs
all_libs:
all_progs: all_libs
${MAKE} ${BINDIR}/cactus_analyseAssembly ${BINDIR}/cactus_batch_mergeChunks ${BINDIR}/cactus_makeAlphaNumericHeaders.py ${BINDIR}/cactus_filterSmallFastaSequences.py
${MAKE} ${BINDIR}/cactus_analyseAssembly ${BINDIR}/cactus_batch_mergeChunks ${BINDIR}/cactus_makeAlphaNumericHeaders.py ${BINDIR}/cactus_filterSmallFastaSequences.py ${BINDIR}/cactus_softmask2hardmask
cd lastzRepeatMasking && ${MAKE} all

${BINDIR}/cactus_filterSmallFastaSequences.py : cactus_filterSmallFastaSequences.py
Expand All @@ -23,7 +23,10 @@ ${BINDIR}/cactus_analyseAssembly : cactus_analyseAssembly.c ${LIBDEPENDS} ${LIBD
${BINDIR}/cactus_batch_mergeChunks : *.c ${LIBDIR}/cactusLib.a ${LIBDEPENDS}
${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/cactus_batch_mergeChunks cactus_batch_mergeChunks.c ${LIBDIR}/cactusLib.a ${LDLIBS}

${BINDIR}/cactus_softmask2hardmask : cactus_softmask2hardmask.c ${LIBDEPENDS} ${LIBDIR}/cactusLib.a
${CC} ${CPPFLAGS} ${CFLAGS} ${LDFLAGS} -o ${BINDIR}/cactus_softmask2hardmask cactus_softmask2hardmask.c ${LIBDIR}/cactusLib.a ${LDLIBS}

clean :
rm -f *.o
rm -f ${BINDIR}/cactus_analyseAssembly ${BINDIR}/cactus_batch_mergeChunks
rm -f ${BINDIR}/cactus_analyseAssembly ${BINDIR}/cactus_batch_mergeChunks ${BINDIR}/cactus_softmask2hardmask
cd lastzRepeatMasking && ${MAKE} clean
59 changes: 59 additions & 0 deletions preprocessor/cactus_softmask2hardmask.c
@@ -0,0 +1,59 @@
/*
* Copyright (C) 2009-2011 by Benedict Paten (benedictpaten@gmail.com)
*
* Released under the MIT license, see LICENSE.txt
*/

#include <assert.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>
#include <stdlib.h>
#include <time.h>
#include <getopt.h>
#include <inttypes.h>
#include <sys/types.h>
#include <sys/param.h>
#include <sys/stat.h>
#include <dirent.h>
#include <math.h>
#include <ctype.h>

#include "bioioC.h"
#include "cactus.h"

void usage() {
fprintf(stderr, "cactus_softmask2hardmask [fastaFile]\n");
}

static void hardmask(void* out_file, const char* name, const char* seq, int64_t length) {
char* uc_seq = (char*)seq;
for (int64_t i = 0; i < length; ++i) {
if (islower(uc_seq[i])) {
uc_seq[i] = 'N';
}
}
fastaWrite(uc_seq, (char*)name, (FILE*)out_file);
}

int main(int argc, char *argv[]) {
if(argc == 1) {
usage();
return 0;
}

for (int64_t j = 1; j < argc; j++) {
FILE *fileHandle;
if (strcmp(argv[j], "-") == 0) {
fileHandle = stdin;
} else {
fileHandle = fopen(argv[j], "r");
if (fileHandle == NULL) {
st_errnoAbort("Could not open input file %s", argv[j]);
}
}
fastaReadToFunction(fileHandle, stdout, hardmask);
}

return 0;
}
Expand Up @@ -25,6 +25,7 @@ def usage(s=None):
(default is to mask with lowercase)
--unmask remove any previous softmasking from sequence being masked (covert to upper case
before masking)
--minLength=<N> ignore intervals less than N
"""

if (s == None): exit (message)
Expand All @@ -41,6 +42,7 @@ def main():
maskChar = None
intervalsFile = None
unmask = False
minLength = None

for arg in argv[1:]:
if ("=" in arg):
Expand All @@ -63,6 +65,8 @@ def main():
if (len(maskChar) != 1): usage("--mask requires a single character")
elif (arg.startswith("--unmask")):
unmask = True
elif (arg.startswith("--minLength=")):
minLength = int(argVal)
elif (arg.startswith("--")):
usage("can't understand %s" % arg)
elif (intervalsFile == None):
Expand Down Expand Up @@ -104,7 +108,8 @@ def main():
continue

if (chrom not in chromToIntervals): chromToIntervals[chrom] = []
chromToIntervals[chrom] += [(start,end)]
if minLength is None or end-start >= minLength:
chromToIntervals[chrom] += [(start,end)]

f.close()

Expand Down
2 changes: 1 addition & 1 deletion setup.py
Expand Up @@ -26,7 +26,7 @@ def run(self):
packages = find_packages(where='src'),
include_package_data = True,
package_data = {
'cactus': ['*_config.xml']
'cactus': ['*_config.xml', '*.knm']
},
# We use the __file__ attribute so this package isn't zip_safe.
zip_safe = False,
Expand Down

0 comments on commit aae8094

Please sign in to comment.