From 1b73c7cefd9730c121cabfb39d48300789b4f889 Mon Sep 17 00:00:00 2001 From: Glenn Hickey Date: Thu, 16 Mar 2023 11:40:34 -0400 Subject: [PATCH] add temp N filter for maf reflines in hal2maf --- build-tools/downloadMafTools | 2 +- doc/progressive.md | 4 +++- src/cactus/maf/cactus_hal2maf.py | 9 ++++++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/build-tools/downloadMafTools b/build-tools/downloadMafTools index e6e83f101..ac30273b3 100755 --- a/build-tools/downloadMafTools +++ b/build-tools/downloadMafTools @@ -55,7 +55,7 @@ fi cd ${mafBuildDir} git clone https://github.com/ComparativeGenomicsToolkit/mafTools.git cd mafTools -git checkout 40cfa5b503a34b8b0b7799678237e2f13ae8bf36 +git checkout 0d2a253a528749bad2c6c0179bd15edd8d56adf6 find . -name "*.mk" | xargs sed -ie "s/-Werror//g" find . -name "Makefile*" | xargs sed -ie "s/-Werror//g" # hack in flags support diff --git a/doc/progressive.md b/doc/progressive.md index 9be688c98..b4202b44c 100644 --- a/doc/progressive.md +++ b/doc/progressive.md @@ -143,7 +143,9 @@ cactus-hal2chain ./js ./evolverMammals.hal chains-dir --refGenome simHuman_chr6 will create `./chains-dir` and populate it with a Chain alignment between simHuman and each other leaf genome in evolverMammals.hal. -By default, chains will be created using `halLiftover` [as in CAT](https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit/blob/fc1623da5df1309d2e2f0b9bb0363aaab84708f4/cat/chaining.py#L96-L98). An option `--useHalSynteny` is provided to use that tool instead. +By default, chains will be created using `halLiftover` [as in CAT](https://github.com/ComparativeGenomicsToolkit/Comparative-Annotation-Toolkit/blob/fc1623da5df1309d2e2f0b9bb0363aaab84708f4/cat/chaining.py#L96-L98). An option `--useHalSynteny` is provided to use that tool instead. + +See here for an all-vs-all script to make chains, including BigChain conversion: https://github.com/human-pangenomics/HPRC_Assembly_Hub/blob/main/chains/wdl/snakesonachain.wdl ### CAT diff --git a/src/cactus/maf/cactus_hal2maf.py b/src/cactus/maf/cactus_hal2maf.py index 636b2f4dc..b1d37c117 100644 --- a/src/cactus/maf/cactus_hal2maf.py +++ b/src/cactus/maf/cactus_hal2maf.py @@ -98,7 +98,12 @@ def main(): parser.add_argument("--keepGapCausingDupes", help="Turn off taffy norm -d filter that removes duplications that would induce gaps > maximumGapLength", action="store_true") - + + parser.add_argument("--maxRefNFrac", + help="(hopefully temporary) partial work around of a current bug that aligns through Ns by filtering out MAF blocks whose reference (first) line has a greater fraction of Ns than the given amount. Should be between 0.0 (filter everything) and 1.0 (filter nothing). [default=0.75]", + type=float, + default=0.75) + #Progressive Cactus Options parser.add_argument("--configFile", dest="configFile", help="Specify cactus configuration file", @@ -301,6 +306,8 @@ def taf_cmd(hal_path, chunk, chunk_num, options): cmd += ' | {} taffy norm -k -m {} -n {} {} -q {}{} 2> {}.tn.time'.format(time_cmd, options.maximumBlockLengthToMerge, options.maximumGapLength, '' if options.keepGapCausingDupes else '-d', options.fractionSharedRows, time_end, chunk_num) + if options.maxRefNFrac: + cmd += ' | mafFilter -m - -N {}'.format(options.maxRefNFrac) if options.dupeMode == 'single': cmd += ' | mafDuplicateFilter -m - -k' if chunk[1] != 0: