Permalink
Browse files

merge development

  • Loading branch information...
glennhickey committed Dec 5, 2013
2 parents 1eb1b9f + 7415d13 commit 4f814f9b3ea4acb74f84472d94ea5abc75bcfdcf
Showing with 8,289 additions and 1,486 deletions.
  1. +67 −0 CODING_STYLE.txt
  2. +1 −1 Makefile
  3. +69 −9 README.md
  4. BIN README.pdf
  5. +7 −9 alignability/halAlignability.cpp
  6. +1 −0 api/Makefile
  7. +21 −0 api/hdf5_impl/hdf5Alignment.cpp
  8. +4 −0 api/hdf5_impl/hdf5Alignment.h
  9. +5 −0 api/hdf5_impl/hdf5BottomSegment.cpp
  10. +1 −0 api/hdf5_impl/hdf5BottomSegment.h
  11. +2 −4 api/hdf5_impl/hdf5ExternalArray.cpp
  12. +278 −66 api/hdf5_impl/hdf5Genome.cpp
  13. +9 −2 api/hdf5_impl/hdf5Genome.h
  14. +1 −1 api/hdf5_impl/hdf5MetaData.cpp
  15. +74 −78 api/hdf5_impl/hdf5Sequence.cpp
  16. +12 −20 api/hdf5_impl/hdf5Sequence.h
  17. +4 −3 api/hdf5_impl/hdf5SequenceIterator.cpp
  18. +5 −0 api/hdf5_impl/hdf5TopSegment.cpp
  19. +1 −0 api/hdf5_impl/hdf5TopSegment.h
  20. +23 −11 api/hdf5_tests/hdf5SequenceTypeTest.cpp
  21. +25 −1 api/impl/defaultBottomSegmentIterator.cpp
  22. +3 −0 api/impl/defaultBottomSegmentIterator.h
  23. +32 −10 api/impl/defaultColumnIterator.cpp
  24. +3 −1 api/impl/defaultColumnIterator.h
  25. +25 −1 api/impl/defaultGappedBottomSegmentIterator.cpp
  26. +1 −0 api/impl/defaultGappedBottomSegmentIterator.h
  27. +23 −0 api/impl/defaultGappedTopSegmentIterator.cpp
  28. +1 −0 api/impl/defaultGappedTopSegmentIterator.h
  29. +58 −37 api/impl/defaultMappedSegment.cpp
  30. +2 −2 api/impl/defaultMappedSegment.h
  31. +23 −0 api/impl/defaultSegmentIterator.cpp
  32. +2 −1 api/impl/defaultSegmentIterator.h
  33. +20 −1 api/impl/defaultTopSegmentIterator.cpp
  34. +3 −0 api/impl/defaultTopSegmentIterator.h
  35. +48 −24 api/impl/halCommon.cpp
  36. +8 −1 api/inc/halAlignment.h
  37. +0 −34 api/inc/halBottomSegmentIterator.h
  38. +12 −21 api/inc/halColumnIterator.h
  39. +16 −0 api/inc/halCommon.h
  40. +6 −0 api/inc/halCountedPtr.h
  41. +2 −1 api/inc/halDefs.h
  42. +0 −12 api/inc/halGappedBottomSegmentIterator.h
  43. +0 −14 api/inc/halGappedTopSegmentIterator.h
  44. +5 −0 api/inc/halGenome.h
  45. +10 −0 api/inc/halSegment.h
  46. +1 −1 api/inc/halSegmentedSequence.h
  47. +2 −2 api/inc/halSequence.h
  48. +0 −31 api/inc/halTopSegmentIterator.h
  49. +5 −1 api/tests/halAlignmentTest.cpp
  50. +2 −2 api/tests/halMappedSegmentTest.cpp
  51. +11 −0 assemblyHub/Makefile
  52. 0 {mask → assemblyHub}/__init__.py
  53. +54 −0 assemblyHub/alignabilityTrack.py
  54. +99 −0 assemblyHub/assemblyHubCommon.py
  55. +266 −0 assemblyHub/bedTrack.py
  56. +101 −0 assemblyHub/conservationTrack.py
  57. +56 −0 assemblyHub/gcPercentTrack.py
  58. +347 −0 assemblyHub/hal2assemblyHub.py
  59. +147 −0 assemblyHub/prepareHubFiles.py
  60. +92 −0 assemblyHub/prepareLodFiles.py
  61. +57 −0 assemblyHub/rmskTrack.py
  62. +31 −0 assemblyHub/snakeTrack.py
  63. +154 −0 assemblyHub/wigTrack.py
  64. +18 −16 chain/Makefile
  65. +0 −383 chain/hal2assemblyHub.py
  66. +0 −99 chain/impl/halBlockInterpolate.cpp
  67. +241 −33 chain/impl/halBlockViz.cpp
  68. +0 −28 chain/inc/halBlockInterpolate.h
  69. +53 −5 chain/inc/halBlockViz.h
  70. +0 −131 chain/test/blockInterpolateTest.cpp
  71. +5 −2 chain/test/blockVizBed.c
  72. +170 −0 chain/test/blockVizBenchmark.py
  73. +115 −0 chain/test/blockVizMaf.c
  74. +15 −10 chain/test/blockVizTest.c
  75. +124 −0 chain/test/blockVizTime.c
  76. +50 −7 chain/test/halChainGetBlocksTest.cpp
  77. +6 −0 chain/test/halChainGetBlocksTest.h
  78. +20 −0 chain/test/timing.sh
  79. +19 −8 extract/Makefile
  80. 0 extract/__init__.py
  81. +428 −0 extract/impl/hal4dExtract.cpp
  82. +139 −0 extract/impl/hal4dExtractMain.cpp
  83. +17 −7 extract/{ → impl}/halAlignedExtract.cpp
  84. 0 extract/{ → impl}/halExtract.cpp
  85. 0 {mask → extract}/impl/halMaskExtractMain.cpp
  86. 0 {mask → extract}/impl/halMaskExtractor.cpp
  87. +57 −0 extract/inc/hal4dExtract.h
  88. 0 {mask → extract}/inc/halMaskExtractor.h
  89. +284 −0 extract/tests/hal4dExtractTest.cpp
  90. +32 −0 extract/tests/hal4dExtractTest.h
  91. +41 −1 include.mk
  92. +12 −7 liftover/Makefile
  93. +175 −10 liftover/impl/halBedLine.cpp
  94. +36 −12 liftover/impl/halBedScanner.cpp
  95. +67 −0 liftover/impl/halBlockLiftover.cpp
  96. +12 −1 {chain → liftover}/impl/halBlockMapper.cpp
  97. +281 −161 liftover/impl/halLiftover.cpp
  98. +29 −2 liftover/impl/halLiftoverMain.cpp
  99. +264 −0 liftover/impl/halWiggleLiftover.cpp
  100. +157 −0 liftover/impl/halWiggleLiftoverMain.cpp
  101. +64 −0 liftover/impl/halWiggleLoader.cpp
  102. +241 −0 liftover/impl/halWiggleScanner.cpp
  103. +30 −0 liftover/inc/halBedLine.h
  104. +12 −6 liftover/inc/halBedScanner.h
  105. +3 −0 liftover/inc/halBlockLiftover.h
  106. +2 −0 {chain → liftover}/inc/halBlockMapper.h
  107. +9 −2 liftover/inc/halLiftover.h
  108. +49 −0 liftover/inc/halTabFacet.h
  109. +77 −0 liftover/inc/halWiggleLiftover.h
  110. +47 −0 liftover/inc/halWiggleLoader.h
  111. +62 −0 liftover/inc/halWiggleScanner.h
  112. +199 −0 liftover/inc/halWiggleTiles.h
  113. +1 −0 lod/Makefile
  114. +2 −1 lod/halLodBenchmark.py
  115. +16 −7 lod/halLodInterpolate.py
  116. +34 −3 lod/impl/halLodExtract.cpp
  117. +11 −9 lod/impl/halLodExtractMain.cpp
  118. +18 −4 lod/impl/halLodGraph.cpp
  119. +50 −28 lod/impl/halLodManager.cpp
  120. +5 −2 lod/inc/halLodExtract.h
  121. +5 −1 lod/inc/halLodManager.h
  122. +4 −3 maf/Makefile
  123. +22 −16 maf/hal2mafMP.py
  124. +46 −52 maf/impl/hal2maf.cpp
  125. +119 −0 maf/impl/halMafBed.cpp
  126. +52 −0 maf/inc/halMafBed.h
  127. +0 −23 mask/Makefile
  128. +1 −0 mutations/Makefile
  129. +40 −0 phyloP/Makefile
  130. +18 −0 phyloP/README.txt
  131. +306 −0 phyloP/halPhyloPMP.py
  132. +260 −0 phyloP/halPhyloPTrain.py
  133. +156 −0 phyloP/halTreePhyloP.py
  134. +438 −0 phyloP/impl/halPhyloP.cpp
  135. +116 −0 phyloP/impl/halPhyloPBed.cpp
  136. +273 −0 phyloP/impl/halPhyloPMain.cpp
  137. +87 −0 phyloP/inc/halPhyloP.h
  138. +51 −0 phyloP/inc/halPhyloPBed.h
  139. BIN phyloP/test/blanchette.hal
  140. +10 −0 phyloP/test/blanchette.mod
  141. +3 −0 phyloP/test/test.sh
  142. +1 −0 stats/Makefile
  143. +18 −2 stats/halStats.py
  144. +126 −2 stats/impl/halStatsMain.cpp
View
@@ -0,0 +1,67 @@
+Please adhere to these style convetions for source code you will push (or send pull requests for) into the HAL repository. They should be pretty obvious by looking any of the existing source code...
+
+
+indent: 2 spaces (no tab characters)
+
+curly braces: on their own lines, no indent. ex:
+
+if (x > y)
+{
+ return true;
+}
+
+class names, struct names and other types: first character capitalized
+
+function names and variable names: first character lower case (ex variableNameOne)
+
+member variables: begin with _
+
+multiword names: capitalize subsequent words (ie no underscore) (ex ClassNameOne, variableNameOne)
+
+inline functions: not defined in class definition. ex:
+
+class X
+{
+ void f();
+};
+
+inline X::voidf()
+{
+
+}
+
+maximum line width: 80 characters
+
+never use "using namespace" in a header file
+
+--Glenn
+
+
+.emacs for the above:
+
+(add-to-list 'auto-mode-alist '("\\.h\\'" . c++-mode))
+
+(c-add-style "mycodingstyle"
+ '((c-comment-only-line-offset . 0)
+ (c-hanging-braces-alist . ((substatement-open beforeafter)))
+ (c-offsets-alist . ((topmost-intro . 0)
+ (topmost-intro-cont . 0)
+ (substatement . 3)
+ (substatement-open . 0)
+ (statement-case-open . 3)
+ (statement-cont . 3)
+ (access-label . -3)
+ (inclass . 3)
+ (inline-open . 3)
+ (innamespace . 0)
+ ))))
+
+;; c/c++ mode
+(add-hook 'c-mode-common-hook
+ '(lambda()
+ (c-set-style "mycodingstyle")
+ (setq tab-width 2)
+ (setq c-basic-offset tab-width)
+ (setq tab-width 8
+ ;; this will make sure spaces are used instead of tabs
+ indent-tabs-mode nil)))
View
@@ -1,5 +1,5 @@
# order is important, libraries first
-modules = api stats randgen validate mutations maf extract fasta alignability lod chain liftover mask analysis
+modules = api stats randgen validate mutations fasta alignability liftover lod maf chain extract analysis phyloP assemblyHub
.PHONY: all %.all clean %.clean doxy %.doxy
View
@@ -1,6 +1,6 @@
-Hierarchical Alignment (HAL) Format API (v1.4)
+Hierarchical Alignment (HAL) Format API (v2.1)
=====
-Copyright (C) 2012 by Glenn Hickey (hickey@soe.ucsc.edu)
+Copyright (C) 2012 - 2013 by Glenn Hickey (hickey@soe.ucsc.edu)
Released under the MIT license, see LICENSE.txt
HAL is a structure to efficiently store and index multiple genome alignments and ancestral reconstructions. HAL is a graph-based representation which provides several advantages over matrix/block-based formats such as MAF, such as improved scalability and the ability to perform queries with respect to an arbitrary reference or subtree.
@@ -13,6 +13,14 @@ Glenn Hickey, Benedict Paten, Dent Earl, Daniel Zerbino, and David
Haussler. HAL: A Hierarchical Format for Storing and Analyzing
Multiple Genome Alignments. Bioinformatics. 2013. [Advance Online Access](http://bioinformatics.oxfordjournals.org/content/early/2013/03/16/bioinformatics.btt128.abstract)
+Code Contributors
+-----
+* Glenn Hickey (UCSC)
+* Joel Armstrong (UCSC)
+* Ngan Nguyen (UCSC)
+* Benedict Paten (UCSC)
+* Melissa Jane Hubisz (Cornell)
+
Installation
-----
@@ -26,6 +34,10 @@ From the parent directory of where you want HAL installed:
git clone git://github.com/glennhickey/hal.git
+#### Progressive Cactus Package
+
+Note that HAL can also be downloaded and installed (automatically along with all its dependencies) as part of the [Progressive Cactus installation package](https://github.com/glennhickey/progressiveCactus)
+
### Installing Dependencies
#### HDF5 1.8 with C++ API enabled
@@ -73,24 +85,54 @@ to reflect the directory where you installed sonLib
Define ENABLE_UDC before making, and specify the path of the Kent source tree using KENTSRC. When built with this enabled, all HAL files opened read-only will be accessed using UDC which supports both local files and URLs.
- `export ENABLE_UDC=1`
- `export KENTSRC=<path to top level of Kent source tree>`
+ export ENABLE_UDC=1
+ export KENTSRC=<path to top level of Kent source tree>
Those without the UCSC genome browser already installed locally will probably find it simpler to first mount URLs with [HTTPFS](http://httpfs.sourceforge.net/) before opening with HAL.
+#### Optional support of PhyloP evolutionary constraint annotation
+
+PhyloP is part of the [Phast Package](http://compgen.bscb.cornell.edu/phast/), and can be used to test for genomic positions that are under selective pressure. We are working on prototype support for running PhyloP on HAL files. In order to enable this support, Phast must be installed. We recommend downloading the latest source using Subversion.
+
+From the same parent directory where you downloaded HAL:
+
+* First install CLAPACK (Linux only)
+
+ `wget http://www.netlib.org/clapack/clapack.tgz`
+ `tar -xvzf clapack.tgz`
+ `mv CLAPACK-3.2.1 clapack`
+ `cd clapack`
+ `cp make.inc.example make.inc && make f2clib && make blaslib && make lib`
+ ``export CLAPACKPATH=`pwd` ``
+ `cd ..`
+
+* Install Phast (Mac or Linux)
+
+ `svn co http://compgen.bscb.cornell.edu/svnrepo/phast/trunk phast/`
+ `cd phast`
+ ``export PHAST=`pwd` ``
+ `cd src && make`
+ `cd ../..`
+
+* Before building HAL
+
+ `export ENABLE_PHYLOP=1`
+
+Special thanks to Melissa Jane Hubiz and Adam Siepel from Cornell University for their work on extending their tools to work with HAL.
+
### Building HAL
From the hal/ directory:
- `make`
+ make
Before using HAL, add it to your path:
- `export PATH=<path to hal>/bin:${PATH}`
+ export PATH=<path to hal>/bin:${PATH}
The parent directory of hal/ should be in your PYTHONPATH in order to use any of the Python functionality. This includes running `make test`
- `export PYTHONPATH=<parent of hal>:${PYTHONPATH}`
+ export PYTHONPATH=<parent of hal>:${PYTHONPATH}
HAL Tools
-----
@@ -233,14 +275,18 @@ Annotations in [BED](http://genome.ucsc.edu/FAQ/FAQformat.html#format1), ie tab-
SequenceName StartPosition LastPosition+1
-can be lifted over between genomes using `halLiftover`. halLiftover does a base-by-base mapping between any two sequences in the alignment (following paralogy relations as well).
+can be lifted over between genomes using `halLiftover`. halLiftover does a base-by-base mapping between any two sequences in the alignment (following paralogy relations as well). The output is written in BED (default) or PSL format.
halLiftover mammals.hal human human_annotation.bed dog dog_annotation.bed
will map all annotations in human_annotation.bed, which must refer to sequences in the human genome, to their corresponding locations in dog (if they exist), outputting the resulting annotations in dog_annotation.bed
halLiftover attempts to autodetect the BED version of the input. This can be overried with the `--inVedVersion` option. Columns that are not described in the official BED specs can be optionally mapped as-is using the `--keepExtra` option.
+By default, halLiftover uses spaces and/or tabs to separate columns. To use only tabs (ie to allow spaces within names), use the `--tab` option.
+
+Annotations in [Wiggle](http://genome.ucsc.edu/goldenPath/help/wiggle.html) format can likewise be mapped using `halWiggleLiftover`
+
#### Alignability
The number of distinct genomes different bases of a set of target genomes align to can be computed using the `halAlignability` tool. The output is in `.wig` format.
@@ -269,8 +315,22 @@ Two bed files must be specified because the coordinates of inserted (and by conv
Point mutations can optionally be written using the `--snpFile <file>` option. The '--maxGap' and '--maxNFraction' options can specify the gap indel threshold and missing data threshold, respectively, as described above in the *halSummarizeMtuations* section.
+### Constrained Element Prediction
+
+(Under development)
+
+PhyloP is part of the [Phast Package](http://compgen.bscb.cornell.edu/phast/), and can be used to test for genomic positions that are under selective pressure. We are working on prototype support for running PhyloP on HAL files.
+
+* Train a neutral model
+
+ See `halPhyloPTrain.py`
+
+* Detect constrained elements
+
+ See `halPhyloPMP.py`
+
+Special thanks to Melissa Jane Hubiz and Adam Siepel from Cornell University for their work on extending their tools to work with HAL.
-### Importing from other formats
Example of HAL Genome Representation
-----
View
Binary file not shown.
@@ -240,6 +240,10 @@ void printSequence(ostream& outStream, const Sequence* sequence,
hal_size_t start, hal_size_t length, hal_size_t step)
{
hal_size_t seqLen = sequence->getSequenceLength();
+ if (seqLen == 0)
+ {
+ return;
+ }
/** If the length is 0, we do from the start position until the end
* of the sequence */
if (length == 0)
@@ -259,12 +263,6 @@ void printSequence(ostream& outStream, const Sequence* sequence,
const Genome* genome = sequence->getGenome();
string sequenceName = sequence->getName();
string genomeName = genome->getName();
- /** A hack to take the genome name out of the chromosome name. Should
- * be largely unnecessary now that our naming conventions are better */
- if (sequenceName.find(genomeName + '.') == 0)
- {
- sequenceName = sequenceName.substr(genomeName.length() + 1);
- }
/** The ColumnIterator is fundamental structure used in this example to
* traverse the alignment. It essientially generates the multiple alignment
@@ -388,9 +386,9 @@ void printGenome(ostream& outStream,
start < seqStart + seqLen &&
runningLength < length)
{
- hal_size_t readStart = seqStart >= start ? 0 : seqStart - start;
- hal_size_t readLen = std::min(seqLen - start, length - runningLength);
-
+ hal_size_t readStart = seqStart >= start ? 0 : start - seqStart;
+ hal_size_t readLen = min(seqLen - readStart, length);
+ readLen = min(readLen, length - runningLength);
printSequence(outStream, sequence, targetSet, readStart, readLen, step);
runningLength += readLen;
}
View
@@ -19,6 +19,7 @@ doxy :
${libPath}/halLib.a : ${libSources} ${libHeaders} ${libInternalHeaders} ${basicLibsDependencies}
cp ${libHeaders} ${libPath}/
+ rm -f *.o
${cpp} ${cppflags} -I inc -I hdf5_impl -I impl -I ${libPath}/ -c ${libSources}
ar rc halLib.a *.o
ranlib halLib.a
@@ -402,6 +402,27 @@ string HDF5Alignment::getParentName(const string& name) const
return stTree_getLabel(parent);
}
+
+void HDF5Alignment::updateBranchLength(const string& parentName,
+ const string& childName,
+ double length)
+{
+ map<string, stTree*>::iterator findIt = _nodeMap.find(childName);
+ if (findIt == _nodeMap.end())
+ {
+ throw hal_exception(string("node ") + childName + " not found");
+ }
+ stTree* node = findIt->second;
+ stTree* parent = stTree_getParent(node);
+ if (parent == NULL || parentName != stTree_getLabel(parent))
+ {
+ throw hal_exception(string("edge ") + parentName + "--" + childName +
+ " not found");
+ }
+ stTree_setBranchLength(node, length);
+ _dirty = true;
+}
+
double HDF5Alignment::getBranchLength(const string& parentName,
const string& childName) const
{
@@ -55,6 +55,10 @@ class HDF5Alignment : public Alignment
std::string getParentName(const std::string& name) const;
+ void updateBranchLength(const std::string& parentName,
+ const std::string& childName,
+ double length);
+
double getBranchLength(const std::string& parentName,
const std::string& childName) const;
@@ -110,6 +110,11 @@ bool HDF5BottomSegment::isMissingData(double nThreshold) const
return false;
}
+void HDF5BottomSegment::print(std::ostream& os) const
+{
+ os << "HDF5 Bottom Segment";
+}
+
// HDF5 SPECIFIC
H5::CompType HDF5BottomSegment::dataType(hal_size_t numChildren)
{
@@ -62,6 +62,7 @@ class HDF5BottomSegment : public BottomSegment
const std::set<const Genome*>* genomesOnPath,
bool doDupes,
hal_size_t minLength) const;
+ void print(std::ostream& os) const;
// BOTTOM SEGMENT INTERFACE
hal_size_t getNumChildren() const;
@@ -128,14 +128,12 @@ void HDF5ExternalArray::load(CommonFG* file, const H5std_string& path,
// create the internal data buffer
_bufSize = _chunkSize > 1 ? _chunkSize : _size;
- _bufStart = 0;
_bufEnd = _bufStart + _bufSize - 1;
+ // set out of range to ensure page happens
+ _bufStart = _bufEnd + 1;
delete [] _buf;
_buf = new char[_bufSize * _dataSize];
- // fill buffer from disk
- page(0);
-
assert(_bufSize > 0 || _size == 0);
}
Oops, something went wrong.

0 comments on commit 4f814f9

Please sign in to comment.