Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
rob-p committed Feb 10, 2018
2 parents bb0d3aa + 77cc0da commit fdb7dcc
Show file tree
Hide file tree
Showing 105 changed files with 25,207 additions and 5,561 deletions.
27 changes: 23 additions & 4 deletions CMakeLists.txt
@@ -1,15 +1,34 @@
cmake_minimum_required(VERSION 3.0)
cmake_minimum_required(VERSION 3.1)

project(pufferfish)

set(RBF_CPP_FLAGS "-pthread -std=c++11 -W -Wall -Wextra -Wpointer-arith -Wunused -Wwrite-strings -openmp -Wno-unknown-pragmas")
set(OPT_FLAGS "-O3 -DNDEBUG -funroll-loops -mmmx -msse -msse2 -msse3 -msse4 -msse4.2 -march=native -fno-strict-aliasing")
list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake")

# We require C++11
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)

set(KSW_FLAGS "-march=native -DHAVE_KALLOC")
set(RBF_CPP_FLAGS "-g -mbmi2 -msse4 -pthread -std=c++11 -W -Wall -Wextra -Wpointer-arith -Wunused -Wwrite-strings -openmp -Wno-unknown-pragmas -Wno-unused-function")
set(WARN_ALL_THINGS "-fdiagnostics-color=always -Wall -Wcast-align -Wcast-qual -Wconversion -Wctor-dtor-privacy -Wdisabled-optimization -Wdouble-promotion -Wextra -Wformat=2 -Winit-self -Wlogical-op -Wmissing-declarations -Wmissing-include-dirs -Wno-sign-conversion -Wnoexcept -Wold-style-cast -Woverloaded-virtual -Wpedantic -Wredundant-decls -Wshadow -Wstrict-aliasing=1 -Wstrict-null-sentinel -Wstrict-overflow=5 -Wswitch-default -Wundef -Wno-unknown-pragmas -Wuseless-cast")

#set(WARN_ALL_THINGS "-fdiagnostics-color=always -Wall -Wcast-align -Wcast-qual -Wconversion -Wctor-dtor-privacy -Wdisabled-optimization -Wdouble-promotion -Wduplicated-branches -Wduplicated-cond -Wextra -Wformat=2 -Winit-self -Wlogical-op -Wmissing-declarations -Wmissing-include-dirs -Wno-sign-conversion -Wnoexcept -Wnull-dereference -Wold-style-cast -Woverloaded-virtual -Wpedantic -Wredundant-decls -Wrestrict -Wshadow -Wstrict-aliasing=1 -Wstrict-null-sentinel -Wstrict-overflow=5 -Wswitch-default -Wundef -Wno-unknown-pragmas -Wuseless-cast")


set(OPT_FLAGS "-O3 -ffast-math -fPIC -DNDEBUG -funroll-loops -mmmx -msse -msse2 -msse3 -msse4 -msse4.2 -march=native -fno-strict-aliasing")
#set(OPT_FLAGS "-fprofile-generate -O3 -DNDEBUG -funroll-loops -mmmx -msse -msse2 -msse3 -msse4 -msse4.2 -march=native -fno-strict-aliasing")
#set(OPT_FLAGS "-fprofile-use -fprofile-correction -O3 -DNDEBUG -funroll-loops -mmmx -msse -msse2 -msse3 -msse4 -msse4.2 -march=native -fno-strict-aliasing")
set(DEBUG_FLAGS "-pg -g -gstabs")

set(RBF_CPP_FLAGS "${RBF_CPP_FLAGS} ${OPT_FLAGS}")
set(RBF_CPP_FLAGS "${KSW_FLAGS} ${RBF_CPP_FLAGS} ${OPT_FLAGS}")

set(CMAKE_CXX_FLAGS ${RBF_CPP_FLAGS})

find_package(JeMalloc)
if(JEMALLOC_FOUND)
include_directories(SYSTEM ${JEMALLOC_INCLUDE_DIRS})
endif()

include_directories(include)
link_directories(lib)
add_subdirectory(src)
81 changes: 74 additions & 7 deletions README.md
@@ -1,4 +1,8 @@
# What is pufferfish?
# Index
* [What is pufferfish?](#whatis)
* [Using pufferfish?](#using)

# What is pufferfish? <a name="whatis"></a>

**short answer** : Pufferfish is a new time and memory-efficient data structure for indexing a compacted, colored de Bruijn graph (ccdBG). You can read more about pufferfish in our [pre-print on bioRxiv](https://www.biorxiv.org/content/early/2017/09/21/191874).

Expand All @@ -10,16 +14,79 @@ While existing hash-based indices based on the cdBG (and ccdBG) are very efficie
**about pufferfish development:**
Currently, Pufferfish is the software implementing this efficient ccdBG index, and allowing point (i.e., k-mer) queries. Pufferfish is under active development, but we want to be as open (and as useful to as many people) as possible early on. However, we are also in the process of building higher-level tools (e.g., read mappers and aligners) around this index, so stay tuned!


**branches:**
The **master** branch of pufferfish is _not_ necessarily stable, but it should, at any given time contain a working version of the index. That is, breaking changes should not be pushed to master. The **develop** branch of pufferfish is guaranteed to be neither stable nor working at any given point, but a best-faith effort will be made to not commit broken code to this branch. For feature branches, all bets are off.

For more details about pufferfish, please check out our [pre-print on bioRxiv](https://www.biorxiv.org/content/early/2017/09/21/191874), as well as the (updating) blog post [here](http://robpatro.com/blog/?p=494).

# Building Pufferfish <a name="building"></a>
**Dependency:**
Pufferfish depends on sdsl-lite, to install from the pufferfish root directory execute following,
>git clone https://github.com/simongog/sdsl-lite.git
Building pufferfish depends on `sdsl-lite`, which we explain how to install during the building process.

>cd sdsl-lite
To build the pufferfish do the following,

```
>git clone git@github.com:COMBINE-lab/pufferfish.git
> cd pufferfish
>git clone https://github.com/simongog/sdsl-lite.git
>cd sdsl-lite
>./install.sh ../
> cd ..
> mkdir build
> cd build
> cmake ../
> make
```

# Using Pufferfish <a name="using"></a>

**branches:**
The **master** branch of pufferfish is _not_ necessarily stable, but it should, at any given time contain a working version of the index. That is, breaking changes should not be pushed to master. The **develop** branch of pufferfish is guaranteed to be neither stable nor working at any given point, but a best-faith effort will be made to not commit broken code to this branch. For feature branches, all bets are off.
**External Dependency:**

For more details about pufferfish, please check out our [pre-print on bioRxiv](https://www.biorxiv.org/content/early/2017/09/21/191874), as well as the (updating) blog post [here](http://robpatro.com/blog/?p=494).
In Pufferfish index building pipeline we use [TwoPaCo](https://github.com/medvedevgroup/TwoPaCo) to build the compacted de Bruijn graph from the list of references.
Later, Pufferfish builds the index on top of this compacted de Bruijn graph.

So before running the whole pipeline of index building, make sure you have already installed TwoPaCo.

## Core Pipeline
Having a set of reference fasta files or a concatenated fasta file which contains all the references, one can build the pufferfish index setting the required arguments in `config.json` and running the command `bash index.sh` in pufferfish directory.

The commands in the `index.sh` file go through the pipeline of "*fixFasta -> TwoPaCo juntion finding -> TwoPaCo dump -> pufferize -> pufferfish index*" and perform the following steps:
1. **FixFasta**
```
<Pufferfish Directory>/build/src/fixFasta -i <input_fasta> -o <output_fixed_fasta>
```
2. **TwoPaCo Junction Finding**
```
<TwoPaCo Directory>/graphconstructor/twopaco -k <ksize> -t <numOfThreads> -f <filterSize> <fixed_fasta> --outfile <deBruijnGraph.bin> --tmpdir <tmp directory>
```
3. **TwoPaCo Dumping**
```
<TwoPaCo Directory>/graphdump/graphdump -k <ksize> -s <fixed_fasta> -f gfa1 <deBruijnGraph.bin> > <gfa_file>
```
4. **Pufferize**
```
<Pufferfish Directory>/build/src/pufferize -k <ksize> -g <gfa_file> -f <fixed_fasta> -o <pufferized_gfa_file>
```
5. **Build Pufferfish Index**
```
<Pufferfish Directory>/build/src/pufferfish index -k <ksize> -g <pufferized_gfa_file> -r <fixed_fasta> -o <pufferfish index directory>
```

Good news is you can run the whole pipeline by just setting the required arguments in file `config.json` and then run `bash index.sh` in the root directory of this repository, pufferfish.

## Using Pufferfish with BCALM2

You can use pufferfish with the unitig file provided by [BCALM2](https://github.com/GATB/bcalm). Once you have downloaded and built bcalm, you can run it on your reference sequence file to produce a list of compacted unitigs like so:

```
bcalm -abundance-min 1 -max-memory <mem_in_MB> -nb-cores <cores_to_use> -in reference.fa -out out_prefix -k <k>
```

This will generate, in addition to some other files, a file called `out_prefix.unitigs.fa`. These are, unfortunately, not quite in the format required by pufferfish yet (unitigs can span reference boundaries). To fix this, we must `pufferize` the file. We can do that as such:

```
bcalm_pufferizer -k <k> -r reference.fa -u out_prefix.unitigs.fa
```

This will create a file called `out_prefix.unitigs.fa.pufferized.gfa`, on which you can then build the pufferfish index.
13 changes: 13 additions & 0 deletions config.json
@@ -0,0 +1,13 @@
{
"ksize":"31",
"pufferfish":"/home/fatemeh/projects/pufferfish",
"twopaco":"/home/fatemeh/projects/TwoPaCo",
"input": {
"is_input_a_directory_to_fasta_files":false,
"input_fasta":"example/gencode.v25.pc_transcripts.fa"
},
"output_dir":"example/output",
"tmp_dir":"example/tmp",
"num_of_threads":16,
"twopaco_filter_size":"estimate"
}
49 changes: 26 additions & 23 deletions include/CanonicalKmer.hpp
Expand Up @@ -2,15 +2,18 @@
#define __CANONICAL_KMER_HPP__

#include "jellyfish/mer_dna.hpp"
#include "Kmer.hpp"

// NO_MATCH => two k-mers k1, and k2 are distinct such that k1 != k2 and rc(k1)
// != k2
// IDENTITY_MATCH => k1 = k2
// TWIN_MATCH => rc(k1) = k2
enum class KmerMatchType : uint8_t { NO_MATCH = 0, IDENTITY_MATCH, TWIN_MATCH };

using my_mer = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;

namespace kmers = combinelib::kmers;
using my_mer = kmers::Kmer<32,1>;//jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;
//using my_mer2 = jellyfish::mer_dna_ns::mer_base_static<uint64_t, 1>;

/**
* This class wraps a pair of jellifish k-mers
* (i.e., mer_dna objects). It maintains both a
Expand All @@ -30,7 +33,7 @@ class CanonicalKmer {
CanonicalKmer(const CanonicalKmer& other) = default;
CanonicalKmer& operator=(CanonicalKmer& other) = default;

static inline void k(int kIn) { my_mer::k(kIn); }
static inline void k(int kIn) { my_mer::k(kIn);}
static inline int k() { return my_mer::k(); }

inline bool fromStr(const std::string& s) {
Expand All @@ -39,8 +42,8 @@ class CanonicalKmer {
return false;
}
for (size_t i = 0; i < k; ++i) {
fw_.shift_right(s[i]);
rc_.shift_left(my_mer::complement(s[i]));
fw_.prepend(s[i]);
rc_.append(kmers::complement(s[i]));
}
return true;
}
Expand All @@ -49,15 +52,15 @@ class CanonicalKmer {
auto k = my_mer::k();
// if (s.length() < k) { return false; }
for (size_t i = 0; i < k; ++i) {
fw_.shift_right(s[i]);
rc_.shift_left(my_mer::complement(s[i]));
fw_.prepend(s[i]);
rc_.append(kmers::complement(s[i]));
}
return true;
}

inline void fromNum(uint64_t w) {
fw_.word__(0) = w;
rc_ = fw_.get_reverse_complement();
rc_ = fw_.getRC();
}

inline void swap(){
Expand All @@ -69,30 +72,30 @@ class CanonicalKmer {

inline bool isFwCanonical() const { return fw_ < rc_; }

inline auto shiftFw(int c) -> decltype(this->fw_.shift_right(c)) {
rc_.shift_left(my_mer::complement(c));
return fw_.shift_right(c);
inline auto shiftFw(int c) -> decltype(this->fw_.prepend(c)) {
rc_.append(kmers::complement(c));
return fw_.prepend(c);
}

inline auto shiftBw(int c) -> decltype(this->fw_.shift_left(c)) {
rc_.shift_right(my_mer::complement(c));
return fw_.shift_left(c);
inline auto shiftBw(int c) -> decltype(this->fw_.append(c)) {
rc_.prepend(kmers::complement(c));
return fw_.append(c);
}

inline auto shiftFw(char c) -> decltype(this->fw_.shift_right(c)) {
int x = my_mer::code(c);
inline auto shiftFw(char c) -> decltype(kmers::charForCode(this->fw_.prepend(c))) {
int x = kmers::codeForChar(c);
if (x == -1)
return 'N';
rc_.shift_left(my_mer::complement(x));
return my_mer::rev_code(fw_.shift_right(x));
rc_.append(kmers::complement(x));
return kmers::charForCode(fw_.prepend(x));
}

inline auto shiftBw(char c) -> decltype(this->fw_.shift_left(c)) {
int x = my_mer::code(c);
inline auto shiftBw(char c) -> decltype(kmers::charForCode(this->fw_.append(c))) {
int x = kmers::codeForChar(c);
if (x == -1)
return 'N';
rc_.shift_right(my_mer::complement(x));
return my_mer::rev_code(fw_.shift_left(x));
rc_.prepend(kmers::complement(x));
return kmers::charForCode(fw_.append(x));
}

inline uint64_t getCanonicalWord() const {
Expand Down Expand Up @@ -124,7 +127,7 @@ class CanonicalKmer {
}

inline std::string to_str() const {
std::string s = fw_.to_str();
std::string s = fw_.toStr();
std::reverse(s.begin(), s.end());
return s;
}
Expand Down
28 changes: 25 additions & 3 deletions include/CanonicalKmerIterator.hpp
Expand Up @@ -10,6 +10,7 @@
#include <iterator>

namespace pufferfish {
namespace kmers = combinelib::kmers;
// class CanonicalKmerIterator : public std::iterator<std::input_iterator_tag,
// std::pair<CanonicalKmer, int>, int> {
class CanonicalKmerIterator
Expand Down Expand Up @@ -47,7 +48,7 @@ class CanonicalKmerIterator
// j is the last nucleotide in the k-mer we're building
for (; j < static_cast<int>(s_.length()); ++j) {
// get the code for the last nucleotide, save it as c
int c = my_mer::code(s_[j]);
int c = kmers::codeForChar(s_[j]);
// c is a valid code if != -1
if (c != -1) {
p_.first.shiftFw(c);
Expand All @@ -66,6 +67,7 @@ class CanonicalKmerIterator
}

public:
inline stx::string_view seq() { return s_; }
// use: ++iter;
// pre:
// post: *iter is now exhausted
Expand All @@ -76,7 +78,7 @@ class CanonicalKmerIterator
if (!invalid_) {
find_next(p_.second, lpos - 1);
/** --- implementation that doesn't skip non-{ACGT}
int c = my_mer::code(s_[lpos]);
int c = kmers::codeForChar(s_[lpos]);
if (c!=-1) { km_.shiftFw(c); } else { lastinvalid_ = pos_ + k_; }
++pos_;
*/
Expand All @@ -93,6 +95,20 @@ class CanonicalKmerIterator
return tmp;
}

// use: iter += constant int;
// pre:
// post: *iter is now exhausted
// OR *iter is the next valid pair of kmer and location after advancing
inline CanonicalKmerIterator& operator+=(int advance) {
//CanonicalKmerIterator tmp(*this) ;
while(advance > 0){
operator++() ;
advance-- ;
}
return *this;
}


// use: val = (a == b);
// pre:
// post: (val == true) if a and b are both exhausted
Expand Down Expand Up @@ -120,6 +136,12 @@ class CanonicalKmerIterator
// post: km will be (*iter).first, i will be (*iter).second
inline pointer operator->() { return &(operator*()); }

void jumpTo(int pos) {
lastinvalid_ = pos-1;
find_next(pos-1,(pos-1));
}


private:
/*
// use: find_next(i,j, last_valid);
Expand All @@ -134,7 +156,7 @@ bool valid{false};
// j is the last nucleotide in the k-mer we're building
while (j < s_.length()) {
// get the code for the last nucleotide, save it as c
int c = my_mer::code(s_[j]);
int c = kmers::codeForChar(s_[j]);
// c is a valid code if != -1
if (c != -1) {
km_.shiftFw(c);
Expand Down
14 changes: 14 additions & 0 deletions include/CommonTypes.hpp
@@ -0,0 +1,14 @@
#ifndef __PUFFERFISH_COMMON_TYPES_HPP__
#define __PUFFERFISH_COMMON_TYPES_HPP__

#include "core/range.hpp"

namespace pufferfish {
namespace common_types {
using ReferenceID = size_t;
template <typename T>
using IterRange = core::range<typename T::iterator>;
}
}

#endif // __PUFFERFISH_COMMON_TYPES_HPP__

0 comments on commit fdb7dcc

Please sign in to comment.