diff --git a/.github/workflows/docker-build.yml b/.github/workflows/docker-build.yml new file mode 100644 index 0000000..7579061 --- /dev/null +++ b/.github/workflows/docker-build.yml @@ -0,0 +1,62 @@ +name: Docker Build and Test + +on: + push: + branches: + - main + pull_request: + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - name: Delete huge unnecessary tools folder + run: rm -rf /opt/hostedtoolcache + + - name: Checkout repository + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Cache build directory + uses: actions/cache@v3 + with: + path: build + key: ${{ runner.os }}-build-${{ github.sha }} + restore-keys: | + ${{ runner.os }}-build- + + - name: Set lowercase repository owner + id: repo_owner + run: echo "::set-output name=owner::${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]' + + - name: Build Docker image + uses: docker/build-push-action@v4 + with: + context: . + load: true + tags: ghcr.io/${{ steps.repo_owner.outputs.owner }}/build_test:latest + cache-from: type=gha + cache-to: type=gha,mode=max + build-args: | + BUILDKIT_INLINE_CACHE=1 + + - name: Extract build directory from Docker image + run: | + container_id=$(docker create ghcr.io/${{ steps.repo_owner.outputs.owner }}/build_test:latest) + docker cp $container_id:/rawhash2/build ./build + docker rm $container_id + + - name: Run Docker container + run: docker run --rm ghcr.io/${{ steps.repo_owner.outputs.owner }}/build_test:latest -h diff --git a/.gitignore b/.gitignore index 182fe16..0bcb25d 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,9 @@ bin/ +build/ +run_dir/ +.venv/ +example_out/ +__pycache__/ test/data/d1_* test/data/d2_* @@ -72,7 +77,7 @@ test/eval/*.summary */.DS_Store test/*.time test/*.idx -src/*.o +#src/*.o extern/pod5* @@ -86,4 +91,4 @@ test/evaluation/read_mapping/*parameters/ test/evaluation/read_mapping/s_modify.py extern/tensorflow/tflite_build/ -test/evaluation/read_mapping/*.features \ No newline at end of file +test/evaluation/read_mapping/*.features diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..8c94148 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,4 @@ +cmake_minimum_required(VERSION 3.10) +project(RawHash2) + +add_subdirectory(src) diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c298825 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM gcc:latest + +RUN apt-get update && apt-get install -y \ + cmake make mold ccache \ + && rm -rf /var/lib/apt/lists/* + +WORKDIR /rawhash2 +COPY . /rawhash2 + +RUN mkdir -p build && cd build \ + && cmake .. \ + && make -j 3 + +ENTRYPOINT ["./build/bin/rawhash2"] + +LABEL Name=rawhash2 Version=0.0.1 diff --git a/Makefile b/Makefile deleted file mode 100644 index 56ee9df..0000000 --- a/Makefile +++ /dev/null @@ -1,29 +0,0 @@ -.PHONY: all subset clean help - -all:rawhash2 - -HOSTNAME := $(shell hostname) -ifeq ($(HOSTNAME),mikado1) -binary_name = rawhash_pybinding -else -binary_name = rawhash2 -endif - -help: ##Show help - +$(MAKE) -C src help - -rawhash2: - @if [ ! -e bin ] ; then mkdir -p ./bin/ ; fi - +$(MAKE) -C src -# mv ./src/rawhash2 ./bin/ - mv ./src/$(binary_name) ./bin/ - -subset: - @if [ ! -e bin ] ; then mkdir -p ./bin/ ; fi - +$(MAKE) -C src subset - mv ./src/rawhash2 ./bin/ - -clean: - rm -rf bin/ - +$(MAKE) clean -C ./src/ - diff --git a/README.md b/README.md index 1ffb2d6..b31bb44 100644 --- a/README.md +++ b/README.md @@ -32,46 +32,56 @@ RawHash performs real-time mapping of nanopore raw signals. When the prefix of r # Installation -* Clone the code from its GitHub repository (`--recursive` must be used): +* Clone the code from its GitHub repository and recursively initialize submodules: ```bash -git clone --recursive https://github.com/CMU-SAFARI/RawHash.git rawhash2 +git clone https://github.com/CMU-SAFARI/RawHash.git rawhash2 +cd rawhash2 && git submodule update --init --recursive ``` * Compile (Make sure you have a C++ compiler and GNU make): ```bash -cd rawhash2 && make +# if not doing a fresh clone, make sure that the submodules don't have anything built from previous makefile-based +# setup , i.e. delete extern directory, then initialize submodules as above +(mkdir -p build && cd build && cmake .. && make -j) +build/bin/rawhash2 ``` -If the compilation is successful, the path to the binary will be `bin/rawhash2`. +Troubleshooting: +- `makefile error 2`: rerun `make -j`, then the actual error is shown +- updating submodules: the current cmake setup may not correctly handle this, so the easiest solution is to delete the build directory + +If the compilation is successful, the default path to the binary will be `build/bin/rawhash2`. ## Compiling with HDF5, SLOW5, and POD5 -We are aware that some of the pre-compiled libraries (e.g., POD5) may not work in your system and you may need to compile these libraries from scratch. Additionally, it may be possible that you may not want to compile any of the HDF5, SLOW5, or POD5 libraries if you are not going to use them. RawHash2 provides a flexible Makefile to enable custom compilation of these libraries. +We are aware that some of the pre-compiled libraries (e.g., POD5) may not work in your system and you may need to compile these libraries from scratch. Additionally, it may be possible that you may not want to compile any of the HDF5, SLOW5, or POD5 libraries if you are not going to use them. RawHash2 provides several CMake options to enable custom compilation of these libraries. -* It is possible to provide your own include and lib directories for *any* of the HDF5, SLOW5, and POD5 libraries, if you do not want to use the source code or the pre-compiled binaries that come with RawHash2. To use your own include and lib directories you should pass them to `make` when compiling as follows: +It is possible to provide your own include and lib directories for *any* of the HDF5, SLOW5, and POD5 libraries, if you do not want to use the source code or the pre-compiled binaries that come with RawHash2. To use your own include and lib directories you should pass them to `cmake` when compiling as follows: ```bash -#Provide the path to all of the HDF5/SLOW5/POD5 include and lib directories during compilation -make HDF5_INCLUDE_DIR=/path/to/hdf5/include HDF5_LIB_DIR=/path/to/hdf5/lib \ - SLOW5_INCLUDE_DIR=/path/to/slow5/include SLOW5_LIB_DIR=/path/to/slow5/lib \ - POD5_INCLUDE_DIR=/path/to/pod5/include POD5_LIB_DIR=/path/to/pod5/lib +# Provide the path to all of the HDF5/SLOW5/POD5 include and lib directories during compilation +cmake -DHDF5_DIR=/path/to/hdf5 -DSLOW5_DIR=/path/to/slow5 -DPOD5_DIR=/path/to/pod5 .. -#Provide the path to only POD5 include and lib directories during compilation -make POD5_INCLUDE_DIR=/path/to/pod5/include POD5_LIB_DIR=/path/to/pod5/lib +# Provide the path to only POD5 include and lib directories during compilation +cmake -DPOD5_DIR=/path/to/pod5 ``` -* It is possible to disable compiling *any* of the HDF5, SLOW5, and POD5 libraries. To disable them, you can use the following variables +Note that the provided path should generally contain _both_ `include/` and `lib/` folders with the corresponding project's include and library files. + +It is possible to disable compiling *any* of the HDF5, SLOW5, and POD5 libraries. To disable them, you can use the following variables ```bash -#Disables compiling HDF5 -make NOHDF5=1 +# Disables compiling HDF5 +cmake -DNOHDF5=1 .. -#Disables compiling SLOW5 and POD5 -make NOSLOW5=1 NOPOD5=1 +# Disables compiling SLOW5 and POD5 +cmake -DNOSLOW5=1 -DNOPOD5=1 .. ``` +The variables and paths will be stored in CMake cache, meaning that you would need to run `cmake` again with explicitly provided new values to change them. + # Usage ## Getting help diff --git a/cmake/SetupCCacheMold.cmake b/cmake/SetupCCacheMold.cmake new file mode 100644 index 0000000..e3e3635 --- /dev/null +++ b/cmake/SetupCCacheMold.cmake @@ -0,0 +1,57 @@ +function(enable_ccache) + # export PATH="/usr/lib/ccache:$PATH" + find_program(CCACHE_EXE ccache) + if(CCACHE_EXE) + message(STATUS "found ccache at ${CCACHE_EXE}, using it") + set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_EXE}" CACHE STRING "C compiler launcher") + set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_EXE}" CACHE STRING "C++ compiler launcher") + else() + message(STATUS "ccache not found, not using it") + endif() +endfunction() + +# mold is a much faster linker than ld, also see for mold: https://github.com/heavyai/heavydb/blob/master/CMakeLists.txt +# or try: https://gitlab.kitware.com/cmake/cmake/-/merge_requests/8861, can now use CMAKE_LINKER_TYPE +macro(set_alternate_linker linker) + find_program(LINKER_EXECUTABLE ld.${linker} ${linker}) + if(LINKER_EXECUTABLE) + message(STATUS "Found linker ${linker}: ${LINKER_EXECUTABLE}") + if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 12.0.0) + add_link_options("-ld-path=${linker}") + elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU" AND "${CMAKE_CXX_COMPILER_VERSION}" VERSION_LESS 12.1.0 AND "${linker}" STREQUAL "mold") + # LINKER_EXECUTABLE will be a full path to ld.mold, so we replace the end of the path, resulting in the relative + # libexec/mold dir, and tell GCC to look there first for an override version of executables, in this case, ld + string(REPLACE "bin/ld.mold" "libexec/mold" PATH_TO_LIBEXEC_MOLD ${LINKER_EXECUTABLE}) + add_link_options("-B${PATH_TO_LIBEXEC_MOLD}") + else() + add_link_options("-fuse-ld=${linker}") + endif() + else() + message(FATAL_ERROR "Could not find linker ${linker}") + endif() +endmacro() + +# not working +# function(setup_ccache_mold) +# if(USE_CCACHE) +# find_program(CCACHE_PROGRAM ccache) +# if(CCACHE_PROGRAM) +# message(STATUS "ccache found: ${CCACHE_PROGRAM}") +# set(CMAKE_C_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") +# set(CMAKE_CXX_COMPILER_LAUNCHER "${CCACHE_PROGRAM}") +# else() +# message(WARNING "ccache not found, using default compiler") +# endif() +# endif() + +# if(USE_MOLD) +# find_program(MOLD_PROGRAM mold) +# # todo: does not seem to work, for a working configuration, see https://github.com/ratschlab/readuntil_fake/blob/refactor/cmake/utils.cmake +# if(MOLD_PROGRAM) +# message(STATUS "mold found: ${MOLD_PROGRAM}") +# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${MOLD_PROGRAM}") +# else() +# message(WARNING "mold not found, using default linker") +# endif() +# endif() +# endfunction() diff --git a/cmake/SetupHDF5.cmake b/cmake/SetupHDF5.cmake new file mode 100644 index 0000000..ca60beb --- /dev/null +++ b/cmake/SetupHDF5.cmake @@ -0,0 +1,32 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Util.cmake) + +function(setup_hdf5 TARGET_NAME) + if(NOHDF5) + target_compile_definitions(${TARGET_NAME} PRIVATE NHDF5RH=1) + else() + # print HDF5_DIR + message(STATUS "EXTERNAL_PROJECTS_BUILD_DIR: ${EXTERNAL_PROJECTS_BUILD_DIR}") + message(STATUS "HDF5_DIR: ${HDF5_DIR}") + set(HDF5_SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/hdf5) + if(HDF5_COMPILE) + if(NOT HDF5_DIR) + override_cached(HDF5_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/hdf5) + endif() + set(HDF5_BUILD_DIR ${HDF5_DIR}/build) + ExternalProject_Add( + hdf5_build + SOURCE_DIR ${HDF5_SOURCE_DIR} + BINARY_DIR ${HDF5_BUILD_DIR} + CONFIGURE_COMMAND ${HDF5_SOURCE_DIR}/configure --enable-threadsafe --disable-hl --prefix=${HDF5_BUILD_DIR} + # INSTALL_DIR and DCMAKE_INSTALL_PREFIX are ignored by hdf5 + INSTALL_COMMAND make install prefix=${HDF5_DIR} + ) + add_dependencies(${TARGET_NAME} hdf5_build) + else() + if(NOT HDF5_DIR) + message(FATAL_ERROR "HDF5_COMPILE is OFF, but no dir provided") + endif() + endif() + link_imported_library(${TARGET_NAME} hdf5 ${HDF5_DIR}) + endif() +endfunction() diff --git a/cmake/SetupPOD5.cmake b/cmake/SetupPOD5.cmake new file mode 100644 index 0000000..008a4b2 --- /dev/null +++ b/cmake/SetupPOD5.cmake @@ -0,0 +1,175 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Util.cmake) + +function(setup_zstd TARGET_NAME) +set(ZSTD_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/zstd) + ExternalProject_Add( + zstd_build + SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/zstd/build/cmake + BINARY_DIR ${ZSTD_DIR}/build + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZSTD_DIR} + ) + add_dependencies(${TARGET_NAME} zstd_build) + link_imported_library(${TARGET_NAME} zstd ${ZSTD_DIR}) +endfunction() + +function(setup_pod5 TARGET_NAME) + if(NOPOD5) + target_compile_definitions(${TARGET_NAME} PRIVATE NPOD5RH=1) + else() + setup_zstd(${TARGET_NAME}) + + set(POD5_VERSION "0.2.2") + set(POD5_URLDIR "pod5-${POD5_VERSION}-${CMAKE_SYSTEM_NAME}") + set(POD5_REPO "https://github.com/nanoporetech/pod5-file-format") + + resolve_pod5_url() + + if(POD5_DOWNLOAD) + if(NOT POD5_DIR) + override_cached(POD5_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}) + endif() + ExternalProject_Add( + pod5_download + SOURCE_DIR ${POD5_DIR} + URL ${POD5_URL} + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + # requires cmake >= 3.24 + # DOWNLOAD_EXTRACT_TIMESTAMP TRUE + ) + add_dependencies(${TARGET_NAME} pod5_download) + else() + if(NOT POD5_DIR) + message(FATAL_ERROR "POD5_DOWNLOAD is OFF, but no dir provided") + endif() + endif() + include_directories(${POD5_DIR}/include) + target_link_libraries(${TARGET_NAME} PRIVATE ${POD5_LIBRARIES} zstd) + endif() +endfunction() + +# POD5_URL and POD5_LIBRARIES are set at PARENT_SCOPE +function(resolve_pod5_url) + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + set(POD5_LIB "lib64") + if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm)") + if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-gcc7-arm64.tar.gz") + set(POD5_LIB "lib") + else() + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-arm64.tar.gz" PARENT_SCOPE) + endif() + else() + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-x64.tar.gz" PARENT_SCOPE) + endif() + set(POD5_LIB_DIR "${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}/${POD5_LIB}") + set(POD5_LIBRARIES "${POD5_LIB_DIR}/libpod5_format.so" + "${POD5_LIB_DIR}/libarrow.so" + "${POD5_LIB_DIR}/libjemalloc_pic.so" PARENT_SCOPE) + elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-osx-11.0-arm64.tar.gz") + set(POD5_LIB_DIR "${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}/lib") + set(POD5_LIBRARIES "${POD5_LIB_DIR}/libpod5_format.so" + "${POD5_LIB_DIR}/libarrow.so" PARENT_SCOPE) + elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows_NT") + set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-win" PARENT_SCOPE) + endif() +endfunction() + + +# not working because of improper design, PARENT_SCOPE should not be used, rather define targets properly +# include(${CMAKE_CURRENT_LIST_DIR}/Util.cmake) + +# set(ZSTD_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/zstd) + +# function(setup_zstd) +# ExternalProject_Add( +# zstd_build +# SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/zstd/build/cmake +# BINARY_DIR ${ZSTD_DIR}/build +# CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ZSTD_DIR} +# ) +# endfunction() + +# function(add_zstd_to_target TARGET_NAME) +# add_dependencies(${TARGET_NAME} zstd_build) +# link_imported_library(${TARGET_NAME} zstd ${ZSTD_DIR}) +# target_link_libraries(${TARGET_NAME} PRIVATE zstd) +# endfunction() + +# function(setup_pod5) +# if(NOPOD5) +# return() +# endif() + +# setup_zstd() + +# set(POD5_VERSION "0.2.2") +# set(POD5_URLDIR "pod5-${POD5_VERSION}-${CMAKE_SYSTEM_NAME}") +# set(POD5_REPO "https://github.com/nanoporetech/pod5-file-format") + +# resolve_pod5_url() + +# if(POD5_DOWNLOAD) +# if(NOT POD5_DIR) +# override_cached(POD5_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}) +# endif() +# ExternalProject_Add( +# pod5_download +# SOURCE_DIR ${POD5_DIR} +# URL ${POD5_URL} +# CONFIGURE_COMMAND "" +# BUILD_COMMAND "" +# INSTALL_COMMAND "" +# # requires cmake >= 3.24 +# # DOWNLOAD_EXTRACT_TIMESTAMP TRUE +# ) +# else() +# if(NOT POD5_DIR) +# message(FATAL_ERROR "POD5_DOWNLOAD is OFF, but no dir provided") +# endif() +# endif() +# endfunction() + +# function(add_pod5_to_target TARGET_NAME) +# if(NOPOD5) +# target_compile_definitions(${TARGET_NAME} PRIVATE NPOD5RH=1) +# else() +# add_zstd_to_target(${TARGET_NAME}) + +# add_dependencies(${TARGET_NAME} pod5_download) +# include_directories(${POD5_DIR}/include) +# message(STATUS "Adding include dir ${POD5_DIR}/include, POD5 libraries: ${POD5_LIBRARIES}") +# target_link_libraries(${TARGET_NAME} PRIVATE ${POD5_LIBRARIES}) +# endif() +# endfunction() + +# # POD5_URL and POD5_LIBRARIES are set at PARENT_SCOPE +# function(resolve_pod5_url) +# if(CMAKE_SYSTEM_NAME STREQUAL "Linux") +# set(POD5_LIB "lib64") +# if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm)") +# if(CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.0) +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-gcc7-arm64.tar.gz") +# set(POD5_LIB "lib") +# else() +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-arm64.tar.gz" PARENT_SCOPE) +# endif() +# else() +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-linux-x64.tar.gz" PARENT_SCOPE) +# endif() +# set(POD5_LIB_DIR "${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}/${POD5_LIB}") +# set(POD5_LIBRARIES "${POD5_LIB_DIR}/libpod5_format.a" +# "${POD5_LIB_DIR}/libarrow.a" +# "${POD5_LIB_DIR}/libjemalloc_pic.a" PARENT_SCOPE) +# elseif(CMAKE_SYSTEM_NAME STREQUAL "Darwin") +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-osx-11.0-arm64.tar.gz") +# set(POD5_LIB_DIR "${EXTERNAL_PROJECTS_BUILD_DIR}/${POD5_URLDIR}/lib") +# set(POD5_LIBRARIES "${POD5_LIB_DIR}/libpod5_format.a" +# "${POD5_LIB_DIR}/libarrow.a" PARENT_SCOPE) +# elseif(CMAKE_SYSTEM_NAME STREQUAL "Windows_NT") +# set(POD5_URL "${POD5_REPO}/releases/download/${POD5_VERSION}/lib_pod5-${POD5_VERSION}-win" PARENT_SCOPE) +# # todo: not setting libraries! +# endif() +# endfunction() diff --git a/cmake/SetupRUClient.cmake b/cmake/SetupRUClient.cmake new file mode 100644 index 0000000..00d5d98 --- /dev/null +++ b/cmake/SetupRUClient.cmake @@ -0,0 +1,24 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Util.cmake) + +function(setup_ruclient TARGET_NAME) + if(RUCLIENT_ENABLED) + set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 20) + target_compile_definitions(${TARGET_NAME} PRIVATE RUCLIENT_ENABLED) + target_sources(${TARGET_NAME} PRIVATE rawhash_ruclient.cpp) + if(NOT RUCLIENT_DIR) + override_cached(RUCLIENT_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/ruclient) + endif() + ExternalProject_Add( + ruclient_build + SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/readuntil_fake + BINARY_DIR ${RUCLIENT_DIR}/build + CMAKE_ARGS + -DCMAKE_INSTALL_PREFIX=${RUCLIENT_DIR} + ) + add_dependencies(${TARGET_NAME} ruclient_build) + include_directories(${RUCLIENT_DIR}/include) + message(STATUS "ruclient enabled") + else() + message(STATUS "ruclient disabled") + endif() +endfunction() diff --git a/cmake/SetupRawHashLikeTarget.cmake b/cmake/SetupRawHashLikeTarget.cmake new file mode 100644 index 0000000..9a43cf3 --- /dev/null +++ b/cmake/SetupRawHashLikeTarget.cmake @@ -0,0 +1,80 @@ +function(setup_rawhashlike_target TARGET_NAME) + # if(PYBINDING) + # message(FATAL_ERROR "Building with Python binding support is not implemented") + # # else() + # # # add_executable(${TARGET_NAME} main.cpp) + # # add_executable(${TARGET_NAME} rawhash_wrapper.cpp) + # endif() + # target_compile_options(${TARGET_NAME} PRIVATE -Wno-sign-compare) + target_compile_options(${TARGET_NAME} PRIVATE -w) # disable all warnings, dangerous! + + set_target_properties(${TARGET_NAME} PROPERTIES CXX_STANDARD 11) + set_target_properties(${TARGET_NAME} PROPERTIES C_STANDARD 11) + + find_package(Threads REQUIRED) + target_link_libraries(${TARGET_NAME} PRIVATE Threads::Threads m z dl) + target_compile_options(${TARGET_NAME} PRIVATE -Wall -fopenmp -march=native -O3) + target_compile_definitions(${TARGET_NAME} PRIVATE HAVE_KALLOC) + + if(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64") + target_compile_options(${TARGET_NAME} PRIVATE -D_FILE_OFFSET_BITS=64 -fsigned-char) + elseif(DEFINED ARM_NEON) + target_compile_options(${TARGET_NAME} PRIVATE -D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char) + endif() + + if(ENABLE_ASAN) + message(STATUS "AddressSanitizer enabled") + target_compile_options(${TARGET_NAME} PRIVATE -fsanitize=address) + target_link_libraries(${TARGET_NAME} PRIVATE -fsanitize=address) + endif() + + if(ENABLE_TSAN) + message(STATUS "ThreadSanitizer enabled") + target_compile_options(${TARGET_NAME} PRIVATE -fsanitize=thread) + target_link_libraries(${TARGET_NAME} PRIVATE -fsanitize=thread) + endif() + + target_sources(${TARGET_NAME} PRIVATE + bseq.c + dtw.cpp + kalloc.c + kthread.c + revent.c + rmap.cpp + roptions.c + rsketch.c + rutils.c + sequence_until.c + ) + # C files that rely on hdf5_tools.hpp + # Should be compiled as CXX for now + set(PSEUDO_C_SOURCES + hit.c + lchain.c + rindex.c + rseed.c + rsig.c + ) + foreach(source IN LISTS PSEUDO_C_SOURCES) + set_source_files_properties(${source} PROPERTIES LANGUAGE CXX) + endforeach() + target_sources(${TARGET_NAME} PRIVATE ${PSEUDO_C_SOURCES}) + + + if(CMAKE_BUILD_TYPE STREQUAL "Debug") + target_compile_options(${TARGET_NAME} PRIVATE + -g + -fsanitize=address + ) + endif() + + if(PROFILE) + target_compile_options(${TARGET_NAME} PRIVATE + -g + -fno-omit-frame-pointer + ) + target_compile_definitions(${TARGET_NAME} PRIVATE + PROFILERH=1 + ) + endif() +endfunction() diff --git a/cmake/SetupSLOW5.cmake b/cmake/SetupSLOW5.cmake new file mode 100644 index 0000000..cd9770b --- /dev/null +++ b/cmake/SetupSLOW5.cmake @@ -0,0 +1,31 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Util.cmake) + +function(setup_slow5 TARGET_NAME) + if(NOSLOW5) + target_compile_definitions(${TARGET_NAME} PRIVATE NSLOW5RH=1) + else() + set(SLOW5_SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/slow5lib) + if(SLOW5_COMPILE) + if(NOT SLOW5_DIR) + override_cached(SLOW5_DIR ${EXTERNAL_PROJECTS_BUILD_DIR}/slow5lib) + endif() + message(STATUS "Compiling slow5 to ${SLOW5_DIR}") + ExternalProject_Add( + slow5_build + BINARY_DIR ${SLOW5_DIR} + SOURCE_DIR ${SLOW5_SOURCE_DIR} + INSTALL_COMMAND ${CMAKE_COMMAND} -E copy_directory ${SLOW5_SOURCE_DIR}/include ${SLOW5_DIR}/include + && ${CMAKE_COMMAND} -E make_directory ${SLOW5_DIR}/lib + && ${CMAKE_COMMAND} -E rename ${SLOW5_DIR}/libslow5.so ${SLOW5_DIR}/lib/libslow5.so + ) + message(STATUS "Current dir: ${CMAKE_CURRENT_BINARY_DIR}") + add_dependencies(${TARGET_NAME} slow5_build) + else() + if(NOT SLOW5_DIR) + message(FATAL_ERROR "SLOW5_COMPILE is OFF, but no dir provided") + endif() + endif() + message(STATUS "Using slow5 from ${SLOW5_DIR}") + link_imported_library(${TARGET_NAME} slow5 ${SLOW5_DIR}) + endif() +endfunction() diff --git a/cmake/SetupTFLite.cmake b/cmake/SetupTFLite.cmake new file mode 100644 index 0000000..97bb5ee --- /dev/null +++ b/cmake/SetupTFLite.cmake @@ -0,0 +1,8 @@ +include(${CMAKE_CURRENT_LIST_DIR}/Util.cmake) + +function(setup_tflite TARGET_NAME) + set(TF_SOURCE_DIR ${CMAKE_SOURCE_DIR}/extern/tensorflow) + add_subdirectory(${TF_SOURCE_DIR}/tensorflow/lite ${EXTERNAL_PROJECTS_BUILD_DIR}/tflite EXCLUDE_FROM_ALL) + include_directories(${TF_SOURCE_DIR}) + target_link_libraries(${TARGET_NAME} PRIVATE tensorflow-lite) +endfunction() diff --git a/cmake/Util.cmake b/cmake/Util.cmake new file mode 100644 index 0000000..f062898 --- /dev/null +++ b/cmake/Util.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) + +function(override_cached name value) + message(STATUS "Overriding ${name} to ${value}") + get_property(doc_string CACHE ${name} PROPERTY HELPSTRING) + get_property(type CACHE ${name} PROPERTY TYPE) + set(${name} ${value} CACHE ${type} ${doc_string} FORCE) +endfunction() + +function(link_imported_library TARGET_NAME LIB_NAME LIB_DIR) + add_library(${LIB_NAME} SHARED IMPORTED) + file(MAKE_DIRECTORY ${LIB_DIR}/include) + set_target_properties(${LIB_NAME} PROPERTIES + IMPORTED_LOCATION ${LIB_DIR}/lib/lib${LIB_NAME}.so + INTERFACE_INCLUDE_DIRECTORIES ${LIB_DIR}/include + ) + target_link_libraries(${TARGET_NAME} PRIVATE ${LIB_NAME}) +endfunction() + +function(check_directory_exists_and_non_empty DIR) + if(NOT EXISTS ${DIR}) + message(FATAL_ERROR "Directory ${DIR} does not exist.") + endif() + + file(GLOB DIRECTORY_CONTENTS "${DIR}/*") + if(DIRECTORY_CONTENTS STREQUAL "") + message(FATAL_ERROR "Directory ${DIR} is empty.") + endif() + + message(STATUS "Directory ${DIR} exists and is non-empty.") +endfunction() diff --git a/own_notes/training_quantized_pred.txt b/own_notes/training_quantized_pred.txt index 7e098ce..6921335 100644 --- a/own_notes/training_quantized_pred.txt +++ b/own_notes/training_quantized_pred.txt @@ -18,7 +18,7 @@ That is, the model predicts whether a chunk is in the correct position based on RI_STORE_SIG: whether to store the signal (generated from a fasta reference) -RI_I_REV_QUERY: whether the index is queried with the read and its reverse complement (through a RC prediction model); if not, need to store reference signal for both forward and reverse signal +RI_I_STORE_REV: whether the index is queried with the read and its reverse complement (through a revcomp prediction model); if not, need to store reference signal for both forward and reverse signal Suppose we only store the positive strand of the reference. If we query a read from the positive strand, we are fine. diff --git a/python_wrapper/example.ipynb b/python_wrapper/example.ipynb new file mode 100644 index 0000000..6726145 --- /dev/null +++ b/python_wrapper/example.ipynb @@ -0,0 +1,181 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# # create venv\n", + "# python3 -m venv .venv\n", + "# source .venv/bin/activate\n", + "# pip install cffi ipykernel\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "\n", + "from pathlib import Path\n", + "import cppyy\n", + "import os\n", + "\n", + "os.chdir(\"/home/mmordig/rawhash_project/rawhash2_new/build/extern\")\n", + "\n", + "header_file = Path(\"/home/mmordig/rawhash_project/rawhash2_new/src/rawhash_wrapper.hpp\")\n", + "library_file = Path(\"/home/mmordig/rawhash_project/rawhash2_new/build/src/librawhash2_wrapper.so\")\n", + "cppyy.include(str(header_file))\n", + "cppyy.add_library_path(\"/home/mmordig/rawhash_project/rawhash2_new/build/extern/hdf5/lib\")\n", + "cppyy.add_library_path(\"/home/mmordig/rawhash_project/rawhash2_new/build/src\")\n", + "cppyy.load_library(\"librawhash2_wrapper\")\n", + "\n", + "# cppyy.add_library_path(str(library_file.parent))\n", + "# cppyy.load_library(str(library_file.name))\n", + "# cppyy.load_library(str(library_file))" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "ename": "", + "evalue": "", + "output_type": "error", + "traceback": [ + "\u001b[1;31mThe Kernel crashed while executing code in the current cell or a previous cell. \n", + "\u001b[1;31mPlease review the code in the cell(s) to identify a possible cause of the failure. \n", + "\u001b[1;31mClick here for more info. \n", + "\u001b[1;31mView Jupyter log for further details." + ] + } + ], + "source": [ + "# cppppy.\n", + "# dir()\n", + "cppyy.gbl.RawHashMapper(5, [b\"hello\"])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pathlib import Path\n", + "from cffi import FFI\n", + "ffi = FFI()\n", + "\n", + "wrapper_path = Path(\"/home/mmordig/rawhash_project/rawhash2_new/src/rawhash_wrapper.hpp\")\n", + "ffi.cdef(wrapper_path.read_text())\n", + "\n", + "# lib = ffi.dlopen()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from cffi import FFI\n", + "ffi = FFI()\n", + "ffi.cdef(\"\"\"\n", + " int printf(const char *format, ...); // copy-pasted from the man page\n", + "\"\"\")\n", + "C = ffi.dlopen(None) # loads the entire C namespace\n", + "arg = ffi.new(\"char[]\", b\"world\") # equivalent to C code: char arg[] = \"world\";\n", + "C.printf(b\"hi there, %s.\\n\", arg)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# # rawhash commands\n", + "# mkdir -p example_out\n", + "# rawhash2 \\\n", + "# -x sensitive -t 8 \\\n", + "# -p /home/mmordig/rawhash_project/rawhash2_new/extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model \\\n", + "# -d example_out/ref.ind \\\n", + "# /home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/ref.fa\n", + "\n", + "# rawhash2_wrapper_example \\\n", + "# -x sensitive -t 8 \\\n", + "# -p /home/mmordig/rawhash_project/rawhash2_new/extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model \\\n", + "# example_out/ref.ind\n", + "\n", + "# export PATH=/home/mmordig/rawhash_project/rawhash2_new/build/bin:$PATH\n", + "\n", + "rawhash2_usinglib \\\n", + " -x sensitive -t 8 \\\n", + " -p /home/mmordig/rawhash_project/rawhash2_new/extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model \\\n", + " -d example_out/ref.ind \\\n", + " /home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/ref.fa" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.12" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/python_wrapper/example.py b/python_wrapper/example.py new file mode 100644 index 0000000..4ff13fe --- /dev/null +++ b/python_wrapper/example.py @@ -0,0 +1,56 @@ +#%% + +# Make sure to compile rawhash properly, both the CLI and the shared lib!!! Otherwise, the kernel may crash + +import sys +sys.path.extend(["/home/mmordig/rawhash_project/rawhash2_new/python_wrapper"]) +from rawhash_wrapper import * + +#%% +rawhash_args = [ + "-x", "sensitive", "-t", "8", "-p", + "/home/mmordig/rawhash_project/rawhash2_new/extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model", +] +# rawhash_args += [ +# "/home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/ref.fa", +# "no-revcomp-query", +# ] +rawhash_args += [ + "/home/mmordig/rawhash_project/rawhash2_new/example_out/forwrev_rawhash2_sensitive.ind", +] +# forwrev_rawhash2_sensitive.paf +# 0006a106-77eb-4752-b405-38d1635718fa +# /home/mmordig/rawhash_project/rawhash2_new/example_out/forwonly_rawhash2_sensitive.ind +# /home/mmordig/rawhash_project/rawhash2_new/example_out/forwrev_rawhash2_sensitive.ind + + +#%% + +slow5_filename = "/home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/small_slow5_files/barcode02_r0barcode02b0_0.blow5" +# we know rawhash finds an alignment for this read from the file /home/mmordig/rawhash_project/rawhash2_new/example_out/forwrev_rawhash2_sensitive.paf +read_id = "0006a106-77eb-4752-b405-38d1635718fa" + +raw_signal = get_read_signal(slow5_filename, read_id) +print(f"Raw signal shape: {raw_signal.shape}") + +rawhash_lib_dir = Path("/home/mmordig/rawhash_project/rawhash2_new") +load_rawhash_wrapper_lib(rawhash_lib_dir) +mapper = get_rawhash_mapper(rawhash_args) +mapper.print_idx_info() +raw_signal = prepare_signal_for_rawhash(raw_signal) +# raw_signal = np.array([1.0, 2.0, 3.0, 4.0], dtype=np.float32) +print(f"Raw signal shape: {raw_signal.shape}") +alignments = get_rawhash_alignments(mapper, raw_signal) +#%% + +for (i, alignment) in enumerate(alignments): + print(f"Alignment {i}: {Alignment.from_cppyy(alignment)}") + +gt_paf_filename = "/home/mmordig/rawhash_project/rawhash2_new/example_out/forwrev_rawhash2_sensitive.paf" +with open(gt_paf_filename) as f: + for line in f: + rid, gt_alignment = parse_paf_line(line) + if rid == read_id: + print(f"GT Alignment for {rid}:\n {gt_alignment}") + break +#%% diff --git a/python_wrapper/map_reads.sh b/python_wrapper/map_reads.sh new file mode 100755 index 0000000..7cd8fbc --- /dev/null +++ b/python_wrapper/map_reads.sh @@ -0,0 +1,58 @@ +#!/usr/bin/env bash +set -eux + +export PATH=~/rawhash_project/rawhash2-env/bin:$PATH +export PATH=~/rawhash_project/extra_tools/ont-guppy/bin/":$PATH" +# export PATH="$PATH:/home/mmordig/rawhash_project/tools/cmake-3.28.0-rc5-linux-x86_64/bin" +export LD_LIBRARY_PATH="/home/mmordig/rawhash_project/rawhash2_new/build/src" +#:"${LD_LIBRARY_PATH+}" +export PATH=~/rawhash_project/rawhash2_new/build/bin:"$PATH" + +echo $LD_LIBRARY_PATH + +THREAD=32 + +# OUTDIR="./rawhash2/" +OUTDIR="/home/mmordig/rawhash_project/rawhash2_new/example_out" +#SIGNALS="../../../data/d2_ecoli_r94/fast5_files/" +#SIGNALS="../../../data/d2_ecoli_r94/small_fast5_dir/" +SIGNALS="/home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/small_slow5_files/" +# # basecalled reads +# READS= +REF="../../../data/d2_ecoli_r94/ref.fa" +# PORE="../../../../extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model" +PORE="/home/mmordig/rawhash_project/rawhash2_new/extern/kmer_models/legacy/legacy_r9.4_180mv_450bps_6mer/template_median68pA.model" +PRESETX="sensitive" + +# map to positive and revcomp strand +PARAMS= +PREFIX="forwrev" + +# # only map to the positive strand +# PARAMS="--no-revcomp-query --rev-query" +# PREFIX="forwonly" + +mkdir -p ${OUTDIR} + + +# if complaining about shared lib, need to recompile target +# index +rawhash2 -x ${PRESETX} -t ${THREAD} -p "${PORE}" -d "${OUTDIR}/${PREFIX}_rawhash2_${PRESETX}.ind" ${PARAMS} ${REF} +# map +rawhash2 -x ${PRESETX} -t ${THREAD} -o "${OUTDIR}/${PREFIX}_rawhash2_${PRESETX}.paf" ${PARAMS} "${OUTDIR}/${PREFIX}_rawhash2_${PRESETX}.ind" ${SIGNALS} + + +# awk '$5 == "-" {print $1}' /home/mmordig/rawhash_project/rawhash2_new/test/evaluation/read_mapping/d2_ecoli_r94/true_mappings.paf > revstrand_readids.txt +# slow5tools skim --rid /home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/small_slow5_files/barcode02_r0barcode02b0_0.blow5 > small_read_ids.txt +# # intersect both files +# awk 'NR==FNR{a[$1];next}($1 in a)' revstrand_readids.txt small_read_ids.txt > revstrand_readids_intersect.txt + +## diff -y -W 250 forwonly_rawhash2_sensitive.paf forwrev_rawhash2_sensitive.paf | less -S +## sort revstrand_readids_intersect.txt | uniq -d # check no duplicates +# slow5tools get /home/mmordig/rawhash_project/rawhash2_new/test/data/d2_ecoli_r94/small_slow5_files/barcode02_r0barcode02b0_0.blow5 -t $(nproc) --list revstrand_readids_intersect.txt -o revcomp_reads.blow5 +# awk 'NR==FNR{a[$1];next}($1 in a)' revstrand_readids_intersect.txt \ +# /home/mmordig/rawhash_project/rawhash2_new/test/evaluation/read_mapping/d2_ecoli_r94/true_mappings.paf \ +# > revcomp_true_mappings.paf + +# bash ../../../scripts/run_minimap2.sh . ../../../data/d2_ecoli_r94/reads.fasta ../../../data/d2_ecoli_r94/ref.fa ${THREAD} +# minimap2 -x map-ont -t ${THREAD} -o "${OUTDIR}/true_mappings.paf" ${REF} ${READS} \ No newline at end of file diff --git a/python_wrapper/rawhash_mapper.py b/python_wrapper/rawhash_mapper.py new file mode 100644 index 0000000..e69de29 diff --git a/python_wrapper/rawhash_wrapper.py b/python_wrapper/rawhash_wrapper.py new file mode 100644 index 0000000..632c6c1 --- /dev/null +++ b/python_wrapper/rawhash_wrapper.py @@ -0,0 +1,72 @@ +""" +Import this file if you want to use rawhash2 in Python. +""" + +from pathlib import Path +from typing import List, TypeVar +import cppyy +import cppyy.ll +import contextlib +import pyslow5 +import numpy as np + +def get_read_signal(slow5_filename, read_id: str): + with contextlib.closing(pyslow5.Open(slow5_filename, "r")) as s5: + return s5.get_read(read_id, pA=True)["signal"] + +def prepare_signal_for_rawhash(raw_signal: np.ndarray[float]): + """ + Remove outliers, convert to numpy + raw_signal should be floats (with offset, range, digitisation scaling) + """ + raw_signal = raw_signal[(raw_signal > 30.) & (raw_signal < 200.)] + return raw_signal + +class Alignment: + def __init__(self, contig, ref_start, ref_end, is_pos_strand): + self.contig = contig + self.ref_start = ref_start + self.ref_end = ref_end + self.is_pos_strand = is_pos_strand + @staticmethod + def from_cppyy(alignment): + return Alignment( + contig=alignment.contig, + ref_start=alignment.ref_start, + ref_end=alignment.ref_end, + is_pos_strand=alignment.is_pos_strand, + ) + def __repr__(self): + return f"Alignment(contig: {self.contig}, start: {self.ref_start}, end: {self.ref_end}, pos_strand: {self.is_pos_strand})" + +def load_rawhash_wrapper_lib(rawhash_lib_dir: Path): + rawhash_lib_dir = Path(rawhash_lib_dir) + header_file = rawhash_lib_dir / "src/rawhash_wrapper.hpp" + # library_file = rawhash_lib_dir / "build/src/librawhash2_wrapper.so" + cppyy.include(str(header_file)) + cppyy.add_library_path("/home/mmordig/rawhash_project/rawhash2_new/build/src") # for shared libs + cppyy.load_library("librawhash2_wrapper") + +# define type RawHashMapper + +RawHashMapper = TypeVar("RawHashMapper") +def get_rawhash_mapper(rawhash_args: List[str]) -> RawHashMapper: + args = ["my_dummy_program"] + rawhash_args + mapper = cppyy.gbl.RawHashMapper(len(args), args) + return mapper + +def get_rawhash_alignments(mapper: RawHashMapper, raw_signal: np.ndarray[float]) -> List[Alignment]: + raw_signal = np.ascontiguousarray(raw_signal, dtype=np.float32) + alignments = mapper.map(cppyy.ll.cast["float*"](raw_signal.ctypes.data), len(raw_signal)) + # return alignments + return [Alignment.from_cppyy(alignment) for alignment in alignments] + +def parse_paf_line(line): + """returns read_id, alignment""" + fields = line.strip().split("\t") + return fields[0], Alignment( + contig=fields[5], + ref_start=int(fields[7]), + ref_end=int(fields[8]), + is_pos_strand=fields[4] == "+", + ) \ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt new file mode 100644 index 0000000..1377d81 --- /dev/null +++ b/src/CMakeLists.txt @@ -0,0 +1,95 @@ +cmake_minimum_required(VERSION 3.10) +project(RawHash2Src) + +# option(USE_CCACHE "Use ccache to speed up rebuilds" ON) +# option(USE_MOLD "Use mold linker for faster linking" ON) + +# option(PYBINDING "Build with Python bindings" OFF) # WIP + +# for debugging +option(ENABLE_ASAN "Enable AddressSanitizer" OFF) +option(ENABLE_TSAN "Enable ThreadSanitizer" OFF) +option(PROFILE "Enable profiling support" OFF) + +# whether to compile or instead assume it is prebuilt +option(POD5_DOWNLOAD "Whether to build POD5" ON) # todo: rename to POD5_COMPILE +option(HDF5_COMPILE "Compile HDF5" ON) +option(SLOW5_COMPILE "Compile slow5lib" ON) + +# features to enable/disable in RawHash +# option(NOPOD5 "Disable POD5 in build" OFF) # todo: figure out how to enable shared libs +option(NOPOD5 "Disable POD5 in build" ON) +option(NOHDF5 "Disable HDF5 in build" OFF) +option(NOSLOW5 "Disable SLOW5 in build" OFF) +option(RUCLIENT_ENABLED "Enable ReadUntil client" OFF) + +set(POD5_DIR "" CACHE PATH "Path to POD5 directory (already built or where it should be built)") +set(HDF5_DIR "" CACHE PATH "Path to HDF5 directory (already built or where it should be built)") +set(SLOW5_DIR "" CACHE PATH "Path to SLOW5 directory (already built or where it should be built)") +set(RUCLIENT_DIR "" CACHE PATH "Path to ReadUntil directory (already built or where it should be built)") + +set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin CACHE PATH "Directory where to put binaries") + +set(EXTERNAL_PROJECTS_BUILD_DIR ${CMAKE_BINARY_DIR}/extern) # default build directory for external projects +file(MAKE_DIRECTORY ${EXTERNAL_PROJECTS_BUILD_DIR}) +message(STATUS "External projects build directory: ${EXTERNAL_PROJECTS_BUILD_DIR}") + +include(../cmake/SetupCCacheMold.cmake) +include(../cmake/SetupRawHashLikeTarget.cmake) +include(../cmake/SetupRUClient.cmake) +include(../cmake/SetupPOD5.cmake) +include(../cmake/SetupHDF5.cmake) +include(../cmake/SetupSLOW5.cmake) +include(../cmake/SetupTFLite.cmake) + +set(CMAKE_VERBOSE_MAKEFILE TRUE CACHE BOOL "verbose makefile" FORCE) # print compilation commands + +enable_ccache() +set_alternate_linker(mold) + +# todo: currently can only compile rawhash2 or the below, not both +# this happens because functions like setup_hdf5 define the hdf5 target and the relationship to the target name +# rather than doing it separately +set(build_cli ON) + +if (build_cli) + message(STATUS "Building CLI") + + set(TARGET_NAME rawhash2) + add_executable(${TARGET_NAME} main.cpp) + setup_rawhashlike_target(${TARGET_NAME}) + setup_pod5(${TARGET_NAME}) + setup_ruclient(${TARGET_NAME}) + setup_hdf5(${TARGET_NAME}) + setup_slow5(${TARGET_NAME}) + setup_tflite(${TARGET_NAME}) + +else() + message(STATUS "Building shared library") + + # set(TARGET_NAME rawhash2_wrapper_example) + # add_executable(${TARGET_NAME} wrapper_example.cpp rawhash_wrapper.cpp rawhash_wrapper.hpp) + set(TARGET_NAME rawhash2_wrapper) + add_library(${TARGET_NAME} SHARED rawhash_wrapper.cpp rawhash_wrapper.hpp) + setup_rawhashlike_target(${TARGET_NAME}) + setup_pod5(${TARGET_NAME}) + setup_ruclient(${TARGET_NAME}) + setup_hdf5(${TARGET_NAME}) + setup_slow5(${TARGET_NAME}) + setup_tflite(${TARGET_NAME}) + + set(TARGET_NAME rawhash2_usinglib) + add_executable(${TARGET_NAME} wrapper_example.cpp) + target_link_libraries(${TARGET_NAME} PRIVATE rawhash2_wrapper) + # setup_rawhashlike_target(${TARGET_NAME}) + # # setup_pod5(${TARGET_NAME}) + # target_compile_definitions(${TARGET_NAME} PRIVATE NPOD5RH=1) # todo: workaround + # setup_ruclient(${TARGET_NAME}) + # setup_hdf5(${TARGET_NAME}) + # setup_slow5(${TARGET_NAME}) + # setup_tflite(${TARGET_NAME}) +endif() + + +# setup_pod5() +# add_pod5_to_target(${TARGET_NAME}) diff --git a/src/Makefile b/src/Makefile deleted file mode 100644 index 9c15fb6..0000000 --- a/src/Makefile +++ /dev/null @@ -1,355 +0,0 @@ -DEBUG_OPTS=-O3 -fsanitize=address -CPP_STANDARD=c++11 -# DEBUG=0 - -# get hostname -HOSTNAME := $(shell hostname) -ifeq ($(HOSTNAME),mikado) -# Max' specific config - -# todo: remove this section again -# disable debug for index creation because there is a segfault (address sanitizer) -DEBUG=1 #todo -# HDF5_COMPILE=0 -HDF5_INCLUDE_DIR=/home/mmordig/rawhash_project/rawhash2_new/src/../extern/hdf5//build/include -HDF5_LIB_DIR=/home/mmordig/rawhash_project/rawhash2_new/src/../extern/hdf5//build/lib -POD5_DOWNLOAD=0 -# NOHDF5=1 -NOPOD5=1 -# also disable fsanitize in debug (cflags etc) below! -asan=1 -tsan= -RUCLIENT_ENABLED=1 - -asan= # disable because grpc does not support it well, see cmakelists in readuntil_client -# DEBUG_OPTS=-O0 # -fsanitize=address # not supported for grpc - -# PYBINDING=1 -else ifeq ($(HOSTNAME),mikado1) -# for Oleksandr -DEBUG=1 #todo -# HDF5_COMPILE=0 -HDF5_INCLUDE_DIR=/home/mmordig/rawhash_project/rawhash2_new/src/../extern/hdf5//build/include -HDF5_LIB_DIR=/home/mmordig/rawhash_project/rawhash2_new/src/../extern/hdf5//build/lib -POD5_DOWNLOAD=0 -# NOHDF5=1 -NOPOD5=1 -endif - -# copied from makefile_readuntil, todo: hardcoded paths -ifdef RUCLIENT_ENABLED - $(info ruclient enabled) - CPP_STANDARD=c++20 - # todo4: add -Werror -DDEBUG - LIBS += -L /home/mmordig/rawhash_project/readuntil_fake/build/ - LIBS += /home/mmordig/rawhash_project/readuntil_fake/build/libont_device_client_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libgrpc_utils_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libutils_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libru_method_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libdata_client_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libgrpc_utils_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libreaduntil_grpc_proto.a /home/mmordig/.local/lib/libgrpc++_reflection.a /home/mmordig/.local/lib/libgrpc++.a /home/mmordig/.local/lib/libgrpc.a /home/mmordig/.local/lib/libre2.a /home/mmordig/.local/lib/libupb_json_lib.a /home/mmordig/.local/lib/libupb_textformat_lib.a /home/mmordig/.local/lib/libupb_collections_lib.a /home/mmordig/.local/lib/libupb.a /home/mmordig/.local/lib/libutf8_range_lib.a /home/mmordig/.local/lib/libz.a /home/mmordig/.local/lib/libcares.a /home/mmordig/.local/lib/libgpr.a /home/mmordig/.local/lib/libabsl_random_distributions.a /home/mmordig/.local/lib/libabsl_random_seed_sequences.a /home/mmordig/.local/lib/libabsl_random_internal_pool_urbg.a /home/mmordig/.local/lib/libabsl_random_internal_randen.a /home/mmordig/.local/lib/libabsl_random_internal_randen_hwaes.a /home/mmordig/.local/lib/libabsl_random_internal_randen_hwaes_impl.a /home/mmordig/.local/lib/libabsl_random_internal_randen_slow.a /home/mmordig/.local/lib/libabsl_random_internal_platform.a /home/mmordig/.local/lib/libabsl_random_internal_seed_material.a /home/mmordig/.local/lib/libabsl_random_seed_gen_exception.a /home/mmordig/.local/lib/libssl.a /home/mmordig/.local/lib/libcrypto.a /home/mmordig/.local/lib/libaddress_sorting.a -ldl -lm -lrt /home/mmordig/.local/lib/libprotobuf.a /home/mmordig/.local/lib/libabsl_log_internal_check_op.a /home/mmordig/.local/lib/libabsl_leak_check.a /home/mmordig/.local/lib/libabsl_die_if_null.a /home/mmordig/.local/lib/libabsl_log_internal_conditions.a /home/mmordig/.local/lib/libabsl_log_internal_message.a /home/mmordig/.local/lib/libabsl_log_internal_nullguard.a /home/mmordig/.local/lib/libabsl_examine_stack.a /home/mmordig/.local/lib/libabsl_log_internal_format.a /home/mmordig/.local/lib/libabsl_log_internal_proto.a /home/mmordig/.local/lib/libabsl_log_internal_log_sink_set.a /home/mmordig/.local/lib/libabsl_log_sink.a /home/mmordig/.local/lib/libabsl_log_entry.a /home/mmordig/.local/lib/libabsl_flags.a /home/mmordig/.local/lib/libabsl_flags_internal.a /home/mmordig/.local/lib/libabsl_flags_marshalling.a /home/mmordig/.local/lib/libabsl_flags_reflection.a /home/mmordig/.local/lib/libabsl_flags_config.a /home/mmordig/.local/lib/libabsl_flags_program_name.a /home/mmordig/.local/lib/libabsl_flags_private_handle_accessor.a /home/mmordig/.local/lib/libabsl_flags_commandlineflag.a /home/mmordig/.local/lib/libabsl_flags_commandlineflag_internal.a /home/mmordig/.local/lib/libabsl_log_initialize.a /home/mmordig/.local/lib/libabsl_log_globals.a /home/mmordig/.local/lib/libabsl_log_internal_globals.a /home/mmordig/.local/lib/libabsl_raw_hash_set.a /home/mmordig/.local/lib/libabsl_hash.a /home/mmordig/.local/lib/libabsl_city.a /home/mmordig/.local/lib/libabsl_low_level_hash.a /home/mmordig/.local/lib/libabsl_hashtablez_sampler.a /home/mmordig/.local/lib/libabsl_statusor.a /home/mmordig/.local/lib/libabsl_status.a /home/mmordig/.local/lib/libabsl_cord.a /home/mmordig/.local/lib/libabsl_cordz_info.a /home/mmordig/.local/lib/libabsl_cord_internal.a /home/mmordig/.local/lib/libabsl_cordz_functions.a /home/mmordig/.local/lib/libabsl_exponential_biased.a /home/mmordig/.local/lib/libabsl_cordz_handle.a /home/mmordig/.local/lib/libabsl_crc_cord_state.a /home/mmordig/.local/lib/libabsl_crc32c.a /home/mmordig/.local/lib/libabsl_crc_internal.a /home/mmordig/.local/lib/libabsl_crc_cpu_detect.a /home/mmordig/.local/lib/libabsl_bad_optional_access.a /home/mmordig/.local/lib/libabsl_str_format_internal.a /home/mmordig/.local/lib/libabsl_strerror.a /home/mmordig/.local/lib/libabsl_synchronization.a /home/mmordig/.local/lib/libabsl_stacktrace.a /home/mmordig/.local/lib/libabsl_symbolize.a /home/mmordig/.local/lib/libabsl_debugging_internal.a /home/mmordig/.local/lib/libabsl_demangle_internal.a /home/mmordig/.local/lib/libabsl_graphcycles_internal.a /home/mmordig/.local/lib/libabsl_kernel_timeout_internal.a /home/mmordig/.local/lib/libabsl_malloc_internal.a /home/mmordig/.local/lib/libabsl_time.a /home/mmordig/.local/lib/libabsl_civil_time.a /home/mmordig/.local/lib/libabsl_time_zone.a /home/mmordig/.local/lib/libabsl_bad_variant_access.a /home/mmordig/.local/lib/libutf8_validity.a /home/mmordig/.local/lib/libabsl_strings.a /home/mmordig/.local/lib/libabsl_int128.a /home/mmordig/.local/lib/libabsl_string_view.a /home/mmordig/.local/lib/libabsl_throw_delegate.a /home/mmordig/.local/lib/libabsl_strings_internal.a /home/mmordig/.local/lib/libabsl_base.a /home/mmordig/.local/lib/libabsl_raw_logging_internal.a /home/mmordig/.local/lib/libabsl_log_severity.a /home/mmordig/.local/lib/libabsl_spinlock_wait.a -lrt /home/mmordig/rawhash_project/readuntil_fake/build/libchannel_data_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libcurrent_read_data_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libchannel_stats_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libutils_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/libthreadsafe_queue_LIB.a /home/mmordig/rawhash_project/readuntil_fake/build/external/spdlog/libspdlogd.a - INCLUDES += -I/home/mmordig/rawhash_project/readuntil_fake/external/spdlog/include \ - -I/home/mmordig/rawhash_project/readuntil_fake/include/ - CFLAGS += -DRUCLIENT_ENABLED - CPPFLAGS += -DRUCLIENT_ENABLED -else - $(info ruclient disabled) -endif - -#pass DEBUG=1 to make to enable debugging -ifdef DEBUG - CFLAGS += -Wall ${DEBUG_OPTS} -g -Wc++-compat -fopenmp -march=native -pthread -DHAVE_KALLOC - CPPFLAGS += --std=${CPP_STANDARD} -Wall ${DEBUG_OPTS} -g -fopenmp -march=native -pthread -DHAVE_KALLOC -else - CFLAGS += -Wall -O3 -Wc++-compat -fopenmp -march=native -pthread -DHAVE_KALLOC - CPPFLAGS += -std=${CPP_STANDARD} -Wall -O3 -fopenmp -march=native -pthread -DHAVE_KALLOC -endif - -#pass PROFILE=1 to make to enable profiling -ifdef PROFILE - CFLAGS+=-g -fno-omit-frame-pointer -march=native -DPROFILERH=1 - CPPFLAGS+=-g -fno-omit-frame-pointer -march=native -DPROFILERH=1 -endif - -OBJS= kthread.o kalloc.o bseq.o roptions.o sequence_until.o rutils.o rsig.o revent.o rsketch.o rindex.o lchain.o rseed.o rmap.o dtw.o hit.o -ifdef PYBINDING - $(info building pybinding) - OBJS += rawhash_mapper.o - PROG=rawhash_pybinding -else - OBJS += main.o - PROG=rawhash2 -endif - -ifdef RUCLIENT_ENABLED - OBJS += rawhash_ruclient.o -endif - -# check if ccache is available -CCACHE := $(shell command -v ccache 2> /dev/null) -ifeq ($(CCACHE),) -# do not use tabs on first line, otherwise "make" thinks it is a recipe - $(info ccache not found, using default compiler) -else - $(info ccache found, using ccache) - CC = ccache gcc - CXX = ccache g++ -endif - -# check if mold linker is available -MOLD_PATH := $(shell command -v mold 2> /dev/null) -ifeq (,$(MOLD_PATH)) - $(info mold not found, using default linker) -else - $(info mold found, using mold) - CPPFLAGS += -B/usr/libexec/mold # for faster linking, todo: fix path -endif - -CXX_COMPILER_VERSION ?= $(shell $(CXX) -dumpversion) -SYSTEM_PROCESSOR ?= $(shell uname -m) -SYSTEM_NAME ?= $(shell uname -s) - -WORKDIR = $(shell pwd)/../extern - -POD5_VERSION = 0.3.10 -POD5_URLDIR = pod5-$(POD5_VERSION)-$(SYSTEM_NAME) -POD5_REPO = https://github.com/nanoporetech/pod5-file-format -ifdef POD5_INCLUDE_DIR - POD5_DOWNLOAD = 0 -endif -POD5_INCLUDE_DIR ?= $(WORKDIR)/$(POD5_URLDIR)/include - -ifeq ($(SYSTEM_NAME),Linux) - POD5_LIB ?= lib64 - ifeq ($(shell echo $(SYSTEM_PROCESSOR) | grep -E '^(aarch64|arm)'),) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-linux-x64.tar.gz - else ifeq ($(shell expr $(CXX_COMPILER_VERSION) \< 9.0),1) - POD5_URLDIR = pod5-$(POD5_VERSION)-$(SYSTEM_NAME) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-linux-gcc7-arm64.tar.gz - POD5_LIB ?= lib - else - POD5_URLDIR = pod5-$(POD5_VERSION)-$(SYSTEM_NAME) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-linux-arm64.tar.gz - endif - POD5_LIB_DIR ?= $(WORKDIR)/$(POD5_URLDIR)/$(POD5_LIB) - POD5_LIBRARIES ?= $(POD5_LIB_DIR)/libpod5_format.a \ - $(POD5_LIB_DIR)/libarrow.a \ - $(POD5_LIB_DIR)/libjemalloc_pic.a -endif - -ifeq ($(SYSTEM_NAME),Darwin) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-osx-11.0-arm64.tar.gz - POD5_LIB_DIR ?= $(WORKDIR)/$(POD5_URLDIR)/lib - POD5_LIBRARIES ?= $(POD5_LIB_DIR)/libpod5_format.a \ - $(POD5_LIB_DIR)/libarrow.a -endif - -ifeq ($(SYSTEM_NAME),Windows_NT) - POD5_URL = $(POD5_REPO)/releases/download/$(POD5_VERSION)/lib_pod5-$(POD5_VERSION)-win -endif - -HDF5_DIR ?= ${WORKDIR}/hdf5/ -HDF5_BUILD_DIR ?= ${HDF5_DIR}/build -ifdef HDF5_INCLUDE_DIR - HDF5_COMPILE = 0 -endif -HDF5_INCLUDE_DIR ?= ${HDF5_BUILD_DIR}/include -HDF5_LIB_DIR ?= ${HDF5_BUILD_DIR}/lib -HDF5_LIB ?= hdf5 - -SLOW5_DIR ?= ${WORKDIR}/slow5lib/ -ifdef SLOW5_INCLUDE_DIR - SLOW5_COMPILE = 0 -endif -SLOW5_INCLUDE_DIR ?= ${SLOW5_DIR}/include -SLOW5_LIB_DIR ?= ${SLOW5_DIR}/lib - -LIBS+=-L${WORKDIR}/tensorflow/tflite_build/ -# INCLUDES+=-I${WORKDIR}/gsl/build/include -# LIBS+=${WORKDIR}/gsl/build/libgsl.a -# LIBS+=${WORKDIR}/gsl/build/libgslcblas.a - -#pass NOPOD5=1 to make to disable compiling with POD5 -ifeq ($(NOPOD5),1) - CFLAGS+=-DNPOD5RH=1 - CPPFLAGS+=-DNPOD5RH=1 -else - INCLUDES+=-I${POD5_INCLUDE_DIR} - LIBS+=-L${WORKDIR}/zstd/lib/ ${POD5_LIBRARIES} -lzstd -endif - -#pass NOHDF5=1 to make to disable compiling with HDF5 -ifeq ($(NOHDF5),1) - CFLAGS+=-DNHDF5RH=1 - CPPFLAGS+=-DNHDF5RH=1 -else - INCLUDES+=-I${HDF5_INCLUDE_DIR} - LIBS+=${HDF5_LIB_DIR}/lib${HDF5_LIB}.a -endif - -#pass NOSLOW5=1 to make to disable compiling with SLOW5 -ifeq ($(NOSLOW5),1) - CFLAGS+=-DNSLOW5RH=1 - CPPFLAGS+=-DNSLOW5RH=1 -else - INCLUDES+=-I${SLOW5_INCLUDE_DIR} - LIBS+=${SLOW5_LIB_DIR}/libslow5.a -endif - -INCLUDES+=-I${WORKDIR}/tensorflow/ -LIBS+=-lm -lz -ldl -ltensorflowlite_c - -ifneq ($(aarch64),) - arm_neon=1 -endif - -ifneq ($(arm_neon),) # if arm_neon is defined -ifeq ($(aarch64),) #if aarch64 is not defined - CFLAGS+=-D_FILE_OFFSET_BITS=64 -mfpu=neon -fsigned-char -else #if aarch64 is defined - CFLAGS+=-D_FILE_OFFSET_BITS=64 -fsigned-char -endif -endif - -ifneq ($(asan),) - $(info AddressSanitizer enabled) - CFLAGS+=-fsanitize=address - LIBS+=-fsanitize=address -endif - -ifneq ($(tsan),) - $(info ThreadSanitizer enabled) - CFLAGS+=-fsanitize=thread - LIBS+=-fsanitize=thread -endif - -.PHONY: all subset clean help - -all: zstd hdf5 slow5 pod5 tensorflow check_hdf5 check_slow5 check_pod5 check_tensorflow $(PROG) ## Build RawHash with HDF5, SLOW5, and POD5 -subset: check_zstd check_pod5 check_slow5 check_hdf5 check_tensorflow $(PROG) ## Build RawHash without recompiling HDF5, SLOW5, and POD5 - -help: ##Show help - @echo "Set PROFILE=1 to enable profiling" - @echo "Set DEBUG=1 to enable debugging" - @echo - @echo "Set NOHDF5=1 to disable compiling with HDF5 (note that at least one of HDF5, SLOW5, and POD5 must be enabled)" - @echo "Set NOSLOW5=1 to disable compiling with SLOW5 (note that at least one of HDF5, SLOW5, and POD5 must be enabled)" - @echo "Set NOPOD5=1 to disable compiling with POD5 (note that at least one of HDF5, SLOW5, and POD5 must be enabled)" - @echo - @echo "use \"make subset\" to prevent compiling HDF5, SLOW5, and POD5 libraries from scratch if they are already built" - @echo "use \"make clean\" to remove all generated files" - -check_zstd: - @if [ "$(NOPOD5)" != "1" ]; then \ - [ -f "${WORKDIR}/zstd/lib/libzstd.so" ] || { echo "ZSTD library not found" >&2; exit 1; }; \ - fi - -check_hdf5: - @if [ "$(NOHDF5)" != "1" ]; then \ - [ -f "${HDF5_INCLUDE_DIR}/H5pubconf.h" ] || { echo "HDF5 headers not found" >&2; exit 1; }; \ - [ -f "${HDF5_LIB_DIR}/lib${HDF5_LIB}.so" ] || [ -f "${HDF5_LIB_DIR}/lib${HDF5_LIB}.a" ] || { echo "HDF5 library not found" >&2; exit 1; }; \ - fi - -check_slow5: - @if [ "$(NOSLOW5)" != "1" ]; then \ - [ -f "${SLOW5_INCLUDE_DIR}/slow5/slow5.h" ] || { echo "SLOW5 headers not found" >&2; exit 1; }; \ - [ -f "${SLOW5_LIB_DIR}/libslow5.so" ] || [ -f "${SLOW5_LIB_DIR}/libslow5.a" ] || { echo "SLOW5 library not found" >&2; exit 1; }; \ - fi - -check_pod5: - @if [ "$(NOPOD5)" != "1" ]; then \ - [ -f "${POD5_INCLUDE_DIR}/pod5_format/c_api.h" ] || { echo "POD5 headers not found" >&2; exit 1; }; \ - [ -f "$(POD5_LIB_DIR)/libpod5_format.a" ] || { echo "POD5 library not found" >&2; exit 1; }; \ - fi - -check_tensorflow: - @[ -e "${WORKDIR}/tensorflow/tflite_build/libtensorflowlite_c.so" ] || { echo "Tensorflow library not found" >&2; exit 1; } - -zstd: - @if [ "$(NOPOD5)" != "1" ]; then \ - cd ${WORKDIR}/zstd && make -j; \ - fi - -hdf5: - @if [ "$(NOHDF5)" != "1" ] && [ "$(HDF5_COMPILE)" != "0" ]; then \ - cd ${HDF5_DIR}; \ - mkdir -p build; \ - ./configure --enable-threadsafe --disable-hl --prefix="${HDF5_BUILD_DIR}"; \ - make -j; \ - make install; \ - fi - -slow5: - @if [ "$(NOSLOW5)" != "1" ] && [ "$(SLOW5_COMPILE)" != "0" ]; then \ - make -C ${SLOW5_DIR}; \ - fi - -pod5: check_zstd - @if [ "$(NOPOD5)" != "1" ] && [ "$(POD5_DOWNLOAD)" != "0" ]; then \ - cd $(WORKDIR) && mkdir -p $(POD5_URLDIR); cd $(POD5_URLDIR); wget -qO- $(POD5_URL) | tar -xzv; \ - fi - -tensorflow: - cd ${WORKDIR}/tensorflow && mkdir -p tflite_build && cd tflite_build/ && cmake ../tensorflow/lite/c && cmake --build . -j; \ - -# ifeq ($(NOHDF5),1) -# FASTDEPCOM=${CC} -# FASTDEPCOM_FLAGS=$(CFLAGS) -# else -# FASTDEPCOM=${CXX} -# FASTDEPCOM_FLAGS=$(CPPFLAGS) -# endif -# always use CPP, does not work otherwise -FASTDEPCOM=${CXX} -FASTDEPCOM_FLAGS=$(CPPFLAGS) - -# $(PROG): $(OBJS) -# ${FASTDEPCOM} $(FASTDEPCOM_FLAGS) $(OBJS) -o $(PROG) $(LIBS) - -$(PROG): $(OBJS) - ${CXX} $(CPPFLAGS) $(OBJS) -o $(PROG) $(LIBS) -lstdc++ -Wl,-rpath,${WORKDIR}/tensorflow/tflite_build/ - -# rsig.o: rsig.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# rindex.o: rindex.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# main.o: main.cpp -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# rmap.o: rmap.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# lchain.o: lchain.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# rseed.o: rseed.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -# hit.o: hit.c -# $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -.SUFFIXES: -.SUFFIXES:.cpp .o - -%.o: %.cpp - ${CXX} -c $(CPPFLAGS) $(INCLUDES) $< -o $@ - -.SUFFIXES: -.SUFFIXES:.c .o - -# %.o: %.c -# ${CC} -c $(CFLAGS) $(INCLUDES) $< -o $@ - -%.o: %.c - $(FASTDEPCOM) -c $(FASTDEPCOM_FLAGS) $(INCLUDES) $< -o $@ - -clean: ## Remove all generated files - rm -fr *.o $(PROG) *~ - -rsketch.o: rutils.h rh_kvec.h -lchain.o: kalloc.h rutils.h rseed.h rsketch.h chain.h krmq.h -hit.o: chain.h kalloc.h khash.h -rsig.o: hdf5_tools.hpp rh_kvec.h -rseed.o: rsketch.h kalloc.h rutils.h rindex.h -hit.o: rmap.h kalloc.h khash.h -rmap.o: rindex.h rsig.h kthread.h rh_kvec.h rutils.h rsketch.h revent.h sequence_until.h dtw.h -revent.o: roptions.h kalloc.h -rindex.o: roptions.h rutils.h rsketch.h rsig.h bseq.h khash.h rh_kvec.h kthread.h -main:o rawhash.h ketopt.h rutils.h diff --git a/src/bseq.h b/src/bseq.h index c0bdc63..55f7b16 100644 --- a/src/bseq.h +++ b/src/bseq.h @@ -14,7 +14,7 @@ typedef struct mm_bseq_file_s mm_bseq_file_t; typedef struct { int l_seq, rid; char *name, *seq, *qual, *comment; -} mm_bseq1_t; +} mm_bseq1_t; // reference sequence of length l_seq (basepairs), reference id rid, name, sequence, quality, and comment mm_bseq_file_t *mm_bseq_open(const char *fn); void mm_bseq_close(mm_bseq_file_t *fp); diff --git a/src/cli_parsing.cpp b/src/cli_parsing.cpp index 0dc7313..b5f3068 100644 --- a/src/cli_parsing.cpp +++ b/src/cli_parsing.cpp @@ -80,6 +80,7 @@ static ko_longopt_t long_options[] = { { (char*)"out-quantize", ko_no_argument, 368 }, { (char*)"no-event-detection", ko_no_argument, 369 }, { (char*)"version", ko_no_argument, 370 }, + { (char*)"no-revcomp-query", ko_no_argument, 371 }, { 0, 0, 0 } }; @@ -388,7 +389,7 @@ CLIParsedArgs parse_args(int argc, char *argv[]) { opt.chain_gap_scale = 1.2f; } - else if (c == 362) {ipt.flag |= RI_I_REV_QUERY;}// --rev-query + else if (c == 362) {ipt.flag |= RI_I_REV_QUERY;}// --rev-query, better name: --idx-dont-store-rev, or negate else if (c == 363) {opt.model_map_path = o.arg;}// --map-model else if (c == 364) {ipt.fine_min = atof(o.arg);}// --fine-min else if (c == 365) {ipt.fine_max = atof(o.arg);}// --fine-max @@ -397,6 +398,7 @@ CLIParsedArgs parse_args(int argc, char *argv[]) { else if (c == 368) {ipt.flag |= RI_I_OUT_QUANTIZE; ipt.flag |= RI_I_SIG_TARGET;}// --out-quantize else if (c == 369) {ipt.flag |= RI_I_NO_EVENT_DETECTION;}// --no-event-detection else if (c == 370) {puts(RH_VERSION); exit(EXIT_SUCCESS);}// --version + else if (c == 371) {opt.flag |= RI_M_DONT_QUERY_REVCOMP;}// --no-revcomp-query, only has an effect when "--rev-query" is used, to avoid querying the index with the revcomp else if (c == 'V') {puts(RH_VERSION); exit(EXIT_SUCCESS);} } diff --git a/src/hdf5_tools.hpp b/src/hdf5_tools.hpp index d9fb715..a8622e7 100644 --- a/src/hdf5_tools.hpp +++ b/src/hdf5_tools.hpp @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include diff --git a/src/main.cpp b/src/main.cpp index 816c63c..c2a61a5 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -46,7 +46,6 @@ int main(int argc, char *argv[]) ketopt_t& o = parsed_args.o; ri_idx_reader_t *idx_rdr; - ri_idx_t *ri; idx_rdr = ri_idx_reader_open(argv[o.ind], &ipt, idx_out_filename); if (idx_rdr == 0) { fprintf(stderr, "[ERROR] failed to open file '%s': %s\n", argv[o.ind], strerror(errno)); @@ -59,17 +58,18 @@ int main(int argc, char *argv[]) exit(EXIT_FAILURE); } + // load the pore model ri_pore_t pore; pore.pore_vals = NULL; pore.pore_inds = NULL; pore.max_val = -5000.0; pore.min_val = 5000.0; - if(!(ipt.flag&RI_I_OUT_QUANTIZE)){ - if(!idx_rdr->is_idx && fpore == 0){ + if(!(ipt.flag&RI_I_OUT_QUANTIZE)) { + if(!idx_rdr->is_idx && fpore == 0) { fprintf(stderr, "[ERROR] missing input: please specify a pore model file with -p when generating the index from a sequence file\n"); ri_idx_reader_close(idx_rdr); exit(EXIT_FAILURE); - }else if(!idx_rdr->is_idx && fpore){ + } else if(!idx_rdr->is_idx && fpore) { load_pore(fpore, ipt.k, ipt.lev_col, &pore); if(!pore.pore_vals){ fprintf(stderr, "[ERROR] cannot parse the k-mer pore model file. Please see the example k-mer model files provided in the RawHash repository.\n"); @@ -81,6 +81,7 @@ int main(int argc, char *argv[]) #ifdef RUCLIENT_ENABLED // todo1: remove + // try to read the port from a predefined file int& port = ru_server_port; if (port == 0) { std::ifstream port_file("/home/mmordig/rawhash_project/ru_python/example_run/server_run/ont_device_server_port.txt"); @@ -97,21 +98,23 @@ int main(int argc, char *argv[]) log("Using port " + std::to_string(port)); #endif + // position o.ind is the index file, argc is 1-based + ri_idx_t *ri; + bool num_query_files = argc - (o.ind + 1); if (ru_server_port == -1) { // offline processing while ((ri = ri_idx_reader_read(idx_rdr, &pore, n_threads)) != 0) { - int ret; if (ri_verbose >= 3) fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", __func__, ri_realtime() - ri_realtime0, ri_cputime() / (ri_realtime() - ri_realtime0), ri->n_seq); - if (argc != o.ind + 1) ri_mapopt_update(&opt, ri); // only update the index if it is used for querying later, todo4: can be moved down to before "ret = 0" + if (num_query_files > 0) ri_mapopt_update(&opt, ri); // only update the index if it is used for querying later, todo4: can be moved down to before "ret = 0" if (ri_verbose >= 3) ri_idx_stat(ri); - if (argc - (o.ind + 1) == 0) { + if (num_query_files == 0) { fprintf(stderr, "[INFO] no files to query index on, just created the index\n"); ri_idx_destroy(ri); continue; // no query files, just creating the index } - ret = 0; + int ret = 0; // if (!(opt.flag & MM_F_FRAG_MODE)) { //TODO: enable frag mode directly from options // for (i = o.ind + 1; i < argc; ++i) { // ret = ri_map_file(ri, argv[i], &opt, n_threads); @@ -119,7 +122,7 @@ int main(int argc, char *argv[]) // } // } // else { //TODO: enable frag mode directly from options - ret = ri_map_file_frag(ri, argc - (o.ind + 1), (const char**)&argv[o.ind + 1], &opt, n_threads); + ret = ri_map_file_frag(ri, num_query_files, (const char**)&argv[o.ind + 1], &opt, n_threads); // } ri_idx_destroy(ri); if (ret < 0) { diff --git a/src/rawhash_ruclient.cpp b/src/rawhash_ruclient.cpp index d30783f..f538e58 100644 --- a/src/rawhash_ruclient.cpp +++ b/src/rawhash_ruclient.cpp @@ -140,72 +140,4 @@ void RawHashDecisionMaker::mark_final_decision(ReadIdentifier const& read_ident, channel_read_mapper.free_mappings(); } -// todo1: remove -// // map reads coming from readuntil -// void map_reads_from_ru() { -// // see map_worker_pipeline for what is freed and how (memory allocator) -// // reuse buffer and reg for each read -// ri_tbuf_t* b = ri_tbuf_init(); -// ri_reg1_t* reg = (ri_reg1_t*)malloc(sizeof(ri_reg1_t)); - -// todo: cannot reuse reg0 due to write out - -// todo: write_out_mappings_and_free(reg0, ri); -// } - -// map reads from a single channel to a single reference -// class ChannelDataMapper { -// public: -// ChannelDataMapper(const ri_idx_t *ri, const ri_mapopt_t *opt): ri(ri), opt(opt) { -// b = ri_tbuf_init(); -// } - -// // called by thread, tries to map current read in channel -// void map_reads() { - -// { -// ri_reg1_t* reg0 = (ri_reg1_t*)malloc(sizeof(ri_reg1_t)); // todo: calloc and push to queue -// // start new read -// init_reg1_t(reg0); - -// uint32_t c_count = 0; -// double mean_sum = 0, std_dev_sum = 0; -// uint32_t n_events_sum = 0; -// double t = ri_realtime(); - -// // todo -// uint32_t l_chunk = 10; -// ri_sig_t* sig = NULL; -// uint32_t qlen = 100; - -// // process chunks - -// double mapping_time = ri_realtime() - t; -// #ifdef PROFILERH -// ri_maptime += mapping_time; -// #endif - -// try_mapping_if_none_found(reg0, opt); -// compute_tag_and_mapping_info(c_count, c_count * l_chunk, reg0, opt, mapping_time, qlen, sig->name, ri); -// free_most_of_ri_reg1_t(b->km, reg0); -// km_destroy_and_recreate(&(b->km)); - -// // todo: write to queue instead which writes out reads -// write_out_mappings_and_free(reg0, ri); - -// // wait until read has ended -// } - -// } - -// ~ChannelDataMapper() { -// // free buffer and reg -// ri_tbuf_destroy(b); -// } -// private: -// ri_tbuf_t* b = NULL; // thread-local buffer -// const ri_idx_t *ri; // reference index -// const ri_mapopt_t *opt; // mapping options -// }; - #endif \ No newline at end of file diff --git a/src/rawhash_ruclient.hpp b/src/rawhash_ruclient.hpp index 48c0dd4..a0473d5 100644 --- a/src/rawhash_ruclient.hpp +++ b/src/rawhash_ruclient.hpp @@ -15,12 +15,12 @@ struct SingleChannelCalibration { double offset; }; -struct Alignment { - char* contig; - uint32_t ref_start; - uint32_t ref_end; - bool pos_strand; -}; +// struct Alignment { +// char* contig; +// uint32_t ref_start; +// uint32_t ref_end; +// bool pos_strand; +// }; typedef DecisionMaker::ChunkType ChunkType; diff --git a/src/rawhash_wrapper.cpp b/src/rawhash_wrapper.cpp new file mode 100644 index 0000000..75b2305 --- /dev/null +++ b/src/rawhash_wrapper.cpp @@ -0,0 +1,174 @@ + +#include +#include + +#include "rawhash_wrapper.hpp" +#include "rawhash.h" +#include "rmap.h" +#include "cli_parsing.cpp" + +RawHashMapper::RawHashMapper(int argc, char *argv[]) { + std::cout << "RawHashMapper constructor" << std::endl; + + // copied from main() + liftrlimit(); + ri_realtime0 = ri_realtime(); + + CLIParsedArgs parsed_args = parse_args(argc, argv); + ri_mapopt_t& opt = parsed_args.opt; + ri_idxopt_t& ipt = parsed_args.ipt; + int& n_threads = parsed_args.n_threads; + // int n_parts; + char*& idx_out_filename = parsed_args.idx_out_filename; + char*& fpore = parsed_args.fpore; + // FILE*& fp_help = parsed_args.fp_help; + int& ru_server_port = parsed_args.ru_server_port; + ketopt_t& o = parsed_args.o; + + ri_idx_reader_t *idx_rdr; + idx_rdr = ri_idx_reader_open(argv[o.ind], &ipt, idx_out_filename); + if (idx_rdr == 0) { + fprintf(stderr, "[ERROR] failed to open file '%s': %s\n", argv[o.ind], strerror(errno)); + exit(EXIT_FAILURE); + } + + if (!idx_rdr->is_idx && idx_out_filename == 0 && argc - o.ind < 2 && !(ipt.flag&RI_I_OUT_QUANTIZE)) { + fprintf(stderr, "[ERROR] missing input: please specify a query FAST5/SLOW5/POD5 file(s) to map or option -d to store the index in a file before running the mapping\n"); + ri_idx_reader_close(idx_rdr); + exit(EXIT_FAILURE); + } + + // load the pore model + ri_pore_t pore; + pore.pore_vals = NULL; + pore.pore_inds = NULL; + pore.max_val = -5000.0; + pore.min_val = 5000.0; + if(!(ipt.flag&RI_I_OUT_QUANTIZE)) { + if(!idx_rdr->is_idx && fpore == 0) { + fprintf(stderr, "[ERROR] missing input: please specify a pore model file with -p when generating the index from a sequence file\n"); + ri_idx_reader_close(idx_rdr); + exit(EXIT_FAILURE); + } else if(!idx_rdr->is_idx && fpore) { + load_pore(fpore, ipt.k, ipt.lev_col, &pore); + if(!pore.pore_vals){ + fprintf(stderr, "[ERROR] cannot parse the k-mer pore model file. Please see the example k-mer model files provided in the RawHash repository.\n"); + ri_idx_reader_close(idx_rdr); + exit(EXIT_FAILURE); + } + } + } + + + ri_idx_t* ri = ri_idx_reader_read(idx_rdr, &pore, n_threads); + if (ri == 0) { + fprintf(stderr, "[ERROR] failed to read the index\n"); + ri_idx_reader_close(idx_rdr); + exit(EXIT_FAILURE); + } + + if (ri_verbose >= 3) + fprintf(stderr, "[M::%s::%.3f*%.2f] loaded/built the index for %d target sequence(s)\n", + __func__, ri_realtime() - ri_realtime0, ri_cputime() / (ri_realtime() - ri_realtime0), ri->n_seq); + + ri_mapopt_update(&opt, ri); + if (ri_verbose >= 3) ri_idx_stat(ri); + + _ri = ri; + _opt = new ri_mapopt_t; + *((ri_mapopt_t*)_opt) = opt; // copy constructor +} + +RawHashMapper::~RawHashMapper() { + fprintf(stderr, "[M::%s] closing the index\n", __func__); + ri_idx_destroy((ri_idx_t*)_ri); + delete _opt; +} + +void RawHashMapper::print_idx_info() const { + fprintf(stderr, "[M::%s] index info\n", __func__); // todo + ri_idx_stat((ri_idx_t*)_ri); +} + +std::vector RawHashMapper::map(float* raw_signal, int signal_length) { + // print + // fprintf(stderr, "[M::%s] mapping\n", __func__); + + if (ri_verbose >= 3) { + for (int i = 0; (i < signal_length) && (i <= 10); i++) { + fprintf(stderr, "%f ", raw_signal[i]); + } + fprintf(stderr, "..."); + } + + ri_idx_t* ri = (ri_idx_t*)_ri; + pipeline_mt pipeline = { + // .n_processed = 0, + // .n_threads = 1, + // .n_fp = 0, + // .cur_fp = 0, + // .n_f = 0, + // .cur_f = 0, + // .mini_batch_size = 0, + .opt = (ri_mapopt_t*)_opt, + // .f = NULL, + // .fp = NULL, + .ri = ri, + // .fn = NULL, + // .su_nreads = 0, + // .su_nestimations = 0, + // .ab_count = 0, + // .su_cur = 0, + // .su_estimations = NULL, + // .su_c_estimations = NULL, + // .su_stop = 0, + .map_model = NULL + }; + ri_tbuf_t *buf = ri_tbuf_init(); + char* read_name = "read123"; + ri_sig_t signal = { + .rid = 123, + .l_sig = signal_length, + .name = read_name, + + .offset = 0, // todo + .sig = raw_signal, + }; + ri_sig_t* signal_ptr = &signal; // since &&signal not working since &signal is temporary + ri_reg1_t reg0_noptr; // will be initialized + ri_reg1_t* reg0 = ®0_noptr; + step_mt data = { + .p = &pipeline, + .n_sig = 1, + .sig = (ri_sig_t**)(&signal_ptr), + .reg = (ri_reg1_t**)(®0), + .buf = &buf + }; + map_worker_for(&data, 0, 0); + + // write out + if (ri_verbose >= 3) { + write_out_mappings_to_stdout(reg0, ri); + } + std::vector alignments; + for (int m = 0; m < reg0->n_maps; ++m) { + if (reg0->maps[m].ref_id < ri->n_seq) { + // mapped + alignments.push_back(Alignment { + .contig = (ri->flag & RI_I_SIG_TARGET) ? ri->sig[reg0->maps[m].ref_id].name : ri->seq[reg0->maps[m].ref_id].name, + .ref_start = reg0->maps[m].fragment_start_position, + .ref_end = reg0->maps[m].fragment_start_position + reg0->maps[m].fragment_length, + .is_pos_strand = not reg0->maps[m].rev + }); + } + } + // dummy alignment + // alignments.push_back(Alignment { + // .contig = "fakealignment_chr123", .ref_start = 123, .ref_end = 456, .is_pos_strand = true + // }); + free_mappings_ri_reg1_t(reg0); + + ri_tbuf_destroy(buf); + + return alignments; +} \ No newline at end of file diff --git a/src/rawhash_wrapper.hpp b/src/rawhash_wrapper.hpp new file mode 100644 index 0000000..4827e58 --- /dev/null +++ b/src/rawhash_wrapper.hpp @@ -0,0 +1,53 @@ +#pragma once + +#include +#include + +struct Alignment { + const char* contig; + uint32_t ref_start; + uint32_t ref_end; // exclusive + bool is_pos_strand; +}; + +class RawHashMapper { + /** + * @brief Wrapper that can be used from other code (as an alternative to the CLI) + * + * This API is inefficient (no threading) and only for experimenting + * with RawHash from Python without having to load the index repeatedly. + * + * Also mixes malloc/free and new/delete + */ +public: + /** + * @brief Use the same arguments as the CLI + * + * Including the leading program name (which will be ignored) + * Note that the option "-d index" (to dump the index) is ignored, so the index should be + * dumped with the rawhash CLI, then it can be used for mapping with this function. + * + * @param argc + * @param argv + */ + RawHashMapper(int argc, char *argv[]); + ~RawHashMapper(); + + /* + * Map the raw read, returning the alignments + * + * Same as ri_map_file_frag -> map_worker_pipeline -> map_worker_for, + * except it reads from memory rather than the file + * and does not perform pipeling or threading + */ + std::vector map(float* raw_signal, int signal_length); + + /** + * Get info about the index + */ + void print_idx_info() const; + +private: + void *_ri; // ri_idx_t* + void* _opt; // ri_mapopt_t* +}; \ No newline at end of file diff --git a/src/rindex.c b/src/rindex.c index 88eac04..13b68b0 100644 --- a/src/rindex.c +++ b/src/rindex.c @@ -311,7 +311,7 @@ static void *worker_sig_pipeline(void *shared, int step, void *in) if(p->ri->flag&RI_I_OUT_QUANTIZE){ if(s->a.a) ri_kfree(0, s->a.a); - free(s); s = 0; + free(s); s = 0; // so pipeline will stop at this stage } return s; } else if (step == 2) { // dispatch sketch to buckets diff --git a/src/rindex.h b/src/rindex.h index 66480c0..5354ce0 100644 --- a/src/rindex.h +++ b/src/rindex.h @@ -17,7 +17,7 @@ typedef struct ri_idx_bucket_s { int32_t n; // size of the _p_ array uint64_t *p; // position array for seeds appearing >1 times void *h; // hash table indexing _p_ and seeds appearing once -} ri_idx_bucket_t; +} ri_idx_bucket_t; // bucket of hash table typedef struct ri_idx_seq_s{ char *name; // name of the db sequence @@ -52,9 +52,9 @@ typedef struct ri_idx_s{ ri_sig_t *sig; // uint32_t *S; - float **F; //forward + float **F; //forward signal uint32_t *f_l_sig; //length of the signals (forward) - float **R; //reverse + float **R; //reverse signal uint32_t *r_l_sig; //length of the signals (reverse) } ri_idx_t; diff --git a/src/rmap.cpp b/src/rmap.cpp index e545d90..1ffab51 100644 --- a/src/rmap.cpp +++ b/src/rmap.cpp @@ -247,7 +247,7 @@ void align_chain(mm_reg1_t *chain, mm128_t* anchors, const ri_idx_t *ri, const f if(opt->flag & RI_M_DTW_LOG_SCORES){ char str[256]; sprintf(str, "chaining_score=%d alignment_score=%f\n", chain->score, chain->alignment_score); - fprintf(stderr, str); + fprintf(stderr, "%s", str); } } @@ -322,7 +322,7 @@ void ri_map_frag(const ri_idx_t *ri, mm128_v riv = {0,0,0}; ri_sketch(b->km, events, 0, 0, n_events, ri->diff, ri->w, ri->e, ri->n, ri->q, ri->k, ri->fine_min, ri->fine_max, ri->fine_range, &riv, 0); - if(ri->flag&RI_I_REV_QUERY){ + if (!(opt->flag&RI_M_DONT_QUERY_REVCOMP)) { uint32_t span = ri->k+ri->e-1; mm128_v riv_rev = {0,0,0}; mm128_t rev_anchor = {0,0}; @@ -707,7 +707,7 @@ bool continue_mapping_with_new_chunk(const ri_idx_t *ri, // reference index // map a single read to a reference, called in parallel -static void map_worker_for(void *_data, +void map_worker_for(void *_data, long i, int tid) // kt_for() callback { diff --git a/src/rmap.h b/src/rmap.h index 29d2b8d..dee8d9f 100644 --- a/src/rmap.h +++ b/src/rmap.h @@ -151,6 +151,11 @@ bool continue_mapping_with_new_chunk(const ri_idx_t *ri, // reference index double* std_dev_sum, // see mean_sum arg uint32_t* n_events_sum // number of events in all seen chunks (not including current chunk) ); + +// map a single read to a reference, called in parallel +void map_worker_for(void *_data, + long i, + int tid); #ifdef __cplusplus } #endif diff --git a/src/roptions.h b/src/roptions.h index ca51693..97823c5 100644 --- a/src/roptions.h +++ b/src/roptions.h @@ -5,15 +5,15 @@ #ifndef ROPTIONS_H #define ROPTIONS_H -#define RI_I_NAIVE 0x1 -#define RI_I_MIN 0x2 -#define RI_I_BLEND 0x4 -#define RI_I_SYNCMER 0x8 -#define RI_I_STORE_SIG 0x10 -#define RI_I_SIG_TARGET 0x20 -#define RI_I_REV_QUERY 0x40 -#define RI_I_OUT_QUANTIZE 0x80 -#define RI_I_NO_EVENT_DETECTION 0x100 +#define RI_I_NAIVE 0x1 // unused +#define RI_I_MIN 0x2 // unused +#define RI_I_BLEND 0x4 // unused +#define RI_I_SYNCMER 0x8 // unused +#define RI_I_STORE_SIG 0x10 // whether to store the signal (generated from a fasta reference) +#define RI_I_SIG_TARGET 0x20 // whether the index consists of raw signals rather than nucleotides +#define RI_I_REV_QUERY 0x40 // whether the index should NOT store the reverse strand, this assumes that we query the index with the revcomp of the read as well +#define RI_I_OUT_QUANTIZE 0x80 // quantize raw signals (index not needed) +#define RI_I_NO_EVENT_DETECTION 0x100 // whether the signal are raw signals, or events (which will be normalized only, removing outliers) #define RI_M_SEQUENCEUNTIL 0x1 #define RI_M_RMQ 0x2 @@ -31,6 +31,7 @@ #define RI_M_LOG_NUM_ANCHORS 0x1000 //Overlapping related #define RI_M_ALL_CHAINS 0x2000 +#define RI_M_DONT_QUERY_REVCOMP 0x4000 // whether to transform the read to its revcomp and query index (defaults to RI_I_REV_QUERY), //Characterization related #define RI_M_OUT_ALL_CHAINS 0x4000 diff --git a/src/rseed.c b/src/rseed.c index 1fea056..024f6a1 100644 --- a/src/rseed.c +++ b/src/rseed.c @@ -7,8 +7,6 @@ void ri_seed_select(int32_t n, ri_seed_t *a, int len, int max_occ, int max_max_occ, int dist) { // for high-occ minimizers, choose up to max_high_occ in each high-occ streak - extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*); - extern void ks_heapmake_uint64_t(size_t n, uint64_t*); int32_t i, last0, m; uint64_t b[MAX_MAX_HIGH_OCC]; // this is to avoid a heap allocation @@ -135,7 +133,7 @@ ri_seed_t *ri_collect_matches(void *km, } // } - fprintf(stderr, "n_seed_hits: %u n_flt: %u ratio: %f\n", n_seed_hits, n_flt, (float)n_flt/n_seed_hits); + // fprintf(stderr, "n_seed_hits: %u n_flt: %u ratio: %f\n", n_seed_hits, n_flt, (float)n_flt/n_seed_hits); for (i = 0, n_seed_m = 0, *rep_len = 0, *n_seed_pos = 0; i < n_seed_m0; ++i) { // for (i = 0, n_seed_m = 0, *n_seed_pos = 0; i < n_seed_m0; ++i) { ri_seed_t *q = &seed_hits[i]; diff --git a/src/rsig.c b/src/rsig.c index df626b6..f0cbe33 100644 --- a/src/rsig.c +++ b/src/rsig.c @@ -207,22 +207,31 @@ static inline ri_sig_file_t *ri_sig_open_slow5(const char *fn){ } #endif +bool str_ends_with(const char *str, const char *suffix) { + if( !str || !suffix ) return 0; + size_t str_len = strlen(str); + size_t suffix_len = strlen(suffix); + if(suffix_len > str_len) return 0; + return 0 == strncmp(str + str_len - suffix_len, suffix, suffix_len); +} + ri_sig_file_t *ri_sig_open(const char *fn){ - if (strstr(fn, ".fast5")) { + if (str_ends_with(fn, ".fast5")) { #ifndef NHDF5RH return ri_sig_open_fast5(fn); #endif - } else if (strstr(fn, ".pod5") || strstr(fn, ".pod")) { + } else if (str_ends_with(fn, ".pod5") || str_ends_with(fn, ".pod")) { #ifndef NPOD5RH return ri_sig_open_pod5(fn); #endif - } else if (strstr(fn, ".slow5") || strstr(fn, ".blow5")) { + } else if (str_ends_with(fn, ".slow5") || str_ends_with(fn, ".blow5")) { #ifndef NSLOW5RH return ri_sig_open_slow5(fn); #endif + } else { + fprintf(stderr, "ERROR: Unknown file format: %s\n", fn); + return 0; } - - return 0; } void ri_sig_close(ri_sig_file_t *fp) @@ -299,8 +308,8 @@ int is_dir(const char *A) void find_sfiles(const char *A, ri_char_v *fnames) { if (!is_dir(A)) { - // todo4: adapt this since slow5 creates .slow5.idx in the same directory, so should be "endswith" - if (strstr(A, ".fast5") || strstr(A, ".pod5") || strstr(A, ".pod") || strstr(A, ".slow5") || strstr(A, ".blow5")) { + if (str_ends_with(A, ".fast5") || str_ends_with(A, ".pod5") || str_ends_with(A, ".pod") + || str_ends_with(A, ".slow5") || str_ends_with(A, ".blow5")) { char** cur_fname; rh_kv_pushp(char*, 0, *fnames, &cur_fname); (*cur_fname) = strdup(A); @@ -318,7 +327,9 @@ void find_sfiles(const char *A, ri_char_v *fnames) if (strcmp(ent->d_name, ".") && strcmp(ent->d_name, "..")) find_sfiles(tmp, fnames); } else { - if (strstr(ent->d_name, ".fast5") || strstr(ent->d_name, ".pod5") || strstr(ent->d_name, ".pod") || strstr(ent->d_name, ".slow5") || strstr(ent->d_name, ".blow5")) { + if (str_ends_with(ent->d_name, ".fast5") || str_ends_with(ent->d_name, ".pod5") + || str_ends_with(ent->d_name, ".pod") || str_ends_with(ent->d_name, ".slow5") + || str_ends_with(ent->d_name, ".blow5")) { char** cur_fname; rh_kv_pushp(char*, 0, *fnames, &cur_fname); (*cur_fname) = strdup(tmp); diff --git a/src/rutils.h b/src/rutils.h index 6c0fbf9..d9e8e42 100644 --- a/src/rutils.h +++ b/src/rutils.h @@ -38,6 +38,10 @@ void liftrlimit(void); void load_pore(const char* fpore, const short k, const short lev_col, ri_pore_t* pore); +// Explicit definition for rseed.c +extern void ks_heapdown_uint64_t(size_t i, size_t n, uint64_t*); +extern void ks_heapmake_uint64_t(size_t n, uint64_t*); + #ifdef __cplusplus } #endif diff --git a/src/wrapper_example.cpp b/src/wrapper_example.cpp new file mode 100644 index 0000000..bcac9b0 --- /dev/null +++ b/src/wrapper_example.cpp @@ -0,0 +1,7 @@ +#include "rawhash_wrapper.hpp" + +int main(int argc, char *argv[]) { + RawHashMapper mapper(argc, argv); + mapper.print_idx_info(); + return 0; +} \ No newline at end of file