Skip to content

Commit

Permalink
Benchmark results for dataset 1
Browse files Browse the repository at this point in the history
  • Loading branch information
GZHoffie committed Jan 26, 2023
1 parent 180dc79 commit 49527ed
Show file tree
Hide file tree
Showing 17 changed files with 179 additions and 82 deletions.
17 changes: 17 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,8 +46,25 @@ Or we may build the full version of BucketMap, which further uses Smith-Waterman
cmake --build . --target bucketmap_align
```

and the binary files named `bucketmap` and `bucketmap_align` will be built under the `./build` directory.

## Usage

The usage of BucketMap is simple. To build the index files, you may use

```bash
<path_to_bucketmap_binary>/bucketmap -x -i <index_name>
```

which will output `<index_name>.qgram` and `<index_name>.bucket_id` under the current directory.

To map the reads in a fastq file, you may run the following command (under the directory of the index file if you have built the index file using the above command)

```bash
<path_to_bucketmap_binary>/bucketmap -i <index_name> -q <fastq_file_path> -o <output_sam_file_path>
```

which will output the `.sam` file in the desired location. You may also run the second command directly, which will build the index and do the mapping in one go. To perform pairwise alignment and output the CIGAR string in the sam files, you may simply replace `bucketmap` with `bucketmap_align`.



Expand Down
2 changes: 1 addition & 1 deletion bucket_map/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ target_compile_definitions(bucketmap PRIVATE ${BM_DEFINITIONS})
target_link_libraries(bucketmap PUBLIC seqan3::seqan3 sharg::sharg)

add_executable(bucketmap_align main.cpp ${SHARED_FILES})
target_compile_definitions(bucketmap_align PRIVATE ${BM_DEFINITIONS})
target_compile_definitions(bucketmap_align PRIVATE BM_ALIGN ${BM_DEFINITIONS})
target_link_libraries(bucketmap_align PUBLIC seqan3::seqan3 sharg::sharg)

#get_property(defs TARGET bucketmap_align PROPERTY COMPILE_DEFINITIONS)
Expand Down
2 changes: 1 addition & 1 deletion bucket_map/benchmark/benchmark.sh
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ mkdir -p "${BENCHMARK_PATH}/log"
mkdir -p "${BENCHMARK_PATH}/output"

# Run index benchmarking
./benchmark_index.sh ${GENOME_FILE} ${BENCHMARK_PATH} ${INDICATOR}
#./benchmark_index.sh ${GENOME_FILE} ${BENCHMARK_PATH} ${INDICATOR}

# Run map benchmarking
./benchmark_map.sh ${QUERY_FILE} ${BENCHMARK_PATH} ${INDICATOR}
17 changes: 10 additions & 7 deletions bucket_map/benchmark/benchmark_index.sh
Original file line number Diff line number Diff line change
Expand Up @@ -8,18 +8,21 @@ INDEX_INDICATOR=$3
cd "${BENCHMARK_PATH}/index"

# run the indexing method of bowtie2
#echo "Indexing using bowtie2"
#/usr/bin/time -o "${BENCHMARK_PATH}/log/bowtie2_index.time" -v bowtie2-build ${FASTA_PATH} "${INDEX_INDICATOR}_bowtie" &> "${BENCHMARK_PATH}/log/bowtie2_index.log"
echo "Indexing using bowtie2"
/usr/bin/time -o "${BENCHMARK_PATH}/log/bowtie2_index.time" -v bowtie2-build ${FASTA_PATH} "${INDEX_INDICATOR}_bowtie" &> "${BENCHMARK_PATH}/log/bowtie2_index.log"

# run indexing for bwa
#echo "Indexing using bwa"
#/usr/bin/time -o "${BENCHMARK_PATH}/log/bwa_index.time" -v bwa index -p "${INDEX_INDICATOR}_bwa" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/bwa_index.log"
echo "Indexing using bwa"
/usr/bin/time -o "${BENCHMARK_PATH}/log/bwa_index.time" -v bwa index -p "${INDEX_INDICATOR}_bwa" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/bwa_index.log"

# run indexing for subread
#echo "Indexing using subread"
#/usr/bin/time -o "${BENCHMARK_PATH}/log/subread_index.time" -v subread-buildindex -o "${INDEX_INDICATOR}_subread" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/subread_index.log"
echo "Indexing using subread"
/usr/bin/time -o "${BENCHMARK_PATH}/log/subread_index.time" -v subread-buildindex -o "${INDEX_INDICATOR}_subread" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/subread_index.log"

# run indexing for minimap2
echo "Indexing using minimap2"
/usr/bin/time -o "${BENCHMARK_PATH}/log/minimap2_index.time" -v minimap2 -d "${INDEX_INDICATOR}_minimap.mmi" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/minimap2_index.log"
/usr/bin/time -o "${BENCHMARK_PATH}/log/minimap2_index.time" -v minimap2 -t 1 -d "${INDEX_INDICATOR}_minimap.mmi" ${FASTA_PATH} &> "${BENCHMARK_PATH}/log/minimap2_index.log"

# run indexing for BucketMap
echo "Indexing using BucketMap"
/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_index.time" -v bucketmap -x -i "${INDEX_INDICATOR}_bucketmap" &> "${BENCHMARK_PATH}/log/bucketmap_index.log"
11 changes: 9 additions & 2 deletions bucket_map/benchmark/benchmark_map.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,12 @@ cd "${BENCHMARK_PATH}/index"
#/usr/bin/time -o "${BENCHMARK_PATH}/log/subread_map.time" -v subread-align --SAMoutput -i "${INDEX_INDICATOR}_subread" -r ${FASTQ_PATH} -t 1 -o "${BENCHMARK_PATH}/output/subread_map.sam" &> "${BENCHMARK_PATH}/log/subread_map.log"

# run minimap2
echo "Mapping using minimap2"
/usr/bin/time -o "${BENCHMARK_PATH}/log/minimap2_map.time" -v minimap2 -t 1 -a "${INDEX_INDICATOR}_minimap.mmi" ${FASTQ_PATH} > "${BENCHMARK_PATH}/output/minimap2_map.sam" 2> "${BENCHMARK_PATH}/log/minimap2_map.log"
#echo "Mapping using minimap2"
#/usr/bin/time -o "${BENCHMARK_PATH}/log/minimap2_map.time" -v minimap2 -a "${INDEX_INDICATOR}_minimap.mmi" ${FASTQ_PATH} > "${BENCHMARK_PATH}/output/minimap2_map.sam" 2> "${BENCHMARK_PATH}/log/minimap2_map.log"

# run bucketmap
echo "Mapping using BucketMap"
/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_map.time" -v bucketmap --version-check 0 -i "${INDEX_INDICATOR}_bucketmap" -q ${FASTQ_PATH} -o "${BENCHMARK_PATH}/output/bucketmap_map.sam" &> "${BENCHMARK_PATH}/log/bucketmap_map.log"

echo "Mapping using BucketMap_align"
/usr/bin/time -o "${BENCHMARK_PATH}/log/bucketmap_align_map.time" -v bucketmap_align --version-check 0 -i "${INDEX_INDICATOR}_bucketmap" -q ${FASTQ_PATH} -o "${BENCHMARK_PATH}/output/bucketmap_align_map.sam" &> "${BENCHMARK_PATH}/log/bucketmap_align_map.log"
12 changes: 12 additions & 0 deletions bucket_map/benchmark/log/bucketmap_align_map.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[INFO] Allowing Smith-Waterman for alignment verifications.
[INFO] Initializing indexer and mapper with bucket length: 65536, and number of buckets: 26507.
[INFO] Set q-gram shape to be: [1,1,1,0,1,1,1,0,1,0,1,1] with number of effective characters: 9
[ERROR] The specified file already exists in directory: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap.qgram".
[BENCHMARK] Number of Q-grams with distinguishability >= 0.499981: 252427 (96.2933%).
[BENCHMARK] Elapsed time for loading index files: 15.571 s.
[INFO] Successfully loaded "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap.qgram".
[BENCHMARK] Elapsed time for bucket mapping: 62.939 s (62.939 μs/seq).
[BENCHMARK] Total time used for building k-mer index for each bucket: 202.868 s.
[BENCHMARK] Total time used for finding exact location of the sequences: 6.124 s (6.124 μs/seq).
[BENCHMARK] Total mapped locations: 1141838 (1.14184 per sequence).
[BENCHMARK] Total time used for alignment verification and output: 69.636 s (60.9859 μs per pairwise alignment).
23 changes: 23 additions & 0 deletions bucket_map/benchmark/log/bucketmap_align_map.time
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Command being timed: "bucketmap_align --version-check 0 -i egu_bucketmap -q /mnt/d/genome/test/sim_illumina_1M.fastq -o /home/zhenhao/bucket-map/bucket_map/benchmark/output/bucketmap_align_map.sam"
User time (seconds): 385.71
System time (seconds): 2.07
Percent of CPU this job got: 99%
Elapsed (wall clock) time (h:mm:ss or m:ss): 6:30.88
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 865640
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 0
Minor (reclaiming a frame) page faults: 137679
Voluntary context switches: 14549
Involuntary context switches: 297
Swaps: 0
File system inputs: 0
File system outputs: 1429848
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Page size (bytes): 4096
Exit status: 0
7 changes: 7 additions & 0 deletions bucket_map/benchmark/log/bucketmap_index.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[INFO] Not using Smith-Waterman for alignment verifications.
[INFO] Initializing indexer and mapper with bucket length: 65536, and number of buckets: 26507.
[INFO] Set q-gram shape to be: [1,1,1,0,1,1,1,0,1,0,1,1] with number of effective characters: 9
[INFO] The bucket q-gram index is stored in: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap.qgram".
[INFO] The number of buckets: 26507.
[INFO] The bucket ids are stored in: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap.bucket_id".
[BENCHMARK] Elapsed time for creating and storing index files: 163.497 s.
23 changes: 23 additions & 0 deletions bucket_map/benchmark/log/bucketmap_index.time
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
Command being timed: "bucketmap -x -i egu_bucketmap"
User time (seconds): 154.56
System time (seconds): 5.16
Percent of CPU this job got: 97%
Elapsed (wall clock) time (h:mm:ss or m:ss): 2:43.55
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 1151240
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 41
Minor (reclaiming a frame) page faults: 10903
Voluntary context switches: 7192
Involuntary context switches: 50
Swaps: 0
File system inputs: 9384
File system outputs: 1697112
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Page size (bytes): 4096
Exit status: 0
22 changes: 12 additions & 10 deletions bucket_map/benchmark/log/bucketmap_map.log
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
[INFO] Set q-gram shape to be: [1,1,1,0,1,1,1,0,1,0,1,1] with number of effective characters: 9
[ERROR] The specified file already exists in directory: "/mnt/d/genome/index/index.qgram".
[BENCHMARK] Number of Q-grams with distinguishability >= 0.499981: 252427 (96.2933%).
[BENCHMARK] Elapsed time for loading index files: 25.364 s.
[INFO] Successfully loaded "/mnt/d/genome/index/index.qgram".
[BENCHMARK] Elapsed time for bucket mapping: 71.395 s (71.395 μs/seq).
[BENCHMARK] Total time used for building k-mer index for each bucket: 229.2 s.
[BENCHMARK] Total time used for finding exact location of the sequences: 6.315 s (6.315 μs/seq).
[BENCHMARK] Total mapped locations: 1145380 (1.14538 per sequence).
[BENCHMARK] Total time used for pairwise alignment and output: 72.53 s (63.324 μs per pairwise alignment).
[INFO] Not using Smith-Waterman for alignment verifications.
[INFO] Initializing indexer and mapper with bucket length: 65536, and number of buckets: 26507.
[INFO] Set q-gram shape to be: [1,1,1,0,1,1,1,0,1,0,1,1] with number of effective characters: 9
[ERROR] The specified file already exists in directory: "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap.qgram".
[BENCHMARK] Number of Q-grams with distinguishability >= 0.499981: 252427 (96.2933%).
[BENCHMARK] Elapsed time for loading index files: 15.735 s.
[INFO] Successfully loaded "/home/zhenhao/bucket-map/bucket_map/benchmark/index/egu_bucketmap.qgram".
[BENCHMARK] Elapsed time for bucket mapping: 61.281 s (61.281 μs/seq).
[BENCHMARK] Total time used for building k-mer index for each bucket: 190.608 s.
[BENCHMARK] Total time used for finding exact location of the sequences: 5.96 s (5.96 μs/seq).
[BENCHMARK] Total mapped locations: 1145380 (1.14538 per sequence).
[BENCHMARK] Total time used for alignment verification and output: 3.187 s (2.78248 μs per pairwise alignment).
46 changes: 23 additions & 23 deletions bucket_map/benchmark/log/bucketmap_map.time
Original file line number Diff line number Diff line change
@@ -1,23 +1,23 @@
Command being timed: "/home/zhenhao/bucket-map/build/bucket_map_final"
User time (seconds): 426.78
System time (seconds): 2.90
Percent of CPU this job got: 97%
Elapsed (wall clock) time (h:mm:ss or m:ss): 7:20.25
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 865568
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 0
Minor (reclaiming a frame) page faults: 138474
Voluntary context switches: 120462
Involuntary context switches: 447
Swaps: 0
File system inputs: 0
File system outputs: 1434408
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Page size (bytes): 4096
Exit status: 0
Command being timed: "bucketmap --version-check 0 -i egu_bucketmap -q /mnt/d/genome/test/sim_illumina_1M.fastq -o /home/zhenhao/bucket-map/bucket_map/benchmark/output/bucketmap_map.sam"
User time (seconds): 305.03
System time (seconds): 1.19
Percent of CPU this job got: 97%
Elapsed (wall clock) time (h:mm:ss or m:ss): 5:12.65
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 865604
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 0
Minor (reclaiming a frame) page faults: 137770
Voluntary context switches: 15141
Involuntary context switches: 110
Swaps: 0
File system inputs: 1219184
File system outputs: 1425232
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Page size (bytes): 4096
Exit status: 0
12 changes: 6 additions & 6 deletions bucket_map/benchmark/log/minimap2_index.log
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
[M::mm_idx_gen::23.637*1.74] collected minimizers
[M::mm_idx_gen::30.015*2.01] sorted minimizers
[M::main::35.365*1.83] loaded/built the index for 932 target sequence(s)
[M::mm_idx_gen::54.069*0.80] collected minimizers
[M::mm_idx_gen::73.719*0.85] sorted minimizers
[M::main::79.385*0.85] loaded/built the index for 932 target sequence(s)
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 932
[M::mm_idx_stat::36.031*1.82] distinct minimizers: 61720831 (52.90% are singletons); average occurrences: 5.186; average spacing: 5.315; total length: 1701312507
[M::mm_idx_stat::80.097*0.85] distinct minimizers: 61720831 (52.90% are singletons); average occurrences: 5.186; average spacing: 5.315; total length: 1701312507
[M::main] Version: 2.24-r1150-dirty
[M::main] CMD: minimap2 -d egu_minimap.mmi /mnt/d/genome/Egu.v3.genome_f.fasta
[M::main] Real time: 36.089 sec; CPU: 65.473 sec; Peak RSS: 6.657 GB
[M::main] CMD: minimap2 -t 1 -d egu_minimap.mmi /mnt/d/genome/Egu.v3.genome_f.fasta
[M::main] Real time: 80.327 sec; CPU: 68.192 sec; Peak RSS: 6.550 GB
24 changes: 12 additions & 12 deletions bucket_map/benchmark/log/minimap2_index.time
Original file line number Diff line number Diff line change
@@ -1,21 +1,21 @@
Command being timed: "minimap2 -d egu_minimap.mmi /mnt/d/genome/Egu.v3.genome_f.fasta"
User time (seconds): 55.55
System time (seconds): 10.08
Percent of CPU this job got: 180%
Elapsed (wall clock) time (h:mm:ss or m:ss): 0:36.30
Command being timed: "minimap2 -t 1 -d egu_minimap.mmi /mnt/d/genome/Egu.v3.genome_f.fasta"
User time (seconds): 54.45
System time (seconds): 13.74
Percent of CPU this job got: 84%
Elapsed (wall clock) time (h:mm:ss or m:ss): 1:20.36
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 6980160
Maximum resident set size (kbytes): 6868580
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 9
Minor (reclaiming a frame) page faults: 2268315
Voluntary context switches: 108702
Involuntary context switches: 152
Major (requiring I/O) page faults: 1121
Minor (reclaiming a frame) page faults: 3067356
Voluntary context switches: 107057
Involuntary context switches: 175
Swaps: 0
File system inputs: 1568
File system outputs: 8081824
File system inputs: 22416
File system outputs: 8081832
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Expand Down
10 changes: 7 additions & 3 deletions bucket_map/benchmark/log/minimap2_map.log
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
[M::main::10.965*0.56] loaded/built the index for 932 target sequence(s)
[M::mm_mapopt_update::12.059*0.60] mid_occ = 1428
[M::main::8.939*0.77] loaded/built the index for 932 target sequence(s)
[M::mm_mapopt_update::11.852*0.71] mid_occ = 1428
[M::mm_idx_stat] kmer size: 15; skip: 10; is_hpc: 0; #seq: 932
[M::mm_idx_stat::12.750*0.62] distinct minimizers: 61720831 (52.90% are singletons); average occurrences: 5.186; average spacing: 5.315; total length: 1701312507
[M::mm_idx_stat::12.569*0.73] distinct minimizers: 61720831 (52.90% are singletons); average occurrences: 5.186; average spacing: 5.315; total length: 1701312507
[M::worker_pipeline::526.566*2.80] mapped 1000000 sequences
[M::main] Version: 2.24-r1150-dirty
[M::main] CMD: minimap2 -a egu_minimap.mmi /mnt/d/genome/test/sim_illumina_1M.fastq
[M::main] Real time: 526.711 sec; CPU: 1475.605 sec; Peak RSS: 5.867 GB
25 changes: 12 additions & 13 deletions bucket_map/benchmark/log/minimap2_map.time
Original file line number Diff line number Diff line change
@@ -1,22 +1,21 @@
Command terminated by signal 2
Command being timed: "minimap2 -t 1 -a egu_minimap.mmi /mnt/d/genome/test/sim_illumina_1M.fastq"
User time (seconds): 1297.26
System time (seconds): 6.20
Percent of CPU this job got: 99%
Elapsed (wall clock) time (h:mm:ss or m:ss): 21:54.75
Command being timed: "minimap2 -a egu_minimap.mmi /mnt/d/genome/test/sim_illumina_1M.fastq"
User time (seconds): 1461.92
System time (seconds): 13.80
Percent of CPU this job got: 280%
Elapsed (wall clock) time (h:mm:ss or m:ss): 8:46.86
Average shared text size (kbytes): 0
Average unshared data size (kbytes): 0
Average stack size (kbytes): 0
Average total size (kbytes): 0
Maximum resident set size (kbytes): 6294596
Maximum resident set size (kbytes): 6151476
Average resident set size (kbytes): 0
Major (requiring I/O) page faults: 13
Minor (reclaiming a frame) page faults: 2974017
Voluntary context switches: 52485
Involuntary context switches: 1207
Major (requiring I/O) page faults: 225912
Minor (reclaiming a frame) page faults: 3258999
Voluntary context switches: 279159
Involuntary context switches: 1892
Swaps: 0
File system inputs: 8085024
File system outputs: 64
File system inputs: 11306232
File system outputs: 1504968
Socket messages sent: 0
Socket messages received: 0
Signals delivered: 0
Expand Down
4 changes: 2 additions & 2 deletions bucket_map/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ struct cmd_arguments
float mapper_distinguishability_threshold = 0.5;

// locator related arguments
float locator_allowed_seed_miss_rate = 0.1;
float locator_allowed_seed_miss_rate = 0.2;
float locator_allowed_indel_rate = 0.015;
float locator_sample_size = 10;
unsigned int locator_quality_threshold = 40;
Expand All @@ -48,7 +48,7 @@ void initialise_parser(sharg::parser & parser, cmd_arguments & args)
sharg::config{.short_id = 'q',
.long_id = "query-file",
.description = "The path to the FASTQ query file.",
.validator = sharg::output_file_validator{"fq", "fastq"}});
.validator = sharg::input_file_validator{{"fq", "fastq"}}});

parser.add_option(args.index_indicator,
sharg::config{.short_id = 'i',
Expand Down
4 changes: 2 additions & 2 deletions bucket_map/mapper_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ int main() {
bucket_hash_indexer<26507> ind(bucket_length, read_length, bucket_shape, locate_shape);
q_gram_mapper<26507> map(bucket_length, read_length, bucket_shape, 30, 6, 0.5);

ind.index(genome_file, data_path / "index");
map.load(data_path / "index");
ind.index(genome_file, data_path, "index");
map.load(data_path, "index");

//short_read_simulator sim(bucket_length, read_length, 0.002, 0.00025, 0.00025);
//sim.read(genome_file);
Expand Down

0 comments on commit 49527ed

Please sign in to comment.