Skip to content

Commit

Permalink
Adding the SLOW5 support
Browse files Browse the repository at this point in the history
  • Loading branch information
canfirtina committed Jun 22, 2023
1 parent c5484bd commit 7ebc604
Show file tree
Hide file tree
Showing 3 changed files with 103 additions and 16 deletions.
15 changes: 8 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ Below figure shows the overview of the steps that RawHash takes to find matching

To efficiently identify similarities between a reference genome and reads, RawHash has two steps, similar to regular read mapping tools, 1) indexing and 2) mapping. The indexing step generates hash values from the expected signal representation of a reference genome and stores them in a hash table. In the mapping step, RawHash generates the hash values from raw signals and queries the hash table generated in the indexing step to find seed matches. To map the raw signal to a reference genome, RawHash performs chaining over the seed matches.

RawHash can be used to map reads from **FAST5 or POD5** files to a reference genome in sequence format. We will provide the support for using SLOW5 files.
RawHash can be used to map reads from **FAST5, POD5, SLOW5, or BLOW5** files to a reference genome in sequence format.

RawHash performs real-time mapping of nanopore raw signals. When the prefix of reads in FAST5 or POD5 file can be mapped to a reference genome, RawHash will stop mapping and provide the mapping information in PAF format. We follow the similar PAF template used in [UNCALLED](https://github.com/skovaka/UNCALLED) and [Sigmap](https://github.com/haowenz/sigmap) to report the mapping information.

Expand Down Expand Up @@ -151,8 +151,7 @@ Please follow the instructions in the [README](test/README.md) file in [test](./

# Upcoming Features

* Support for reading SLOW5 Files.
* Direct integration with the Read Until API.
* Direct integration with MinKNOW.
* Ability to specify even/odd channels to eject the reads only from these specified channels.
* Please create issues if you want to see more features that can make RawHash easily integratable with nanopore sequencers for any use case.

Expand All @@ -163,11 +162,13 @@ To cite RawHash, you can use the following BibTeX:
```bibtex
@article{firtina_rawhash_2023,
title = {{RawHash}: {Enabling} {Fast} and {Accurate} {Real}-{Time} {Analysis} of {Raw} {Nanopore} {Signals} for {Large} {Genomes}},
doi = {10.1101/2023.01.22.525080},
journal = {bioRxiv},
doi = {10.1093/bioinformatics/btad272},
journal = {Bioinformatics},
author = {Firtina, Can and Ghiasi, Nika Mansouri and Lindegger, Joel and Singh, Gagandeep and Cavlak, Meryem Banu and Mao, Haiyu and Mutlu, Onur},
month = jan,
year = {2023},
pages = {2023.01.22.525080},
}
```

# Acknowledgement

RawHash uses [klib](https://github.com/attractivechaos/klib), some code snippets from [Minimap2](https://github.com/lh3/minimap2) (e.g., pipelining, hash table usage) and [Sigmap](https://github.com/haowenz/sigmap) (e.g., chaining).
100 changes: 91 additions & 9 deletions src/rsig.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,11 @@ static inline ri_sig_file_t *ri_sig_open_fast5(const char *fn)
#ifndef NPOD5RH
fp->pp = NULL;
#endif

#ifndef NSLOW5RH
fp->sp = NULL;
#endif

fp->fp = fast5_file;

bool is_single = false;
Expand Down Expand Up @@ -125,13 +130,16 @@ static inline ri_sig_file_t *ri_sig_open_pod5(const char *fn){
}

fp = (ri_sig_file_t*)calloc(1, sizeof(ri_sig_file_t));
#ifndef NPOD5RH
fp->pp = pod5_file;
#endif

#ifndef NHDF5RH
fp->fp = NULL;
#endif

#ifndef NSLOW5RH
fp->sp = NULL;
#endif

fp->num_read = batch_count; //num_read is the number of batches for pod5
fp->cur_read = 0; //cur_read is cur_batch for pod5

Expand Down Expand Up @@ -164,8 +172,31 @@ static inline ri_sig_file_t *ri_sig_open_pod5(const char *fn){

#ifndef NSLOW5RH
static inline ri_sig_file_t *ri_sig_open_slow5(const char *fn){
//TODO: complete here
return 0;

ri_sig_file_t *fp;

//open the SLOW5 file
slow5_file_t *sp = slow5_open(fn, "r");
if(sp==NULL){
fprintf(stderr, "ERROR: Failed to open file %s\n", fn);
return 0;
}

fp = (ri_sig_file_t*)calloc(1, sizeof(ri_sig_file_t));

#ifndef NHDF5RH
fp->fp = NULL;
#endif

#ifndef NPOD5RH
fp->pp = NULL;
#endif

fp->sp = sp;
fp->num_read = 1;
fp->cur_read = 0;

return fp;
}
#endif

Expand All @@ -180,7 +211,7 @@ ri_sig_file_t *ri_sig_open(const char *fn){
#endif
} else if (strstr(fn, ".slow5") || strstr(fn, ".blow5")) {
#ifndef NSLOW5RH
// TODO: complete here
return ri_sig_open_slow5(fn);
#endif
}

Expand All @@ -206,6 +237,13 @@ void ri_sig_close(ri_sig_file_t *fp)
pod5_close_and_free_reader(fp->pp);
}
#endif

#ifndef NSLOW5RH
if(fp->sp){
slow5_close(fp->sp);
}
#endif

free(fp);
}

Expand Down Expand Up @@ -246,13 +284,12 @@ int is_dir(const char *A)
return S_ISDIR(st.st_mode);
}

//Recursively find all files that ends with "fast5" or "pod5" under input directory const char *A
//Recursively find all files that ends with "fast5", "pod5", or "s/blow5" under input directory const char *A
//Generated by GitHub Copilot
void find_sfiles(const char *A, ri_char_v *fnames)
{
if (!is_dir(A)) {
//TODO: Add slow5 here later
if (strstr(A, ".fast5") || strstr(A, ".pod5")) {
if (strstr(A, ".fast5") || strstr(A, ".pod5") || strstr(A, ".slow5") || strstr(A, ".blow5")) {
char** cur_fname;
rh_kv_pushp(char*, 0, *fnames, &cur_fname);
(*cur_fname) = strdup(A);
Expand All @@ -270,7 +307,7 @@ void find_sfiles(const char *A, ri_char_v *fnames)
if (strcmp(ent->d_name, ".") && strcmp(ent->d_name, ".."))
find_sfiles(tmp, fnames);
} else {
if (strstr(ent->d_name, ".fast5") || strstr(ent->d_name, ".pod5")) {
if (strstr(ent->d_name, ".fast5") || strstr(ent->d_name, ".pod5") || strstr(ent->d_name, ".slow5") || strstr(ent->d_name, ".blow5")) {
char** cur_fname;
rh_kv_pushp(char*, 0, *fnames, &cur_fname);
(*cur_fname) = strdup(tmp);
Expand Down Expand Up @@ -441,6 +478,48 @@ static inline void ri_read_sig_pod5(ri_sig_file_t* fp, ri_sig_t* s){
}
#endif

#ifndef NSLOW5RH
static inline void ri_read_sig_slow5(ri_sig_file_t* fp, ri_sig_t* s){

if(fp->cur_read >= fp->num_read) return;

slow5_file_t *sp = fp->sp;
slow5_rec_t *rec = NULL;
int ret = 0;
if ((ret = slow5_get_next(&rec, sp)) < 0) {
fp->cur_read = fp->num_read;
return;
}

s->name = strdup(rec->read_id);
float *sigF = (float*)malloc(rec->len_raw_signal * sizeof(float));
uint32_t l_sig = 0;
float pa = 0.0f;
float scale = rec->range/rec->digitisation;

for(int i = 0; i < rec->len_raw_signal; ++i){
pa = (rec->raw_signal[i]+rec->offset)*scale;
if (pa > 30.0f && pa < 200.0f) {
sigF[l_sig++] = pa;
}
}

fp->cur_read++;

if(ret != SLOW5_ERR_EOF){ //check if proper end of file has been reached
fp->num_read++;
}else{
fp->cur_read = fp->num_read;
}

s->sig = (float*)calloc(l_sig, sizeof(float));
s->l_sig = l_sig;
memcpy(s->sig, sigF, l_sig*sizeof(float));
free(sigF);
slow5_rec_free(rec);
}
#endif

void ri_read_sig(ri_sig_file_t* fp, ri_sig_t* s){

assert(fp->cur_read < fp->num_read);
Expand All @@ -451,4 +530,7 @@ void ri_read_sig(ri_sig_file_t* fp, ri_sig_t* s){
#ifndef NPOD5RH
if(fp->pp) ri_read_sig_pod5(fp, s);
#endif
#ifndef NSLOW5RH
if(fp->sp) ri_read_sig_slow5(fp, s);
#endif
}
4 changes: 4 additions & 0 deletions src/rsig.h
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@ typedef struct ri_sig_file_s {

Pod5FileReader_t* pp; //POD5 file pointer
#endif

#ifndef NSLOW5RH
slow5_file_t* sp; //SLOW5 file pointer
#endif
} ri_sig_file_t;

/**
Expand Down

0 comments on commit 7ebc604

Please sign in to comment.