In [None]:
import pyfastx
from fqfa.fastq.fastq import parse_fastq_reads

# Benchmark 1: list of reads

This code creates a list containing all the reads in the file.
Note that the data structures for the reads are quite different, with two being package-specific objects and one being a tuple.

## pyfastx with index

Much of the time spent in the first example is likely spent building the ``.fxi`` index file.
This file enables direct access into the FASTQ file, which we are not using here.
The index is quite large, much larger than the reads in this case:

```
334M    BRCA1_input_sample.fq
 48M    BRCA1_input_sample.fq.bz2
511M    BRCA1_input_sample.fq.fxi
 68M    BRCA1_input_sample.fq.gz
513M    BRCA1_input_sample.fq.gz.fxi
```

In [None]:
%time reads = [x for x in pyfastx.Fastq("BRCA1_input_sample.fq")]
for x in reads[:5]:
    print(repr(x))
del reads

## pyfastx without index

This is by far the fastest for just reading data from the file, but it doesn't perform any extra computation or quality value conversion.

In [None]:
%time reads = [x for x in pyfastx.Fastq("BRCA1_input_sample.fq", build_index=False)]
for x in reads[:5]:
    print(x)
del reads

## fqfa

Unlike pyfastx, fqfa takes an open file handle rather than a file name.
In these examples, this is addressed using a context created by a with statement.

In [None]:
with open("BRCA1_input_sample.fq") as handle:
    %time reads = [x for x in parse_fastq_reads(handle)]
for x in reads[:5]:
    print(x)
del reads

# Benchmark 2: summarized quality statistics

This code calculates the median average read quality for all reads in the file.

In [None]:
from statistics import mean, median

## pyfastx with index

pyfastx provides integer quality values as part of its FASTQ read data structure.

In [None]:
%time read_quals = [mean(x.quali) for x in pyfastx.Fastq("BRCA1_input_sample.fq")]
print(f"Median average quality is {median(read_quals)}")
del read_quals

## pyfastx without index

The timing here is quite a bit closer to the others, since the conversion and calculation has not already been performed as part of processing the input file.

In [None]:
%time read_quals = [mean([ord(c) - 33 for c in x[2]]) for x in pyfastx.Fastq("BRCA1_input_sample.fq", build_index=False)]
print(f"Median average quality is {median(read_quals)}")
del read_quals

## fqfa

This code uses the ``average_quality()`` method implemented by the FastqRead class.

In [None]:
with open("BRCA1_input_sample.fq") as handle:
    %time read_quals = [x.average_quality() for x in parse_fastq_reads(handle)]
print(f"Median average quality is {median(read_quals)}")
del read_quals

# Benchmark 3: filtering reads on quality

This code creates a list of reads for which all bases are at least Q20.
The performance and usage in this section is quite a bit faster than Benchmark 2 following recent performance improvements in pyfastx.

## pyfastx with index

In [None]:
%time filt_reads = [x for x in pyfastx.Fastq("BRCA1_input_sample.fq") if min(x.quali) >= 20]
print(f"Kept {len(filt_reads)} reads after applying filter.")
del filt_reads

## pyfastx without index

In [None]:
%time filt_reads = [x for x in pyfastx.Fastq("BRCA1_input_sample.fq", build_index=False) if min([ord(c) - 33 for c in x[2]]) >= 20]
print(f"Kept {len(filt_reads)} reads after applying filter.")
del filt_reads

## fqfa

This code uses the ``min_quality()`` method implemented by the FastqRead class.

In [None]:
with open("BRCA1_input_sample.fq") as handle:
    %time filt_reads = [x for x in parse_fastq_reads(handle) if x.min_quality() >= 20]
print(f"Kept {len(filt_reads)} reads after applying filter.")
del filt_reads