In [153]:
import numpy as np
from tqdm import tqdm
#!pip install fastq, minifasta
import fastq as fq
from time import time
#!pip install mpu
import mpu

In [177]:
class Illumina:
    def __init__(self, seq_length=50000, sub_seq_mean=250, sub_seq_std=30, read_times=100, max_N=0.6) -> None:
        self.alp = ['A', 'C', 'G', 'T']
        self.seq_length = seq_length
        self.sub_seq_mean = sub_seq_mean
        self.sub_seq_std = sub_seq_std
        self.max_N = max_N
        self.read_times = read_times
        self.seq = np.random.choice(self.alp, size=seq_length)

    def get_reads(self, total_reads=50000):
        return [self.get_one_read() for _ in tqdm(range(total_reads))]
        
    def get_one_read(self):
        sub_seq = self.gen_subseq()
        seq = []
        quality = []
        for s in sub_seq:
            nuc, qual = self.read_nucleotide(s)
            seq.append(nuc)
            quality.append(qual)
        seq_id = hex(abs(hash(''.join(sub_seq) + str(time()))))[2:]
        read = {
            'head': 'SEQ_ID:' + seq_id,
            'body': ''.join(seq),
            'qstr': ''.join(map(lambda x: chr(round(x) + 33), quality))}
        ground_truth = {
            'head': 'SEQ_ID:' + seq_id,
            'body': ''.join(sub_seq)}
        return ground_truth, read
    
    def gen_subseq(self):
        size = int(np.random.normal(loc=self.sub_seq_mean, scale=self.sub_seq_std))
        start = np.random.randint(self.seq_length)
        if start + size > self.seq_length:
            return self.seq[start:]
        else:
            return self.seq[start:start+size]
        
    def read_nucleotide(self, s):
        N = np.random.rand()*self.max_N
        prob = np.full(4, N/3)
        prob[self.alp.index(s)] = 1.0 - N
        nucs = np.random.choice(self.alp, p=prob, size=self.read_times)
        un, counts = np.unique(nucs, return_counts=True)
        p = (self.read_times - counts.max())/self.read_times
        nuc = un[counts.argmax()]
        qual = - 10 * np.log10(max(p, 1e-4))
        return nuc, qual
    
    def to_fastq(self, path_to_file, total_reads=100):
        reads = self.get_reads(total_reads)
        fos = [fq.fastq_object(**read[1]) for read in reads]
        fq.write(fos, file_path=path_to_file)
        return {read[0]['head']: read[0]['body'] for read in reads}, {read[1]['head']: read[1]['body'] for read in reads}, 
        

In [178]:
ill = Illumina()
truth, reads = ill.to_fastq('reads.fastq', 5)

100%|██████████| 5/5 [00:00<00:00, 41.67it/s]


In [None]:
mpu.io.write('truth.pickle', truth)

In [164]:
truth = mpu.io.read('truth.pickle')

In [179]:
fq.fastq_object(**ill.get_one_read()[1]).info

{'a_num': 73,
 'g_num': 62,
 't_num': 68,
 'c_num': 47,
 'gc_content': 0.436,
 'at_content': 0.5640000000000001,
 'qual': 6.2,
 'qual_median': 5.0,
 'qual_variance': 21.012048192771083,
 'qual_min': 2,
 'qual_max': 40}

С параметрами:

java -jar trimmomatic-0.35.jar SE -phred33 input.fq.gz output.fq.gz ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36

Он все дропает

Там среднее качество около 6 (при ошибке в среднем 30%). А мы просим 15 качество