In [64]:
import numpy as np
import fastq as fq
from time import time

In [55]:
# генерируем случайную строку и fastq файл из всех ее подстрок

def gen_test_fastq(seq_length: int, reads_length: int, path_to_file: str) -> str:
    seq = np.random.choice(['A', 'C', 'G', 'T'], size=seq_length)
    reads = []
    for start_idx in np.random.permutation(seq_length):
        quality = [40 for _ in range(reads_length)]
        sub_seq = seq[start_idx: start_idx + reads_length]
        if len(sub_seq) < reads_length:
            continue
        seq_id = hex(abs(hash(''.join(sub_seq) + str(time()))))[2:]
        read = {
            'head': 'SEQ_ID:' + seq_id,
            'body': ''.join(sub_seq),
            'qstr': ''.join(map(lambda x: chr(round(x) + 33), quality))}
        reads.append(read)
    fos = [fq.fastq_object(**read) for read in reads]
    fq.write(fos, file_path=path_to_file)
    return ''.join(seq)


initial_seq = gen_test_fastq(1000, 150, 'test_reads.fastq')

In [52]:
class DeBruijn:
    def __init__(self, path_to_fastq_file: str, k: int) -> None:
        self.alp = ['A', 'C', 'G', 'T']
        self.fos = fq.read(path_to_fastq_file)
        self.k = k
        self.build()

    def get_kmers(self):
        kmers = dict()
        for item in self.fos:
            seq = item.body
            for i in range(len(seq)):
                kmer = seq[i: i+self.k]
                if len(kmer) < self.k:
                    break
                if kmer not in kmers:
                    kmers[kmer] = 1
                else:
                    kmers[kmer] += 1
        return kmers

    def build(self) -> None:
        """
        Построение графа
        """
        self.g = dict()
        kmers = self.get_kmers()
        cannot_be_starting_node = set()
        all_nodes = set()
        for kmer in kmers:
            k1 = kmer[:-1]
            k2 = kmer[1:]
            all_nodes.add(k1)
            all_nodes.add(k2)
            if k1 not in self.g:
                self.g[k1] = []
            self.g[k1].append(k2)
            cannot_be_starting_node.add(k2)
        self.can_be_starting_node = list(all_nodes - cannot_be_starting_node)
    
    
    def compress(self) -> None:
        """
        Сжатие графа
        """
        pass


In [62]:
gr = DeBruijn('test_reads.fastq', 10)
gr.g

{'GGCCATGTA': ['GCCATGTAA'],
 'GCCATGTAA': ['CCATGTAAA'],
 'CCATGTAAA': ['CATGTAAAG'],
 'CATGTAAAG': ['ATGTAAAGC'],
 'ATGTAAAGC': ['TGTAAAGCC'],
 'TGTAAAGCC': ['GTAAAGCCC'],
 'GTAAAGCCC': ['TAAAGCCCA'],
 'TAAAGCCCA': ['AAAGCCCAA'],
 'AAAGCCCAA': ['AAGCCCAAA'],
 'AAGCCCAAA': ['AGCCCAAAC'],
 'AGCCCAAAC': ['GCCCAAACT'],
 'GCCCAAACT': ['CCCAAACTA'],
 'CCCAAACTA': ['CCAAACTAT'],
 'CCAAACTAT': ['CAAACTATG'],
 'CAAACTATG': ['AAACTATGT'],
 'AAACTATGT': ['AACTATGTA'],
 'AACTATGTA': ['ACTATGTAA'],
 'ACTATGTAA': ['CTATGTAAT'],
 'CTATGTAAT': ['TATGTAATT'],
 'TATGTAATT': ['ATGTAATTA'],
 'ATGTAATTA': ['TGTAATTAT'],
 'TGTAATTAT': ['GTAATTATA'],
 'GTAATTATA': ['TAATTATAT'],
 'TAATTATAT': ['AATTATATG'],
 'AATTATATG': ['ATTATATGT'],
 'ATTATATGT': ['TTATATGTC'],
 'TTATATGTC': ['TATATGTCT'],
 'TATATGTCT': ['ATATGTCTC'],
 'ATATGTCTC': ['TATGTCTCC'],
 'TATGTCTCC': ['ATGTCTCCT'],
 'ATGTCTCCT': ['TGTCTCCTT'],
 'TGTCTCCTT': ['GTCTCCTTC'],
 'GTCTCCTTC': ['TCTCCTTCG'],
 'TCTCCTTCG': ['CTCCTTCGG'],
 'CTCCTTCGG': 

In [63]:
gr.can_be_starting_node

['TTACCGAAG']