### 1

In [12]:
import random
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqIO import write

def generate_synthetic_data(sequence_length=10000, read_length=250, num_reads=150, output_file="synthetic.fastq"):
    sequence = ''.join(random.choices('ACGT', k=sequence_length))
    reads = [sequence[i:i+read_length] for i in range(0, sequence_length - read_length + 1, read_length // 2)]
    
    seq_records = []
    for i, read in enumerate(reads):
        record = SeqRecord(Seq(read), id=f"read_{i}", description="", letter_annotations={"phred_quality": [40]*read_length})
        seq_records.append(record)
    
    with open(output_file, "w") as output_handle:
        write(seq_records, output_handle, "fastq")

generate_synthetic_data()

In [13]:
from Bio import SeqIO
from collections import defaultdict

fastq_file = "synthetic.fastq"

reads = []
for record in SeqIO.parse(fastq_file, "fastq"):
    reads.append(str(record.seq))


In [14]:
#################

In [15]:
from collections import defaultdict, Counter

class DeBruijnGraph:
    def __init__(self, k):
        self.k = k
        self.graph = defaultdict(list)
        self.kmer_counts = Counter()
        
    def add_read(self, read):
        for i in range(len(read) - self.k + 1):
            kmer = read[i:i+self.k]
            self.kmer_counts[kmer] += 1
            if i + self.k < len(read):
                next_kmer = read[i+1:i+self.k+1]
                self.graph[kmer].append(next_kmer)
                
    def get_edges(self):
        edges = []
        for node in self.graph:
            for neighbor in self.graph[node]:
                edges.append((node, neighbor))
        return edges

    def get_coverage(self, kmer):
        return self.kmer_counts[kmer]

def build_debruijn_graph_from_fastq(Realization, k):
    # graph = DeBruijnGraph(k)
    graph = Realization(k)
    
    for read in reads:
        graph.add_read(read)
    
    return graph

def print_graph(graph, edges):
    print(f"{len(edges)} Edges", "**"*10, sep="\n")
    for i, edge in enumerate(edges, 1):
        coverage = graph.get_coverage(edge[0])
        print(f"Edge {i}: {edge[0]} -> {edge[1]}, Coverage: {coverage}")

In [16]:
k = 21
graph = build_debruijn_graph_from_fastq(DeBruijnGraph, k)
edges = graph.get_edges()

print_graph(graph, edges)

18091 Edges
********************
Edge 1: CTCACGGTATCAAGATAAAAG -> TCACGGTATCAAGATAAAAGC, Coverage: 1
Edge 2: TCACGGTATCAAGATAAAAGC -> CACGGTATCAAGATAAAAGCA, Coverage: 1
Edge 3: CACGGTATCAAGATAAAAGCA -> ACGGTATCAAGATAAAAGCAG, Coverage: 1
Edge 4: ACGGTATCAAGATAAAAGCAG -> CGGTATCAAGATAAAAGCAGG, Coverage: 1
Edge 5: CGGTATCAAGATAAAAGCAGG -> GGTATCAAGATAAAAGCAGGT, Coverage: 1
Edge 6: GGTATCAAGATAAAAGCAGGT -> GTATCAAGATAAAAGCAGGTA, Coverage: 1
Edge 7: GTATCAAGATAAAAGCAGGTA -> TATCAAGATAAAAGCAGGTAA, Coverage: 1
Edge 8: TATCAAGATAAAAGCAGGTAA -> ATCAAGATAAAAGCAGGTAAC, Coverage: 1
Edge 9: ATCAAGATAAAAGCAGGTAAC -> TCAAGATAAAAGCAGGTAACG, Coverage: 1
Edge 10: TCAAGATAAAAGCAGGTAACG -> CAAGATAAAAGCAGGTAACGT, Coverage: 1
Edge 11: CAAGATAAAAGCAGGTAACGT -> AAGATAAAAGCAGGTAACGTT, Coverage: 1
Edge 12: AAGATAAAAGCAGGTAACGTT -> AGATAAAAGCAGGTAACGTTG, Coverage: 1
Edge 13: AGATAAAAGCAGGTAACGTTG -> GATAAAAGCAGGTAACGTTGT, Coverage: 1
Edge 14: GATAAAAGCAGGTAACGTTGT -> ATAAAAGCAGGTAACGTTGTC, Coverage: 1
Edge 15: A

## 2

In [17]:
from collections import defaultdict, Counter

class DeBruijnGraphCompress():
    def __init__(self, k):
        self.k = k
        self.graph = defaultdict(list)
        self.kmer_counts = Counter()
        self.edge_coverage = defaultdict(float)
        self.edge_length = defaultdict(int)
        
    def add_read(self, read):
        for i in range(len(read) - self.k + 1):
            kmer = read[i:i+self.k]
            self.kmer_counts[kmer] += 1
            if i + self.k < len(read):
                next_kmer = read[i+1:i+self.k+1]
                self.graph[kmer].append(next_kmer)
                self.edge_length[(kmer, next_kmer)] = len(next_kmer)
        
    def get_edges(self):
        edges = []
        for node in self.graph:
            for neighbor in self.graph[node]:
                edges.append((node, neighbor))
        return edges
        
    def get_coverage(self, kmer):
        return self.kmer_counts[kmer]

    def compress_graph(self):
        compressed_graph = defaultdict(list)
        compressed_edge_coverage = defaultdict(float)
        
        for node in self.graph:
            neighbors = self.graph[node]
            if len(neighbors) == 0:
                continue
            
            current_edge = neighbors[0]
            current_coverage = self.edge_coverage[(node, current_edge)]
            current_length = self.edge_length[(node, current_edge)]
            
            for neighbor in neighbors[1:]:
                next_coverage = self.edge_coverage[(node, neighbor)]
                next_length = self.edge_length[(node, neighbor)]
                
                # считаем среднее
                new_coverage = (current_coverage * current_length + next_coverage * next_length) / (current_length + next_length)
                
                # обновляем ребра 
                current_edge = current_edge + neighbor[self.k-1:]
                current_coverage = new_coverage
                current_length += next_length
            
            compressed_graph[node].append(current_edge)
            compressed_edge_coverage[(node, current_edge)] = current_coverage
            self.edge_length[(node, current_edge)] = current_length
        
        self.graph = compressed_graph
        self.edge_coverage = compressed_edge_coverage

In [18]:
graph = build_debruijn_graph_from_fastq(DeBruijnGraphCompress, k)
edges = graph.get_edges()

graph.compress_graph()
compressed_edges = graph.get_edges()

print_graph(graph, compressed_edges)

9979 Edges
********************
Edge 1: CTCACGGTATCAAGATAAAAG -> TCACGGTATCAAGATAAAAGC, Coverage: 1
Edge 2: TCACGGTATCAAGATAAAAGC -> CACGGTATCAAGATAAAAGCA, Coverage: 1
Edge 3: CACGGTATCAAGATAAAAGCA -> ACGGTATCAAGATAAAAGCAG, Coverage: 1
Edge 4: ACGGTATCAAGATAAAAGCAG -> CGGTATCAAGATAAAAGCAGG, Coverage: 1
Edge 5: CGGTATCAAGATAAAAGCAGG -> GGTATCAAGATAAAAGCAGGT, Coverage: 1
Edge 6: GGTATCAAGATAAAAGCAGGT -> GTATCAAGATAAAAGCAGGTA, Coverage: 1
Edge 7: GTATCAAGATAAAAGCAGGTA -> TATCAAGATAAAAGCAGGTAA, Coverage: 1
Edge 8: TATCAAGATAAAAGCAGGTAA -> ATCAAGATAAAAGCAGGTAAC, Coverage: 1
Edge 9: ATCAAGATAAAAGCAGGTAAC -> TCAAGATAAAAGCAGGTAACG, Coverage: 1
Edge 10: TCAAGATAAAAGCAGGTAACG -> CAAGATAAAAGCAGGTAACGT, Coverage: 1
Edge 11: CAAGATAAAAGCAGGTAACGT -> AAGATAAAAGCAGGTAACGTT, Coverage: 1
Edge 12: AAGATAAAAGCAGGTAACGTT -> AGATAAAAGCAGGTAACGTTG, Coverage: 1
Edge 13: AGATAAAAGCAGGTAACGTTG -> GATAAAAGCAGGTAACGTTGT, Coverage: 1
Edge 14: GATAAAAGCAGGTAACGTTGT -> ATAAAAGCAGGTAACGTTGTC, Coverage: 1
Edge 15: AT

## 3

In [19]:
import numpy as np

class DeBruijnGraphTails:
    def __init__(self, k):
        self.k = k
        self.graph = defaultdict(list)
        self.kmer_counts = Counter()
        self.edge_coverage = defaultdict(float)
        self.edge_length = defaultdict(int)
        
    def add_read(self, read):
        for i in range(len(read) - self.k + 1):
            kmer = read[i:i+self.k]
            self.kmer_counts[kmer] += 1
            if i + self.k < len(read):
                next_kmer = read[i+1:i+self.k+1]
                self.graph[kmer].append(next_kmer)
                self.edge_length[(kmer, next_kmer)] = len(next_kmer)
        
    def get_edges(self):
        edges = []
        for node in self.graph:
            for neighbor in self.graph[node]:
                edges.append((node, neighbor))
        return edges
    
    def compress_graph(self):
        compressed_graph = defaultdict(list)
        compressed_edge_coverage = defaultdict(float)
        
        for node in self.graph:
            neighbors = self.graph[node]
            if len(neighbors) == 0:
                continue
            
            current_edge = neighbors[0]
            current_coverage = self.edge_coverage[(node, current_edge)]
            current_length = self.edge_length[(node, current_edge)]
            
            for neighbor in neighbors[1:]:
                next_coverage = self.edge_coverage[(node, neighbor)]
                next_length = self.edge_length[(node, neighbor)]
                
                # считаем среднее
                new_coverage = (current_coverage * current_length + next_coverage * next_length) / (current_length + next_length)
                
                # обновляем ребра 
                current_edge = current_edge + neighbor[self.k-1:]
                current_coverage = new_coverage
                current_length += next_length
            
            compressed_graph[node].append(current_edge)
            compressed_edge_coverage[(node, current_edge)] = current_coverage
            self.edge_length[(node, current_edge)] = current_length
        
        self.graph = compressed_graph
        self.edge_coverage = compressed_edge_coverage

    def get_coverage(self, kmer):
        return self.kmer_counts[kmer]
    
    def remove_low_coverage_tails(self):
        ratios = []
        for edge in self.edge_coverage:
            coverage = self.edge_coverage[edge]
            length = self.edge_length[edge]
            if length > 0:
                ratio = coverage / length
                ratios.append(ratio)
        
        # ищем хвосты с худшим покрытием
        if ratios:
            threshold = np.percentile(ratios, 30)
            for edge in list(self.edge_coverage.keys()):
                coverage = self.edge_coverage[edge]
                length = self.edge_length[edge]
                if length > 0:
                    ratio = coverage / length
                    if ratio < threshold:
                        del self.edge_coverage[edge]
                        del self.edge_length[edge]
                        if edge[0] in self.graph and edge[1] in self.graph[edge[0]]:
                            self.graph[edge[0]].remove(edge[1])
                        if len(self.graph[edge[0]]) == 0:
                            del self.graph[edge[0]]

In [20]:
graph = build_debruijn_graph_from_fastq(DeBruijnGraphTails, k)
edges = graph.get_edges()

graph.compress_graph()
graph.remove_low_coverage_tails()

# обновим данные и в печать
compressed_edges = graph.get_edges()
print_graph(graph, compressed_edges)

9979 Edges
********************
Edge 1: CTCACGGTATCAAGATAAAAG -> TCACGGTATCAAGATAAAAGC, Coverage: 1
Edge 2: TCACGGTATCAAGATAAAAGC -> CACGGTATCAAGATAAAAGCA, Coverage: 1
Edge 3: CACGGTATCAAGATAAAAGCA -> ACGGTATCAAGATAAAAGCAG, Coverage: 1
Edge 4: ACGGTATCAAGATAAAAGCAG -> CGGTATCAAGATAAAAGCAGG, Coverage: 1
Edge 5: CGGTATCAAGATAAAAGCAGG -> GGTATCAAGATAAAAGCAGGT, Coverage: 1
Edge 6: GGTATCAAGATAAAAGCAGGT -> GTATCAAGATAAAAGCAGGTA, Coverage: 1
Edge 7: GTATCAAGATAAAAGCAGGTA -> TATCAAGATAAAAGCAGGTAA, Coverage: 1
Edge 8: TATCAAGATAAAAGCAGGTAA -> ATCAAGATAAAAGCAGGTAAC, Coverage: 1
Edge 9: ATCAAGATAAAAGCAGGTAAC -> TCAAGATAAAAGCAGGTAACG, Coverage: 1
Edge 10: TCAAGATAAAAGCAGGTAACG -> CAAGATAAAAGCAGGTAACGT, Coverage: 1
Edge 11: CAAGATAAAAGCAGGTAACGT -> AAGATAAAAGCAGGTAACGTT, Coverage: 1
Edge 12: AAGATAAAAGCAGGTAACGTT -> AGATAAAAGCAGGTAACGTTG, Coverage: 1
Edge 13: AGATAAAAGCAGGTAACGTTG -> GATAAAAGCAGGTAACGTTGT, Coverage: 1
Edge 14: GATAAAAGCAGGTAACGTTGT -> ATAAAAGCAGGTAACGTTGTC, Coverage: 1
Edge 15: AT

## 4

In [21]:
class DeBruijnGraphBubble:
    def __init__(self, k):
        self.k = k
        self.graph = defaultdict(list)
        self.kmer_counts = Counter()
        self.edge_coverage = defaultdict(float)
        self.edge_length = defaultdict(int)
        
    def add_read(self, read):
        for i in range(len(read) - self.k + 1):
            kmer = read[i:i+self.k]
            self.kmer_counts[kmer] += 1
            if i + self.k < len(read):
                next_kmer = read[i+1:i+self.k+1]
                self.graph[kmer].append(next_kmer)
                self.edge_length[(kmer, next_kmer)] = len(next_kmer)
        
    def get_edges(self):
        edges = []
        for node in self.graph:
            for neighbor in self.graph[node]:
                edges.append((node, neighbor))
        return edges
    
    def compress_graph(self):
        compressed_graph = defaultdict(list)
        compressed_edge_coverage = defaultdict(float)
        
        for node in self.graph:
            neighbors = self.graph[node]
            if len(neighbors) == 0:
                continue
            
            current_edge = neighbors[0]
            current_coverage = self.edge_coverage[(node, current_edge)]
            current_length = self.edge_length[(node, current_edge)]
            
            for neighbor in neighbors[1:]:
                next_coverage = self.edge_coverage[(node, neighbor)]
                next_length = self.edge_length[(node, neighbor)]
                
                # считаем среднее
                new_coverage = (current_coverage * current_length + next_coverage * next_length) / (current_length + next_length)
                
                # обновляем ребра 
                current_edge = current_edge + neighbor[self.k-1:]
                current_coverage = new_coverage
                current_length += next_length
            
            compressed_graph[node].append(current_edge)
            compressed_edge_coverage[(node, current_edge)] = current_coverage
            self.edge_length[(node, current_edge)] = current_length
        
        self.graph = compressed_graph
        self.edge_coverage = compressed_edge_coverage

    def get_coverage(self, kmer):
        return self.kmer_counts[kmer]
    
    def remove_low_coverage_tails(self):
        ratios = []
        for edge in self.edge_coverage:
            coverage = self.edge_coverage[edge]
            length = self.edge_length[edge]
            if length > 0:
                ratio = coverage / length
                ratios.append(ratio)
        
        # ищем хвосты с худшим покрытием
        if ratios:
            threshold = np.percentile(ratios, 30)
            for edge in list(self.edge_coverage.keys()):
                coverage = self.edge_coverage[edge]
                length = self.edge_length[edge]
                if length > 0:
                    ratio = coverage / length
                    if ratio < threshold:
                        del self.edge_coverage[edge]
                        del self.edge_length[edge]
                        if edge[0] in self.graph and edge[1] in self.graph[edge[0]]:
                            self.graph[edge[0]].remove(edge[1])
                        if len(self.graph[edge[0]]) == 0:
                            del self.graph[edge[0]]

    def remove_bubbles(self):
        visited = set()
        for node in self.graph:
            if node not in visited:
                self._dfs(node, visited)
    
    def _dfs(self, start_node, visited):
        stack = [(start_node, None)]  # (current_node, parent_node)
        path = []
        while stack:
            current_node, parent_node = stack.pop()
            if current_node in visited:
                continue
            visited.add(current_node)
            path.append((current_node, parent_node))
            
            if len(self.graph[current_node]) > 1:
                for neighbor in self.graph[current_node]:
                    if neighbor != parent_node:
                        stack.append((neighbor, current_node))
        
        # рассмотрим потенциальный пузырь
        if len(path) >= 2:
            bubble_length = len(path)
            if bubble_length <= 2 * self.k:
                # если возможно, удалим путь из пузыря
                for i in range(1, len(path)):
                    if path[i-1][0] in self.graph and path[i][0] in self.graph[path[i-1][0]]:
                        self.graph[path[i-1][0]].remove(path[i][0])
                        if len(self.graph[path[i-1][0]]) == 0:
                            del self.graph[path[i-1][0]]
                        break

In [22]:
graph = build_debruijn_graph_from_fastq(DeBruijnGraphBubble, k)
edges = graph.get_edges()

graph.compress_graph()
graph.remove_low_coverage_tails()
graph.remove_bubbles()


# обновим данные и в печать
compressed_edges = graph.get_edges()
print_graph(graph, compressed_edges)

9979 Edges
********************
Edge 1: CTCACGGTATCAAGATAAAAG -> TCACGGTATCAAGATAAAAGC, Coverage: 1
Edge 2: TCACGGTATCAAGATAAAAGC -> CACGGTATCAAGATAAAAGCA, Coverage: 1
Edge 3: CACGGTATCAAGATAAAAGCA -> ACGGTATCAAGATAAAAGCAG, Coverage: 1
Edge 4: ACGGTATCAAGATAAAAGCAG -> CGGTATCAAGATAAAAGCAGG, Coverage: 1
Edge 5: CGGTATCAAGATAAAAGCAGG -> GGTATCAAGATAAAAGCAGGT, Coverage: 1
Edge 6: GGTATCAAGATAAAAGCAGGT -> GTATCAAGATAAAAGCAGGTA, Coverage: 1
Edge 7: GTATCAAGATAAAAGCAGGTA -> TATCAAGATAAAAGCAGGTAA, Coverage: 1
Edge 8: TATCAAGATAAAAGCAGGTAA -> ATCAAGATAAAAGCAGGTAAC, Coverage: 1
Edge 9: ATCAAGATAAAAGCAGGTAAC -> TCAAGATAAAAGCAGGTAACG, Coverage: 1
Edge 10: TCAAGATAAAAGCAGGTAACG -> CAAGATAAAAGCAGGTAACGT, Coverage: 1
Edge 11: CAAGATAAAAGCAGGTAACGT -> AAGATAAAAGCAGGTAACGTT, Coverage: 1
Edge 12: AAGATAAAAGCAGGTAACGTT -> AGATAAAAGCAGGTAACGTTG, Coverage: 1
Edge 13: AGATAAAAGCAGGTAACGTTG -> GATAAAAGCAGGTAACGTTGT, Coverage: 1
Edge 14: GATAAAAGCAGGTAACGTTGT -> ATAAAAGCAGGTAACGTTGTC, Coverage: 1
Edge 15: AT