### BWT Homework

Today we will construct BWT from scratch. First we will construct a suffix array for a given string using pysuffixarray.


In [14]:
! pip install pysuffixarray

from pysuffixarray.core import SuffixArray
suffix_array = SuffixArray('ACAACG')
print(suffix_array.suffix_array())


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
[6, 2, 0, 3, 1, 4, 5]


## Task 1: Create BWT using suffix array:

- Using BioPython upload SARS-CoV-2 reference genome from fasta file (genome.fa in BWT_folder)
- Construct suffix array
- Construct BWT from suffix array
- Don't forget to add special symbol (but after SA construction)

![correct](BWT_folder/BWT1.png)

In [None]:
! pip install biopython

from Bio import SeqIO

record = SeqIO.read("BWT_folder/genome.fa", "fasta")
seq = str(record.seq)
suffix_array = sorted(range(len(seq)), key=lambda i: seq[i:])


# BWT[i] = (idx == 0) ? '$' : seq[idx - 1], где idx = SA[i]
bwt_chars = []
for idx in suffix_array:
    if idx == 0:
        bwt_chars.append("$")
    else:
        bwt_chars.append(seq[idx - 1])
bwt = "".join(bwt_chars)

with open("suffix_array.txt", "w") as f_sa:
    for pos in suffix_array:
        f_sa.write(f"{pos}\n")

with open("bwt.txt", "w") as f_bwt:
    f_bwt.write(bwt)



[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


## Task 2: Create FM index
- Construct Occurence array
- Construct Count dictionary
- Make a class BWTSearcher

![correct](BWT_folder/BWT3.png)

In [None]:
from collections import Counter


with open("bwt.txt", "r") as f:
    bwt = f.read().strip()

alphabet = sorted(set(bwt))

freq = Counter(bwt)

C = {}
total = 0
for c in alphabet:
    C[c] = total
    total += freq[c]

Occ = {c: [0]*(len(bwt)+1) for c in alphabet}

for i, char in enumerate(bwt, start=1):
    for c in alphabet:
        Occ[c][i] = Occ[c][i-1] + (1 if char == c else 0)

class BWTSearcher:
    def __init__(self, bwt_string):
        self.bwt = bwt_string
        self.n = len(bwt_string)
        
        from collections import Counter
        self.alphabet = sorted(set(self.bwt))
        freq = Counter(self.bwt)
    
        self.C = {}
        total = 0
        for c in self.alphabet:
            self.C[c] = total
            total += freq[c]
        
        self.Occ = {c: [0]*(self.n + 1) for c in self.alphabet}
        for i, ch in enumerate(self.bwt, start=1):
            for c in self.alphabet:
                self.Occ[c][i] = self.Occ[c][i-1] + (1 if ch == c else 0)
    
    def occ(self, c, pos):
        if c not in self.Occ:
            return 0
        return self.Occ[c][pos]
    
    def backward_search(self, pattern):
        l = 0
        r = self.n - 1
        
        for ch in reversed(pattern):
            if ch not in self.C:
                return None, None
            l = self.C[ch] + self.occ(ch, l)
            r = self.C[ch] + self.occ(ch, r + 1) - 1
            if l > r:
                return None, None
        
        return l, r

# TEST
bwt_searcher = BWTSearcher(bwt)
pattern = "ATG"
l, r = bwt_searcher.backward_search(pattern)
if l is None:
    print(f"Паттерн '{pattern}' НЕ найден.")
else:
    print(f"Паттерн '{pattern}' найден в интервале SA: [{l}, {r}]")


Пример значений Occurrence:
Символ '$': Occ[$][10] = 0
Символ 'A': Occ[A][10] = 10
Символ 'C': Occ[C][10] = 0
Символ 'G': Occ[G][10] = 0
Символ 'T': Occ[T][10] = 0

Count dictionary (C):
C['$'] = 0
C['A'] = 1
C['C'] = 8954
C['G'] = 14446
C['T'] = 20309
Паттерн 'ATG' найден в интервале SA: [7456, 8180]


### Task 4:
- There are 100 reads that were randomly sampled from genome.fa
- Some of them are error free, some contain one mutation, and some contain 5 mutations
- Could you use your BWTSearcher class to classify them? Think about the solution and implement it. You can add any functions of class members
- How many reads of each class did you find?

In [5]:
from Bio import SeqIO
# Process each read through the BWTSearcher
with open("BWT_folder/sample_reads.fasta", "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        read_sequence = str(record.seq)

        # Here is just a placeholder to demonstrate using the read with the BWTSearcher
        print("Processing read:", read_sequence)

Processing read: TATGGACAACTCACCTAATTTAGCATGGCCTCTTATTGTAACAGCTTTAAGGGCCAATCCTTCTGTAAAATTACAGAATAATGATCTTAGTCCTGTTGCA
Processing read: TTGTGTTCCCTTGAACATAATACCTCTTACAACAGCAGCCAAACTAATGGTTGTCATACCAGACTATAACACATATAAAAATACGTTTGATGGTACAACA
Processing read: AGTGTTACCACAGAGATTCTACCTGTTTCTATGACCAAGACATGAGTAGATTGTACAATGTACATTTGTGGTGATTCAACTGAATGCAGCAATCTCTTGT
Processing read: AAAGGGGTAAGGCTAGACTTTATAATGATTCAATGAGTTATGAGGATCAAGATGCACTTTTCGCATATACAACATGTAATGTCATCCCTACTATCACTCA
Processing read: TTTTGATAAATGCAAAGTGAATTCAACATTAGAACAGTATGTCTTTTGTACTGTAAATGCATTGCCTGAGACGACAGCAGATATAGTTGTCTTTGATGAA
Processing read: AGTTGGTAACATCTGTTACACACCATCAAAACTTATAGAGTACACTGACTTTGCAACATCAGCTTGTGTTTTGGCTGCTGAATGTACAATTTTTAAAGAT
Processing read: TTAAAACTCTTTGACCGTTATTTTAAATATTGGGATCAGACATACCACCCAAATTGTGTTAACTGTTTGGATCACAGATGCATTCTGCATTGTGCAAACT
Processing read: TGTTGAAGGTTTTAATTGTTACTTTCCTTTACAATCATAGGGTTTCCAACCCACTAATGGTGTTGGTTACCAACCATACAGAGTAGTAGTACTTTCTTTT
Processing read: TACAAATTAAATGTTGGTGATTATTTTGTGCTGACATCA

In [6]:
# Your impletementation of BWT

class BWTSearcher:
    def __init__(self, bwt_string):
        self.bwt = bwt_string
        self.n = len(bwt_string)
        from collections import Counter
        self.alphabet = sorted(set(self.bwt))
        freq = Counter(self.bwt)
        self.C = {}
        total = 0
        for c in self.alphabet:
            self.C[c] = total
            total += freq[c]
        self.Occ = {c: [0]*(self.n + 1) for c in self.alphabet}
        for i, ch in enumerate(self.bwt, start=1):
            for c in self.alphabet:
                self.Occ[c][i] = self.Occ[c][i-1] + (1 if ch == c else 0)

    def occ(self, c, pos):
        if c not in self.Occ:
            return 0
        return self.Occ[c][pos]
    def bwt_pattern_search(self, pattern):
        l = 0
        r = self.n - 1
        for ch in reversed(pattern):
            if ch not in self.C:
                return None, None
            l = self.C[ch] + self.occ(ch, l)
            r = self.C[ch] + self.occ(ch, r + 1) - 1
            if l > r:
                return None, None
        return l, r

In [11]:
def classify_read(read, genome, SA, bwt_searcher, k_max=5):
    L = len(read)
    l, r = bwt_searcher.bwt_pattern_search(read)
    if l is not None:
        return 0
    
    seg_count = k_max + 1
    seg_len = (L + seg_count - 1) // seg_count
    seen_starts = set()
    
    for seg_idx in range(seg_count):
        seg_start = seg_idx * seg_len
        seg_end = min((seg_idx + 1) * seg_len, L)
        segment = read[seg_start:seg_end]
        if not segment:
            continue
        l_seg, r_seg = bwt_searcher.bwt_pattern_search(segment)
        if l_seg is None:
            continue
        for sa_idx in range(l_seg, r_seg + 1):
            genome_pos = SA[sa_idx]
            full_start = genome_pos - seg_start
            if full_start < 0 or full_start + L > len(genome):
                continue
            if full_start in seen_starts:
                continue
            seen_starts.add(full_start)
            genome_sub = genome[full_start:full_start + L]
            mismatches = sum(1 for a, b in zip(read, genome_sub) if a != b)
            if mismatches == 0:
                return 0
            if mismatches == 1:
                return 1
            if mismatches == 5:
                return 5
    return None

In [None]:
from collections import Counter
from Bio import SeqIO

required_files = ["genome.fa", "suffix_array.txt", "bwt.txt", "reads.fa"]

record = SeqIO.read("BWT_folder/genome.fa", "fasta")
genome = str(record.seq)
    
    
SA = []
with open("suffix_array.txt", "r") as f_sa:
    for line in f_sa:
        line = line.strip()
        if line:
            SA.append(int(line))
    

with open("bwt.txt", "r") as f_bwt:
    bwt_string = f_bwt.read().strip()

    
bwt_searcher = BWTSearcher(bwt_string)
reads = []
id = 0
read_ids = []
with open("BWT_folder/sample_reads.fasta", "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        id += 1
        read_sequence = str(record.seq)

        read_ids.append(id)
        reads.append(read_sequence)
    
counts = Counter()
total_reads = 0
for record in SeqIO.parse("BWT_folder/sample_reads.fasta", "fasta"):
    total_reads += 1
    read_sequence = str(record.seq)
    cls = classify_read(read_sequence, genome, SA, bwt_searcher, k_max=5)
    if cls is None:
        counts["none"] += 1
    else:
        counts[cls] += 1

# Results
print(f"\nОбщее число ридов: {total_reads}")
print("Результаты классификации:")
print(f"  - 0 мутаций: {counts.get(0, 0)}")
print(f"  - 1 мутация : {counts.get(1, 0)}")
print(f"  - 5 мутаций: {counts.get(5, 0)}")
print(f"  - Не классифицировано: {counts.get('none', 0)}")

Загружён суффиксный массив (длина = 29903).
Загружена BWT-строка (длина = 29903).
Инициализирован объект BWTSearcher.

Общее число ридов: 100
Результаты классификации:
  - 0 мутаций: 45
  - 1 мутация : 31
  - 5 мутаций: 24
  - Не классифицировано: 0
