## 1 

In [71]:
import numpy as np
import random

genome_length = 50000
read_count = 50000
mean_read_length = 250
std_dev_read_length = 30
max_error_rate = 75
num_readings = 100

genome = ''.join(random.choices('ATGC', k=genome_length))

In [93]:
def generate_reads(genome, read_count, mean_length, std_dev_length):
    """генерирует риды"""
    reads = []
    positions = []
    genome_length = len(genome)
    
    for _ in range(read_count):
        read_length = int(np.random.normal(mean_length, std_dev_length))
        read_length = max(1, min(genome_length, read_length))
        
        start_pos = random.randint(0, genome_length - read_length)
        read = genome[start_pos:start_pos + read_length]
        reads.append(read)
        positions.append(start_pos)
    
    return reads, positions

def simulate_sequencing(read, num_readings, max_error_rate):
    """имитирует ошибку при секвенировании"""
    sequenced_read = ""
    correct_reads, error_probability = [], []
    error_count = random.randint(0, max_error_rate)
    
    for i, base in enumerate(read):
        # ошибка в N ридах из 100
        # графическое чисел как точек на отрезке [0;1]
        illumina_reads = {"A": 0, "T": 0, "G": 0, "C": 0}
        illumina_reads[base] = 100 - error_count 
        
        for _ in range(error_count):
            if base == "A":
                new_base = random.choice("TGC")
            elif base == "T":
                new_base = random.choice("AGC")
            elif base == "G":
                new_base = random.choice("ATC")
            elif base == "C":
                new_base = random.choice("ATG")
                
            illumina_reads[new_base] += 1
        
        freq_base = max(illumina_reads, key=illumina_reads.get)
        sequenced_read += freq_base
        error_probability.append(1 - illumina_reads[freq_base]/100)
        # print(f"read {i} P = {1 - illumina_reads[freq_base]/100}")
        
        sequenced_read += freq_base

        if (freq_base != base):
            correct_reads.append((i, base))
        error_positions.append(i)

    return sequenced_read, correct_reads, error_probability

In [94]:
reads, positions = generate_reads(genome, read_count, mean_read_length, std_dev_read_length)

In [97]:
import math

fastq_data = []
error_data = []

for read_id, read in enumerate(reads):
    sequenced_read, mismatched_bases, error_probability = simulate_sequencing(read, num_readings, max_error_rate)
    # https://en.wikipedia.org/wiki/FASTQ_format
    # https://en.wikipedia.org/wiki/Phred_quality_score
    quality_scores = ["I"]*len(read)
    for pos in range(len(read)):
        if error_probability[pos] == 0:
            continue
        quality_scores[pos] = chr(int(-10 * math.log10(error_probability[pos])) + 33)
    
    fastq_entry = f"@read_{read_id}\n{sequenced_read}\n+\n{"".join(quality_scores)}\n"
    fastq_data.append(fastq_entry)

    # сепаратор в файле ошибок - пробел
    error_data_entry = f"Read_ID: {read_id} Start_Pos: {positions[read_id]} Read_Length: {len(read)} Errors: " + ' '.join([f"Pos: {pos} True_Base: {base}" for pos, base in mismatched_bases]) + "\n"
    error_data.append(error_data_entry)

with open("simulated_reads.fastq", "w") as f:
    f.writelines(fastq_data)

with open("error_positions.dat", "w") as f:
    f.writelines(error_data)


В симуляции при значении max_error_rate=75 риды читаются практически без ошибок

Сделать до дедлайна симуляцию более естественной не удалось, поэтому отправляю так

## 2

In [10]:
%%bash
fastqc ./simulated_reads.fastq

Started analysis of simulated_reads.fastq
Approx 5% complete for simulated_reads.fastq
Approx 10% complete for simulated_reads.fastq
Approx 15% complete for simulated_reads.fastq
Approx 20% complete for simulated_reads.fastq
Approx 25% complete for simulated_reads.fastq
Approx 30% complete for simulated_reads.fastq
Approx 35% complete for simulated_reads.fastq
Approx 40% complete for simulated_reads.fastq
Approx 45% complete for simulated_reads.fastq
Approx 50% complete for simulated_reads.fastq
Approx 55% complete for simulated_reads.fastq
Approx 60% complete for simulated_reads.fastq
Approx 65% complete for simulated_reads.fastq
Approx 70% complete for simulated_reads.fastq
Approx 75% complete for simulated_reads.fastq
Approx 80% complete for simulated_reads.fastq
Approx 85% complete for simulated_reads.fastq
Approx 90% complete for simulated_reads.fastq
Approx 95% complete for simulated_reads.fastq
Approx 100% complete for simulated_reads.fastq


Analysis complete for simulated_reads.fastq


          Step options:

               ILLUMINACLIP:<fastaWithAdaptersEtc>:<seed mismatches>:<palindrome clip threshold>:<simple clip threshold>
                   fastaWithAdaptersEtc: specifies the path to a fasta file containing all the adapters, PCR sequences etc.
                   The naming of the various sequences within this file determines how they are used. See below.
                   seedMismatches: specifies the maximum mismatch count which will still allow a full match to be performed
                   palindromeClipThreshold: specifies how accurate the match between the two ´adapter ligated´ reads must be for PE palindrome read alignment.
                   simpleClipThreshold: specifies how accurate the match between any adapter etc. sequence must be against a read.
                   .
                   The adapters are installed on the Debian system at /usr/share/trimmomatic/.

               SLIDINGWINDOW:<windowSize>:<requiredQuality>
                   windowSize: specifies the number of bases to average across
                   requiredQuality: specifies the average quality required.

               LEADING:<quality>
                   quality: Specifies the minimum quality required to keep a base.

               TRAILING:<quality>
                   quality: Specifies the minimum quality required to keep a base.

               CROP:<length>
                   length: The number of bases to keep, from the start of the read.

               HEADCROP:<length>
                   length: The number of bases to remove from the start of the read.

               MINLENGTH:<length>
                   length: Specifies the minimum length of reads to be kept.

           Trimming Order

           Trimming  occurs in the order which the steps are specified on the command line. It is recommended in most cases
           that adapter clipping, if required, is done as early as possible.


In [38]:
%%bash
java -jar ./Trimmomatic-0.39/trimmomatic-0.39.jar SE  ./simulated_reads.fastq trimmomatic.fastq ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 -phred33 -threads 8

TrimmomaticSE: Started with arguments:
 ./simulated_reads.fastq trimmomatic.fastq ILLUMINACLIP:TruSeq3-SE:2:30:10 LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36 -phred33 -threads 8
java.io.FileNotFoundException: /Data/BioinformaticsCourse2024/homework/2_1/TruSeq3-SE (Нет такого файла или каталога)
	at java.base/java.io.FileInputStream.open0(Native Method)
	at java.base/java.io.FileInputStream.open(FileInputStream.java:216)
	at java.base/java.io.FileInputStream.<init>(FileInputStream.java:157)
	at org.usadellab.trimmomatic.fasta.FastaParser.parse(FastaParser.java:54)
	at org.usadellab.trimmomatic.trim.IlluminaClippingTrimmer.loadSequences(IlluminaClippingTrimmer.java:110)
	at org.usadellab.trimmomatic.trim.IlluminaClippingTrimmer.makeIlluminaClippingTrimmer(IlluminaClippingTrimmer.java:71)
	at org.usadellab.trimmomatic.trim.TrimmerFactory.makeTrimmer(TrimmerFactory.java:32)
	at org.usadellab.trimmomatic.Trimmomatic.createTrimmers(Trimmomatic.java:59)
	at org.usadellab.trimmomatic.Tri

In [None]:
# Сравнение с учетом того, какой получилась симуляция

In [57]:
from Bio import SeqIO

error_positions = []
with open("error_positions.dat", "r") as f:
    for line in f:
        data = line.split(" ")
        fields = len(data)
        if fields == 7:
            error_positions.append({ "start_pos": int(data[1]), "lenght": int(data[3]), "errors": 0 })
        else:
            ## f"Read_ID: {read_id} Start_Pos: {positions[read_id]} Read_Length: {len(read)} Errors: " + ' '.join([f"Pos: {pos} True_Base: {base}" for pos, base in errors])
            error_positions.append({ "start_pos": int(data[1]), "lenght": int(data[3]), "errors": (fields-7)//2, "error_pos": list(map(int, data[8::2])), "true_base": data[9::2]} )


def read_fastq(file):
    reads = []
    for record in SeqIO.parse(file, "fastq"):
        reads.append({"id": record.id, "seq": record.seq})
    return reads

def equal_reads(file1, file2):
    reads1 = read_fastq(file1)
    reads2 = read_fastq(file2)
    all_equal = True

    for read1, read2 in zip(reads1, reads2):
        seq1 = read1["seq"]
        seq2 = read2["seq"]
        
        if (seq1 != seq2):
            id = int(read1["id"].split("_", 1)[1])
            true_seq = seq1
            for pos, base in zip(error_positions[id]["error_pos"], error_positions[id]["true_base"]):
                true_seq = true_seq[:pos] + base + true_seq[pos+1:]
            print(f"{read1["id"]} was corrected\tErrors: {sum(1 for a,b in zip(true_seq, seq2) if a != b)}")
            
            if all_equal:
                all_equal = False
    if all_equal:
        print(f"All reads in {file1} and {file2} are match")


ID: read_0
Name: read_0
Description: read_0
Number of features: 0
Per letter annotation for: phred_quality
Seq('GTATCGAATACCGTCACTCACACGCAATTTAGGAGGTATTGTGGTACGTTATTA...TGG')


'read_0'

In [56]:
equal_reads("simulated_reads.fastq", "trimmomatic.fastq")

read_0 was corrected	Errors: 160
read_1 was corrected	Errors: 177
read_2 was corrected	Errors: 167
read_3 was corrected	Errors: 155
read_4 was corrected	Errors: 181
read_5 was corrected	Errors: 173
read_6 was corrected	Errors: 197
read_7 was corrected	Errors: 177
read_8 was corrected	Errors: 145
read_9 was corrected	Errors: 174
read_10 was corrected	Errors: 175
read_11 was corrected	Errors: 130
read_12 was corrected	Errors: 168
read_13 was corrected	Errors: 190
read_14 was corrected	Errors: 188
read_15 was corrected	Errors: 163
read_16 was corrected	Errors: 166
read_17 was corrected	Errors: 176
read_18 was corrected	Errors: 161
read_19 was corrected	Errors: 187
read_20 was corrected	Errors: 181
read_21 was corrected	Errors: 191
read_22 was corrected	Errors: 203
read_23 was corrected	Errors: 179
read_24 was corrected	Errors: 169
read_25 was corrected	Errors: 204
read_26 was corrected	Errors: 170
read_27 was corrected	Errors: 199
read_28 was corrected	Errors: 169
read_29 was corrected	Er

## 3
https://www.csd.uwo.ca/~ilie/RACER/

In [39]:
%%bash
./racer/RACER ./simulated_reads.fastq racer.fastq 50000

reading input file...done reading input file
Genome length = 50000
Coverage = 249
Maximum read length = 378
Number of reads = 50000
Number of threads = 8
correcting reads...done correcting reads
writing back the reads...done
Total time building witnesses and counters = 1 seconds.
Total time correcting = 3 seconds.
Total time = 4 seconds.
Peak Memory = 87 MB.
