<a href="https://colab.research.google.com/github/Codeptor/DNA-Sequencing/blob/main/minor_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DNA Sequence Error Detection using Pysam: A Comprehensive Guide


## 1. Dataset Selection and Preparation
For this guide, we'll use a smaller dataset from the Sequence Read Archive (SRA) that's more manageable for research projects with limited computational resources.

### 1.1 Download a Small Dataset
We'll use a dataset from the E. coli genome, which is much smaller in size.



1. Install the SRA Toolkit:


In [None]:
!wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/current/sratoolkit.current-ubuntu64.tar.gz
!tar -xzf sratoolkit.current-ubuntu64.tar.gz
!export PATH=$PATH:$PWD/sratoolkit.3.0.0-ubuntu64/bin
!sudo apt-get install sra-toolkit

2. Download a small E. coli dataset:


In [None]:
!prefetch SRR1770413
!fastq-dump --split-files SRR1770413

This will give you two FASTQ files: `SRR1770413_1.fastq` and `SRR1770413_2.fastq`.



### 1.2 Download the Reference Genome
Download the E. coli reference genome:


In [None]:
!wget ftp://ftp.ensemblgenomes.org/pub/bacteria/release-45/fasta/bacteria_0_collection/escherichia_coli_str_k_12_substr_mg1655/dna/Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa.gz
!gunzip Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa.gz

### 1.3 Align Reads to Reference
Now, let's align the reads to the reference genome using BWA:


In [None]:
!sudo apt-get install bwa

!bwa index Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa

!bwa mem Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa SRR1770413_1.fastq SRR1770413_2.fastq > aligned_reads.sam


In [None]:
!sudo apt-get install samtools

In [None]:
!samtools view -bS aligned_reads.sam > aligned_reads.bam

!samtools sort aligned_reads.bam -o aligned_reads_sorted.bam

!samtools index aligned_reads_sorted.bam

Now you have a smaller, manageable BAM file (`aligned_reads_sorted.bam`) to work with for your error detection project.

## 2. Prerequisites
Ensure you have:
- Python 3.7 or later
- Basic understanding of DNA sequencing concepts
- Familiarity with Python programming

## 3. Installation and Setup
Install required libraries:


In [None]:
!pip install pysam numpy matplotlib seaborn

## 4. Reading BAM Files with Pysam
Let's start by reading our BAM file:


In [None]:
import pysam

def read_bam_file(bam_path):
    try:
        bam_file = pysam.AlignmentFile(bam_path, "rb")
        print(f"Successfully opened BAM file: {bam_path}")
        print(f"Number of mapped reads: {bam_file.mapped}")
        print(f"Number of unmapped reads: {bam_file.unmapped}")
        return bam_file
    except Exception as e:
        print(f"Error opening BAM file: {e}")
        return None

bam_path = "aligned_reads_sorted.bam"
bam_file = read_bam_file(bam_path)

if bam_file:
    for i, read in enumerate(bam_file.fetch()):
        if i >= 5:
            break
        print(f"Read {i+1}: {read.query_name}, Position: {read.reference_start}, CIGAR: {read.cigarstring}")

    bam_file.close()

## 5. Error Detection Algorithms
Let's implement some error detection algorithms using our E. coli dataset:




In [None]:
import pysam
import numpy as np

def detect_mismatches(bam_file, reference_genome, chromosome, start, end):
    mismatches = []
    for pileup_column in bam_file.pileup(chromosome, start, end):
        ref_base = reference_genome.fetch(chromosome, pileup_column.reference_pos, pileup_column.reference_pos + 1).upper()
        for pileup_read in pileup_column.pileups:
            if not pileup_read.is_del and not pileup_read.is_refskip:
                read_base = pileup_read.alignment.query_sequence[pileup_read.query_position].upper()
                if read_base != ref_base:
                    mismatches.append((pileup_column.reference_pos, ref_base, read_base))
    return mismatches

def detect_indels(bam_file, chromosome, start, end):
    indels = []
    for pileup_column in bam_file.pileup(chromosome, start, end):
        for pileup_read in pileup_column.pileups:
            if pileup_read.indel != 0:
                indels.append((pileup_column.reference_pos, pileup_read.indel))
    return indels

In [None]:
bam_path = "aligned_reads_sorted.bam"
reference_path = "Escherichia_coli_str_k_12_substr_mg1655.ASM584v2.dna.toplevel.fa"

bam_file = pysam.AlignmentFile(bam_path, "rb")
reference_genome = pysam.FastaFile(reference_path)

chromosome = "Chromosome"  
start = 0
end = reference_genome.get_reference_length(chromosome)

mismatches = detect_mismatches(bam_file, reference_genome, chromosome, start, end)
indels = detect_indels(bam_file, chromosome, start, end)

print(f"Found {len(mismatches)} mismatches and {len(indels)} indels.")

bam_file.close()
reference_genome.close()

## 6. Analyzing Error Patterns
Now, let's analyze the error patterns we've detected:


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

def analyze_mismatch_types(mismatches):
    mismatch_types = {}
    for _, ref_base, read_base in mismatches:
        mismatch_type = f"{ref_base}>{read_base}"
        mismatch_types[mismatch_type] = mismatch_types.get(mismatch_type, 0) + 1
    return mismatch_types

def analyze_indel_sizes(indels):
    indel_sizes = [indel[1] for indel in indels]
    return indel_sizes

mismatch_types = analyze_mismatch_types(mismatches)

plt.figure(figsize=(12, 6))
sns.barplot(x=list(mismatch_types.keys()), y=list(mismatch_types.values()))
plt.title("Mismatch Types in E. coli Genome")
plt.xlabel("Mismatch Type")
plt.ylabel("Count")
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("ecoli_mismatch_types.png")
plt.close()

indel_sizes = analyze_indel_sizes(indels)

plt.figure(figsize=(12, 6))
sns.histplot(indel_sizes, bins=20, kde=True)
plt.title("Indel Size Distribution in E. coli Genome")
plt.xlabel("Indel Size")
plt.ylabel("Count")
plt.savefig("ecoli_indel_sizes.png")
plt.close()

print("Analysis complete. Check the generated PNG files for visualizations.")

## 7. Implementing Quality Control Measures
Let's implement some quality control measures to filter out low-quality reads:


In [None]:
def filter_low_quality_reads(bam_file, min_mapping_quality=20, min_base_quality=20):
    filtered_reads = 0
    total_reads = 0

    for read in bam_file.fetch():
        total_reads += 1
        if read.mapping_quality >= min_mapping_quality and \
           np.mean([ord(q) - 33 for q in read.qual]) >= min_base_quality:
            yield read
        else:
            filtered_reads += 1

    print(f"Filtered {filtered_reads} out of {total_reads} reads.")

bam_file = pysam.AlignmentFile(bam_path, "rb")
filtered_bam_path = "filtered_aligned_reads.bam"

with pysam.AlignmentFile(filtered_bam_path, "wb", template=bam_file) as filtered_bam:
    for read in filter_low_quality_reads(bam_file):
        filtered_bam.write(read)

bam_file.close()

pysam.index(filtered_bam_path)

print(f"Created filtered BAM file: {filtered_bam_path}")

## 8. Comparative Analysis
Now, let's compare the error rates before and after quality control:


In [None]:
def calculate_error_rate(bam_path, reference_path, chromosome, start, end):
    bam_file = pysam.AlignmentFile(bam_path, "rb")
    reference_genome = pysam.FastaFile(reference_path)

    mismatches = detect_mismatches(bam_file, reference_genome, chromosome, start, end)
    indels = detect_indels(bam_file, chromosome, start, end)

    total_bases = sum(read.query_length for read in bam_file.fetch(chromosome, start, end))
    error_rate = (len(mismatches) + len(indels)) / total_bases

    bam_file.close()
    reference_genome.close()

    return error_rate

original_error_rate = calculate_error_rate(bam_path, reference_path, chromosome, start, end)
filtered_error_rate = calculate_error_rate(filtered_bam_path, reference_path, chromosome, start, end)

print(f"Original error rate: {original_error_rate:.6f}")
print(f"Filtered error rate: {filtered_error_rate:.6f}")

labels = ['Original', 'Filtered']
error_rates = [original_error_rate, filtered_error_rate]

plt.figure(figsize=(8, 6))
sns.barplot(x=labels, y=error_rates)
plt.title("Error Rate Comparison in E. coli Genome")
plt.ylabel("Error Rate")
plt.savefig("ecoli_error_rate_comparison.png")
plt.close()

print("Comparison complete. Check the generated PNG file for visualization.")