In [6]:
import pandas as pd
import pysam
from collections import Counter

# Define the paths to the input and output files
sam_file_path = "/home/abozar/pathogenereads/blast/aligned.sam"
snp_output_path = "/home/abozar/pathogenereads/SNP/SNP_output.xlsx"
reference_path = "/home/abozar/pathogenereads/blast/fetched_sequences.fasta"

# Open the SAM file for reading
samfile = pysam.AlignmentFile(sam_file_path, "r")

# Open the reference genome file and read the contents
with open(reference_path, 'r') as ref_file:
    reference_genome = ref_file.read()

# Function to find SNPs
def find_snps(samfile, reference_genome):
    snps = {}
    for read in samfile.fetch():
        # Skip unmapped reads
        if read.is_unmapped:
            continue
        
        # Get the reference sequence for the read
        ref_seq = reference_genome[read.reference_start:read.reference_end]
        
        # Get the query sequence (read)
        query_seq = read.query_sequence
        
        # Compare the reference and query sequences to find mismatches (SNPs)
        for i, (ref_base, query_base) in enumerate(zip(ref_seq, query_seq)):
            if ref_base != query_base:
                position = read.reference_start + i
                if position not in snps:
                    snps[position] = {'ref_base': ref_base, 'query_bases': Counter()}
                snps[position]['query_bases'][query_base] += 1
    return snps

# Find SNPs
snps = find_snps(samfile, reference_genome)

# Prepare data for Excel output
snp_data = []
for position, info in snps.items():
    coverage = sum(info['query_bases'].values())
    for query_base, count in info['query_bases'].items():
        frequency = count / coverage
        snp_data.append({
            'Position': position,
            'Reference': info['ref_base'],
            'Allele': query_base,
            'Frequency': frequency,
            'Coverage': coverage
        })

# Create a DataFrame and save as Excel
df = pd.DataFrame(snp_data)
df.to_excel(snp_output_path, index=False)

# Close the SAM file
samfile.close()

# Print a success message
print(f"SNP discovery is complete. Detailed SNP information is saved in {snp_output_path}")


SNP discovery is complete. Detailed SNP information is saved in /home/abozar/pathogenereads/SNP/SNP_output.xlsx
