In [42]:
import sys
import os
import math
import matplotlib.pyplot as plt
import numpy as np
from scipy.special import logsumexp

from io import StringIO
from Bio import Phylo
from Bio import Align
from Bio.Align import PairwiseAligner
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

import pandas as pd
import numpy as np

from copy import deepcopy
from itertools import product
from itertools import combinations
import pickle

In [86]:
def align_sequences(reference, sequences):
    """
    Align a list of sequences to a reference genome using pairwise alignment.
    Ensures all aligned sequences have the same length as the reference genome.
    """
    aligner = PairwiseAligner()
    aligner.mode = "global"  # Use global alignment (can be adjusted to 'local' if needed)
    aligned_records = []
    
    for seq_record in sequences:
        alignment = aligner.align(reference, seq_record.seq)
        best_alignment = alignment[0]  # Get the best alignment
        
        # Extract aligned reference and query sequences as strings
        aligned_ref = str(best_alignment.target)
        aligned_query = str(best_alignment.query)

        # Ensure both sequences have the same length as the reference
        if len(aligned_ref) != len(reference):
            raise ValueError("Alignment length mismatch with reference genome.")
        
        aligned_records.append(
            SeqRecord(Seq(aligned_query), id=seq_record.id, description="aligned")
        )
    return aligned_records

In [87]:
# File paths
reference_file = "../dat/COVID/EPI_ISL_402124.fasta"  # Replace with the path to your reference genome file
query_file = "../dat/COVID/1736547195158.sequences.fasta"      # Replace with the path to your query sequences file

# Load the reference genome
reference_record = SeqIO.read(reference_file, "fasta")
reference_seq = reference_record.seq

# Load the query sequences
query_sequences = list(SeqIO.parse(query_file, "fasta"))

# Perform alignments
alignments = align_sequences(reference_seq, query_sequences[0:10])


In [88]:
SeqIO.write(alignments, "../dat/COVID/aligned.fasta", "fasta")

10

In [95]:
len(str(alignments[0].seq))

29903

In [96]:
len(str(alignments[1].seq))

29682

In [99]:
alignments[0].seq

Seq('ANNNAAGGTTTATACCTTCCTAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA')

In [102]:
len(reference_record.seq)

29891

In [103]:
alignments[0]

SeqRecord(seq=Seq('ANNNAAGGTTTATACCTTCCTAGGTAACAAACCAACCAACTTTCGATCTCTTGT...AAA'), id='hCoV-19/USA/TX-TAMGHRC-SHS-113823/2024', name='<unknown name>', description='aligned', dbxrefs=[])

In [104]:
alignments[0].target

AttributeError: 'SeqRecord' object has no attribute 'target'

In [106]:
alignment = aligner.align(reference_record, query_sequences[0].seq)

In [109]:
print(alignment[0])

hCoV-19/W         0 ATT---AAAGGTTTATACCTTCCC-AGGTAACAAACCAACCAACTTTCGATCTCTTGTAG
                  0 |-----||-||||||||||||||--|||||||||||||||||||||||||||||||||||
query             0 A--NNNAA-GGTTTATACCTTCC-TAGGTAACAAACCAACCAACTTTCGATCTCTTGTAG

hCoV-19/W        56 ATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTG
                 60 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query            56 ATCTGTTCTCTAAACGAACTTTAAAATCTGTGTGGCTGTCACTCGGCTGCATGCTTAGTG

hCoV-19/W       116 CACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTC
                120 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           116 CACTCACGCAGTATAATTAATAACTAATTACTGTCGTTGACAGGACACGAGTAACTCGTC

hCoV-19/W       176 TATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAG
                180 ||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||
query           176 TATCTTCTGCAGGCTGCTTACGGTTTCGTCCGTGTTGCAGCCGATCATCAGCACATCTAG

hCoV-19/W       236 GTTT