In [15]:
import os
import subprocess
from tqdm import tqdm
from Bio import SeqIO
from io import StringIO
from Bio.Align.Applications import MafftCommandline

In [3]:
wd = '/Volumes/Diane-Saunders/loizos/Pber/assemble_Pb01_genome/14_wolbachia/2-phylo'
consensus = 'all_Pber_wolb_cons.fna'
pre_comb = '../1-extract_contigs/consensus'

In [4]:
with open(os.path.join(wd,consensus), 'r') as f:
    for record in SeqIO.parse(f, 'fasta'):
        print(record.id, len(record.seq))

Pb01 1455753
Pb02 1455752
Pb03 1455915
Pb04 1455899
Pb06 1455762
Pb07 1455760
Pb08 1455905
Pb09 1455911
Pb10 1455802


In [None]:
seq_dict = {}
for file in os.listdir(os.path.join(wd,pre_comb)):
    sample = file.split('_')[0]
    with open(os.path.join(wd,pre_comb,file), 'r') as f:
        for record in SeqIO.parse(f, 'fasta'):
            if sample not in seq_dict:
                seq_dict[sample] = {}
            seq_dict[sample][record.id] = record.seq

In [19]:
contig_dict = {}
for sample, contigs in seq_dict.items():
    for contig_id, seq in contigs.items():
        if contig_id not in contig_dict:
            contig_dict[contig_id] = {}
        contig_dict[contig_id][sample] = seq

In [20]:
for contig_id, samples in tqdm(contig_dict.items()):
    lengths = [len(seq) for seq in samples.values()]
    if len(set(lengths)) > 1:  # If there are different lengths
        # Write sequences to a temporary file for alignment
        with open(f"{contig_id}_temp.fasta", 'w') as temp_fasta:
            for sample, seq in samples.items():
                temp_fasta.write(f">{sample}\n{seq}\n")
        
        print(f"Aligning {contig_id}...")
        # Align sequences using MAFFT
        mafft_cline = MafftCommandline(input=f"{contig_id}_temp.fasta")
        stdout, stderr = mafft_cline()
        
        print(f"Finished aligning {contig_id}")
        
        # Parse the aligned sequences
        aligned_sequences = SeqIO.parse(StringIO(stdout), 'fasta')
        
        # Replace original sequences with aligned sequences
        for aligned_record in aligned_sequences:
            seq_dict[aligned_record.id][contig_id] = aligned_record.seq
        
        # Clean up temporary file
        os.remove(f"{contig_id}_temp.fasta")

  0%|          | 0/5 [00:00<?, ?it/s]

Aligning contig_10393_pilon...


 20%|██        | 1/5 [04:23<17:35, 263.89s/it]

Finished aligning contig_10393_pilon
Aligning contig_6635_pilon...


 40%|████      | 2/5 [04:38<05:52, 117.38s/it]

Finished aligning contig_6635_pilon
Aligning contig_6636_pilon...


 60%|██████    | 3/5 [04:58<02:25, 72.97s/it] 

Finished aligning contig_6636_pilon
Aligning contig_6637_pilon...


 80%|████████  | 4/5 [12:08<03:33, 213.62s/it]

Finished aligning contig_6637_pilon
Aligning contig_6638_pilon...


100%|██████████| 5/5 [12:08<00:00, 145.69s/it]

Finished aligning contig_6638_pilon





In [None]:
with open(os.path.join(wd,'correctly_aligned_sequences.fna'), 'w') as output1, open(os.path.join(wd,'separated_aligned_sequences.fna'), 'w') as output2:
    for sample, contigs in seq_dict.items():
        output1.write(f">{sample}\n")
        for contig_id, seq in contigs.items():
            output1.write(f"{seq}")
            output2.write(f">{sample}_{contig_id}\n{seq}\n")
        output1.write("\n")