In [59]:
import re
import os
import tempfile
from tqdm import tqdm
from collections import defaultdict
import Bio
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.Align.Applications import MafftCommandline
from pymsaviz import MsaViz

In [5]:
cwd='/Volumes/Diane-Saunders/main/MARPLE/PST/consensus'

In [6]:
seq_dict = {}
for file in tqdm(os.listdir(cwd)):
    with open(os.path.join(cwd,file), 'r') as f:
        for line in f:
            if 'jgi.p|Pucstr1|474\n' in line:
                isol=file.split('_')[-4]
                seq_dict[isol]=next(f).strip()
                break

100%|██████████| 434/434 [20:43<00:00,  2.86s/it]  


In [63]:
uniq_seqs = {}
seq_counts = defaultdict(int)
count = 0

for isol, seq in tqdm(seq_dict.items()):
    if seq not in uniq_seqs.values():
        count += 1
        uniq_seqs[f'H{count}'] = seq
    seq_counts[seq] += 1

for seq, count in seq_counts.items():
    for hap, seq2 in uniq_seqs.items():
        if seq == seq2:
            uniq_seqs[hap] = seq, count


100%|██████████| 434/434 [00:00<00:00, 828174.68it/s]


In [99]:
haps = {}
new_index = 0

if os.path.exists('haplotypes.fna'):
    os.remove('haplotypes.fna')

for hap,(seq,count) in uniq_seqs.items():
    amb=seq.count('?')
    cov=(len(seq)-amb)/len(seq)
    if cov < 1:
        continue
    new_index+=1
    print(f'>H0{new_index}({count})\n{seq}')
    haps[f'H0{new_index}']=seq
    with open('haplotypes.fna', 'a') as f:
        f.write(f'>H0{new_index}({count})\n{seq}\n')

>H01(282)
ATGAAGACTTCAATCCAGTTGGTTATTGTCATCGGCACACTCATCACAGCTTGTGCAGGATTGATTTCAACCATCACACATGAGAACATCGAATATAATATCGTCAACGCACTAACCGACCTTCATGGGGTCGATGGTCCACTCCATAATTTTGTCTCCGATAATGCAGTTCGAATTGGACACACCAATATTGCAAATGATTCACCCAGAAAGGTCGGCTTCTACACCAACTTGCCTGGAAGCACATATGTATTGTGGTTAGATCCCGGGGAAAAAGGTACTCTACATTTCAACAAGGATTATCCCTGGGTATTGGCGTCTGGAGAGCCTCGAGCCAATTTAAACAAGGAGATATACGAGATCCTTGTACCCCAGGTAATCAACATGAACGAGGACGTTTTGGGCATAATGACTTTTCTTCGCTCTTTGCGAGATTAA
>H02(6)
ATGAAGACTTCAATCCAGTTGGTTATTGTCATCGGCACACTCATCACAGCTTGTGCAGGATTGATTTCAACCATCACACATGAGAACATCGAATATAATATCGTCAACGCACTAACCGACCTTCATGGGGTCGATGGTCCACTCCATAATTTTGTCTCCGATAATGCAGTTCGAATTRGACACACCAATATTGCAAATGATTCACCCAGAAAGGTCGGCTTCTACACCAACTTGCCTGGAAGCACATATGTATTGTGGTTAGATCCCGGGGAAAAAGGTACTCTACATTTCAACAAGGATTATCCCTGGGTATTGGCGTCTGGAGAGCCTCGAGCCAATTTAAACAAGGAGATATACGAGATCCTTGTACCCCAGGTAATCAACATGAACGAGGACGTTTTGGGCATAATGACTTTTCTTCGCTCTTTGCGAGATTAA
>H03(3)
ATGAAGACTTCAATCCAGTTGGTTATTGTCATCGGCACACTCATCACAGCTTGTGCAGGATTGATTTCAACCATCACACATGAGAACATCGAATAT

In [93]:
isof = {}
isof_count = 0

if os.path.exists('isoforms.fna'):
    os.remove('isoforms.fna')

for hap,seq in haps.items():
    aa=Seq(seq).translate()
    if aa not in isof.values():
        isof_count+=1
        print(f'I0{isof_count}\t{aa}')
        isof[f'I0{isof_count}']=aa
        with open('isoforms.fna', 'a') as f:
            f.write(f'>I0{isof_count}\n{aa}\n')

I01	MKTSIQLVIVIGTLITACAGLISTITHENIEYNIVNALTDLHGVDGPLHNFVSDNAVRIGHTNIANDSPRKVGFYTNLPGSTYVLWLDPGEKGTLHFNKDYPWVLASGEPRANLNKEIYEILVPQVINMNEDVLGIMTFLRSLRD*
I02	MKTSIQLVIVIGTLITACAGLISTITHENIEYNIVNALTDLHGVDGPLHNFVSDNAVRIXHTNIANDSPRKVGFYTNLPGSTYVLWLDPGEKGTLHFNKDYPWVLASGEPRANLNKEIYEILVPQVINMNEDVLGIMTFLRSLRD*
I03	MKTSIQLVIVIGTLITACAGLISTITHENIEYNIVNALTDLHGVDGPLHNFVSDNAVRIRHTNIANDSPRKVGFYTNLPGSTYVLWLDPGEKGTLHFNKDYPWVLASGEPRANLNKEIYEILVPQVINMNEDVLGIMTFLRSLRD*
I04	MKTSIQLVIVIGTLITACAGLISTITHENIEYNIVNALTDLHGVDGPLHNFVSDNXVRIGHTNIANDSPRKVGFYTNLPGSTYVLWLDPGEKGTLHFNKDYPWVLASGEPRANLNKEIYEILVPQVINMNEDVLGIMTFLRSLRD*


In [101]:
from shutil import copyfile

def align_seqs(sampleid, sample_dict, both=True):
    with tempfile.NamedTemporaryFile(mode="w+", delete=False) as temp_input:
        for sample,qseq in sample_dict.items():
            temp_input.write(f">{sample}\n{qseq}\n")
            temp_input.flush()

        temp_output_name = temp_input.name + ".aln"
        mafft_cline = MafftCommandline(input=temp_input.name)
        stdout, stderr = mafft_cline()
        # print(stdout.upper())
        with open(temp_output_name, "w") as temp_output:
            temp_output.write(stdout.upper())


    copyfile(temp_output_name, f"{sampleid}.aln")
    mv = MsaViz(temp_output_name, show_count=True)
    mv.savefig(f"{sampleid}.png")
    
    sequences = {record.id: str(record.seq) for record in SeqIO.parse(temp_output_name, 'fasta')}


In [102]:
align_seqs('haplotypes', haps)