In [1]:
from typing import Union
from pathlib import Path


from Bio.Align import MultipleSeqAlignment as MSA
from Bio import AlignIO
from Bio.Seq import Seq, MutableSeq
from Bio.SeqRecord import SeqRecord

In [2]:
NSEQS = 100
path_msa=f"/data/analysis-paper/msas/paper/{NSEQS}/{NSEQS}-sars-cov-2-ena.fa"

## Pangeblocks

In [3]:
ALPHABET_PG=["a","c","g","t","n","-"]
path_save_pg=f"/data/analysis-paper/msas/paper/{NSEQS}-pangeblocks/{NSEQS}-sars-cov-2-ena.fa"
Path(path_save_pg).parent.mkdir(exist_ok=True, parents=True)

In [4]:
msa=AlignIO.read(path_msa, "fasta")
n_cols = msa.get_alignment_length()
n_seqs = len(msa)

In [5]:
n_cols, n_seqs

(29904, 100)

In [67]:
list_chars = []

# create new MSA
records = []

for row in range(n_seqs):
    seq = MutableSeq(msa[row].seq)
    for c in set(seq):
        if c not in ALPHABET_PG:
            seq = seq.replace(c,"n")

    # list_chars.extend(
    #     list(set(seq))
    # )    
    records.append(
        SeqRecord(
            seq=Seq(seq), id=msa[row].id, name=msa[row].name, description=msa[row].description 
            )
    )
new_msa=MSA(records)

In [68]:
print(new_msa)

Alignment with 50 rows and 29903 columns
----aaggtntataccttcccaggtaacaaaccaaccaactttc...--- ENA|MW565758|MW565758.1
----------tataccttcccaggtaacaaaccaaccaactttc...--- ENA|MW565759|MW565759.1
--------------------------------------actttc...--- ENA|MW565760|MW565760.1
---------------------------------------ctttc...--- ENA|MW565761|MW565761.1
-----------ataccttcccaggtaacaaaccaaccaactttc...--- ENA|MT970305|MT970305.1
--------------------------------------------...--- ENA|MT811295|MT811295.1
--------------------------------------------...--- ENA|MT407656|MT407656.1
------------taccttcccaggtaacaaaccaaccaactttc...--- ENA|MW626827|MW626827.1
----------------------ggtaacaaaccaaccaactttc...--- ENA|MT520494|MT520494.1
--------------------------------------------...--- ENA|MW420377|MW420377.1
------ggtttataccttcccaggtaacaaaccaaccaactttc...--- ENA|MT520495|MT520495.1
--------------------------------------------...--- ENA|MW420388|MW420388.1
----aaggtttataccttcccaggtaacaaaccaaccaactttc...--- ENA|MT97

In [69]:
with open(path_save_pg,"w") as fp:
    AlignIO.write(new_msa, fp, "fasta")

## VG

In [70]:
ALPHABET_VG=["a","c","g","t","-"]
path_save_vg=f"/data/analysis-paper/msas/paper/{NSEQS}-vg/{NSEQS}-sars-cov-2-ena.fa"
Path(path_save_vg).parent.mkdir(exist_ok=True, parents=True)


In [71]:
list_chars = []

# create new MSA
records = []

for row in range(n_seqs):
    seq = MutableSeq(msa[row].seq)
    for c in set(seq):
        if c not in ALPHABET_VG:
            seq = seq.replace(c,"-")

    # list_chars.extend(
    #     list(set(seq))
    # )    
    records.append(
        SeqRecord(
            seq=Seq(seq), id=msa[row].id, name=msa[row].name, description=msa[row].description 
            )
    )
new_msa=MSA(records)

In [72]:
with open(path_save_vg,"w") as fp:
    AlignIO.write(new_msa, fp, "fasta")

In [73]:
print(new_msa)

Alignment with 50 rows and 29903 columns
----aaggt-tataccttcccaggtaacaaaccaaccaactttc...--- ENA|MW565758|MW565758.1
----------tataccttcccaggtaacaaaccaaccaactttc...--- ENA|MW565759|MW565759.1
--------------------------------------actttc...--- ENA|MW565760|MW565760.1
---------------------------------------ctttc...--- ENA|MW565761|MW565761.1
-----------ataccttcccaggtaacaaaccaaccaactttc...--- ENA|MT970305|MT970305.1
--------------------------------------------...--- ENA|MT811295|MT811295.1
--------------------------------------------...--- ENA|MT407656|MT407656.1
------------taccttcccaggtaacaaaccaaccaactttc...--- ENA|MW626827|MW626827.1
----------------------ggtaacaaaccaaccaactttc...--- ENA|MT520494|MT520494.1
--------------------------------------------...--- ENA|MW420377|MW420377.1
------ggtttataccttcccaggtaacaaaccaaccaactttc...--- ENA|MT520495|MT520495.1
--------------------------------------------...--- ENA|MW420388|MW420388.1
----aaggtttataccttcccaggtaacaaaccaaccaactttc...--- ENA|MT97