In [13]:
# Author: Edgars Liepa
# Date: 6.5.22 
# Descripton: Create a list of amino acid substitutions from multiple sequences 
# in single fasta file.
# Programm firstly creates pairvise allignment to reference sequnce 
# and then counts changed positions
# 
# Dependencies: BioPython 

In [1]:
from Bio import SeqIO
from datetime import datetime

In [2]:
records = list(SeqIO.parse("spike.aln", "fasta"))

In [3]:
# This is my reference
print(records[0].id)  # first record
print(records[0].description)

Spike|hCoV-19/Wuhan/WIV04/2019|2019-12-30|EPI_ISL_402124|Original|hCoV-19^^Hubei|Human|Wuhan
Spike|hCoV-19/Wuhan/WIV04/2019|2019-12-30|EPI_ISL_402124|Original|hCoV-19^^Hubei|Human|Wuhan Jinyintan Hospital|Wuhan Institute of Virology|Shi|China


In [6]:
mutationList = []
haploTypeList = []

# Set up List with reference sequnce
for index, letter in enumerate(records[0].seq):
    mutationList.append({letter: 0})

In [7]:
for i in range (1, len(records)):
    haploType = {}
    for index, letter in enumerate(records[i].seq):
        if (letter not in mutationList[index]):       
            mutationList[index][letter] = 1
            haploType[index] = [letter]
        elif (letter != records[0].seq[index]):
            mutationList[index][letter] += 1
            haploType[index] = [letter]
        haploTypeList.append(haploType)
    print(haploType)  

# Print Results
for index, position in enumerate(mutationList):
    for aa in  position.keys():
        if position[aa] != 0:
            print("Postion: ", index+1, "aaSubst: ", aa, "Count: ", position[aa] )  

{0: ['A'], 11: ['F'], 489: ['Y'], 613: ['G']}
{0: ['B'], 360: ['T'], 613: ['G'], 653: ['Q'], 761: ['*']}
{0: ['C'], 476: ['N'], 613: ['G']}
{0: ['C'], 613: ['G'], 784: ['L'], 806: ['T'], 807: ['N'], 1107: ['I'], 1109: ['M'], 1110: ['N'], 1111: ['H'], 1112: ['K'], 1113: ['S'], 1114: ['L'], 1115: ['L'], 1116: ['Q'], 1117: ['T'], 1118: ['T'], 1119: ['H'], 1120: ['L'], 1121: ['C'], 1122: ['L'], 1123: ['V'], 1124: ['T'], 1125: ['V'], 1126: ['M'], 1127: ['L'], 1128: ['*']}
Postion:  1 aaSubst:  A Count:  1
Postion:  1 aaSubst:  B Count:  1
Postion:  1 aaSubst:  C Count:  2
Postion:  12 aaSubst:  F Count:  1
Postion:  361 aaSubst:  T Count:  1
Postion:  477 aaSubst:  N Count:  1
Postion:  490 aaSubst:  Y Count:  1
Postion:  614 aaSubst:  G Count:  4
Postion:  654 aaSubst:  Q Count:  1
Postion:  762 aaSubst:  * Count:  1
Postion:  785 aaSubst:  L Count:  1
Postion:  807 aaSubst:  T Count:  1
Postion:  808 aaSubst:  N Count:  1
Postion:  1108 aaSubst:  I Count:  1
Postion:  1110 aaSubst:  M Cou

In [None]:
# Print Results
import csv
import sys

header = ['Postion', 'aaSubst', 'Count']

with open('mutations.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)
    writer.writerow(header)

    for index, position in enumerate(mutationList):
        for aa in  position.keys():
            if position[aa] != 0:
                print("Postion: ", index+1, "aaSubst: ", aa, "Count: ", position[aa])
                writer.writerow([index+1, aa, position[aa]])



print("AA substitition counts saved at ")  

now = datetime.now()
current_time = now.strftime("%H:%M:%S")
print("Current Time =", current_time)