## 30/01/2024 Modifications
* SARS-CoV-2 ORF1ab has a frameshift
* end of ORF1a also the start of ORF1b
* 266...13468, 13468...21555
* python index: 265:13568, 13568:21555

### Amino acids translation (altered ORF1ab)

reference (frame shift normal translation and truncated translation)

The UCSC SARS-CoV-2 Genome Browser https://www.nature.com/articles/s41588-020-0700-8 

In [6]:
import Bio
from Bio import AlignIO, SeqIO, SeqRecord, Seq


In [24]:
from Bio.Align import MultipleSeqAlignment
from Bio.SeqRecord import SeqRecord

hcov_19_aligned = AlignIO.read('gisaid_hcov_19_aligned_2.fasta', 'fasta')

# There is a frameshift of ORF1ab
# ORF1ab: join(266..13468,13468..21555)

ORF = {'ORF1a': [265, 13483], 'ORF1a_truncated': [265, 13468], 'ORF1b_continued': [13467, 21555], 'S': [21562, 25384], 'ORF_3a': [25392, 26220], 
       'E': [26244, 26472], 'M': [26522, 27191], 'ORF_6': [27201, 27387], 
       'ORF_7a': [27393, 27759], 'ORF_8': [27893, 28259], 'N': [28273, 29533], 
       'ORF_10': [29557, 29674]}
ORF_nt = {}
for k,v in ORF.items():
  ORF_nt[k] = hcov_19_aligned[:, v[0]:v[1]]
  # print the ORF: ORF_1a, ORF_1b, S, ORF_3a, E, M, ORF_6, ORF_7a, ORF_8, N, ORF_10
  print(k)
  print(ORF[k])
  print(ORF_nt[k])

print(ORF_nt)

ORF1a
[265, 13483]
Alignment with 4 rows and 13218 columns
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...taa NC_045512.2
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...taa hCoV-19/Italy/VEN-IZSVe-21RS1571-1_VI/2021|EPI_ISL_2927997|2021-06-05
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...taa hCoV-19/Italy/VEN-IZSVe-21RS1721-7_VI/2021|EPI_ISL_3006795|2021-06-28
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...taa hCoV-19/Italy/VEN-IZSVe-21RS8150-1_VI/2021|EPI_ISL_4968925|2021-09-22
ORF1a_truncated
[265, 13468]
Alignment with 4 rows and 13203 columns
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...aac NC_045512.2
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...aac hCoV-19/Italy/VEN-IZSVe-21RS1571-1_VI/2021|EPI_ISL_2927997|2021-06-05
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...aac hCoV-19/Italy/VEN-IZSVe-21RS1721-7_VI/2021|EPI_ISL_3006795|2021-06-28
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...aac hCoV-19/Italy/VEN-IZSVe-21RS8150-1_VI/2021|EPI_ISL_4968925|2021-09-22
ORF1b_continued
[134

In [25]:
from Bio.Align import MultipleSeqAlignment
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

for k,v in ORF_nt.items():
    print(k)
    print(v)

codons = {'ttt': 'F', 'ttc': 'F', 'tta': 'L', 'ttg': 'L', 'ctt': 'L', 'ctc': 'L', 'cta': 'L', 
           'ctg': 'L', 'att': 'I', 'atc': 'I', 'ata': 'I', 'atg': 'M', 'gtt': 'V', 'gtc': 'V', 
           'gta': 'V', 'gtg': 'V', 'tct': 'S', 'tcc': 'S', 'tca': 'S', 'tcg': 'S', 'cct': 'P', 
           'ccc': 'P', 'cca': 'P', 'ccg': 'P', 'act': 'T', 'acc': 'T', 'aca': 'T', 'acg': 'T', 
           'gct': 'A', 'gcc': 'A', 'gca': 'A', 'gcg': 'A', 'tat': 'Y', 'tac': 'Y', 'taa': '*', 
           'tag': '*', 'cat': 'H', 'cac': 'H', 'caa': 'Q', 'cag': 'Q', 'aat': 'N', 'aac': 'N', 
           'aaa': 'K', 'aag': 'K', 'gat': 'D', 'gac': 'D', 'gaa': 'E', 'gag': 'E', 'tgt': 'C', 
           'tgc': 'C', 'tga': '*', 'tgg': 'W', 'cgt': 'R', 'cgc': 'R', 'cga': 'R', 'cgg': 'R', 
           'agt': 'S', 'agc': 'S', 'aga': 'R', 'agg': 'R', 'ggt': 'G', 'ggc': 'G', 'gga': 'G', 
           'ggg': 'G'}

def translate_multiple_alignment_nt(alignment):
    aa_alignment = []
    
    for record in alignment:
        seq = record.seq.lower()
        aa_seq = ''
        for i in range(0, len(seq), 3):
            codon = seq[i:i+3]
            if codon in codons:
                aa_seq += codons[codon]
            else:
                aa_seq += 'X'
        aa_alignment.append(SeqRecord(Seq(aa_seq), id=record.id, description=''))

    return MultipleSeqAlignment(aa_alignment)


ORF_aa = {}
for k, v in ORF_nt.items():
    ORF_aa[k] = translate_multiple_alignment_nt(v)
    print(k)
    print(ORF_aa[k])

ORF1a
Alignment with 4 rows and 13218 columns
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...taa NC_045512.2
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...taa hCoV-19/Italy/VEN-IZSVe-21RS1571-1_VI/2021|EPI_ISL_2927997|2021-06-05
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...taa hCoV-19/Italy/VEN-IZSVe-21RS1721-7_VI/2021|EPI_ISL_3006795|2021-06-28
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...taa hCoV-19/Italy/VEN-IZSVe-21RS8150-1_VI/2021|EPI_ISL_4968925|2021-09-22
ORF1a_truncated
Alignment with 4 rows and 13203 columns
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...aac NC_045512.2
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...aac hCoV-19/Italy/VEN-IZSVe-21RS1571-1_VI/2021|EPI_ISL_2927997|2021-06-05
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...aac hCoV-19/Italy/VEN-IZSVe-21RS1721-7_VI/2021|EPI_ISL_3006795|2021-06-28
atggagagccttgtccctggtttcaacgagaaaacacacgtcca...aac hCoV-19/Italy/VEN-IZSVe-21RS8150-1_VI/2021|EPI_ISL_4968925|2021-09-22
ORF1b_continued
Alignment with 4 rows and 8088

In [26]:
print(type(ORF_aa))
print(type(v for k,v in ORF_aa.items()))
for k,v in ORF_aa.items():
    print(type(k))
    print(type(v))

<class 'dict'>
<class 'generator'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>
<class 'str'>
<class 'Bio.Align.MultipleSeqAlignment'>


#### Compare AA sequences differences
* data type of ORF_aa: dictionary
* dictionary contains k = str and v = Bio.Align.MultipleSeqAlignment

In [28]:
# write the ORF_aa to fasta file
    # ORF1a, ORF1a_truncated, ORF1b_continued, S, ORF_3a, E, M, ORF_6, ORF_7a, ORF_8, N, ORF_10
# read in the fasta file again
# use the old program to compare the differences in amino acids sequence

for k,v in ORF_aa.items():
    SeqIO.write(v, k + '.fasta', 'fasta')


#### Use AlignIO.read to read the fasta file again

In [30]:
# read in the fasta file again
    
ORF1a_fasta = AlignIO.read('ORF1a.fasta', 'fasta')
ORF1a_truncated_fasta = AlignIO.read('ORF1a_truncated.fasta', 'fasta')
ORF1b_continued_fasta = AlignIO.read('ORF1b_continued.fasta', 'fasta')
ORF_3a_fasta = AlignIO.read('ORF_3a.fasta', 'fasta')
E_fasta = AlignIO.read('E.fasta', 'fasta')
M_fasta = AlignIO.read('M.fasta', 'fasta')
ORF_6_fasta = AlignIO.read('ORF_6.fasta', 'fasta')
ORF_7a_fasta = AlignIO.read('ORF_7a.fasta', 'fasta')
ORF_8_fasta = AlignIO.read('ORF_8.fasta', 'fasta')
N_fasta = AlignIO.read('N.fasta', 'fasta')
ORF_10_fasta = AlignIO.read('ORF_10.fasta', 'fasta')

# Check the read in files

for seq in ORF1a_fasta:
    print(seq)

display = 0
i = 0
while display < 10 and i < 10:
    characters = ORF1a_fasta[:, i]
    print(characters)
    display += 1
    i += 1

print(ORF1a_fasta)

ID: NC_045512.2
Name: NC_045512.2
Description: NC_045512.2
Number of features: 0
Seq('MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLV...AV*')
ID: hCoV-19/Italy/VEN-IZSVe-21RS1571-1_VI/2021|EPI_ISL_2927997|2021-06-05
Name: hCoV-19/Italy/VEN-IZSVe-21RS1571-1_VI/2021|EPI_ISL_2927997|2021-06-05
Description: hCoV-19/Italy/VEN-IZSVe-21RS1571-1_VI/2021|EPI_ISL_2927997|2021-06-05
Number of features: 0
Seq('MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLV...AV*')
ID: hCoV-19/Italy/VEN-IZSVe-21RS1721-7_VI/2021|EPI_ISL_3006795|2021-06-28
Name: hCoV-19/Italy/VEN-IZSVe-21RS1721-7_VI/2021|EPI_ISL_3006795|2021-06-28
Description: hCoV-19/Italy/VEN-IZSVe-21RS1721-7_VI/2021|EPI_ISL_3006795|2021-06-28
Number of features: 0
Seq('MESLVPGFNEKTHVQLSLPVLQVRDVLVRGFGDSVEEVLSEARQHLKDGTCGLV...AV*')
ID: hCoV-19/Italy/VEN-IZSVe-21RS8150-1_VI/2021|EPI_ISL_4968925|2021-09-22
Name: hCoV-19/Italy/VEN-IZSVe-21RS8150-1_VI/2021|EPI_ISL_4968925|2021-09-22
Description: hCoV-19/Italy/VEN-IZSVe-21RS8150-1_VI/2021

#### Summarization of amino acid sequences differences in each ORF

In [None]:
# for a single sequence

length_aa = hcov_19_translated.get_alignment_length()
diff_pos_aa = {}
i = 0

while i < length_aa:
    residue = hcov_19_translated[:, i]
    residue = [aa for aa in residue if aa != '-']

    if len(set(residue)) > 1:
        diff_pos_aa[i] = residue

    i += 1

print(diff_pos_aa)

for k,v in diff_pos_aa.items():
  print(f'position: {k}: {v}')

In [31]:
# find the differences in amino acids sequence

def find_diff_aa(alignment):
    length_aa = alignment.get_alignment_length()
    diff_pos_aa = {}
    i = 0

    while i < length_aa:
        residue = alignment[:, i]
        residue = [aa for aa in residue if aa != '-']

        if len(set(residue)) > 1:
            diff_pos_aa[i] = residue

        i += 1

    return diff_pos_aa

ORF1a_diff_aa = find_diff_aa(ORF1a_fasta)
ORF1a_truncated_diff_aa = find_diff_aa(ORF1a_truncated_fasta)
ORF1b_continued_diff_aa = find_diff_aa(ORF1b_continued_fasta)
ORF_3a_diff_aa = find_diff_aa(ORF_3a_fasta)
E_diff_aa = find_diff_aa(E_fasta)
M_diff_aa = find_diff_aa(M_fasta)
ORF_6_diff_aa = find_diff_aa(ORF_6_fasta)
ORF_7a_diff_aa = find_diff_aa(ORF_7a_fasta)
ORF_8_diff_aa = find_diff_aa(ORF_8_fasta)
N_diff_aa = find_diff_aa(N_fasta)
ORF_10_diff_aa = find_diff_aa(ORF_10_fasta)

In [34]:
# Check the differences in amino acids sequences

for k,v in ORF1a_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in ORF1a_truncated_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in ORF1b_continued_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in ORF_3a_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in E_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in M_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in ORF_6_diff_aa.items():
    print(f'position: {k}: {v}')
    print()

for k,v in ORF_7a_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in ORF_8_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in N_diff_aa.items():
    print(f'position: {k}: {v}')
print()

for k,v in ORF_10_diff_aa.items():
    print(f'position: {k}: {v}')
print()

position: 609: ['S', 'L', 'L', 'L']
position: 1014: ['E', 'E', 'K', 'E']
position: 2494: ['T', 'I', 'T', 'I']
position: 2966: ['T', 'T', 'T', 'I']
position: 3115: ['L', 'L', 'F', 'L']
position: 3254: ['T', 'T', 'I', 'I']
position: 3605: ['L', 'L', 'F', 'L']
position: 4219: ['P', 'P', 'L', 'P']

position: 609: ['S', 'L', 'L', 'L']
position: 1014: ['E', 'E', 'K', 'E']
position: 2494: ['T', 'I', 'T', 'I']
position: 2966: ['T', 'T', 'T', 'I']
position: 3115: ['L', 'L', 'F', 'L']
position: 3254: ['T', 'T', 'I', 'I']
position: 3605: ['L', 'L', 'F', 'L']
position: 4219: ['P', 'P', 'L', 'P']

position: 313: ['P', 'L', 'L', 'L']
position: 1690: ['V', 'L', 'L', 'L']
position: 2221: ['A', 'V', 'A', 'V']

position: 44: ['W', 'L', 'L', 'L']
position: 174: ['T', 'I', 'T', 'I']


position: 81: ['I', 'T', 'S', 'T']



position: 29: ['G', 'R', 'G', 'G']
position: 66: ['P', 'S', 'S', 'S']
position: 202: ['R', 'K', 'K', 'K']
position: 203: ['G', 'L', 'R', 'R']




#### Write the differences in position into csv file

In [None]:
# for a single file
# Write the differences into a csv file

import csv

with open('AA_mismatch.csv', 'w', newline = '') as f:

  # Create a csv writer object
  writer = csv.writer(f)

  # Write the header row
  writer.writerow(['Position', 'Amino Acids variations'])
  # Write one key-value tuple per row
  for k, v in diff_pos_aa.items():
    writer.writerow([k, v])

In [35]:
# for iterations

import csv

ORF_diff_aa = {'ORF1a': ORF1a_diff_aa, 'ORF1a_truncated': ORF1a_truncated_diff_aa, 'ORF1b_continued': ORF1b_continued_diff_aa, 'ORF_3a': ORF_3a_diff_aa,
                'E': E_diff_aa, 'M': M_diff_aa, 'ORF_6': ORF_6_diff_aa, 'ORF_7a': ORF_7a_diff_aa, 'ORF_8': ORF_8_diff_aa, 'N': N_diff_aa, 'ORF_10': ORF_10_diff_aa}
for k,v in ORF_diff_aa.items():
  with open(k + '_AA_mismatch.csv', 'w', newline = '') as f:
    writer = csv.writer(f)
    writer.writerow(['Position', 'Amino Acids variations'])
    for k, v in v.items():
      writer.writerow([k, v])