In [183]:
import pandas as pd

### Read CSV files

In [170]:
import csv

codons = []

with open('data/side-by-side.csv', newline='') as csvfile:
    csvreader = iter(csv.reader(csvfile, delimiter=','))
    # Skip first row
    next(csvreader)

    for abspos, virus, vaccine in csvreader:
        codons.append({
            'abspos': abspos,
            'virus': virus,
            'vaccine': vaccine
        })

print(len(codons))


1274


In [171]:
codon_table = []

with open('data/codon-table-grouped.csv', newline='') as csvfile:
    csvreader = iter(csv.reader(csvfile, delimiter=','))
    # Skip first row
    next(csvreader)

    for aminoacid, codon in csvreader:
        codon_table.append({
            'aminoacid': aminoacid,
            'codon': codon,
        })

print(len(codon_table))

64


### Categorize codon table by amount of 'good' chars.


In [172]:
def calculate_codon_score(codon):
    score = 0

    for char in codon:
        if char is 'G' or char is 'C':
            score += 1

    return score

In [173]:
grouped_codon_table = { 0: [], 1: [], 2: [], 3: [] }

for codon_row in codon_table:
    codon = codon_row['codon']
    score = calculate_codon_score(codon)

    grouped_codon_table[score].append(codon_row)

### Find suggestion vaccine

In [174]:
def find_codon_with_score(score, target_aminoacid):
    for codon_row in grouped_codon_table[score]:
        if codon_row['aminoacid'] is target_aminoacid:
            return codon_row

    return False

In [175]:
def find_target_aminoacid(codon): 
    for codon_row in codon_table:
        if codon_row['codon'] == codon:
            return codon_row['aminoacid']

In [176]:
for i in range(len(codons)):
    virus_codon = codons[i]['virus']
    virus_codon_score = calculate_codon_score(virus_codon)

    # Find target aminoacid
    target_aminoacid = find_target_aminoacid(virus_codon)       
    
    # Find codon with the highest score
    suggestion = None
    
    # Try to get a valid codon with a score > codon score of virus
    for score in range(3, virus_codon_score, -1):        
        best_codon = find_codon_with_score(score, target_aminoacid)
        if best_codon:
            suggestion = best_codon['codon']
            break
    
    # If no codon is found with a better score; just use the virus codon
    if suggestion is None:
        suggestion = virus_codon

    codons[i]['suggestion'] = suggestion

### Check mismatches

In [177]:
for codon in codons:
    virus_target = find_target_aminoacid(codon['virus'])
    vaccine_target = find_target_aminoacid(codon['vaccine'])
    suggestion_target = find_target_aminoacid(codon['suggestion'])

    if vaccine_target != suggestion_target:
        print(f"MISMATCH - virus: {codon['virus']} -> {virus_target}, vaccine: {codon['vaccine']} -> {vaccine_target}, suggestion: {codon['suggestion']} -> {suggestion_target}")

MISMATCH - virus: AAA -> K, vaccine: CCT -> P, suggestion: AAG -> K
MISMATCH - virus: GTT -> V, vaccine: CCT -> P, suggestion: GTC -> V


### Calculate score

In [178]:
match_count = 0

for codon in codons:
    if codon['vaccine'] == codon['suggestion']:
        match_count += 1

percentage = 100 * match_count / len(codons)

f"Result: {percentage}%"


'Result: 62.71585557299843%'

### Print results

In [179]:
total_pfizer_score = 0
total_suggestion_score = 0


print("abspos, virus, vaccine, suggestion, vaccine_match, score_difference")

for codon in codons:
    same_result = codon['vaccine'] == codon['suggestion']

    pfizer_score = calculate_codon_score(codon['vaccine'])
    suggestion_score = calculate_codon_score(codon['suggestion'])
    score_difference = suggestion_score - pfizer_score

    total_pfizer_score += pfizer_score
    total_suggestion_score += suggestion_score

    print(f"{codon['abspos']}, {codon['virus']}, {codon['vaccine']}, {codon['suggestion']}, {same_result}, {score_difference}")

abspos, virus, vaccine, suggestion, vaccine_match, score_difference
0, ATG, ATG, ATG, True, 0
3, TTT, TTC, TTC, True, 0
6, GTT, GTG, GTC, False, 0
9, TTT, TTC, TTC, True, 0
12, CTT, CTG, CTC, False, 0
15, GTT, GTG, GTC, False, 0
18, TTA, CTG, CTC, False, 0
21, TTG, CTG, CTC, False, 0
24, CCA, CCT, CCC, False, 1
27, CTA, CTG, CTC, False, 0
30, GTC, GTG, GTC, False, 0
33, TCT, TCC, AGC, False, 0
36, AGT, AGC, AGC, True, 0
39, CAG, CAG, CAG, True, 0
42, TGT, TGT, TGC, False, 1
45, GTT, GTG, GTC, False, 0
48, AAT, AAC, AAC, True, 0
51, CTT, CTG, CTC, False, 0
54, ACA, ACC, ACC, True, 0
57, ACC, ACC, ACC, True, 0
60, AGA, AGA, CGC, False, 2
63, ACT, ACA, ACC, False, 1
66, CAA, CAG, CAG, True, 0
69, TTA, CTG, CTC, False, 0
72, CCC, CCT, CCC, False, 1
75, CCT, CCA, CCC, False, 1
78, GCA, GCC, GCC, True, 0
81, TAC, TAC, TAC, True, 0
84, ACT, ACC, ACC, True, 0
87, AAT, AAC, AAC, True, 0
90, TCT, AGC, AGC, True, 0
93, TTC, TTT, TTC, False, 1
96, ACA, ACC, ACC, True, 0
99, CGT, AGA, CGC, False, 2

### Total scores

_Can be slightly different because of 2 changed values by Pfizer_

In [180]:
print(f"Total pfizer score: {total_pfizer} (higher is better)")
print(f"Total suggestion score: {total_suggestion} (higher is better)")

Total pfizer score: 733 (higher is better)
Total suggestion score: 993 (higher is better)


### Print entire result

In [181]:
result = ""
for codon in codons:
    result += codon['suggestion']

print(result)

ATGTTCGTCTTCCTCGTCCTCCTCCCCCTCGTCAGCAGCCAGTGCGTCAACCTCACCACCCGCACCCAGCTCCCCCCCGCCTACACCAACAGCTTCACCCGCGGCGTCTACTACCCCGACAAGGTCTTCCGCTCCAGCGTCCTCCACAGCACCCAGGACCTCTTCCTCCCCTTCTTCTCCAACGTCACCTGGTTCCACGCCATCCACGTCAGCGGGACCAACGGCACCAAGCGCTTCGACAACCCCGTCCTCCCCTTCAACGACGGCGTCTACTTCGCCTCCACCGAGAAGAGCAACATCATCCGCGGCTGGATCTTCGGCACCACCCTCGACTCGAAGACCCAGTCCCTCCTCATCGTCAACAACGCCACCAACGTCGTCATCAAGGTCTGCGAGTTCCAGTTCTGCAACGACCCCTTCCTCGGCGTCTACTACCACAAGAACAACAAGAGCTGGATGGAGAGCGAGTTCCGCGTCTACAGCAGCGCGAACAACTGCACCTTCGAGTACGTCAGCCAGCCCTTCCTCATGGACCTCGAGGGCAAGCAGGGCAACTTCAAGAACCTCCGCGAGTTCGTGTTCAAGAACATCGACGGCTACTTCAAGATCTACAGCAAGCACACGCCCATCAACCTCGTGCGCGACCTCCCCCAGGGCTTCTCGGCCCTCGAGCCCCTCGTCGACCTCCCCATCGGCATCAACATCACCCGCTTCCAGACCCTCCTCGCCCTCCACCGCAGCTACCTCACCCCCGGCGACAGCAGCAGCGGCTGGACCGCCGGCGCCGCCGCCTACTACGTGGGCTACCTCCAGCCCCGCACCTTCCTCCTCAAGTACAACGAGAACGGCACCATCACCGACGCCGTCGACTGCGCCCTCGACCCCCTCAGCGAGACCAAGTGCACGCTCAAGTCCTTCACCGTCGAGAAGGGCATCTACCAGACCAGCAACTTCCGCGTCCAGCCCACCGAGAGCATCGTCCGCTTCCCCAACATCACCA