### Read CSV files

In [187]:
import csv

codons = []

with open('data/side-by-side.csv', newline='') as csvfile:
    csvreader = iter(csv.reader(csvfile, delimiter=','))
    # Skip first row
    next(csvreader)

    for abspos, virus, vaccine in csvreader:
        codons.append({
            'abspos': abspos,
            'virus': virus,
            'vaccine': vaccine
        })

print(len(codons))


1274


In [188]:
import csv

codon_table = []

with open('data/codon-table-grouped.csv', newline='') as csvfile:
    csvreader = iter(csv.reader(csvfile, delimiter=','))
    # Skip first row
    next(csvreader)

    for aminoacid, codon in csvreader:
        codon_table.append({
            'aminoacid': aminoacid,
            'codon': codon,
        })

print(len(codon_table))

64


### Categorize codon table by amount of 'good' chars.


In [189]:
def calculate_codon_score(codon):
    score = 0

    for char in codon:
        if char is 'G' or char is 'C':
            score += 1

    return score

In [190]:
grouped_codon_table = { 0: [], 1: [], 2: [], 3: [] }

for codon_row in codon_table:
    codon = codon_row['codon']
    score = calculate_codon_score(codon)

    grouped_codon_table[score].append(codon_row)

### Find a codon with the highest possible score for each aminoacid

In [191]:
aminoacids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 's', 'S', 'T', 'V', 'W', 'Y']
best_codons = {}

for aminoacid in aminoacids:
    for score in range(3, 0, -1):
        for codon_row in grouped_codon_table[score]:
            if codon_row['aminoacid'] is aminoacid:
                best_codons[aminoacid] = codon_row['codon']
                continue

print(best_codons)
print(f"Found all: {len(aminoacids) == len(best_codons)}")

{'A': 'GCT', 'C': 'TGT', 'D': 'GAT', 'E': 'GAA', 'F': 'TTC', 'G': 'GGT', 'H': 'CAT', 'I': 'ATC', 'K': 'AAG', 'L': 'TTG', 'M': 'ATG', 'N': 'AAC', 'P': 'CCT', 'Q': 'CAA', 'R': 'AGA', 's': 'TGA', 'S': 'TCT', 'T': 'ACT', 'V': 'GTT', 'W': 'TGG', 'Y': 'TAC'}
Found all: True


### Target

> It is known that a higher fraction of G and C characters improves the efficiency of an mRNA vaccine.



In [192]:
def find_codon_with_score(score, target_aminoacid):
    for codon_row in grouped_codon_table[score]:
        if codon_row['aminoacid'] is target_aminoacid:
            return codon_row

    return False

In [196]:
for i in range(len(codons)):
    virus_codon = codons[i]['virus']
    virus_codon_score = calculate_codon_score(virus_codon)
  
    # Find target aminoacid
    target_aminoacid = None
    for codon_row in codon_table:
        if codon_row['codon'] == virus_codon:
            target_aminoacid = codon_row['aminoacid']
            break

    best_matching_codon = best_codons[target_aminoacid] 
    best_matching_codon_score = calculate_codon_score(best_matching_codon)
        
    # If virus codon has the best score, use that one.
    if virus_codon_score == best_matching_codon_score:
        codons[i]['suggestion'] = virus_codon
    else:
        codons[i]['suggestion'] = best_matching_codon

### Calculate score

In [194]:
match_count = 0

for codon in codons:
    if codon['vaccine'] == codon['suggestion']:
        match_count += 1

percentage = 100 / len(codons) * match_count

f"Result: {percentage}%"


'Result: 34.850863422291994%'

### Print results

In [195]:
print("abspos, virus, suggestion, vaccine")

for codon in codons:
    print(f"{codon['abspos']}, {codon['virus']}, {codon['suggestion']}, {codon['vaccine']}")

abspos, virus, suggestion, vaccine
0, ATG, ATG, ATG
3, TTT, TTC, TTC
6, GTT, GTT, GTG
9, TTT, TTC, TTC
12, CTT, CTT, CTG
15, GTT, GTT, GTG
18, TTA, TTG, CTG
21, TTG, TTG, CTG
24, CCA, CCA, CCT
27, CTA, CTA, CTG
30, GTC, GTT, GTG
33, TCT, TCT, TCC
36, AGT, AGT, AGC
39, CAG, CAA, CAG
42, TGT, TGT, TGT
45, GTT, GTT, GTG
48, AAT, AAC, AAC
51, CTT, CTT, CTG
54, ACA, ACA, ACC
57, ACC, ACT, ACC
60, AGA, AGA, AGA
63, ACT, ACT, ACA
66, CAA, CAA, CAG
69, TTA, TTG, CTG
72, CCC, CCT, CCT
75, CCT, CCT, CCA
78, GCA, GCA, GCC
81, TAC, TAC, TAC
84, ACT, ACT, ACC
87, AAT, AAC, AAC
90, TCT, TCT, AGC
93, TTC, TTC, TTT
96, ACA, ACA, ACC
99, CGT, AGA, AGA
102, GGT, GGT, GGC
105, GTT, GTT, GTG
108, TAT, TAC, TAC
111, TAC, TAC, TAC
114, CCT, CCT, CCC
117, GAC, GAT, GAC
120, AAA, AAG, AAG
123, GTT, GTT, GTG
126, TTC, TTC, TTC
129, AGA, AGA, AGA
132, TCC, TCT, TCC
135, TCA, TCA, AGC
138, GTT, GTT, GTG
141, TTA, TTG, CTG
144, CAT, CAT, CAC
147, TCA, TCA, TCT
150, ACT, ACT, ACC
153, CAG, CAA, CAG
156, GAC, GAT, 