# Profiling results for original  code

In [2]:
%%file rna_translation.py

import sys

try:
    profile
except:
    profile = lambda f: f

    
triplets_txt = """UUU CUU AUU GUU UUC CUC AUC GUC UUA CUA AUA GUA
                  UUG CUG AUG GUG UCU CCU ACU GCU UCC CCC ACC GCC
                  UCA CCA ACA GCA UCG CCG ACG GCG UAU CAU AAU GAU
                  UAC CAC AAC GAC UAA CAA AAA GAA UAG CAG AAG GAG
                  UGU CGU AGU GGU UGC CGC AGC GGC UGA CGA AGA GGA
                  UGG CGG AGG GGG"""

aas_txt = """F L I V F L I V L L I V L L M V S P T A S P T A S P
             T A S P T A Y H N D Y H N D . Q K E . Q K E C R S G
             C R S G . R R G W R R G"""


def remove_whitespace(txt):
    # remove spaces and line breaks from string
    return txt.replace(" ", "").replace("\n", "")


def translate(rna_sequence):
    """
    uses codon table to translate rna sequence.
    example:

    UUU AUC GUU -> F I V

    spaces can be omitted.
    """
    
    # cleanup
    rna_sequence = remove_whitespace(rna_sequence)

    aas = ""
    for start_idx in range(0, len(rna_sequence), 3):
        triplet = rna_sequence[start_idx : start_idx + 3]
        aa = lookup_aa(triplet)
        aas += aa
    return aas


@profile
def lookup_aa(triplet):
    """finds aa symbol for given triplet.
    returns 'X' for invalid triplet"""
    
    # cleanup the multiline strings
    triplets = remove_whitespace(triplets_txt)
    aas = remove_whitespace(aas_txt)

    if triplet not in triplets:
        return "X"
    start_idx = triplets.index(triplet)
    return aas[start_idx // 3]


long_sequence = 20_000 * remove_whitespace(triplets_txt)
translate(long_sequence)

Overwriting rna_translation.py


In [3]:
!time python rna_translation.py


real	0m7.592s
user	0m7.537s
sys	0m0.033s


In [4]:
!pyinstrument rna_translation.py


  _     ._   __/__   _ _  _  _ _/_   Recorded: 20:50:56  Samples:  8459
 /_//_/// /_\ / //_// / //_'/ //     Duration: 8.471     CPU time: 8.452
/   _/                      v3.3.0

Program: rna_translation.py

[31m8.470[0m [48;5;24m[38;5;15m<module>[0m  [2mrna_translation.py:2[0m
└─ [31m8.468[0m [48;5;24m[38;5;15mtranslate[0m  [2mrna_translation.py:27[0m
   ├─ [31m7.913[0m [48;5;24m[38;5;15mlookup_aa[0m  [2mrna_translation.py:48[0m
   │  ├─ [31m6.841[0m [48;5;24m[38;5;15mremove_whitespace[0m  [2mrna_translation.py:22[0m
   │  │  ├─ [31m6.333[0m str.replace[0m  [2m<built-in>:0[0m
   │  │  └─ [32m0.508[0m [self][0m  [2m[0m
   │  ├─ [32m0.720[0m [self][0m  [2m[0m
   │  └─ [92m[2m0.352[0m str.index[0m  [2m<built-in>:0[0m
   └─ [32m0.551[0m [self][0m  [2m[0m

To view this report with different options, run:
    pyinstrument --load-prev 2021-01-16T20-50-56 [options]



We can see that the majority of the time is spent in `str.replace` within `remove_whitespace`. Let's check it with line-profiler too:

In [5]:
!kernprof -vl rna_translation.py

Wrote profile results to rna_translation.py.lprof
Timer unit: 1e-06 s

Total time: 9.83939 s
File: rna_translation.py
Function: lookup_aa at line 48

Line #      Hits         Time  Per Hit   % Time  Line Contents
    48                                           @profile
    49                                           def lookup_aa(triplet):
    50                                               """finds aa symbol for given triplet.
    51                                               returns 'X' for invalid triplet"""
    52                                               
    53                                               # cleanup the multiline strings
    54   1280000    4721622.0      3.7     48.0      triplets = remove_whitespace(triplets_txt)
    55   1280000    3246243.0      2.5     33.0      aas = remove_whitespace(aas_txt)
    56                                           
    57   1280000     595546.0      0.5      6.1      if triplet not in triplets:
    58                   

# First improvement

We can see that `remove_whitespace` is called 128000 for both arguments times! Since the inputs and outputs will not change for each call of `lookup_aa`, we can try to move this call outside of `lookup_aa`:

In [6]:
%%file rna_translation_2.py

import sys

try:
    profile
except:
    profile = lambda f: f

    
triplets_txt = """UUU CUU AUU GUU UUC CUC AUC GUC UUA CUA AUA GUA
                  UUG CUG AUG GUG UCU CCU ACU GCU UCC CCC ACC GCC
                  UCA CCA ACA GCA UCG CCG ACG GCG UAU CAU AAU GAU
                  UAC CAC AAC GAC UAA CAA AAA GAA UAG CAG AAG GAG
                  UGU CGU AGU GGU UGC CGC AGC GGC UGA CGA AGA GGA
                  UGG CGG AGG GGG"""

aas_txt = """F L I V F L I V L L I V L L M V S P T A S P T A S P
             T A S P T A Y H N D Y H N D . Q K E . Q K E C R S G
             C R S G . R R G W R R G"""


def remove_whitespace(txt):
    # remove spaces and line breaks from string
    return txt.replace(" ", "").replace("\n", "")


def translate(rna_sequence):
    """
    uses codon table to translate rna sequence.
    example:

    UUU AUC GUU -> F I V

    spaces can be omitted.
    """
    
    # cleanup
    rna_sequence = remove_whitespace(rna_sequence)

    aas = ""
    for start_idx in range(0, len(rna_sequence), 3):
        triplet = rna_sequence[start_idx : start_idx + 3]
        aa = lookup_aa(triplet)
        aas += aa
    return aas


# cleanup the multiline strings
triplets = remove_whitespace(triplets_txt)
aas = remove_whitespace(aas_txt)


@profile
def lookup_aa(triplet):
    """finds aa symbol for given triplet.
    returns 'X' for invalid triplet"""
   
    if triplet not in triplets:
        return "X"
    start_idx = triplets.index(triplet)
    return aas[start_idx // 3]


long_sequence = 20_000 * remove_whitespace(triplets_txt)
translate(long_sequence)

Overwriting rna_translation_2.py


In [7]:
!time python rna_translation_2.py


real	0m1.048s
user	0m1.023s
sys	0m0.016s


This is around 7-8 times faster!

In [9]:
!pyinstrument rna_translation_2.py


  _     ._   __/__   _ _  _  _ _/_   Recorded: 20:52:35  Samples:  1242
 /_//_/// /_\ / //_// / //_'/ //     Duration: 1.247     CPU time: 1.242
/   _/                      v3.3.0

Program: rna_translation_2.py

[31m1.246[0m [48;5;24m[38;5;15m<module>[0m  [2mrna_translation_2.py:2[0m
└─ [31m1.244[0m [48;5;24m[38;5;15mtranslate[0m  [2mrna_translation_2.py:27[0m
   ├─ [31m0.770[0m [48;5;24m[38;5;15mlookup_aa[0m  [2mrna_translation_2.py:53[0m
   │  ├─ [33m0.446[0m [self][0m  [2m[0m
   │  └─ [33m0.324[0m str.index[0m  [2m<built-in>:0[0m
   └─ [33m0.469[0m [self][0m  [2m[0m

To view this report with different options, run:
    pyinstrument --load-prev 2021-01-16T20-52-35 [options]



# Can we do better?

The approriate data structure for lookup and associations are dictionaries! Let's try this.

In [10]:
%%file rna_translation_3.py

import sys

try:
    profile
except:
    profile = lambda f: f

    
triplets_txt = """UUU CUU AUU GUU UUC CUC AUC GUC UUA CUA AUA GUA
                  UUG CUG AUG GUG UCU CCU ACU GCU UCC CCC ACC GCC
                  UCA CCA ACA GCA UCG CCG ACG GCG UAU CAU AAU GAU
                  UAC CAC AAC GAC UAA CAA AAA GAA UAG CAG AAG GAG
                  UGU CGU AGU GGU UGC CGC AGC GGC UGA CGA AGA GGA
                  UGG CGG AGG GGG"""

aas_txt = """F L I V F L I V L L I V L L M V S P T A S P T A S P
             T A S P T A Y H N D Y H N D . Q K E . Q K E C R S G
             C R S G . R R G W R R G"""


def remove_whitespace(txt):
    # remove spaces and line breaks from string
    return txt.replace(" ", "").replace("\n", "")


def translate(rna_sequence):
    """
    uses codon table to translate rna sequence.
    example:

    UUU AUC GUU -> F I V

    spaces can be omitted.
    """
    
    # cleanup
    rna_sequence = remove_whitespace(rna_sequence)

    aas = ""
    for start_idx in range(0, len(rna_sequence), 3):
        triplet = rna_sequence[start_idx : start_idx + 3]
        aa = lookup_aa(triplet)
        aas += aa
    return aas


# cleanup the multiline strings
triplets = remove_whitespace(triplets_txt)
aas = remove_whitespace(aas_txt)

mapping = {}
for start_idx in range(0, len(triplets), 3):
    mapping[triplets[start_idx: start_idx + 3]] = aas[start_idx // 3]


@profile
def lookup_aa(triplet):
    """finds aa symbol for given triplet.
    returns 'X' for invalid triplet"""
    return mapping.get(triplet, "X")    


long_sequence = 20_000 * remove_whitespace(triplets_txt)
translate(long_sequence)

Overwriting rna_translation_3.py


In [11]:
!time python rna_translation_3.py


real	0m0.641s
user	0m0.621s
sys	0m0.014s


In [12]:
!pyinstrument rna_translation_3.py


  _     ._   __/__   _ _  _  _ _/_   Recorded: 20:52:47  Samples:  832
 /_//_/// /_\ / //_// / //_'/ //     Duration: 0.837     CPU time: 0.836
/   _/                      v3.3.0

Program: rna_translation_3.py

[31m0.836[0m [48;5;24m[38;5;15m<module>[0m  [2mrna_translation_3.py:2[0m
└─ [31m0.834[0m [48;5;24m[38;5;15mtranslate[0m  [2mrna_translation_3.py:27[0m
   ├─ [31m0.524[0m [self][0m  [2m[0m
   └─ [33m0.305[0m [48;5;24m[38;5;15mlookup_aa[0m  [2mrna_translation_3.py:57[0m
      ├─ [33m0.175[0m [self][0m  [2m[0m
      └─ [32m0.130[0m dict.get[0m  [2m<built-in>:0[0m

To view this report with different options, run:
    pyinstrument --load-prev 2021-01-16T20-52-47 [options]



Observations:
1. We saved another 40% of run time
2. time for `lookup_aa` reduced from ~750ms to ~300ms, which is faster by a factor ~2.5. This speedup will be much more for larger data collections.