In [1]:
#In-chapter exercises for Chapter 8 - Dictionaries:

#Storing paired data is an incredibly common problem that must be addressed in programming.
#For example, we may want to store paired items like (1) codons and their associated amino 
#acid residues, (2) colleagues' names and their email addresses, (3) words and their 
#definitions, etc. In each of these examples, we have a key and a value. For example 1, our
#key could be called "codon" and its value "amino acid residue." In example 2, they key is
#"name" and the value "email address"; in example 3, key = "word" and value = "definition".
#For the last example, we have a tool for storing words and definitions in Python called a 
#dictionary.

#Creating a dictionary involves curly brackets. Each pair of data, consisting of a key and
#a value, is called an "item". Each item is separated with a comma; keys and values are 
#separated with colons. We can write a dictionary on a single line, like so:
    #enzymes = {'EcoRI':r'GAATTC', 'AvaII':r'GG(A|T)CC', 'BisI':r'GC[ATGC]GC'}
#But splitting a dictionary definition over several lines makes it easier to read:
enzymes = {
    'EcoRI' : r'GAATTC',
    'AvaII' : r'GG(A|T)CC',
    'BisI' : r'GC[ATGC]GC'
}
#This is how you get a bit of info from a dictionary. It's similar to getting an item from a
#list, but you don't use indexes. You provide the key and the value:
print(enzymes['BisI'])
#Here's how you would create an empty dictionary and add elements to it:
tRNA_codons = {}
tRNA_codons['alanine'] = r'GC[ATGC]'
tRNA_codons['tryptophan'] = r'TGG'
tRNA_codons['phenylalanine'] = r'TT[TC]'
#Here's how you delete a key from a dictionary:
tRNA_codons.pop('alanine')
#Let's make a dictionary of counts for a DNA sequence:
dna = "AATGATCGATCGTACGCTGA"
counts = {}
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
        for base3 in ['A', 'T', 'G', 'C']:
            trinucleotide = base1 + base2 + base3
            count = dna.count(trinucleotide)
            if count > 0:
                counts[trinucleotide] = count
print(counts)
#This is great, but now we can't look up counts for codons that are 0, because we only wanted
#things in our dictionary with a count of 1 or greater. We will get an error if we try to print
#any of those counts. We can test for whether something exists in a dictionary by using 'get'.
#In the example below, 0 is the default value, which will be overridden if the item exists in
#the counts dictionary:
print("count for TGA is " + str(counts.get('TGA', 0)))
print("count for AAA is " + str(counts.get('AAA', 0)))
print("count for GTA is " + str(counts.get('GTA', 0)))
print("count for TTT is " + str(counts.get('TTT', 0)))

#We can iterate over a dictionary to do something to all of our items. This way of doing so,
#in order to list all codons with a count of 2, is kind of inefficient:
for base1 in ['A', 'T', 'G', 'C']:
    for base2 in ['A', 'T', 'G', 'C']:
        for base3 in ['A', 'T', 'G', 'C']:
            trinucleotide = base1 + base2 + base3
            if counts.get(trinucleotide, 0) == 2:
                print(trinucleotide)
#So let's use the 'keys' method instead. This will print a list of all the keys in our counts
#dictionary:
print(counts.keys())
#Here's how we can list all keys with a count of 2 more concisely. The "sorted" text sorts
#our dictionary so we get consistently ordered results. Otherwise, since dictionaries are
#inherently unordered, we will get a different order in our output every time:
for trinucleotide in sorted(counts.keys()):
    if counts.get(trinucleotide) == 2:
        print(trinucleotide)
#We can also iterate over items in a dictionary:
for trinucleotide, count in counts.items():
    if count == 2:
        print(trinucleotide)
#This will return a list of pairs of values, something we haven't seen before.

GC[ATGC]GC
{'AAT': 1, 'ATG': 1, 'ATC': 2, 'ACG': 1, 'TAC': 1, 'TGA': 2, 'TCG': 2, 'GAT': 2, 'GTA': 1, 'GCT': 1, 'CTG': 1, 'CGA': 1, 'CGT': 1, 'CGC': 1}
count for TGA is 2
count for AAA is 0
count for GTA is 1
count for TTT is 0
ATC
TGA
TCG
GAT
dict_keys(['AAT', 'ATG', 'ATC', 'ACG', 'TAC', 'TGA', 'TCG', 'GAT', 'GTA', 'GCT', 'CTG', 'CGA', 'CGT', 'CGC'])
ATC
GAT
TCG
TGA
ATC
TGA
TCG
GAT


In [2]:
#End-of-chapter exercises:

In [13]:
#DNA Translation:
#Write a program that will translate a DNA sequence into protein. Your program
#should use the standard genetic code which can be found at the provided NCBI URL.

#Generate a random DNA sequence:
sequence = 'gctagtatgttcgcggacgcgtcagaacaaagaaatggccgctcgcacttggcgctcatactctaagctggtcagattgattcaggggggaatctacgaa'
dna = sequence.upper()

#Make a dictionary for codon translations:
translations = {
    'TTT':'F', 'TCT':'S', 'TAT':'Y', 'TGT':'C', 'TTC':'F', 'TCC':'S', 'TAC':'Y', 'TGC':'C',
    'TTA':'L', 'TCA':'S', 'TAA':'*', 'TGA':'*', 'TTG':'L', 'TCG':'S', 'TAG':'*', 'TGG':'W',  
    'CTT':'L', 'CCT':'P', 'CAT':'H', 'CGT':'R', 'CTC':'L', 'CCC':'P', 'CAC':'H', 'CGC':'R', 
    'CTA':'L', 'CCA':'P', 'CAA':'Q', 'CGA':'R', 'CTG':'L', 'CCG':'P', 'CAG':'Q', 'CGG':'R', 
    'ATT':'I', 'ACT':'T', 'AAT':'N', 'AGT':'S', 'ATC':'I', 'ACC':'T', 'AAC':'N', 'AGC':'S',
    'ATA':'I', 'ACA':'T', 'AAA':'K', 'AGA':'R', 'ATG':'M', 'ACG':'T', 'AAG':'K', 'AGG':'R', 
    'GTT':'V', 'GCT':'A', 'GAT':'D', 'GGT':'G', 'GTC':'V', 'GCC':'A', 'GAC':'D', 'GGC':'G', 
    'GTA':'V', 'GCA':'A', 'GAA':'E', 'GGA':'G', 'GTG':'V', 'GCG':'A', 'GAG':'E', 'GGG':'G'}

#Write a function to do translate DNA:
def translate_dna(sequence):
    sequence = sequence.upper()
    last_codon_start = len(sequence) - 2
    protein = ""
    for start in range(0, last_codon_start, 3):
        end = start + 3
        codon = sequence[start:end]
        amino_acid = translations.get(codon, 'X') #The 'X' is a default, which will be returned 
        #if there is an unknown 'N' base in our DNA sequence, which will create a key that 
        #doesn't exist in our dictionary. 'X' thus stands for an unknown amino acid residue.
        protein = protein + amino_acid
    return protein
    
#Translate DNA to test if the function works:
print(translate_dna(dna))

#Let's assert some things just to make sure:
assert(translate_dna('ACTACTACTACT')) == 'TTTT'
assert(translate_dna('actactactact')) == 'TTTT'
assert(translate_dna('actactactactnnnnnanaaaaa')) == 'TTTTXXXK'

ASMFADASEQRNGRSHLALIL*AGQIDSGGNLR
