# Dictionaries

In [7]:
my_dict = {'a': 6, 'b': 7, 'c': 27.6}

# Can also create a dictionary:
dict(a=6, b=7, c=27.6)

{'a': 6, 'b': 7, 'c': 27.6}

In [4]:
my_dict
# key - value, example: a - 6

{'a': 6, 'b': 7, 'c': 27.6}

In [6]:
my_dict['b']
# access key value

7

Any mutable objects like lists cannot be used as keys.

In [9]:
# Could have resolved Exercise 1.4 with it
def complement_seq_reversed(seq, material='DNA'):
    """Returns the reversed, complementary strain for a DNA sequence."""
    dict = str.maketrans({"A": "T", "C": "G", "T": "A", "G": "C"})
    complementarySeq = seq.translate(dict)
    return complementarySeq[::-1]

In [10]:
aa_dict = {
    "A": "Ala",
    "R": "Arg",
    "N": "Asn",
    "D": "Asp",
    "C": "Cys",
    "Q": "Gln",
    "E": "Glu",
    "G": "Gly",
    "H": "His",
    "I": "Ile",
    "L": "Leu",
    "K": "Lys",
    "M": "Met",
    "F": "Phe",
    "P": "Pro",
    "S": "Ser",
    "T": "Thr",
    "W": "Trp",
    "Y": "Tyr",
    "V": "Val",
}
# could have taken less line but here, very legible 

In [11]:
aa_dict['L']

'Leu'

In [13]:
# Codes https://en.wikipedia.org/wiki/DNA_codon_table
# The set of DNA bases
bases = ['T', 'C', 'A', 'G']

# Build list of codons
codon_list = []
for first_base in bases:
    for second_base in bases:
        for third_base in bases:
            codon_list += [first_base + second_base + third_base]

# The amino acids that are coded for (* = STOP codon)
amino_acids = 'FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG'

# Build dictionary from tuple of 2-tuples (technically an iterator, but it works)
codons = dict(zip(codon_list, amino_acids))

# Show that we did it
print(codons)

{'TTT': 'F', 'TTC': 'F', 'TTA': 'L', 'TTG': 'L', 'TCT': 'S', 'TCC': 'S', 'TCA': 'S', 'TCG': 'S', 'TAT': 'Y', 'TAC': 'Y', 'TAA': '*', 'TAG': '*', 'TGT': 'C', 'TGC': 'C', 'TGA': '*', 'TGG': 'W', 'CTT': 'L', 'CTC': 'L', 'CTA': 'L', 'CTG': 'L', 'CCT': 'P', 'CCC': 'P', 'CCA': 'P', 'CCG': 'P', 'CAT': 'H', 'CAC': 'H', 'CAA': 'Q', 'CAG': 'Q', 'CGT': 'R', 'CGC': 'R', 'CGA': 'R', 'CGG': 'R', 'ATT': 'I', 'ATC': 'I', 'ATA': 'I', 'ATG': 'M', 'ACT': 'T', 'ACC': 'T', 'ACA': 'T', 'ACG': 'T', 'AAT': 'N', 'AAC': 'N', 'AAA': 'K', 'AAG': 'K', 'AGT': 'S', 'AGC': 'S', 'AGA': 'R', 'AGG': 'R', 'GTT': 'V', 'GTC': 'V', 'GTA': 'V', 'GTG': 'V', 'GCT': 'A', 'GCC': 'A', 'GCA': 'A', 'GCG': 'A', 'GAT': 'D', 'GAC': 'D', 'GAA': 'E', 'GAG': 'E', 'GGT': 'G', 'GGC': 'G', 'GGA': 'G', 'GGG': 'G'}


In [14]:
my_dict

{'a': 6, 'b': 7, 'c': 27.6}

Dictionnaries are mutable

In [17]:
# Can add key/value
my_dict['d'] = 'Bootcamp is so much fun!'

In [16]:
my_dict

{'a': 6, 'b': 7, 'c': 27.6, 'd': 'Bootcamp is so much fun!'}

In [19]:
# Can change a value
my_dict['a'] = 'I was not satisfied with entry a.'
my_dict

{'a': 'I was not satisfied with entry a.',
 'b': 7,
 'c': 27.6,
 'd': 'Bootcamp is so much fun!'}

In [23]:
# membership in a dictionary is based on a key
7 in my_dict

False

In [21]:
'c' in my_dict

True

In [24]:
'e' not in my_dict

True

In [25]:
7 not in my_dict

True

In [27]:
# Iterate over the keys
for key in my_dict:
    print(key)

a
b
c
d


In [28]:
# Iterate over the keys and values
for key, value in my_dict.items():
    print(key, ":", value)

a : I was not satisfied with entry a.
b : 7
c : 27.6
d : Bootcamp is so much fun!


In [30]:
for key, value in my_dict.items():
    value = 3252
my_dict
# Nothing changes like lists

{'a': 'I was not satisfied with entry a.',
 'b': 7,
 'c': 27.6,
 'd': 'Bootcamp is so much fun!'}

In [32]:
for key, value in my_dict.items():
    my_dict[key] = 3252
my_dict
# needs to specify the key to change it !

{'a': 3252, 'b': 3252, 'c': 3252, 'd': 3252}

In [33]:
list(aa_dict.keys())

['A',
 'R',
 'N',
 'D',
 'C',
 'Q',
 'E',
 'G',
 'H',
 'I',
 'L',
 'K',
 'M',
 'F',
 'P',
 'S',
 'T',
 'W',
 'Y',
 'V']

In [34]:
list(aa_dict.values())

['Ala',
 'Arg',
 'Asn',
 'Asp',
 'Cys',
 'Gln',
 'Glu',
 'Gly',
 'His',
 'Ile',
 'Leu',
 'Lys',
 'Met',
 'Phe',
 'Pro',
 'Ser',
 'Thr',
 'Trp',
 'Tyr',
 'Val']

In [35]:
aa_dict.pop('K')
# Shows values and removes it from dictionary

'Lys'

In [37]:
aa_dict['L']

'Leu'

In [38]:
aa_dict['K']
# Error does not exist

KeyError: 'K'

In [39]:
aa_dict.get('L')

'Leu'

In [40]:
aa_dict.get('K') 
# No error

In [42]:
print(aa_dict.get('K'))
# indicate absent

None


In [47]:
aa_dict.get('K', 'NaAA') 
# Can specify output if absent

'NaAA'

In [45]:
aa_dict['K']

KeyError: 'K'

In [48]:
def concatenate_sequences(a, b, **kwargs):
    """Concatenate (combine) 2 or more sequences."""
    seq = a + b

    for key in kwargs:
        seq += kwargs[key]

    return seq

If function \**kwargs means any keywords argument can be converted into a dictionary that would be passed into the function !

In [49]:
concatenate_sequences(a='AGATAGATA', b='GATGGAGA')

'AGATAGATAGATGGAGA'

In [50]:
dict(a='AGATAGATA', b='GATGGAGA')

{'a': 'AGATAGATA', 'b': 'GATGGAGA'}

In [57]:
def concat_sequences(say_hi=False, **kwargs):
    """Concatenate sequences."""
    if say_hi:
        print("Hi.")

    seq = ''
    for key, value in kwargs.items():
        seq += value

    return seq

In [58]:
concat_sequences(say_hi=True, a='AGATAGATA', b='GATGGAGA')

Hi.


'AGATAGATAGATGGAGA'

In [59]:
seq_dict = dict(a='AGATAGATA', b='GATGGAGA')
seq_dict

{'a': 'AGATAGATA', 'b': 'GATGGAGA'}

In [60]:
concat_sequences(**seq_dict)

'AGATAGATAGATGGAGA'

In [61]:
seq_dict = dict(a='AGATAGATA', b='GATGGAGA', say_hi=True)
concat_sequences(**seq_dict)

Hi.


'AGATAGATAGATGGAGA'

In [62]:
def mean(data):
    """Compute the mea of a list of numbers."""
    return sum(data) / len(data)

In [63]:
mean([1, 2, 3, 4, 5])

3.0

In [64]:
mean([42, 3252, 71])

1121.6666666666667

In [66]:
# But most likely frequently used, method must exist but not present in python.
# --> exist in modules

# Packages and modules

In [67]:
import numpy

In [68]:
numpy.mean([1, 2, 3, 4, 5])

3.0

In [71]:
numpy.median?
# or check online documentation

[0;31mSignature:[0m [0mnumpy[0m[0;34m.[0m[0mmedian[0m[0;34m([0m[0ma[0m[0;34m,[0m [0maxis[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mout[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0moverwrite_input[0m[0;34m=[0m[0;32mFalse[0m[0;34m,[0m [0mkeepdims[0m[0;34m=[0m[0;32mFalse[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Compute the median along the specified axis.

Returns the median of the array elements.

Parameters
----------
a : array_like
    Input array or object that can be converted to an array.
axis : {int, sequence of int, None}, optional
    Axis or axes along which the medians are computed. The default
    is to compute the median along a flattened version of the array.
    A sequence of axes is supported since version 1.9.0.
out : ndarray, optional
    Alternative output array in which to place the result. It must
    have the same shape and buffer length as the expected output,
    but the type (of the output) will be cast if n

In [73]:
import numpy as np
# more tha 300 000 packages!

Can also avoid copying code (= bad) by importing our own built packages ! see website

In [74]:
import jb_bootcamp

ModuleNotFoundError: No module named 'jb_bootcamp'

# File I/O

In [97]:
pwd # check if in the correct folder

'/Users/anaishaget/git/bootcamp'

In [89]:
import os
import glob

In [90]:
# Use PDB file, protein data bank
# p53 essential protein, if mutate we are very likely to get cancer

In [92]:
with open('data/1OLG.pdb', 'r') as f:
    print(type(f))
    
# python way to open a file to read.
# w = to write but annihilate the content !
# f is what we get when we open a file, is a textIOWrapper datatype.
# do not forget to close the file when open it.
# Context management great, with such context block it will automatically close it for you, like here !!

<class '_io.TextIOWrapper'>


In [93]:
with open('data/1OLG.pdb', 'r') as f:
    f_str = f.read() # method to read a file and return it as a string

In [94]:
f_str[:1000]

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \nTITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \nTITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \nCOMPND    MOL_ID: 1;                                                            \nCOMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \nCOMPND   3 CHAIN: A, B, C, D;                                                   \nCOMPND   4 ENGINEERED: YES                                                      \nSOURCE    MOL_ID: 1;                                                            \nSOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \nSOURCE   3 ORGANISM_COMMON: HUMAN;                                              \nSOURCE   4 ORGANISM_TAXID: 9606                                                 \nKEYWDS    ANTI-ONCOGENE                                                         \nEXPDTA    SOLUT

In [95]:
with open('data/1OLG.pdb', 'r') as f:
    f_list = f.readlines() # method to read a file and return it as a string

In [96]:
f_list[:10]

['HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \n',
 'TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION             \n',
 'TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR                               \n',
 'COMPND    MOL_ID: 1;                                                            \n',
 'COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);             \n',
 'COMPND   3 CHAIN: A, B, C, D;                                                   \n',
 'COMPND   4 ENGINEERED: YES                                                      \n',
 'SOURCE    MOL_ID: 1;                                                            \n',
 'SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;                                   \n',
 'SOURCE   3 ORGANISM_COMMON: HUMAN;                                              \n']

In [98]:
f_list[0]

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG              \n'

In [99]:
f_list[0].rstrip()

'HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG'

In [113]:
with open('data/1OLG.pdb', 'r') as f:
    for i, line in enumerate(f):
        if i<20:
            print(line.rstrip())

HEADER    ANTI-ONCOGENE                           13-JUN-94   1OLG
TITLE     HIGH-RESOLUTION SOLUTION STRUCTURE OF THE OLIGOMERIZATION
TITLE    2 DOMAIN OF P53 BY MULTI-DIMENSIONAL NMR
COMPND    MOL_ID: 1;
COMPND   2 MOLECULE: TUMOR SUPPRESSOR P53 (OLIGOMERIZATION DOMAIN);
COMPND   3 CHAIN: A, B, C, D;
COMPND   4 ENGINEERED: YES
SOURCE    MOL_ID: 1;
SOURCE   2 ORGANISM_SCIENTIFIC: HOMO SAPIENS;
SOURCE   3 ORGANISM_COMMON: HUMAN;
SOURCE   4 ORGANISM_TAXID: 9606
KEYWDS    ANTI-ONCOGENE
EXPDTA    SOLUTION NMR
AUTHOR    G.M.CLORE,J.G.OMICHINSKI,A.M.GRONENBORN
REVDAT   4   24-FEB-09 1OLG    1       VERSN
REVDAT   3   01-APR-03 1OLG    1       JRNL
REVDAT   2   08-MAR-95 1OLG    1       REMARK
REVDAT   1   26-JAN-95 1OLG    0
JRNL        AUTH   G.M.CLORE,J.G.OMICHINSKI,K.SAKAGUCHI,N.ZAMBRANO,
JRNL        AUTH 2 H.SAKAMOTO,E.APPELLA,A.M.GRONENBORN


In [101]:
# DO NOT RUN IT risk obliterating if the file exist !
# Write a line to a file !
with open('yogi.txt', 'w') as f:
    f.write('When you come to a fork in the road, take it.')
    f.write('You can observe a lot by just watching.')
    f.write('I never said most of the things I said.')

In [103]:
# For safety check by doing:
if os.path.isfile('yogi.txt'):
    raise RuntimeError('File yogi.txt already exists.')
    
with open('yogi.txt', 'w') as f:
    f.write('When you come to a fork in the road, take it.')
    f.write('You can observe a lot by just watching.')
    f.write('I never said most of the things I said.')
    # Everything on the same line

RuntimeError: File yogi.txt already exists.

In [104]:
if os.path.isfile('yogi.txt'):
    raise RuntimeError('File yogi.txt already exists.')
    
with open('yogi.txt', 'w') as f:
    f.write('When you come to a fork in the road, take it.\n')
    f.write('You can observe a lot by just watching.\n')
    f.write('I never said most of the things I said.\n')

In [109]:
outfile = 'atoms_chain_A.txt'

if os.path.isfile(outfile):
    raise RuntimeError('File outfile already exists.')
    
with open('data/1OLG.pdb', 'r') as f, open(outfile, 'w') as f_out:
    for line in f:
        if len(line) > 21 and line[:4] == 'ATOM' and line[21] == 'A':
            # Condition to get atoms for chain A, knows which columns number from pdb specifications
            f_out.write(line)

In [112]:
!head -10 atoms_chain_A.txt

ATOM      1  N   LYS A 319      18.634  25.437  10.685  1.00  4.81           N  
ATOM      2  CA  LYS A 319      17.984  25.295   9.354  1.00  4.32           C  
ATOM      3  C   LYS A 319      18.160  23.876   8.818  1.00  3.74           C  
ATOM      4  O   LYS A 319      19.259  23.441   8.537  1.00  3.67           O  
ATOM      5  CB  LYS A 319      18.609  26.282   8.371  1.00  4.67           C  
ATOM      6  CG  LYS A 319      18.003  26.056   6.986  1.00  5.15           C  
ATOM      7  CD  LYS A 319      16.476  26.057   7.091  1.00  5.90           C  
ATOM      8  CE  LYS A 319      16.014  27.341   7.784  1.00  6.51           C  
ATOM      9  NZ  LYS A 319      16.388  28.518   6.952  1.00  7.33           N  
ATOM     10  H1  LYS A 319      18.414  24.606  11.281  1.00  5.09           H  
