# Lesson 3 - 2024/10/24

How (not) to count:

In [2]:
def manual_count_v1(seq, character):
    m = []
    for c in seq:
        m.append(True if c == character else False)
    return sum(m)

def manual_count_v2(seq, character):
    m = []
    for c in seq:
        if c == character:
            m.append(True)
    return sum(m)

def manual_count_v3(seq, character):
    m = 0
    for c in seq:
        if c == character:
            m += 1
    return m

How to generate random strings:

In [3]:
import random

def generate_string(n, alphabet):
    s = ""
    for i in range(n):
        s += random.choice(alphabet)

    return s

The `range()` function returns an object that produces a sequence of integers.

In [4]:
generate_string(10, 'ATCG')

'TATTATTGAT'

### List Comprehension
Building a list with the aid of a <code>for</code> loop can often be condensed to a single line by using <strong>list comprehensions</strong>.

The list comprehension is a construct which returns lists:<br/>

<center><code>[expr for element in iterable]</code></center><br/>

It consist of square brackets containing the expression, which is executed <code>for</code> each element along with the for loop to iterate over each element.

In [5]:
my_list = [c for c in 'TAATA']

my_list

['T', 'A', 'A', 'T', 'A']

In [6]:
my_list = []
for c in 'TAATA':
    my_list.append(c)

my_list

['T', 'A', 'A', 'T', 'A']

Another example:

In [7]:
['1' + c + '2' for c in 'TAATA']

['1T2', '1A2', '1A2', '1T2', '1A2']

Using the list comprehension in the `manual_count_v1` function, we get:

In [8]:
def manual_count_v1_lc(seq, character):
    m = [c == character for c in seq]
    return sum(m)

Is it just a question of readability? List comprehensions provide a concise way to create lists, but not only that.

In [9]:
def count(seq, character):
    return seq.count(character)

In [10]:
from time import time

dna = generate_string(2000000, 'ACGT')

timings = []
for function in [manual_count_v1, manual_count_v2, manual_count_v3, manual_count_v1_lc, count]:
    t0 = time()
    result = function(dna, 'A')
    t1 = time()
    
    timing = t1 - t0

    timings.append(timing)

    print('{}, {} sec'.format(result, timing))

499746, 0.053919315338134766 sec
499746, 0.0325777530670166 sec
499746, 0.030370712280273438 sec
499746, 0.04849529266357422 sec
499746, 0.0033903121948242188 sec


In [11]:
print('manual_count_v1 vs. manual_count_v1_lc: {:.2f}'.format(timings[0]/timings[3]))

manual_count_v1 vs. manual_count_v1_lc: 1.11


In [12]:
print('manual_count_v3 vs. count: {:.2f}'.format(timings[2]/timings[-1]))

manual_count_v3 vs. count: 8.96


Working function to measure the execution time, using a default value for the argument `trials`.

In [13]:
def get_counts_and_time(function, seq, trials = 10):
    time_average = 0

    for i in range(trials):
        t0 = time()
        counts = function(seq)
        t1 = time()
        
        time_average += t1 - t0

    return counts, time_average/trials

### Dictionary (II)
Count how many As, Ts, Cs, and Gs there are in a sequence.

In [14]:
def count_all_bases_v1(seq):
    counts = [manual_count_v1_lc(seq, c) for c in ['A', 'T', 'C', 'G']]
    return counts

get_counts_and_time(count_all_bases_v1, dna)

([499746, 500040, 499585, 500629], 0.17413153648376464)

The functions reads all the sequence 4 times. It has to count the 4 bases simultaneously.

In [15]:
def count_all_bases_v2(seq):
    As = 0
    Ts = 0
    Cs = 0
    Gs = 0

    for c in seq:
        if c == 'A':
            As += 1
        elif c == 'T':
            Ts += 1
        elif c == 'C':
            Cs += 1
        elif c == 'G':
            Gs += 1

    return As, Ts, Cs, Gs

get_counts_and_time(count_all_bases_v2, dna)

((499746, 500040, 499585, 500629), 0.07060825824737549)

The series of <code>if</code> tests are somewhat cumbersome, especially if we want to extend the code to other bioinformatics problems where the alphabet is larger (aminoacids).

<center><pre>
'A' --> As += 1
'T' --> Ts += 1
'C' --> Cs += 1
'G' --> Gs += 1
</pre></center>

We need a better way <strong>to map</strong> a specific base to a specific counter.

In [16]:
# Lists' arithmetic
counts = [0]*4

counts

[0, 0, 0, 0]

The operators (as <code>+</code> and <code>*</code>) have a meaning that depends on the objects to which they are applied.

How to map each base to its counter?

In [18]:
'ATCG'.index('C')

2

The `index()` method returns the index of a substring inside the string (if found).

In [17]:
def count_all_bases_v3(seq):
    counts = [0]*4

    for c in seq:
        index = 'ATCG'.index(c)
        counts[index] += 1

    return counts

get_counts_and_time(count_all_bases_v3, dna)

([499746, 500040, 499585, 500629], 0.15538487434387208)

The <code>index</code> method is called for each base! What we want is a mapping from <code>base</code>, which is a character, to the corresponding index 0, 1, 2, or 3. A <strong>dictionary</strong> may represent such mappings.

In [18]:
base2index = {
    'A': 0,
    'T': 1,
    'C': 2,
    'G': 3
}

print('G', base2index['G'])
print('G', base2index.get('G', 'not found'))

G 3
G 3


In [21]:
print('U', base2index['U'])

KeyError: 'U'

In [22]:
print('U', base2index.get('U', 'not found'))

U not found


How to add an element:

In [23]:
base2index['U'] = 99

print('U', base2index.get('U', 'not found'))

U 99


In [24]:
print(base2index.keys())
print(base2index.values())

dict_keys(['A', 'T', 'C', 'G', 'U'])
dict_values([0, 1, 2, 3, 99])


The main characteristic of a dictionary is that it stores <strong>arbitrary indexed unordered</strong> data types.

In [22]:
def count_all_bases_v4(seq):
    base2index = {
        'A': 0,
        'T': 1,
        'C': 2,
        'G': 3
    }

    counts = [0]*4

    for c in seq:
        index = base2index[c]
        counts[index] += 1

    return counts

get_counts_and_time(count_all_bases_v4, dna)

([499746, 500040, 499585, 500629], 0.07595083713531495)

A dictionary may represent the counters.

In [21]:
def count_all_bases_v5(dna):
    counts = {
        'A': 0,
        'T': 0,
        'C': 0,
        'G': 0
    }

    for base in dna:
        counts[base] += 1

    return counts

get_counts_and_time(count_all_bases_v5, dna)

({'A': 499746, 'T': 500040, 'C': 499585, 'G': 500629}, 0.07698302268981934)

### Set

How many proteins share the DNA replication and gene expression pathways?

In [27]:
# UniprotId from REACTOME
DNA_replication_proteins = ['P28340', 'P49005', 'Q15054', 'Q9HCU8', 'P49642', 'P09884', 'Q14181', 'P49643', 'P12004', 'P35250', 'P35251', 'P40937', 'P35249', 'P40938', 'P18858', 'P39748', 'P27694', 'P35244', 'P15927', 'P51530', 'Q9UJA3', 'O75419', 'Q9BRT9', 'Q9BRX5', 'Q14691', 'Q9Y248', 'P25205', 'P33991', 'P33992', 'Q14566', 'P33993', 'P49736', 'P56282', 'Q07864', 'Q9NRF9', 'Q9NR33', 'P20248', 'P78396', 'P24941', 'Q9H211', 'Q9UBD5', 'O43913', 'O43929', 'Q13416', 'Q9Y5N6', 'Q13415', 'Q99741', 'P63208', 'Q13309', 'P62877', 'Q13616', 'P0CG48', 'P0CG47', 'P62987', 'P62979', 'O43242', 'P60900', 'Q13200', 'P28066', 'Q9UNM6', 'P25789', 'O00232', 'P25788', 'O00231', 'P25787', 'P60896', 'O75832', 'P25786', 'Q99460', 'O00487', 'P62333', 'P62195', 'P43686', 'P17980', 'P35998', 'P62191', 'P28065', 'P28062', 'A5LHX3', 'Q14997', 'Q99436', 'P28072', 'Q8TAA3', 'Q92530', 'P61289', 'P28074', 'Q9UL46', 'P28070', 'Q06323', 'O00233', 'P49720', 'P48556', 'P49721', 'P51665', 'Q15008', 'P40306', 'Q16401', 'P20618', 'P55036', 'O14818', 'Q9UM11', 'Q9UJX3', 'Q9UJX4', 'Q8NHZ8', 'O00762', 'Q9NYG5', 'Q13042', 'Q9H1A4', 'P60006', 'Q96DE5', 'P51965', 'Q9UJX6', 'Q9UJX5', 'P30260', 'Q9UJX2', 'Q9UM13', 'P51668', 'P24864', 'O96020', 'Q14209', 'Q01094', 'O00716', 'O75496', 'Q9UBU7', 'O00311', 'Q7L590', 'Q13156']
gene_expression_proteins = ['Q14493', 'P52298', 'Q09161', 'Q8WTR7', 'P62304', 'P62306', 'Q969L4', 'P62308', 'P14678', 'P83369', 'P62318', 'Q92989', 'O94913', 'O43809', 'Q8N684', 'Q01130', 'P84103', 'Q13242', 'O95391', 'P26368', 'Q15287', 'Q9BZI7', 'Q86V81', 'Q9Y5S9', 'P61326', 'Q96A72', 'P38919', 'O15234', 'Q8IYB3', 'Q05048', 'P33240', 'Q9H0L4', 'Q12996', 'Q86U42', 'O95639', 'Q92797', 'Q10570', 'Q9P2I0', 'Q9UKF6', 'Q9C0J8', 'Q6UN15', 'P51003', 'Q08170', 'Q01081', 'Q8WU68', 'Q05519', 'Q13247', 'Q9BY77', 'Q96QD9', 'Q9P127', 'O75152', 'Q9Y3Y2', 'P82979', 'Q13838', 'O00148', 'Q13769', 'Q96J01', 'Q8NI27', 'Q96FV9', 'Q6I9Y2', 'Q86W42', 'Q16629', 'Q13243', 'Q92620', 'Q07955', 'O60508', 'Q00403', 'P52655', 'P52657', 'Q5VWG9', 'Q6P1X5', 'Q9HBM6', 'Q92750', 'Q15543', 'Q16514', 'Q15544', 'Q12962', 'Q5H9L4', 'Q15545', 'P49848', 'Q16594', 'P20226', 'Q15542', 'O00268', 'Q8IZX4', 'P21675', 'Q92804', 'P29084', 'P29083', 'P35269', 'P13984', 'P30876', 'P19388', 'P53803', 'P19387', 'P62875', 'P52434', 'P61218', 'O15514', 'P36954', 'P52435', 'P62487', 'P24928', 'Q92759', 'P19447', 'Q13889', 'Q13888', 'P32780', 'Q6ZYL4', 'P50613', 'P51948', 'P51946', 'P18074', 'O60942', 'O00267', 'O43148', 'Q08945', 'Q9Y5B9', 'Q8IXH7', 'P18615', 'Q9H3P2', 'Q8WX92', 'P55199', 'P63272', 'Q9Y5B0', 'Q96JC9', 'Q9UHB7', 'P42568', 'P50750', 'O60563', 'O75909', 'O60583', 'Q96CJ1', 'Q03111', 'Q7KZ85', 'P23193', 'Q6PD62', 'Q92541', 'Q8N7H5', 'Q6P1J9', 'Q8WVC0', 'Q9GZS3', 'Q96ST2', 'Q3SY89', 'A6NLF2', 'Q14241', 'Q8NG57', 'Q8IYF1', 'Q15370', 'Q15369', 'Q7Z7C8', 'Q5SXM2', 'Q13487', 'O75971', 'Q92966', 'Q16533', 'P09086', 'P14859', 'P08047', 'P52747', 'Q8IXW5', 'Q96P16', 'Q9NQG5', 'Q5VT52', 'O00472', 'Q9HB65', 'Q8N5P1', 'Q659A1', 'Q9Y2F5', 'Q96HW7', 'Q96CB8', 'Q9NV88', 'Q68E01', 'Q9NVH2', 'Q9NVM9', 'Q8N201', 'Q96SY0', 'Q5TA45', 'Q96AH0', 'Q9BQ15', 'Q75QN2', 'Q9UL03', 'Q9NVR2', 'Q6P9B9', 'Q9H0H0', 'Q9H814', 'Q9BXP5', 'Q9NP77', 'P24863', 'Q9UHV7', 'O75586', 'O75448', 'Q9Y2X0', 'O43513', 'Q96HR3', 'Q9Y3C7', 'Q9H944', 'Q6P2C8', 'Q9NVC6', 'Q9NPJ6', 'Q93074', 'Q9BTT4', 'O60244', 'P49336', 'Q15648', 'Q9ULK4', 'O75461', 'Q14186', 'Q14188', 'P38398', 'O14757', 'Q96KQ7', 'Q9H9B1', 'Q8IWI9', 'P61244', 'Q99496', 'Q06587', 'Q969R5', 'Q9BYE7', 'Q8IY57', 'Q13185', 'P31350', 'Q01094', 'Q9H2F5', 'Q15022', 'Q15910', 'O75530', 'Q16576', 'Q09028', 'O00311', 'P35226', 'P35227', 'P78364', 'Q8NDX5', 'Q8N488', 'Q9UBK9', 'P45973', 'Q06609', 'O14727', 'Q99708', 'Q13951', 'Q01196', 'P46937', 'P17542', 'Q99081', 'P15923', 'P25791', 'P25800', 'Q86U70', 'P23771', 'P10242', 'Q96J02', 'O15350', 'P0CG48', 'P0CG47', 'P62987', 'P62979', 'Q99879', 'Q96A08', 'P58876', 'Q93079', 'Q16778', 'P23527', 'P57053', 'Q99877', 'P62807', 'P06899', 'P33778', 'O60814', 'Q99880', 'Q8N257', 'P84243', 'P68431', 'Q71DI3', 'P0C5Y9', 'Q71UI9', 'P0C0S5', 'P20671', 'P16104', 'Q9BTM1', 'Q16777', 'Q93077', 'P04908', 'Q6FI13', 'Q99878', 'P62805', 'P17947', 'Q03164', 'P23769', 'P15976', 'P00519', 'O43242', 'P60900', 'Q13200', 'P28066', 'Q9UNM6', 'P25789', 'O00232', 'P25788', 'O00231', 'P25787', 'P60896', 'O75832', 'P25786', 'Q99460', 'O00487', 'P62333', 'P62195', 'P43686', 'P17980', 'P35998', 'P62191', 'P28065', 'P28062', 'A5LHX3', 'Q14997', 'Q99436', 'P28072', 'Q8TAA3', 'Q92530', 'P61289', 'P28074', 'Q9UL46', 'P28070', 'Q06323', 'O00233', 'P49720', 'P48556', 'P49721', 'P51665', 'Q15008', 'P40306', 'Q16401', 'P20618', 'P55036', 'O14818', 'O15169', 'P03372', 'Q8NC69', 'Q9HCL2', 'Q8NDV7', 'Q9UPQ9', 'Q9HCJ0', 'Q9HCE1', 'Q9UL18', 'Q9HCK5', 'Q9H9G7', 'P30279', 'P24385', 'P30281', 'P12931', 'Q06124', 'Q9UKV8', 'P29590', 'Q00534', 'P32519', 'P42702', 'P08700', 'P05771', 'P04141', 'Q92793', 'P17931', 'Q13950-2', 'Q13950-1', 'P51451', 'Q15723', 'Q02548', 'Q9BZS1', 'Q13469', 'P60568', 'P16410', 'Q9Y5U5', 'P01589', 'P17927', 'P01579', 'Q07157', 'Q16625', 'O00501', 'Q9H2X6', 'Q12824', 'Q8TAQ2', 'Q92922', 'P51532', 'P51531', 'Q68CP9', 'Q96GM5', 'Q86U86', 'Q92925', 'Q6STE5', 'Q8NFD5', 'O96019', 'O14497', 'Q969G3', 'O94805', 'Q09472', 'Q96GD3-2', 'Q8IXK0', 'Q14781', 'Q9HC52', 'O00257', 'O95503', 'Q86SE9', 'Q8WXX7', 'P19784', 'P68400', 'P67870', 'P07359', 'O75182', 'Q96ST3', 'Q96LA8', 'Q13547', 'Q9UBL3', 'Q15291', 'P61964', 'Q9C005', 'Q9UPS6', 'O14686', 'O15047', 'Q9UMN6', 'Q8NEZ4', 'Q8IZD2', 'Q92831', 'Q99873', 'Q8IX07', 'P08514', 'Q92570', 'P02776', 'P07996', 'Q16621', 'Q04759', 'P24844', 'Q9UIV8', 'P43235', 'O60911', 'P07711', 'O14543', 'Q8WXH5', 'Q9BXY4', 'P23511', 'P25208', 'Q13952', 'P98177', 'Q12778', 'O43524', 'Q99967', 'Q15831', 'P41182', 'Q9BXH1', 'P35638', 'Q9BXM7', 'O43521', 'P48023', 'Q08999', 'Q13485', 'P84022', 'Q15796', 'Q16589', 'P55316', 'P38936', 'P57723', 'O43474', 'P24522', 'Q03135', 'P46527', 'O14793', 'P62324', 'O00253', 'Q9HCM2', 'A8MYZ6', 'P04179', 'P54252', 'P35557', 'Q92769', 'P01189', 'P04150', 'Q9NTG7', 'P01308', 'Q9UBK2', 'Q969P5', 'P35558', 'P01303', 'Q8N139', 'Q9HD89', 'P08833', 'P36956', 'Q969Q1', 'P35575', 'P04040', 'P31751', 'Q9Y243', 'P31749', 'P27348', 'P63104', 'P31947', 'P61981', 'P31946', 'Q96EB6', 'P10599', 'Q9H3M7', 'O75376', 'Q9Y618', 'P04150-2', 'Q14994-2', 'P04150-3', 'P51843-2', 'Q96RI1-3', 'Q15406-3', 'Q92570-2', 'Q15406-5', 'P04150-7', 'O75469-5', 'O00482-2', 'O75469-3', 'O00482-3', 'Q96RI1-2', 'O75469-6', 'O75469-4', 'P08235-2', 'Q15406-4', 'O75469-2', 'Q9Y5X4-2', 'Q15406-2', 'P08235-4', 'Q13133-2', 'P06401-2', 'P04150-6', 'P04150-8', 'Q96RI1-4', 'O75469-7', 'P04150-5', 'Q96F24-2', 'P08235-3', 'P04150-9', 'P04150-4', 'P55055', 'Q14995', 'P35398', 'Q13133-1', 'Q07869', 'P04150-1', 'Q14994-1', 'Q9Y5X4-1', 'Q14541', 'P13056', 'P06401', 'P22736', 'P49116', 'Q86WQ0', 'Q9Y466', 'Q9UHY1', 'P19793', 'P11474', 'P20393', 'P62508', 'Q15406-1', 'O75469-1', 'P10588', 'P10276', 'Q92731', 'P41235', 'P28702', 'Q96F24-1', 'P13631', 'P10275', 'P37231', 'O95718', 'O00482-1', 'P48443', 'P11473', 'P10589', 'Q92753', 'Q13285', 'Q96RI1-1', 'P51843-1', 'P08235-1', 'P10827', 'P10826', 'P10828', 'Q03181', 'Q92570-1', 'P51449', 'P43354', 'Q15466', 'O15297', 'Q8WVJ9', 'Q15672', 'P08648', 'O95965', 'P45452', 'Q9UBN7', 'P48436', 'P42224', 'Q86YN6', 'P63208', 'Q13309', 'P62877', 'Q13616', 'Q9HCE7', 'Q5T1R4', 'Q9H0M0', 'P49841', 'Q969H0-1', 'Q9UNE7', 'P56179', 'P56178', 'P12643', 'P78367', 'P35548', 'P11802', 'Q07812', 'P06493', 'P14635', 'O43541', 'Q15797', 'Q96K83', 'O15379', 'Q9GZV5', 'P06400', 'Q8TDD2', 'P02818', 'O75444', 'P02452', 'P28482', 'P27361', 'Q9UPW6', 'P07947', 'Q8WVF2', 'Q9Y5J3', 'Q9UBP5', 'Q14469', 'P10071', 'Q14623', 'P10070', 'P56524', 'Q96PK6', 'P61296', 'Q99594', 'Q15561', 'Q15562', 'P28347', 'Q99593', 'P29279', 'P01160', 'P52952', 'P43694', 'Q86Z02', 'Q13263', 'Q99676', 'Q5HYK9', 'Q9Y6Q3', 'P17014', 'Q8TF47', 'Q6ZS27', 'P98182', 'Q5SXM1', 'Q15928', 'Q96JC4', 'Q6ZMY9', 'Q5MCW4', 'Q8N988', 'Q6P280', 'Q9Y2H8', 'Q8N9F8', 'Q9HAH1', 'Q8TF39', 'Q8TA94', 'Q9ULM2', 'Q8IVP9', 'Q8WV37', 'Q9H707', 'Q9BX82', 'Q08ER8', 'Q96H40', 'Q969W8', 'Q9Y2A4', 'Q8NEP9', 'Q96IT1', 'Q7Z340', 'Q86TJ5', 'Q5JVG2', 'Q8TB69', 'Q9NWS9', 'Q8TBZ8', 'O60304', 'Q96NI8', 'Q5VIY5', 'Q96K75', 'Q3ZCX4', 'Q6NSZ9', 'Q9H7X3', 'Q8N859', 'Q9H963', 'Q9NV72', 'Q8N1W2', 'Q9UC06', 'Q6ZNC4', 'Q8NEM1', 'P17021', 'Q8N3J9', 'Q15776', 'Q96BR6', 'P52738', 'Q8TD23', 'P17019', 'Q6AZW8', 'Q96K58', 'O95780', 'Q8IYX0', 'Q96N22', 'Q8TAW3', 'Q5T5D7', 'Q8NCK3', 'Q9BR84', 'Q03936', 'P59923', 'Q86UE3', 'Q86XU0', 'O95201', 'Q9P255', 'Q6V9R5', 'Q5JUW0', 'P17022', 'Q6ZR52', 'Q8NDQ6', 'Q9H7R5', 'P51786', 'Q96MR9', 'Q8NHY6', 'Q3MIS6', 'Q9BS34', 'Q12901', 'Q7Z398', 'P0C7X2', 'Q06732', 'Q6P9A3', 'Q5JVG8', 'Q6NX49', 'Q9NZV7', 'Q8N184', 'Q8N7Q3', 'Q6P2D0', 'Q96NG5', 'Q96CS4', 'Q96PE6', 'Q8N9K5', 'Q8NEK5', 'Q8N587', 'P17023', 'P52744', 'Q6P9A1', 'Q9Y473', 'Q9H7S9', 'Q9P0L1', 'A8MWA4', 'P52737', 'Q5VV52', 'Q9Y2L8', 'Q5TEC3', 'Q6ZN57', 'Q9NQZ8', 'P52742', 'Q9BU19', 'Q9UJW8', 'A8MVS1', 'Q15973', 'Q3SXZ3', 'Q14929', 'Q9H0M5', 'P21506', 'P0CH99', 'Q9HCG1', 'Q9Y462', 'P17029', 'Q96N38', 'O95125', 'Q8N972', 'A2RRD8', 'Q6ZN79', 'A8MXY4', 'Q32M78', 'Q8IZC7', 'A6NP11', 'P52736', 'O43345', 'Q96C28', 'Q8IYN0', 'Q9Y5V0', 'O14709', 'A8MUZ8', 'Q13106', 'Q9BY31', 'P17017', 'Q9BSG1', 'O14628', 'Q8NC26', 'O75820', 'P17024', 'Q3SY52', 'Q5TYW1', 'Q9BS31', 'Q96N77', 'Q9BSK1', 'Q86UQ0', 'Q96I27', 'Q7Z3V5', 'Q96LX8', 'Q86T29', 'Q8WXB4', 'Q7L945', 'Q8N8J6', 'Q08AN1', 'Q8IYB9', 'Q9NXT0', 'Q96NG8', 'Q4V348', 'Q6P3V2', 'Q6ZNG0', 'Q9UJL9', 'Q6ZMW2', 'Q8N393', 'Q86XN6', 'P10072', 'Q6AW86', 'Q6ECI4', 'A8K8V0', 'Q8TAF7', 'Q75MW2', 'Q96SQ5', 'Q6PF04', 'Q96SK3', 'Q8IVC4', 'Q8N9Z0', 'Q8TC21', 'Q96NL3', 'Q68DY1', 'Q8N823', 'Q8N883', 'Q8N720', 'Q86YE8', 'Q49AA0', 'Q9P2J8', 'Q6ZNG1', 'Q52M93', 'Q8N2I2', 'Q6ZSS3', 'Q96ND8', 'O94892', 'Q8IZ26', 'Q8TAQ5', 'Q9HCZ1', 'Q9BWM5', 'Q86V71', 'Q8TD17', 'P17032', 'Q8N8Z8', 'Q9H8G1', 'Q9UK10', 'Q9UL58', 'Q9UK13', 'Q9UK12', 'Q9UL59', 'Q9NZL3', 'O14771', 'Q9UK11', 'Q9UDV6', 'Q13398', 'Q8TAU3', 'Q9C0F3', 'O60765', 'Q6P1L6', 'Q96LW1', 'Q09FC8', 'Q9H7R0', 'Q8TF45', 'Q9NX65', 'Q8TF32', 'Q9Y3M9', 'P17038', 'Q9NQX6', 'Q8IYI8', 'Q14584', 'Q96HQ0', 'Q14588', 'Q06730', 'Q9NYT6', 'Q8NA42', 'Q9Y2Q1', 'Q8N7K0', 'Q9UJU3', 'Q6IV72', 'O43296', 'Q9GZX5', 'P17030', 'Q86Y25', 'Q8NDP4', 'Q14590', 'Q96SE7', 'P17031', 'Q9BUY5', 'Q96JL9', 'Q53GI3', 'P51814', 'Q96SR6', 'Q6PK81', 'O75437', 'Q9Y2P7', 'O75346', 'Q9UIE0', 'Q8NDW4', 'Q86WZ6', 'P15622', 'Q96GE5', 'P17027', 'Q6PG37', 'Q14586', 'Q3KQV3', 'O14978', 'P51815', 'A6NK53', 'Q96N20', 'Q969J2', 'Q9BRR0', 'P17035', 'Q96PQ6', 'Q9HBT7', 'Q9UDV7', 'Q96GC6', 'Q5JNZ3', 'Q96RE9', 'Q9NR11', 'P17036', 'O75467', 'Q9HCX3', 'Q9HBT8', 'P17039', 'Q14587', 'Q14593', 'Q96NJ3', 'Q14592', 'Q6ZN11', 'Q9ULD5', 'Q6IQ21', 'Q3KP31', 'Q68DI1', 'Q92670', 'Q15937', 'Q6NX45', 'Q68DY9', 'Q96BV0', 'Q15935', 'Q32MQ0', 'Q96MU6', 'Q7L3S4', 'A5PL33', 'Q9NQW5', 'Q96H86', 'Q9HCL3', 'Q9Y2G7', 'Q6NUN9', 'A4D1E1', 'A8MTY0', 'Q9BV97', 'O43361', 'Q8NE65', 'B4DX44', 'P0CB33', 'A8MUV8', 'O75373', 'B4DXR9', 'Q6ZMV8', 'Q16587', 'A8K0R7', 'Q8TF20', 'A6NHJ4', 'Q8NDX6', 'A6NN14', 'Q7Z2F6', 'A6NNF4', 'Q15940', 'A6NDX5', 'Q02386', 'Q13875', 'P51608-2', 'P51608-1', 'Q06413', 'Q9NWB1', 'P16220', 'P48431', 'P42858', 'Q14739', 'P17612', 'P0DP23', 'Q16566', 'Q13227', 'Q9BZK7', 'O60907', 'Q96GD4', 'Q13557', 'Q13555', 'Q13554', 'Q9UQM7', 'Q14353', 'P51617', 'P23560', 'P61278', 'P06850', 'O00548', 'Q05329', 'Q99259', 'O00141', 'P20472', 'P41145', 'P08581', 'Q7Z2K8', 'P18031', 'P29074', 'Q13224', 'P11169', 'Q13507', 'Q12879', 'P42262', 'P35372', 'Q13451', 'Q8WYQ5', 'P60484', 'Q13761', 'Q9NQB0', 'Q9HCS4', 'Q9UJU2', 'P35222', 'P36402', 'P01106', 'Q00987', 'Q8N726', 'Q9HAU4', 'P01137', 'P25440', 'P01116', 'P46531', 'Q06330', 'Q13573', 'Q13495', 'Q8IZL2', 'Q92585', 'Q96JK9', 'Q92830', 'P78504', 'Q15911', 'P04637', 'P10451', 'P13612', 'P20701', 'P51449-2', 'Q9NQ88', 'Q13131', 'P54646', 'Q9Y478', 'O43741', 'Q9UGJ0', 'P54619', 'Q9UGI9', 'Q92574', 'P49815', 'Q9NX09', 'Q04917', 'P62258', 'Q8N122', 'P42345', 'Q9BVC4', 'Q8NBW4', 'O43504', 'Q6IAA8', 'Q9Y2Q5', 'Q9UHA4', 'Q0VGL1', 'Q9HB90', 'Q9NQL2', 'Q7L523', 'Q5VZM2', 'Q15382', 'Q16881', 'P18283', 'P30044-2', 'Q06830', 'P32119', 'O43819', 'Q7LG56', 'P11413', 'Q9H3D4', 'O94925', 'Q9UI32', 'P58005', 'P58004', 'Q9Y6P5-1', 'Q9Y6P5-3', 'P09669', 'P14854', 'O00483', 'P00403', 'P12074', 'P15954', 'P10606', 'P13073', 'O14548', 'P20674', 'P24311', 'P00395', 'P00414', 'P10176', 'P99999', 'Q8N8Q8', 'Q96I36', 'Q9P0S2', 'Q5RI15', 'Q9Y6N1', 'Q3E731', 'Q9BSH4', 'O75880', 'Q15526', 'P42704', 'P06744', 'P00390-2', 'Q13625', 'Q96KQ4', 'O14763', 'O00220', 'O14798', 'Q9UBN6', 'Q86XT9', 'P17936', 'P25445', 'O43715', 'Q96N28', 'Q9Y255', 'Q9HCN2', 'Q13794', 'Q9BRQ8', 'Q96A56', 'Q13315', 'P55957', 'O60238', 'Q658P3', 'Q9BZR8', 'Q96FX8', 'Q92597', 'Q53FA7', 'Q92696', 'P53611', 'P24386', 'O15392', 'Q9NPP4', 'P29466', 'P55212', 'Q9HB75', 'P42575', 'P78560', 'Q92851', 'Q86X55', 'P12004', 'O14965', 'Q96PM9', 'P30307', 'Q16254', 'P28749', 'Q9H4B4', 'O95628', 'Q9H9A5', 'A5YKK6', 'O75175', 'Q9NZN8', 'Q9UIV1', 'Q9UFF9', 'Q92600', 'Q9UKZ1', 'Q9ULM6', 'Q96LI5', 'Q9C0C2', 'P78543', 'Q9UM63', 'Q9NYY3', 'P06748', 'Q9H4X1', 'Q9HC77', 'Q99856', 'A0AVK6', 'Q96AV8', 'P24864', 'O96020', 'P24941', 'P20248', 'P78396', 'Q9BVI0', 'Q01851', 'Q8WUF5', 'Q12837', 'Q8N9N5', 'Q8WYH8', 'Q92794', 'Q9HAF1', 'O95696', 'Q9ULD4', 'P55201', 'Q9NPI1', 'Q9H160', 'P78356', 'Q8TBX8', 'P48426', 'Q13526', 'P52564', 'Q86T03', 'Q86YP4', 'Q8WXI9', 'O95983', 'O94776', 'Q12873', 'Q14839', 'Q00535', 'Q15078', 'O60285', 'O96017', 'Q9Y3T9', 'Q8IW41', 'Q16539', 'Q15759', 'Q9ULW0', 'Q92630', 'P35250', 'O75943', 'P40937', 'P35249', 'P40938', 'Q9BSD3', 'Q92547', 'O60921', 'Q99638', 'Q6WBX8', 'O60671', 'Q9BX63', 'P51530', 'P54132', 'Q13472', 'Q96E14', 'Q9H9A7', 'Q14191', 'Q9UQ84', 'O60934', 'P49959', 'Q92878', 'Q92993', 'Q99728', 'P27694', 'P35244', 'P15927', 'Q8WXE1', 'Q13535', 'O15151', 'Q96S44', 'Q93009', 'Q8WZ73', 'Q969K3', 'Q9BPZ7', 'Q6R327', 'P85299', 'O75604', 'P51959', 'P62714', 'P67775', 'P30153', 'P30154', 'Q13362', 'O15530', 'Q9UER7', 'O75626', 'Q8N0Z6', 'Q9NRG4', 'Q8N9B5', 'O14744', 'Q9NQR1', 'Q9Y468', 'Q8NE22', 'P40692', 'Q00597', 'P43246', 'P15336', 'P05412', 'P01100', 'O75771', 'Q14676', 'Q9BXW9', 'Q9NVI1', 'Q9NYV4', 'Q92466', 'Q14004', 'P54278', 'Q719H9', 'Q6VUC0', 'Q7Z6R9', 'Q92754', 'Q92481', 'P05549', 'P63165', 'P63279', 'P05549-1', 'Q9NZC7', 'Q96SI1', 'P10809', 'P10244', 'P46087', 'Q99697', 'Q99966', 'Q96RK1', 'O95619', 'P35659', 'P01135', 'P25490', 'P01215', 'P0DN86', 'P00533', 'P15692', 'Q6PL18', 'P04626', 'P10721', 'Q9UGL1', 'P02649', 'Q04721', 'Q9UM47', 'Q99466', 'Q9BY41', 'Q96DB2', 'Q9UKV0', 'Q9UQL6', 'Q8WUI4', 'Q969S8', 'O95402', 'Q96RN5', 'Q96G25', 'Q71SY5', 'P35813', 'P09874', 'Q9UPN9', 'P12755', 'P12757', 'Q6ZNA4', 'Q96PU5', 'Q15583', 'Q9GZN2', 'P51668', 'P61077', 'Q93008', 'O15105', 'Q9UN42', 'P17275', 'P05121', 'Q15329', 'O00255', 'P42772', 'Q15361', 'Q03468', 'Q9BTC8', 'Q13330', 'O75533', 'O60264', 'Q9UIG0', 'O00159-3', 'Q9BQG0', 'Q9NR30', 'O95602', 'Q9H9Y6', 'Q3B726', 'O15160', 'O15446', 'P0DPB5', 'Q9GZS1', 'Q9P1U0', 'Q15572', 'Q15573', 'Q53T94', 'Q9H5J8', 'P60709', 'Q9UJW3', 'Q9UBC3', 'P26358', 'Q96T88', 'Q9Y6K1', 'Q13569', 'Q8NFU7', 'Q6N021', 'O43151', 'P17480', 'Q9H0E3', 'Q9H7L9', 'Q9UHR5', 'Q9HAJ7', 'O75446', 'Q4LE39', 'O00422', 'Q9UIF9', 'Q9UBB5', 'O43159', 'O43463', 'Q5T6S3', 'O43189', 'Q92833', 'Q9Y483', 'Q6ZN18', 'Q6NZI2', 'Q9NYV6', 'Q00059', 'O00411', 'Q9H5Q4', 'Q99551', 'O75569', 'Q9UPY3', 'Q15633', 'Q15631', 'Q99598', 'P62826', 'Q9HAV4', 'Q9NRR4', 'Q7Z5W3', 'O15397', 'P52948-5', 'Q96EE3-1', 'P57740', 'Q8NFH3', 'Q12769', 'Q8NFH4', 'Q9BW27', 'Q8WUM0', 'P55735', 'P12270', 'P49790', 'Q99567', 'Q5SRE5', 'P35658', 'Q9BTX1', 'Q8TEM1', 'Q96HA1', 'A8CG34', 'Q8NFH5', 'Q8N1F7', 'O75694', 'Q92621', 'Q96EE3-2', 'P78406', 'P52948-3', 'P52948-4', 'P37198', 'Q9BVL2-2', 'Q9BVL2-1', 'Q7Z3B4', 'Q9UKX7', 'Q9NRG9', 'P49792', 'O15504', 'Q5T8I9', 'Q7Z3Z4', 'Q96JY0', 'Q9Y2W6', 'Q8NDG6', 'Q8TC59', 'Q587J7', 'Q8WWH4', 'Q9NQI0', 'Q9BXT6', 'Q9BXT4', 'Q96J94', 'O60522', 'Q8N2A8', 'P07900', 'O75344', 'P10243', 'Q9BUI4', 'Q9H1D9', 'O15318', 'Q9BT43', 'Q9Y2Y1', 'P05423', 'Q9NVU0', 'O14802', 'Q9NW08', 'O75575', 'Q9Y535', 'Q14938', 'Q12857', 'P05455', 'P08651', 'O00712', 'Q12789', 'Q969F1', 'Q9UKN8', 'Q9Y5Q9', 'Q9Y5Q8', 'Q8WUA4', 'A6H8Y1', 'Q92994', 'Q92664', 'Q9HAW0']

# Check you data!
print('DNA_replication_proteins: {}'.format(len(DNA_replication_proteins)))
print('gene_expression_proteins: {}'.format(len(gene_expression_proteins)))

DNA_replication_proteins: 127
gene_expression_proteins: 1498


Possible solutions:

In [28]:
def proteins_in_common_v1(proteinsA, proteinsB):
    num_proteins_in_common = 0

    for protein in proteinsA:
        if protein in proteinsB:
            num_proteins_in_common += 1
    
    return num_proteins_in_common

In [29]:
common = proteins_in_common_v1(DNA_replication_proteins, gene_expression_proteins)

common

70

In [30]:
%timeit proteins_in_common_v1(DNA_replication_proteins, gene_expression_proteins)

914 µs ± 12.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [31]:
%timeit proteins_in_common_v1(gene_expression_proteins, DNA_replication_proteins)

1.36 ms ± 26.2 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [32]:
def proteins_in_common_v2(proteinsA, proteinsB):
    if len(proteinsA) < len(proteinsB):
        list1 = proteinsA
        list2 = proteinsB
    else:
        list1 = proteinsB
        list2 = proteinsA

    num_proteins_in_common = 0
        
    for protein in list1:
        if protein in list2:
            num_proteins_in_common += 1

    return num_proteins_in_common

In [33]:
%timeit proteins_in_common_v2(DNA_replication_proteins, gene_expression_proteins)

930 µs ± 15.3 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [34]:
%timeit proteins_in_common_v2(gene_expression_proteins, DNA_replication_proteins)

922 µs ± 14.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


A <strong>set</strong> is a structure similar to a list

In [35]:
my_set = {'A', 'C', 'T'}

In [36]:
my_set

{'A', 'C', 'T'}

- every element is <strong>unique</strong>:

In [37]:
my_set.add('T')

my_set

{'A', 'C', 'T'}

- its elements <strong>do not preserve</strong> an implied order (but the elements are ordered!):

In [38]:
my_set.add('G')

my_set

{'A', 'C', 'G', 'T'}

In [40]:
def proteins_in_common_v3(proteinsA, proteinsB):
    if len(proteinsA) < len(proteinsB):
        list1 = proteinsA
        list2 = proteinsB
    else:
        list1 = proteinsB
        list2 = proteinsA

    num_proteins_in_common = 0

    for protein in list1:
        if protein in set(list2):
            num_proteins_in_common += 1

    return num_proteins_in_common

%timeit proteins_in_common_v3(gene_expression_proteins, DNA_replication_proteins)

2.2 ms ± 106 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [41]:
def proteins_in_common_v4(proteinsA, proteinsB):
    num_proteins_in_common = 0

    if len(proteinsA) < len(proteinsB):
        list1 = proteinsA
        set2 = set(proteinsB)
    else:
        list1 = proteinsB
        set2 = set(proteinsA)

    for protein in list1:
        

19.3 µs ± 205 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [None]:
if protein in set2:
            num_proteins_in_common += 1

    return num_proteins_in_common


%timeit proteins_in_common_v4(gene_expression_proteins, DNA_replication_proteins)

A set is a structure frequently found in mathematics, therefore it supports the mathematical operations: intersections, unions, and differences.

Intersection:<br/>
<img src="http://bioinfo.mbb.yale.edu/course/classes/c4/add-1.gif"/>

In [42]:
def proteins_in_common_v5(proteinsA, proteinsB):
    if len(proteinsA) < len(proteinsB):
        set1 = set(proteinsA)
        return len(set1.intersection(proteinsB))
    else:
        set1 = set(proteinsB)
        return len(set1.intersection(proteinsA))

%timeit proteins_in_common_v5(gene_expression_proteins, DNA_replication_proteins)

15.1 µs ± 539 ns per loop (mean ± std. dev. of 7 runs, 100,000 loops each)


## File (II)

In [44]:
with open('result.txt', 'w') as f:
    f.write('\t'.join(['geneA', str(1.201)]))

In [45]:
'\t'.join(['geneA', str(1.201)])

'geneA\t1.201'

In [46]:
with open('result.txt', 'r') as f: # with with open('result.txt') as f
    print(f.read().split('\t'))

['geneA', '1.201']
