In [1]:
	import random

# Review of previous exercises
#### What's indexing

In [2]:
# Consider 100 randomly mutated sequences. Index them from 0 to 99. Select only those with at least 2 deletions and GC% > 0.45 using higher order functions. Return their index and GC% value.
def generate_dna():
    random_dna_string = "".join(random.choices(["A", "C", "G", "T"], weights=[29, 21, 21, 29], k=random.randint(5, 50)))
    return random_dna_string

def mutate(n_times, input_dna):
    for i in range(n_times):
        mut_pos = random.randint(0, len(input_dna)-1)     # mut_pos is [0,len(input_dna)]
        mut_type = random.randint(0, 2)
        if mut_type == 0:
            # substitution
            replacement = random.choice(["A", "C", "G", "T"])
            input_dna = input_dna[:mut_pos] + replacement+ input_dna[mut_pos+1:]
        elif mut_type == 1:
            # insertion
            inserted_elem = random.choice(["A", "C", "G", "T"])
            input_dna = input_dna[:mut_pos] + inserted_elem + input_dna[mut_pos+1:]
        else:
            # deletion
            input_dna = input_dna[:mut_pos] + '-' + input_dna[mut_pos+1:]
    return input_dna

random.seed(16)
input_dna = [mutate(10, generate_dna()) for i in range(100)]
index = zip(list(range(len(input_dna))), input_dna)

del_filter = filter(lambda i: i[1].count('-') >= 2, index)
gc_count = map(lambda i: (i[0],(i[1].count('G') + i[1].count('C'))/len(i[1])), del_filter)
gc_filter = filter(lambda i: i[1]>0.45, gc_count)

result = list(gc_filter)
print(result)

[(2, 0.4666666666666667), (9, 0.46808510638297873), (13, 0.47619047619047616), (19, 0.6296296296296297), (24, 0.47368421052631576), (42, 0.4523809523809524), (49, 0.5), (53, 0.4772727272727273), (62, 0.5), (66, 0.5454545454545454), (71, 0.4857142857142857), (73, 0.4583333333333333), (75, 0.48), (82, 0.48), (83, 0.47058823529411764), (94, 0.45652173913043476)]


#### Alternative "pipelined" style

In [3]:
random.seed(16)
data = [mutate(10, generate_dna()) for i in range(100)]

def index(data):
	return zip(list(range(len(data))), data)

pipeline = [
	(index,),
	(filter, lambda i: i[1].count('-') >= 2),
	(map, lambda i: (i[0],(i[1].count('G') + i[1].count('C'))/len(i[1])) ),
	(filter, lambda i: i[1]>0.45)
]

for stage in pipeline:
	if len(stage) == 1:
		whole_dataset_f = stage[0]
		data = whole_dataset_f(data)
	elif len(stage) == 2:
		apply, single_point_f = stage
		data = apply(single_point_f, data)
print(list(data))

[(2, 0.4666666666666667), (9, 0.46808510638297873), (13, 0.47619047619047616), (19, 0.6296296296296297), (24, 0.47368421052631576), (42, 0.4523809523809524), (49, 0.5), (53, 0.4772727272727273), (62, 0.5), (66, 0.5454545454545454), (71, 0.4857142857142857), (73, 0.4583333333333333), (75, 0.48), (82, 0.48), (83, 0.47058823529411764), (94, 0.45652173913043476)]


# Correction of homework

In [4]:
# ex.21 on 26.10.22
annotations  =  [('chr1',  10,  20,  'GeneA.ex1'),  ('chr1',  25,  40,  'GeneA.ex2'),
	('chr1',  50,  90,  'GeneA.ex3'),  ('chr1',  10,  90,  'GeneA'),
	('chr1',  100,  150,  'GeneB.ex1'),  ('chr1',  180,  190,  'GeneB.ex2'),
	('chr1',  100,  210,  'GeneB'),  ('chr3',  90,  135,  'GeneC.ex1'),
	('chr3',  180,  190,  'GeneC.ex2'),  ('chr3',  90,  190,  'GeneC')]
mutations  =  [('chr1',  81),  ('chr1',  239),  ('chr1',  165),  ('chr2',  123),  ('chr3',  50),
	('chr1',  162),  ('chr3',  101),  ('chr2',  185),  ('chr2',  149),  ('chr3',  183),
	('chr2',  158),  ('chr3',  25),  ('chr2',  250),  ('chr1',  299),  ('chr2',  114),
	('chr2',  184),  ('chr2',  33),  ('chr3',  290),  ('chr1',  138),  ('chr2',  147)]

mutated_regions_with_rep = [ann[3] for ann in annotations for mut in mutations if ann[0] == mut[0] and ann[1] <= mut[1] <= ann[2] ]
mutated_regions = set(mutated_regions_with_rep)
print(mutated_regions)
mutation_count = {reg: mutated_regions_with_rep.count(reg) for reg in mutated_regions}
print(mutation_count)

{'GeneC.ex1', 'GeneC', 'GeneA', 'GeneC.ex2', 'GeneB.ex1', 'GeneA.ex3', 'GeneB'}
{'GeneC.ex1': 1, 'GeneC': 2, 'GeneA': 1, 'GeneC.ex2': 1, 'GeneB.ex1': 1, 'GeneA.ex3': 1, 'GeneB': 3}


# 1-on-1 tutoring exercises

In [5]:
# ex. 30 # A list of number is "lucky" if it contains at least one multiple of 7.
# Randomly generate a list of 100 lists, where each inner list is made of 5 random whole numbers between 1 and 70 (inclusive)
# Then, select only the lucky ones.
import random
numbers = [[random.randint(1,70) for i in range(5)] for i in range(100)]

def lucky_list(l):
	return len([x for x in l if x%7==0])>0

list(filter(lucky_list, numbers))

[[61, 10, 22, 8, 14],
 [56, 39, 23, 56, 46],
 [18, 26, 5, 56, 24],
 [61, 66, 29, 66, 14],
 [53, 63, 2, 52, 17],
 [40, 14, 33, 42, 65],
 [19, 67, 7, 33, 30],
 [60, 14, 23, 59, 57],
 [59, 70, 31, 56, 8],
 [21, 28, 19, 68, 8],
 [20, 32, 45, 49, 55],
 [56, 47, 48, 4, 18],
 [56, 68, 30, 51, 11],
 [35, 45, 11, 58, 11],
 [8, 25, 49, 18, 24],
 [44, 29, 14, 64, 43],
 [16, 58, 64, 38, 70],
 [6, 63, 66, 39, 32],
 [36, 35, 9, 19, 63],
 [40, 58, 9, 1, 14],
 [61, 47, 37, 37, 63],
 [7, 59, 58, 10, 20],
 [13, 41, 55, 34, 7],
 [17, 30, 27, 21, 65],
 [14, 22, 9, 59, 21],
 [39, 45, 12, 21, 28],
 [63, 7, 44, 2, 27],
 [38, 49, 19, 24, 11],
 [21, 1, 66, 1, 40],
 [42, 22, 62, 50, 54],
 [10, 44, 59, 14, 57],
 [67, 41, 52, 20, 21],
 [20, 68, 5, 49, 40],
 [6, 27, 68, 21, 7],
 [62, 6, 58, 49, 10],
 [6, 10, 11, 70, 8],
 [37, 3, 42, 21, 15],
 [23, 27, 23, 24, 42],
 [28, 64, 11, 46, 60],
 [12, 4, 22, 1, 14],
 [42, 30, 35, 67, 50],
 [44, 56, 22, 54, 68],
 [39, 20, 35, 66, 32],
 [28, 13, 12, 6, 2],
 [25, 49, 62, 46, 

use of any

In [8]:
any([1,2])

True

In [3]:
any([])

False

In [5]:
any([False, False])

False

In [4]:
def lucky_list(l):
	return any([x for x in l if x%7==0])

list(filter(lucky_list, numbers))

[[60, 2, 42, 38, 3],
 [17, 68, 35, 14, 57],
 [1, 5, 37, 61, 42],
 [32, 50, 49, 5, 39],
 [36, 37, 46, 42, 14],
 [6, 32, 35, 14, 31],
 [32, 35, 7, 11, 51],
 [56, 19, 54, 44, 59],
 [55, 18, 28, 54, 19],
 [47, 9, 56, 46, 60],
 [68, 51, 5, 38, 49],
 [49, 62, 67, 31, 65],
 [34, 54, 4, 3, 56],
 [63, 5, 48, 4, 67],
 [10, 61, 7, 44, 35],
 [61, 42, 2, 31, 51],
 [40, 56, 61, 56, 36],
 [19, 20, 6, 21, 20],
 [17, 35, 44, 26, 68],
 [25, 35, 46, 8, 54],
 [49, 31, 33, 26, 55],
 [53, 11, 14, 2, 11],
 [32, 49, 45, 26, 16],
 [18, 64, 42, 36, 66],
 [28, 66, 13, 30, 19],
 [49, 54, 29, 36, 30],
 [3, 70, 30, 67, 37],
 [50, 51, 28, 9, 5],
 [27, 70, 5, 37, 31],
 [4, 63, 40, 57, 12],
 [4, 56, 7, 3, 44],
 [15, 42, 65, 59, 29],
 [5, 7, 43, 18, 53],
 [50, 2, 28, 22, 31],
 [54, 40, 70, 46, 14],
 [22, 3, 9, 7, 35],
 [55, 10, 34, 40, 14],
 [67, 63, 61, 6, 14],
 [50, 35, 20, 30, 63],
 [49, 51, 70, 56, 11],
 [35, 55, 6, 48, 66],
 [35, 61, 23, 51, 52],
 [60, 28, 28, 53, 29],
 [65, 44, 6, 70, 46],
 [5, 70, 38, 51, 52],
 

In [30]:
# ex.28 # Let's consider a collection of mutations the in the format of a list of tuples: [(seq_id,protein,pos,ref,alt)]
# 1. count how many distinct seq_ids are present in the database
# 2. select only mutations on the S protein and sort them by position
# 3. find the most common mutation
# 4. transform the mutations in the format <Ref><Position><Alt> (eg.'614', 'D', 'G' => 'D614G')
# copy from  shorturl.at/apVYZ
muts = [
	("MW173245", "S", 614, "D", "G"),
	("MW173245", "N", 12, "P", "M"),
	("MW166097", "S", 54, "D", "L"),
	("MW166097", "S", 614, "D", "G"),
	("MW166097", "E", 33, "L", "K"),
	("MW166097", "Orf7", 251, "K", "G")]
seq_ids = [t[0] for t in muts]
count_distinct = len({s for s in seq_ids})
req2 = sorted([t for t in muts if t[1] == "S"], key=lambda x: x[2])

unique_muts = [t[1:] for t in muts]
unique_muts = {t: unique_muts.count(t) for t in unique_muts}
maximum_count = {k:v for k,v in unique_muts.items() if v == max(unique_muts.values())}
#transformed_list = [mut[3]+str(mut[2])+mut[4] for mut in muts]
transformed_list = list(map(lambda mut: mut[3]+str(mut[2])+mut[4], muts))
print(transformed_list)

['D614G', 'P12M', 'D54L', 'D614G', 'L33K', 'K251G']


# Homework

In [7]:
# ex.35 # Consider a matrix represented as a list of lists
M = [[1,4,5,6,1],
	 [4,6,2,3,7],
	 [1,0,-1,0,1],
	 [3,4,1,5,6],
	 [1,3,4,5,3],
	 [0,0,0,0,0]
]
# perform using list/set/dictionary comprehension or higher order functions :
# (1) sanity check: ensures that all the rows have the same length
# (2) data cleaning: removes from the matrix any row such that
#	it contains only zeros
#	it contains at least one negative value
# (3) normalization: divide each element by the maximum of the row
# (4) analysis:
#	computes the list sum_row with contains the sum of th elements of each row
#	computes the list sum_col which contains the sum of the elements of each column

# suggestion: you may consider using the functions any() and all() to solve some requests

### ex 40.
**Identify all the 4-mers inside the following sequences and print them in a list sorted by count across all the sequences**<br>
`seqs = ["acctctgtcgatg", "acctgtgcgatg", "accttggtttc", "tttctttccgccaa"]`<br>
*For example, in the first sequence we see the 4-mer "acct" and it appears 3 times (in the 1st, 2nd and 3rd sequence), so its count is 3.<br> the second 4-mer is "cctc" and its count is 1.*

In [6]:
seqs = ["acctctgtcgatg", "acctgtgcgatg", "accttggtttc", "tttctttccgccaa"]

kmer_counts = {}
for seq in seqs:
	for i in range(len(seq)-3):
		kmer = seq[i : i + 4]
		if kmer in kmer_counts:
			kmer_counts[kmer] += 1
		else:
			kmer_counts[kmer] = 1

for kmer in sorted(kmer_counts, key = kmer_counts.get, reverse = True):
	print(kmer, kmer_counts[kmer])

acct 3
tttc 3
ctgt 2
cgat 2
gatg 2
cctc 1
ctct 1
tctg 1
tgtc 1
gtcg 1
tcga 1
cctg 1
tgtg 1
gtgc 1
tgcg 1
gcga 1
cctt 1
cttg 1
ttgg 1
tggt 1
ggtt 1
gttt 1
ttct 1
tctt 1
cttt 1
ttcc 1
tccg 1
ccgc 1
cgcc 1
gcca 1
ccaa 1


In [None]:
# ex. 42 # consider a bank log as a list of tuples: (account_id, withdrawal / deposit, amount)
# Create a function that returns the net amount for each account.
# E.g. input:
# (111, “D”, 200)
# (222, “W”, 100)
# (111, “D”, 200)
# (111, “W”, 100)
# (222, “D”, 100)
# (333, “D”, 500)
# Output:
# (111, 300)
# (222, 0)
# (333, 500)