# Bioinformatics I: Finding Hidden Messages in DNA
## Week 1: Where in the Genome Does DNA Replication Begin?

### Problem 1.1: Pattern Count

In [1]:
def pattern_count(text, pattern):
	"""Count the number of times that a pattern appears in a text.

	Args:
		text (str): The whole String (DNA sequence)
		pattern (str): The pattern to be searched in the text.

	Returns:
		int: The number of times that the pattern appears in the text.
	"""
	count = 0
	for i in range(len(text) - len(pattern) + 1):
		if text[i:i + len(pattern)] == pattern:
			count += 1
	return count

In [30]:
text= 'CGCGATACGTTACATACATGATAGACCGCGCGCGATCATATCGCGATTATC'
pattern= 'CGCG'
pattern_count(text, pattern)

5

### Problem 1.2: Frequent Words

In [3]:
def frequency_table(text, k):
	"""Generate a frequency table for a given text and k-mer length.

	Args:
		text (str): The whole String (DNA sequence)
		k (int): The length of the k-mer.

	Returns:
		dict: A dictionary with the k-mers as keys and their frequency as values.
	"""
	freqMap = {}
	n = len(text)
	for i in range(n-k+1):
		pattern = text[i:i+k]
		if pattern not in freqMap:
			freqMap[pattern] = 1
		else:
			freqMap[pattern] += 1
	return freqMap

In [4]:
def best_frequent_words(text, k):
	"""Find the most frequent k-mers in a text.

	Args:
		text (str): The whole String (DNA sequence)
		k (int): The length of the k-mer.

	Returns:
		list: A list with the most frequent k-mers in the text.
	"""
	freqMap = frequency_table(text, k)
	maxValue = max(freqMap.values())
	mostFreqPatterns = []
	for key in freqMap:
		if freqMap[key] == maxValue:
			mostFreqPatterns.append(key)
	return mostFreqPatterns

In [29]:
text= 'TAAACGTGAGAGAAACGTGCTGATTACACTTGTTCGTGTGGTAT'
k= 3
best_frequent_words(text, k)

['GTG']

### Problem 1.3: Reverse Complement

In [6]:
def reverse_complement(pattern):
	complement= {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A'}
	reverse_complement= ''
	for base in pattern:
		reverse_complement= complement[base] + reverse_complement
	return reverse_complement

In [31]:
pattern= 'GCTAGCT'
reverse_complement(pattern)

'AGCTAGC'

### Problem 1.4: Pattern Matching

In [8]:
def pattern_matching(pattern, genome):
	"""Find all occurrences of a pattern in a genome.

	Args:
		pattern (str): The pattern to be searched in the genome.
		genome (str): The whole String (DNA sequence).

	Returns:
		list: A list with the positions of the pattern in the genome.
	"""
	positions = []
	for i in range(len(genome) - len(pattern) + 1):
		if genome[i:i + len(pattern)] == pattern:
			positions.append(i)
	return positions

In [32]:
pattern= 'AA'
genome= 'AAACATAGGATCAAC'
results= pattern_matching(pattern, genome)
for result in results:
	print(result, end=' ')

0 1 12 

In [10]:
pattern= 'ATGATCAAG'
with open('Vibrio_cholerae.txt', 'r') as file:
    vibro_cholerae_genome = file.read()
results= pattern_matching(pattern, vibro_cholerae_genome)
for result in results:
	print(result, end=' ')

116556 149355 151913 152013 152394 186189 194276 200076 224527 307692 479770 610980 653338 679985 768828 878903 985368 

### Problem 1.5: Clump Finding

In [21]:
from tqdm import tqdm

In [22]:
def clump_finding(genome, k, L, t):
	"""Find patterns forming clumps in a genome.

	Args:
		genome (str): The whole String (DNA sequence).
		k (int): The length of the pattern.
		L (int): The length of the clump.
		t (int): The minimum number of times that the pattern must appear in the clump.

	Returns:
		list: A list with the patterns forming clumps in the genome.
	"""
	
	patterns= []
	n= len(genome)
	for i in tqdm(range(n - L + 1)):
		window= genome[i:i + L]
		freq_map= frequency_table(window, k)
		for key in freq_map:
			if freq_map[key] >= t:
				patterns.append(key)
	return set(patterns)

In [23]:
genome= 'CGACTATATGTAACGCCGTAACCGCCAGGGGCGAAAACTTGCTTAGCATTACTTTGCCCGGTATGGTATCTGGCATGTGCTAGTGTAGGCAGGGAGAACCTGGAAAGCGACCCGGAAAGGCGCCTGGATGAACTCGCCTATTGATGAGTAACGAGTAAGGATGGATTAACCAGAGTAAGGATGTAAGGATACCACCTCGAGTAAGGAGTAGAGTAAGGATAGGATTCGAAACGAATTAAGAAAAAATCATTTCCTCACAATGATGTTTGACTTCTATAGAACGCTCTTTCGGCGTGGACGGAGAAAGACCGGAGAAAGAAACGGAGAAAGAACGGAGAAAGAGCGGAGAAAGAATCAACTTACGACGGCGGAGAAAGATCAACCGGAGAAAGACGATCAACTTACGACCAGGGACTCAACTTACGGCGACGTCAACTTACGACGTGGCTAAATAGAAGCTTTGGGGTCGAGACCCTCTTGGGGTAGAACAGATGGCAACATTGCTCTGATGACTATCCTATCCCTTGCTATATGCGTTAGGCTGGCGACAAAGTTGTATTGTAGTTTGTAGTCCAATTGTAGTCCATGTAGTCCATCGCCGGACTAACTTGTAGTCCAGTCTTTTGTAGTCCATAGCCTGCGGTCCATGGTCTGACTCAGGTCAGCCCCGTTATGACCCTCCCGCAAACCGATCCAGCAGTGAACCACTGCCAGCGCCACAATCAATACGGTTCGCCTTTGGCTGAGTTGCTAAGATCGCCAAACCATCCCATCTATTTCTAGGAACCAGCTCCTATCGTGCGAGGAGCCTCACGCCACTCAGCTTTAACAGTGCAGCGGCTTAGGCCGGGCGCAGCGGCTTACAGCGGCTTACAGCAGCGGCTTAGCGGCTTACAGCGGCTTAAGCGGGCAGCGGCTTAGGAATTCCAGTTTCTGCAGACGTGTTTGGATGTTCCTAGCCGTTGACCTATGCGAGTGGTCACGTCGATTCGGGAATCTATGGGATCGATCGAGCTTTACACGAAGAACCTTATGTATTATTCACGAACCTTATGCATTATTCTGGCAGGGATACTGGCAGGGATGATCGGAACCTTATGTATGGCTAGCTATTATTCTTCTGAGCCCGTGCACTGCTGGCATGGCAGGGATGCAGGGATGTCCACGGCTCCAAGCGCACGGCTCCAAGCTCCAACTTTAGAGACAACGGCTCCAAATTAGCAGAGTACCTATTAACGGCTACGGCTCCAAGACTAGATTATTAGCAGAGAGTATTAGCAGAGTGTCGTCTCCTAAGGCCCTGGTCTTTCACGTGGCCCTGGTCCTGGTCGTCGATCAGGCGGCCCTGGCCCTGGTCCTGGTCACTGTCACATTGCGCTGATTGGTTCTGTTCTGGATCTTTCAATAGCAGAAGATAGCAGAAGAGCTTCGATACAAACAAGCTTCCATAGCAGAAGAGCAGATAGCAGAAGAAATAATAGCAGAAGACCATCCGACAGACTTCTCACCCTCCGAGTCTTACTAATACGCGTCACTTCCATCGCGCATCGACAATCAGGCATTCGTTTGGACCCGCCTTGCTCAAGCACCCCATCGAGACATAAAAGAGAAGACGCCGCCCTCCACAGTTTCGCTCGTGTAGCTCAGGGCGCGGGTCGCTGACTAGTTCTCCGAAGACTGTGACGGTACGCCTGACACTGCGACCAAAGACGGGGGACTGGCGTACGGCCGATCTGGAACCCATTCGAATGTTCGAATTCGAATGATCCTGAAAGCATTACTTCGAATGATATGCTTAGACTTCGAATGATTCTTCGAATGATATTGACTTCAGCTAATGCTAGGGCTTGAATGCAGCAGATACATGATGTGAATGTACAAAGAACAGATACATGGATCAGATACATGAATCCAGATACATGATGCAGATACATGGTGAATCGCCAGGTTAACATCACGGATGGGAATGTAGCGAGGGATACGGATGCGCGCACTGAACG'
clumps= clump_finding(genome, 10, 100, 4)
for clump in clumps:
	print(clump, end=' ')

100%|██████████| 1901/1901 [00:00<00:00, 34371.69it/s]

TAGCAGAAGA TTGTAGTCCA GCAGCGGCTT CAGATACATG TTCGAATGAT TCAACTTACG CGGAGAAAGA ATAGCAGAAG ACGGCTCCAA CAGCGGCTTA 




In [33]:
with open('E_coli.txt', 'r') as file:
	e_coli_genome = file.read()
clumps= clump_finding(e_coli_genome, 9, 500, 3)

100%|██████████| 4639176/4639176 [14:14<00:00, 5429.80it/s]


In [34]:
for clump in clumps:
	print(clump, end=' ')

GGAACAGCA AACTGTAGG AGAGCAGTT CGCCTGATG GGTTGCCTG ATCCTGCAC CGAGCTCTT ACGCGGGGT CAAAGCGCG CCAATTCCG GCGCTGGCG GGTGATGTT AATTGGTCG CCGCAACAA GCCAGACAG ATATTGGTG CCGTTGCCG CGCATCCGA ATAACCCGA TGGTCGCAG GCATCGGGA TATGTTCCG CCGAACCGT CCTACGGAG CGAGTCATC GGGGCTTCT GCGCTGCTC GAATCTGTA CTCACTGCG GCCGCCAGT GCCAGCAGG GACAGCGCA GATGCATCG AATCAATTG CGGTTGCGC GCGTTAGCG CAGAATATT TCTTATCAG GCGGATGCG CGACCTACG ATCAGCAGC TTATCCCCG GCTCGTCGG GCGGTTCGA TGATCCAGT GTCGGCGGT GAATGCCGG TTTCTTTTG TCAACGCCT CGACATTAT CGTCTTATC CTGTCGCCA CAAGCGTCG GGAGGATTC AAGCGTCGC CGGATAAGG TTTGGCAAC TTAAATAAT GGCCTACAA CGGGTCGTT GACGCACAG TGCCTGCGG ACGCTGTCG CCTGGTAGC ATGCCTGAT GGGGGGACT TGCACGACC TGCCGAACC GATGAAATG ATCGGGAAA CGGCTGCCG AACGCGTCT CACCTGAAG CAGTTGACT TTATGTTCC GTCCAACTG TAGCTCGTC AGGCACTTG AGTCGGCAC TGAAGCTAG ACGACGTTC TCTGTAGGC CTCATCCTT CCTACGGAT AAAAAGCCG TGCGGCGCG TCGTCGGTT GAAATGATG ATGGCGGTG AGGCGGTTA AAAGCAGAA CACTTGTGC TGATTTAGC CAGGCTAAT AGGCATCTG AAGTTGAAG TGTTCATAT GACTTGAAC TATGGATTA ATAAGACGC 

In [35]:
len(clumps)

1904