# Stew problem

In [3]:
def parse_file(file_name):
    f = open(file_name, 'r')
    Text = f.read().splitlines()
    f.close()
    return Text

In [25]:
Text = parse_file('dataset_7_10.txt')[0]
n = len(Text)
skews = [0]*n
for i in range(0, n):
    if i == 0:
        skews[0] = 0
    elif Text[i-1] == 'G':
        skews[i] = skews[i-1] + 1
    elif Text[i-1] == 'C':
        skews[i] = skews[i-1] - 1 
    else:
        skews[i] = skews[i-1]
        
min_skew = min(skews)
indexes = [i for i in range(0, n) if skews[i] == min_skew]
print(*indexes, sep = ' ')

5072 5077


# Hamming distance
### The number of mismatches between 2 genome strings

In [26]:
Text = parse_file('dataset_9_3.txt')
Text1 = Text[0]
Text2 = Text[1]

def hamming_distance(text1, text2):
    hamm_dist = 0
    for i in range(0,len(text1)):
        if (text1[i] != text2[i]):
            hamm_dist += 1
    return hamm_dist

hamming_distance(Text1, Text2)

756

## Approximate Pattern Matching and Pattern Count Problems

In [28]:
data = parse_file('dataset_9_6.txt')
pattern = data[0]
text = data[1]
d = int(data[2])

pattern_count = 0
indexes = []
for i in range(len(text)-len(pattern)+1):
    hamm_dist = hamming_distance(text[i:i+len(pattern)], pattern)
    if hamm_dist <= d:
        indexes.append(i)
        pattern_count += 1
print(pattern_count)

10


# d-Neighborhood

In [7]:
def first_symbol(pattern):
    return pattern[0:1]
pattern = 'AAGG'
first_symbol(pattern)

def suffix(pattern):
    return pattern[1:]

In [8]:
def find_neighbors(pattern, d):
    nbs = {'A', 'C', 'T', 'G'}
    if d == 0:
        return pattern
    if len(pattern) == 1:
        return nbs
    neighborhood = set()
    suffix_pattern = suffix(pattern)
    suffix_neighbors = find_neighbors(suffix_pattern, d)
    for text in suffix_neighbors:
        if hamming_distance(suffix(pattern), text) < d:
            for nb in nbs:
                neighborhood.add(nb + text)
        else:
            neighborhood.add(first_symbol(pattern) + text)
    return neighborhood


In [34]:
data = parse_file('dataset_3014_4.txt')
neighbors = find_neighbors(data[0], int(data[1]))
nei_str = [str(i)+' ' for i in neighbors]
print(''.join(nei_str))

GCCAGCGCAA GCGTCTGCTA GTCACCGTAA GCCGCCGAAA ACCACGGCAC TCCACTGCTG GCCAGCGGAA GACACTGCCC GTCACTAGAA AACACTGGAA GCCAGTGCCA TCCTTTGCAA GCGAATGCGA GCCACGGTAC GCGACTAGAA CGCGCTGCAA GTTACTCCAA ACCCGTGCAA GGCAAAGCAA TTCTCTGCAA CCCACTATAA GCAACAACAA GCAAGGGCAA GTAACAGCAA GCCACAGGGA GCGACTGTTA GCCATCTCAA CCCAACGCAA CCCATTGAAA GCCCCTGATA ACCACTATAA TCCATGGCAA TCTAGTGCAA GACGATGCAA GCCACCGCCA GCCTCTGGAA TCCACTTAAA GCCAGTCGAA GCCACGTCTA GACCCCGCAA GCGACTGGAA GCGACTACCA GCGACTGTCA GACAATGCTA GGCCCTGCAG CCCATTCCAA TCTACCGCAA TGCACTGCTA GCAAGTTCAA GGCACGTCAA GCCAAAGCAC ACCTCTGCAG GATCCTGCAA GCCACGGAAC TCCTCTGTAA TGCACCGCAA GCCGCTTCTA GCGACTACAC ACAACCGCAA GGAACTGCTA GCTGCTGCAG GCCACGCCAC GCTGCAGCAA GTCCATGCAA GCGAACGCAA GCCCCTACGA GCTCCTGAAA GGCATTGCAA GCCGCTACCA GCACCTGCAC CCAACTGCGA GCGAATGGAA ACCACTTCAC GCCAATGAAT TCCACTGTTA CCCACTGACA GCCACTGTAC TACACTGCGA GGCACGGCAG GTCACCGCCA ACCACTGGAG GCCACGGGTA GCCATTGTAG TCCCCTCCAA GACGCTGCGA GCTACTGTAG TCCGCTGCAA GCCCCGGCCA GACACCGAAA CTCACTGCAG TGCACTCCAA

In [10]:
def max_value_dict(dictionary):
    values_abs = [abs(i) for i in dictionary.values()]
    return max(values_abs)

# Frequent Words with Mismatches Problem

In [17]:
def frequent_words_with_mismatches(text, k, d):
    patterns = []
    freq_dict = {}
    n = len(text)
    for i in range(n-k):
        pattern = text[i:i+k]
        neighborhood = find_neighbors(pattern, d)
        for neighbor in neighborhood:
            if neighbor not in freq_dict.keys():
                freq_dict[neighbor] = 1
            else:
                freq_dict[neighbor] += 1
    max_freq = max_value_dict(freq_dict)
    for pattern in freq_dict.keys():
        if (freq_dict[pattern] == max_freq):
            patterns.append(pattern)
    return patterns

In [18]:
data = parse_file('dataset_9_9.txt')
frequent_words_with_mismatches(data[0], 6, 3)

['ATTATT']

# Frequent Words with Mismatches and Reverse Complement Problem 

In [19]:
def find_reverse_complement(Text):
    reverse_complement = ''
    for nt in range(0,len(Text)):
        if Text[nt] == 'A':
            reverse_complement += 'T'
        elif Text[nt] == 'T':
            reverse_complement += 'A'
        elif Text[nt] == 'C':
            reverse_complement += 'G'
        elif Text[nt] == 'G':
            reverse_complement += 'C'
    return reverse_complement[::-1]

In [20]:
def frequent_words_with_mismatches_and_reverse_comp(text, k, d):
    patterns = []
    freq_dict = {}
    n = len(text)
    for i in range(n-k):
        pattern = text[i:i+k]
        neighborhood = find_neighbors(pattern, d)
        neighborhood_rc = find_neighbors(find_reverse_complement(pattern), d)
        for neighbor in neighborhood:
            if neighbor not in freq_dict.keys():
                freq_dict[neighbor] = 1
            else:
                freq_dict[neighbor] += 1
        for neighbor in neighborhood_rc:
            if neighbor not in freq_dict.keys():
                freq_dict[neighbor] = 1
            else:
                freq_dict[neighbor] += 1
    max_freq = max_value_dict(freq_dict)
    for pattern in freq_dict.keys():
        if (freq_dict[pattern] == max_freq):
            patterns.append(pattern)
    return patterns

In [21]:
text = parse_file('dataset_9_10.txt')[0]
set1 = frequent_words_with_mismatches_and_reverse_comp(text, 7, 3)
print(set1)

['GAGAGAG', 'CTCTCTC']
