In [5]:
from ipynb.fs.full.replication_ori_functions import HammingDistance, Neighbors, NumberToPattern;

**Brute force algorithm for motif finding**

In [4]:
def MotifEnumeration(Dna, k, d):
    Patterns = set()
    strings = Dna.split()
    for i in strings:
        for j in range(len(i) - k + 1):
            k_mer = i[j:j+k]
            neighbourhood = Neighbors(k_mer, d)
            for neighbour in neighbourhood:
                found_in_all = True
                for m in strings:
                    if not any(HammingDistance(neighbour, m[n:n+k]) <= d for n in range(len(m) - k + 1)):
                        found_in_all = False
                        break
                if found_in_all:
                    Patterns.add(neighbour)
    return Patterns

**Entropy calculation**

In [15]:
import math

def entropy(strings):
    dna = strings
    num_dna = len(dna)
    entropy_cal = 0
    a, t, g, c = 0, 0, 0, 0
    for i in range(0,len(dna[0])):
        for j in dna:
            if j[i] == 'A':
                a += 1
            elif j[i] == 'T':
                t += 1
            elif j[i] == 'G':
                g += 1
            else:
                c += 1
                
        a /= num_dna
        t /= num_dna
        g /= num_dna
        c /= num_dna
        if a > 0:
            entropy_cal += -(a * math.log2(a))
        if t > 0:
            entropy_cal += -(t * math.log2(t))
        if g > 0:
            entropy_cal += -(g * math.log2(g))
        if c > 0:
            entropy_cal += -(c * math.log2(c))
        a, t, g, c = 0, 0, 0, 0
    
    return entropy_cal

**Solving the Median String Problem**

In [18]:
def DistanceBetweenPatternAndStrings(Pattern, Dna):
    k = len(Pattern)
    distance = sum(min(HammingDistance(Pattern, text[i:i+k]) for i in range(len(text) - k + 1)) for text in Dna)
    return distance

In [19]:
def median_string_all_kmers(dna, k):
    best_distance = float('inf')
    median = None

    for i in range(4**k):
        pattern = NumberToPattern(i, k)
        dist = DistanceBetweenPatternAndStrings(pattern, dna)

        if dist < best_distance:
            best_distance = dist
            median = pattern

    return median

In [20]:
def all_kmers_from_dna(dna, k):
    kmers = set()
    for text in dna:
        for i in range(len(text) - k + 1):
            kmers.add(text[i:i+k])
    return kmers

In [21]:
def median_string_kmers_from_dna(dna, k):
    best_distance = float('inf')
    median = None

    candidate_kmers = all_kmers_from_dna(dna, k)

    for pattern in candidate_kmers:
        dist = DistanceBetweenPatternAndStrings(pattern, dna)
        if dist < best_distance:
            best_distance = dist
            median = pattern

    return median

**Profile-most Probable k-mer Problem**

In [7]:
def profile_probability(kmer, profile):
    prob = 1.0
    mapping = {'A': 0, 'C': 1, 'G': 2, 'T': 3}

    for i, nucleotide in enumerate(kmer):
        prob *= profile[mapping[nucleotide]][i]

    return prob

In [8]:
def profile_most_probable_kmer(dna, k, profile):
    max_prob = -1
    best_kmer = dna[:k]

    for i in range(len(dna) - k + 1):
        kmer = dna[i:i+k]
        prob = profile_probability(kmer, profile)

        if prob > max_prob:
            max_prob = prob
            best_kmer = kmer

    return max_prob, best_kmer