In [1]:
def profile_from_motifs(motifs, pseudocount):
    n = len(motifs[0])
    m = len(motifs)
    profile = {x:[1 / (m + pseudocount * 4)] * n for x in ['A', 'C', 'G', 'T']}
    for i in range(n):
        for motif in motifs:
            profile[motif[i]][i] += 1 / (m + pseudocount * 4)
    return profile

In [2]:
profile_from_motifs(['ACGT', 'GCAT', 'ACCT', 'AATC'], 1)

{'A': [0.5, 0.25, 0.25, 0.125],
 'C': [0.125, 0.5, 0.25, 0.25],
 'G': [0.25, 0.125, 0.25, 0.125],
 'T': [0.125, 0.125, 0.25, 0.5]}

In [3]:
def Profile_score(matrix, k_mer):
    score = 1
    for i in range(len(k_mer)):
        score *= matrix[k_mer[i]][i]
    return score

In [4]:
def most_probable_k_mer(string, profile, k):
    prob = 0
    k_best = string[:k]
    for i in range(len(string) - k + 1):
        k_mer = string[i:i + k]
        if Profile_score(profile, k_mer) > prob:
            prob = Profile_score(profile, k_mer)
            k_best = k_mer
    return k_best

In [5]:
matrix = {
    'A': [0.2, 0.2, 0.3, 0.2, 0.3],
    'C': [0.4, 0.3, 0.1, 0.5, 0.1],
    'G': [0.3, 0.3, 0.5, 0.2, 0.4],
    'T': [0.1, 0.2, 0.1, 0.1, 0.2]
}
k = 5
text = 'ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT'
most_probable_k_mer(text, matrix, k)

'CCGAG'

In [6]:
def motif_matrix_score(motifs):
    score = 0
    n = len(motifs[0])
    m = len(motifs)
    count_matrix = {x:[0] * n for x in ['A', 'C', 'G', 'T']}
    for i in range(n):
        for motif in motifs:
            count_matrix[motif[i]][i] += 1
    for i in range(n):
        elem = [count_matrix[a][i] for a in ['A', 'C', 'G', 'T']]
        score += m - max(elem)
    return score

In [7]:
import random
def motif_matrix_from_dna(dna, k):
    """
    return: list of random motifs from each line of dna
    """
    matrix = []
    for string in dna:
        i = random.randint(0, len(string) - k)
        matrix.append(string[i:i + k])
    return matrix

In [8]:
motif_matrix_from_dna(['asgasg', 'asgasg', 'asbjas', 'asmfbasf'], 3)

['gas', 'gas', 'sbj', 'mfb']

In [9]:
def motifs_from_prifile(profile, dna, k):
    """
    list of motifes, where each one of them minimise profile's
    the score in corresponding line
    """
    return [most_probable_k_mer(string, profile, k) for string in dna]

In [48]:
def randomised_motif_search(dna, k, t):
    motifs = motif_matrix_from_dna(dna, k)
    best_motifs = motifs[:]
    i = 0
    while True:
        i += 1
        profile = profile_from_motifs(motifs, 1)
        motifs = motifs_from_prifile(profile, dna, k)
        if motif_matrix_score(motifs) < motif_matrix_score(best_motifs):
            best_motifs = motifs
        elif i < 30:
            continue
        else:
            return best_motifs

In [49]:
def get_fucking_shit(dna, k, t):
    score = k * t
    for i in range(1000):
        motifs = randomised_motif_search(dna, k, t)
        if motif_matrix_score(motifs) < score:
            score = motif_matrix_score(motifs)
            print(score)
            best = motifs
    return best

In [50]:
k, t = 8, 5
dna = [
    'CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA',
    'GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG',
    'TAGTACCGAGACCGAAAGAAGTATACAGGCGT',
    'TAGATCAAGTTTCAGGTGCACGTCGGTGAACC',
    'AATCCACCAGCTCCACGTGCAATGTTGGCCTA'
]

In [52]:
get_fucking_shit(dna, k, t)

12
11
10
9


['TCTCGGGG', 'CCAAGGTG', 'TACAGGCG', 'TTCAGGTG', 'TCCACGTG']

In [53]:
with open('data/dataset_161_5.txt', 'r') as f:
    k, t = map(int, f.readline().strip().split(' '))
    dna = list(map(lambda x: x.strip(), f.readlines()))

In [54]:
from ipywidgets import IntProgress
from IPython.display import display
import time
n = 1000
prgBar = IntProgress(min = 0, max = n) # Создаем прогрессбар
display(prgBar) # Выводим прогрессбар на экран

score = k * t
for i in range(n):
    prgBar.value = prgBar.value + 1
    motifs = randomised_motif_search(dna, k, t)
    if motif_matrix_score(motifs) < score:
        score = motif_matrix_score(motifs)
        best = motifs
        print(score)
print('\n'.join(best))

IntProgress(value=0, max=1000)

143
131
108
62


KeyboardInterrupt: 