In [None]:
import numpy as np

def form_profile(motifs):
    '''For each position in motifs, calculate the frequencies of ACGT'''
    k = len(motifs[0])
    t = len(motifs)
    profile = {i: [0] * k for i in range(len('ACGT'))}

    for i in range(k):
        column = [motif[i] for motif in motifs]
        for j, nucleotide in enumerate('ACGT'):
            profile[j][i] = (column.count(nucleotide) + 1)/ t
    return profile


def Score(motifs):
    '''Finds a consensus from motifs and a score indicating how well motifs match it'''
    consensus = ''
    score = 0
    k = len(motifs[0])
    counts = np.zeros((4, k))
    
    for i in range(k):
        column = [motif[i] for motif in motifs]
        counts[0][i] = column.count('A')
        counts[1][i] = column.count('C')
        counts[2][i] = column.count('G')
        counts[3][i] = column.count('T')
        
        max_count = max(counts[:, i])
        score += sum(counts[:, i]) - max_count
        
        if max_count == counts[0][i]:
            consensus += 'A'
        elif max_count == counts[1][i]:
            consensus += 'C'
        elif max_count == counts[2][i]:
            consensus += 'G'
        elif max_count == counts[3][i]:
            consensus += 'T'
    
    return score, consensus


def GreedyMotifFinding(seq, k, profile):
    '''Finds the most probable k-mer in a sequence given a profile'''
    max_prob = -1
    kmer = ''
    for i in range(len(seq) - k + 1):
        kmer = seq[i:i+k]
        prob = 1
        for j in range(k):
            if kmer[j] == 'A':
                prob *= profile[0][j]
            elif kmer[j] == 'C':
                prob *= profile[1][j]
            elif kmer[j] == 'G':
                prob *= profile[2][j]
            elif kmer[j] == 'T':
                prob *= profile[3][j]
        
        if prob > max_prob:
            max_prob = prob
            most_probable_kmer = kmer
            
    return most_probable_kmer

def greedysearch(dna, k , t):
    '''For a given k-mer length and given number of sequences given by t, 
    implements a Greedy Search algorithm to find the best motifs'''
    bestmotifs = [seq[:k] for seq in dna]
    for i in range(len(dna[0]) - k + 1):
        motifs_first = [dna[0][i:i+k]]
        for motif in motifs_first:
            motifs_it = [motif]
            for j in range(1, t):
                profile = form_profile(motifs_it)
                most = GreedyMotifFinding(dna[j], k, profile)
                motifs_it.append(most)
            if Score(motifs_it)[0] < Score(bestmotifs)[0]:
                bestmotifs = motifs_it
                consensus = Score(bestmotifs)[1]
    return consensus
   
with open("dataset_30306_9.txt", 'r') as file:
    first_line = file.readline().strip().split()
    k, t = map(int, first_line)
    dna = file.read().strip().split()
print(' '.join(greedysearch(dna, k , t)))
print(len(greedysearch(dna, k, t)))
# dna = ['GGCGTTCAGGCA', 'AAGAATCAGTCA', 'CAAGGAGTTCGC', 'CACGTCAATCAC', 'CAATAATATTCG']
# k, t = 3, 5
# print(' '.join(greedysearch(dna, k, t)))

G G G A A G C T T C C T
12


In [None]:
def Neighbors(pattern, d):
    '''Generate all sequences that differ from a pattern by max d nucleotides'''
    if d == 0:
        return pattern
    if len(pattern) == 1:
        return {'A', 'C', 'G', 'T'}
    alphabet = {'A', 'C', 'G', 'T'}
    neighbors = set()
    
    suffixneighbors = Neighbors(pattern[1:],d)
    for text in suffixneighbors:
        if HammingDistance(pattern[1:], text) < d:
            for nucleotide in alphabet:
                neighbors.add(nucleotide+text)
        else:
            neighbors.add(pattern[0] + text)
    return neighbors

def allstrings_try(k):
    '''Generates all possible 4^k k-mers using Neighbors function'''
    pattern = 'A' * k
    seq = list(Neighbors(pattern, k))
    return seq

def DistanceBetweenPatternAndStrings(pattern, dna):
    '''For a given pattern, compute its best match (minimum Hamming distance) in each dna string
    dna -> a list of dna strings
    pattern -> pattern'''
    k = len(pattern)
    distance = 0
    for seq in dna:
        hamming = float('inf')
        for i in range(len(seq) - k + 1):
            pattern_prime = seq[i:i+k]
            if hamming >= HammingDistance(pattern_prime, pattern):
                hamming = HammingDistance(pattern_prime, pattern)
        distance += hamming
    return distance

def HammingDistance(seq1, seq2):
    count = 0
    for n, j in zip(seq1, seq2):
        if n != j:
            count += 1
    return count

def MedianString(dna, k):
    '''Finds the k-mer that has the minimum total Hamming distance to all DNA strings
    dna -> list of dna strings
    k -> length of the k-mer'''
    d = float('inf')
    patterns = allstrings_try(k)
    median = ''
    for pattern in patterns:
        if DistanceBetweenPatternAndStrings(pattern, dna) <= d:
            d = DistanceBetweenPatternAndStrings(pattern, dna)
            median = pattern
    return median



k = 7
dna = ['CTCGATGAGTAGGAAAGTAGTTTCACTGGGCGAACCACCCCGGCGCTAATCCTAGTGCCC', 'GCAATCCTACCCGAGGCCACATATCAGTAGGAACTAGAACCACCACGGGTGGCTAGTTTC', 'GGTGTTGAACCACGGGGTTAGTTTCATCTATTGTAGGAATCGGCTTCAAATCCTACACAG']

print(MedianString(dna, k))

GTAGGAA
