## Evaluate SIM tests
* Look at results from MEME, MEMECHIP, DREME, and mEpigram
* See if the inserted motifs are found 
* mEpigram: top 3 motifs, others: top 3 motifs and of k>= 6
* Method to use: motif similarity score 

In [2]:
from scipy import stats
from sys import argv
'''
TODO: Change the algorithm for motif alignment-distance calculation

Remove similar motifs:
Usage:
python filter_motiflist.py input output
'''
def load_motifs(filename):
    file=open(filename)
    seq=file.read().split("MOTIF")
    seq=seq[1:]
    motifs={}
    meta={}
    for s in range(len(seq)):
        t=seq[s].strip().split("\n")
        motifs[int(t[0].split('_')[0])]=t[2:]
        meta[int(t[0].split('_')[0])]=t[:2]
    for m in motifs:
        tdict={'A':[],'C':[],'G':[],'T':[],'E':[],}
        for pos in range(len(motifs[m])):
            tmp=motifs[m][pos].strip().split("\t")
            tdict['A']+=[float(tmp[0])]
            tdict['C']+=[float(tmp[1])]
            tdict['G']+=[float(tmp[2])]
            tdict['T']+=[float(tmp[3])]
            tdict['E']+=[float(tmp[4])]
        motifs[m]=tdict
    
    return motifs,meta

def makevector(motif,start,end):
    '''Make a vector out of the motif, using the start and end positions. The order of bases is A C G T E
    '''
    alphabet=['A','C','G','T','E']
    vector=[]
    for i in range(end-start):
        for char in alphabet:
            vector+=[motif[char][start+i]]
    return vector

#align at least 4 positions together. 



def slide(m1,m2):
    s1=0
    e1=len(m1['A'])
    s2=0
    e2=len(m2['A'])
    #print e1,e2
    maxalignlength=min(e1,e2)
    scores=[]
    for k in range(4,maxalignlength+1):
        #print "k is",k,
        vector1_1=makevector(m1,s1,s1+k)
        vector2_1=makevector(m2,e2-k,e2)
        score_1=stats.pearsonr(vector1_1,vector2_1)
        
        vector1_2=makevector(m1,e1-k,e1)
        vector2_2=makevector(m2,s2,s2+k)
        score_2=stats.pearsonr(vector1_2,vector2_2)
        scores=scores+[score_1]+[score_2]
    return max(scores)
def revcomplmotif(motif):
    '''do a reverse compliment of a motif'''  # I dont know how to do it yet
    return
    

def main():
    input=argv[1]
    output=argv[2]

    motifs,meta=load_motifs(input)

    for m1 in range(len(motifs)):
        for m2 in range(len(motifs)):
            if m1==m2 or m1 not in motifs or m2 not in motifs:
                continue
            else:
                score=slide(motifs[m1],motifs[m2])
                if score[0]>=0.9: 
                    print "FOUND", m1, m2,score
                    if m2>m1:
                        print "deleting",m2
                        del motifs[m2]
                        #toremove[m2]=1
                    else:
                        print "deleting",m1
                        del motifs[m1]
                        #toremove[m1]=1

    target=open(output,'w')
    header="MEME version 4.5\nALPHABET= A,C,G,T,mC \nstrands: + \nBackground letter frequencies (from \nA 0.295 C 0.205 G 0.205 T 0.295\n"
    target.write(header+'\n')
    alphabet=['A','C','G','T','E']
    newname=0
    for m in sorted(motifs.keys()):
        h1='MOTIF '+str(newname)+"_"+'_'.join('\n'.join(meta[m]).split('_')[1:])
        target.write(h1+'\n')
        for i in range(len(motifs[m]['A'])):
            line=''
            for char in alphabet:
                line+=str(motifs[m][char][i])+'\t'
            line=line.strip()
            target.write(line+'\n')
        target.write('\n')
        newname+=1
    target.close()

if __name__ == "__main__":
    main()
