nuin / beginning-python-for-bioinformatics

repository for the code featured in the blog

This URL has Read+Write access

beginning-python-for-bioinformatics / scripts / motifs / pymotif.py
100644 51 lines (42 sloc) 1.447 kb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
#!/usr/bin/env python
 
import fasta
import sys
from collections import defaultdict
 
def choose(n, k):
    if 0 <= k <= n:
        ntok = 1
        ktok = 1
        for t in xrange(1, min(k, n - k) + 1):
            ntok *= n
            ktok *= t
            n -= 1
        return ntok // ktok
    else:
        return 0
    
def get_quorums(seqs, mlen):
    """
add seq id_no to a set
use explicit counter to create seq_no
"""
    quorum = defaultdict(int)
    for seq in seqs:
        for n in range(len(seq) - mlen):
            quorum[seq[n:n + mlen]] += 1
    return quorum
    
def calculate_motifs(input_seqs, input_seqs2, width):
    
    print input_seqs, input_seqs2
    input_seqs = fasta.read_seqs(open(input_seqs).readlines())
    input_seqs2 = fasta.read_seqs(open(input_seqs2).readlines())
 
    foreground = get_quorums(input_seqs, width)
    background = get_quorums(input_seqs2, width)
 
    N = len(input_seqs) + len(input_seqs2)
 
    res_motifs = []
    for i in foreground:
        term1 = choose(background[i], foreground[i])
        term2 = choose((N - background[i]), len(input_seqs) - 1)
        term3 = choose(N, len(input_seqs))
        p = (float(term1) * float(term2)) / term3
        if 0 < p <= 0.0001:
            res_motifs.append(i + '\t' + str(foreground[i]) + '\t' + str(background[i]) + '\t' + str(p))
    
    res_motifs.sort()
    return res_motifs