In [1]:
import numpy as np
import os
import pickle

In [2]:
# May add more motifs in the future to this list
jaspar_list = ["MA0049.1_hb.jaspar", "MA0216.2_cad.jaspar", "MA0212.1_bcd.jaspar", "MA0447.1_gt.jaspar"]

# Modify this path appropriately
jaspar_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/jaspar_pwm/"

In [3]:
# Change the index according to the jaspar_list
data = open(jaspar_path + jaspar_list[3], 'r').readlines()

# Extract the name of the motif for buffering
motif_name = data[0].split()[1]

# The first digit shows up at START, the last one shows up at END
start = 4
end = len(data[1]) - 3

data

['>MA0447.1\tgt\n',
 'A  [    28      0      1     54      0      7      0     55     60      2 ]\n',
 'C  [     5      0      1      0     53      0      6      3      0     25 ]\n',
 'G  [    25      0      3      6      0     53      0      1      0      5 ]\n',
 'T  [     2     60     55      0      7      0     54      1      0     28 ]\n']

In [4]:
pre_pwm = []
for a_line in data[1:]:
    row = a_line.split("\t")[0][start:end]
    numbers = row.split() 
    entries = [float(k) for k in numbers] # cast to float
    pre_pwm.append(entries)
pre_pwm

[[28.0, 0.0, 1.0, 54.0, 0.0, 7.0, 0.0, 55.0, 60.0, 2.0],
 [5.0, 0.0, 1.0, 0.0, 53.0, 0.0, 6.0, 3.0, 0.0, 25.0],
 [25.0, 0.0, 3.0, 6.0, 0.0, 53.0, 0.0, 1.0, 0.0, 5.0],
 [2.0, 60.0, 55.0, 0.0, 7.0, 0.0, 54.0, 1.0, 0.0, 28.0]]

In [5]:
# Convert the 2D list into a matrix
hbpwm = np.matrix(pre_pwm)

# Find the length of the words
word_length = len(pre_pwm[0])

# Determine the background model (assuming ATCG show up uniformly)
total_obs = 0
for i in range(len(pre_pwm)):
    total_obs += pre_pwm[i][0]
b = total_obs/4

# Transform the entries by taking log
log_hbpwm = np.log(hbpwm/b)
print(log_hbpwm)

[[ 0.62415431        -inf -2.7080502   1.28093385        -inf -0.76214005
         -inf  1.29928298  1.38629436 -2.01490302]
 [-1.09861229        -inf -2.7080502         -inf  1.26224171        -inf
  -0.91629073 -1.60943791        -inf  0.51082562]
 [ 0.51082562        -inf -1.60943791 -0.91629073        -inf  1.26224171
         -inf -2.7080502         -inf -1.09861229]
 [-2.01490302  1.38629436  1.29928298        -inf -0.76214005        -inf
   1.28093385 -2.7080502         -inf  0.62415431]]


  


In [6]:
positive_entries = []
for i in range (log_hbpwm.shape[0]):
    for j in range(log_hbpwm.shape[1]):
        if log_hbpwm[i, j] > 0:
            positive_entries.append((i, j))
positive_entries

[(0, 0),
 (0, 3),
 (0, 7),
 (0, 8),
 (1, 4),
 (1, 9),
 (2, 0),
 (2, 5),
 (3, 1),
 (3, 2),
 (3, 6),
 (3, 9)]

In [7]:
ACGT = {0:"A", 1:"C", 2:"G", 3:"T"}
pairs = []
for i in range(0, word_length):
    temp = []
    for j in range(0, len(positive_entries)):
        if (positive_entries[j][1] == i):
            temp.append(ACGT[positive_entries[j][0]])
    pairs.append(temp)
print(pairs)

[['A', 'G'], ['T'], ['T'], ['A'], ['C'], ['G'], ['T'], ['A'], ['A'], ['C', 'T']]


In [8]:
# Find all the words by dynamic programming
W = [pairs[0]]
for i in range(1, len(pairs)):
    temp = W[i - 1].copy()
    W.append([])
    for char in pairs[i]:
        for word in temp:
            W[i].append(word+char)

In [9]:
W[word_length - 1]

['ATTACGTAAC', 'GTTACGTAAC', 'ATTACGTAAT', 'GTTACGTAAT']

In [10]:
buffer_file_path = jaspar_path + "words_generated_by_" + motif_name

In [11]:
with open(buffer_file_path, "wb") as buff:
    pickle.dump(W[word_length - 1], buff)