**Authors:** Zhanyuan Zhang, Adam Stafford  
**Purpose:** This notebook automates the process of generating words by a motif's position weight matrix.  
**Usage:** 
* Update the jaspar_list if necessary  
* Modify the jaspar_path if necessary
* Set corresponding motif_index
* Run the rest of the notebook, and check if the intermediate outputs match expectations
* Finally, set the buffer_file_path, where the list of words is buffered to

In [1]:
import numpy as np
import os
import pickle
import json

In [2]:
# May add more motifs in the future to this list
jaspar_list = ["MA0049.1_hb.jaspar", "MA0216.2_cad.jaspar", "MA0212.1_bcd.jaspar", "MA0447.1_gt.jaspar"]

# Modify this path appropriately
jaspar_path = "/home/zhanyuan/uc-berkeley/discoverydna/team_neural_network/data/input/jaspar_pwm/"

In [3]:
# Change the index according to the jaspar_list
motif_index = 1
data = open(jaspar_path + jaspar_list[motif_index], 'r').readlines()

# Extract the name of the motif for buffering
motif_name = data[0].split()[1]

# The first digit shows up at START, the last one shows up at END
start = 4
end = len(data[1]) - 3

data

['>MA0216.2\tcad\n',
 'A  [   854    575      0    745   2117      0   2236   2303   2303   1637   1046 ]\n',
 'C  [     0    341   1481   1475      0     95      0      0      0    115    755 ]\n',
 'G  [  1143   1387      0      0    186      0      0      0      0    147      0 ]\n',
 'T  [   306      0    822     83      0   2208     67      0      0    404    502 ]\n']

In [4]:
pre_pwm = []
for a_line in data[1:]:
    row = a_line.split("\t")[0][start:end]
    numbers = row.split() 
    entries = [float(k) for k in numbers] # cast to float
    pre_pwm.append(entries)
pre_pwm

[[854.0,
  575.0,
  0.0,
  745.0,
  2117.0,
  0.0,
  2236.0,
  2303.0,
  2303.0,
  1637.0,
  1046.0],
 [0.0, 341.0, 1481.0, 1475.0, 0.0, 95.0, 0.0, 0.0, 0.0, 115.0, 755.0],
 [1143.0, 1387.0, 0.0, 0.0, 186.0, 0.0, 0.0, 0.0, 0.0, 147.0, 0.0],
 [306.0, 0.0, 822.0, 83.0, 0.0, 2208.0, 67.0, 0.0, 0.0, 404.0, 502.0]]

In [5]:
# Convert the 2D list into a matrix
hbpwm = np.matrix(pre_pwm)

# Find the length of the words
word_length = len(pre_pwm[0])

# Determine the background model (assuming ATCG show up uniformly)
total_obs = 0
for i in range(len(pre_pwm)):
    total_obs += pre_pwm[i][0]
b = total_obs/4

# Transform the entries by taking log
log_hbpwm = np.log(hbpwm/b)
print(log_hbpwm)

[[ 3.94257655e-01 -1.30349790e-03            -inf  2.57710680e-01
   1.30208173e+00            -inf  1.35677030e+00  1.38629436e+00
   1.38629436e+00  1.04494704e+00  5.97055106e-01]
 [           -inf -5.23791061e-01  9.44799276e-01  9.40739730e-01
             -inf -1.80179665e+00            -inf            -inf
             -inf -1.61074141e+00  2.71044211e-01]
 [ 6.85738125e-01  8.79224882e-01            -inf            -inf
  -1.12992686e+00            -inf            -inf            -inf
             -inf -1.36524095e+00            -inf]
 [-6.32088437e-01            -inf  3.56066856e-01 -1.93683293e+00
             -inf  1.34416887e+00 -2.15098092e+00            -inf
             -inf -3.54258661e-01 -1.37073419e-01]]


  


In [6]:
positive_entries = []
for i in range (log_hbpwm.shape[0]):
    for j in range(log_hbpwm.shape[1]):
        if log_hbpwm[i, j] > 0:
            positive_entries.append((i, j))
positive_entries

[(0, 0),
 (0, 3),
 (0, 4),
 (0, 6),
 (0, 7),
 (0, 8),
 (0, 9),
 (0, 10),
 (1, 2),
 (1, 3),
 (1, 10),
 (2, 0),
 (2, 1),
 (3, 2),
 (3, 5)]

In [7]:
ACGT = {0:"A", 1:"C", 2:"G", 3:"T"}
pairs = []
for i in range(0, word_length):
    temp = []
    for j in range(0, len(positive_entries)):
        if (positive_entries[j][1] == i):
            temp.append(ACGT[positive_entries[j][0]])
    pairs.append(temp)
print(pairs)

[['A', 'G'], ['G'], ['C', 'T'], ['A', 'C'], ['A'], ['T'], ['A'], ['A'], ['A'], ['A'], ['A', 'C']]


In [8]:
# Find all the words by dynamic programming
W = [pairs[0]]
for i in range(1, len(pairs)):
    temp = W[i - 1].copy()
    W.append([])
    for char in pairs[i]:
        for word in temp:
            W[i].append(word+char)

In [9]:
W[word_length - 1]

['AGCAATAAAAA',
 'GGCAATAAAAA',
 'AGTAATAAAAA',
 'GGTAATAAAAA',
 'AGCCATAAAAA',
 'GGCCATAAAAA',
 'AGTCATAAAAA',
 'GGTCATAAAAA',
 'AGCAATAAAAC',
 'GGCAATAAAAC',
 'AGTAATAAAAC',
 'GGTAATAAAAC',
 'AGCCATAAAAC',
 'GGCCATAAAAC',
 'AGTCATAAAAC',
 'GGTCATAAAAC']

In [10]:
buffer_file_path = jaspar_path + "words_generated_by_" + motif_name

In [11]:
with open(buffer_file_path, "wb") as buff:
    pickle.dump(W[word_length - 1], buff)