**Author:** Boren Tsai
**Purpose:** This notebook randomly generates DNA sequences by a motif's position weight matrix (PWM)

In [51]:
import numpy as np
import random
import os

In [52]:
# Motifs
jaspar_files = ["MA0049.1_hb.jaspar", "MA0216.2_cad.jaspar", "MA0212.1_bcd.jaspar", "MA0447.1_gt.jaspar"]

# Path (modify accordingly)
file_path = "C:/Users/nicho/Desktop/team_neural_network/data/input/jaspar_pwm/"

In [53]:
# Change the index according to the jaspar_list
motif_index = 1
data = open(file_path + jaspar_files[motif_index], 'r').readlines()

# Extract the name of the motif for buffering
motif_name = data[0].split()[1]

# The first digit shows up at START, the last one shows up at END
start = 4
end = len(data[1]) - 3

data

['>MA0216.2\tcad\n',
 'A  [   854    575      0    745   2117      0   2236   2303   2303   1637   1046 ]\n',
 'C  [     0    341   1481   1475      0     95      0      0      0    115    755 ]\n',
 'G  [  1143   1387      0      0    186      0      0      0      0    147      0 ]\n',
 'T  [   306      0    822     83      0   2208     67      0      0    404    502 ]\n']

In [54]:
#Convert jaspar into numpy matrix 
PWM = []
for a_line in data[1:]:
    row = a_line.split("\t")[0][start:end]
    numbers = row.split() 
    entries = [int(k) for k in numbers] # cast to int
    PWM.append(entries)

rows = sum([row[0] for row in PWM])
columns = len(PWM[0])

DNA_Array = np.zeros(shape=(rows, columns))

print("Position Weight Matrix: " 
      + jaspar_files[motif_index] 
      + "\n", PWM, "\n")

Position Weight Matrix: MA0216.2_cad.jaspar
 [[854, 575, 0, 745, 2117, 0, 2236, 2303, 2303, 1637, 1046], [0, 341, 1481, 1475, 0, 95, 0, 0, 0, 115, 755], [1143, 1387, 0, 0, 186, 0, 0, 0, 0, 147, 0], [306, 0, 822, 83, 0, 2208, 67, 0, 0, 404, 502]] 



In [55]:
# Places FILL into MATRIX at column POSITION
# NUM amount of times
def fillIn(matrix, fill, position, num):
    counter = 0
    random_position = random.sample(range(rows), rows)
    while (len(random_position) > 0):
        if counter > num:
            break
        else:
            place = random_position.pop()
            while(matrix[place][position] != 0) :
                if len(random_position) > 0:
                    place = random_position.pop()
                else:
                    break
            if matrix[place][position] == 0:
                matrix[place][position] = fill
                counter += 1

In [56]:
# Creates randomly generator DNA sequences based off
# the information given by the specified posiiton weight matrix
def randomizedSeqGenerator(WeightMatrix):
    for k in range(1, 5):
        for i in range(columns):
            fillIn(DNA_Array, k, i, WeightMatrix[k-1][i])

    reader = {1 : "A", 2 : "C", 3 : "G", 4: "T"}

    lst = ["" for i in range(rows)]

    for i in range(rows):
        for j in range(columns):
            lst[i] += reader[DNA_Array[i][j]]

    seq = {}
    
    for i in range(len(lst)):
        if not (lst[i] in seq.keys()):
            seq[lst[i]] = 1
        else:
            pass
    
    for i in seq.keys():
        yield i

In [59]:
generator = randomizedSeqGenerator

In [62]:
for i in generator:
    print(i)

TypeError: 'function' object is not an iterator