In [1]:
from Bio import SeqIO
from Bio import Seq
import pandas as pd
import numpy as np
import csv
import re
import os

In [2]:
def seqToString(motif):
    """
    motif: a Seq that represents the motif 
    
    Returns the String representation of the motif
    """
    i = 0
    string = ""
    length = motif.__len__()
    while i < length:
        string += motif.__getitem__(i)
        i += 1
    return string

In [3]:
def getNegative(pos_seq):
    """
    pos_seq = dna sequence in the positive direction reading from the file
    
    Returns the negative counterpart of the positive sequence.
    """
    dict = {"A":'T','T':'A','G':'C','C':'G','-':'-'}
    negative = ""
    last_index = len(pos_seq) - 1
    while last_index > -1:
        negative += dict[pos_seq[last_index].upper()]
        last_index -= 1
    return negative

In [10]:
def getMotifLength(motif_key):
    f = open("../data/jaspar_fm/modified/" + motif_key, 'r')
    length = 0
    for line in f:
        a = line.split()
        length = len(a)
    return length

In [4]:
species = [11048, 16679, 19895, 40548, 43692, 44110, 48156, 50550, 59000, 60074, 61209, 6436, 6705, 7859, 8646]
member = ['Dkik', 'MEMB002A', 'MEMB002B', 'MEMB002C', 'MEMB002D', 'MEMB002E', 'MEMB002F', 'MEMB003A', 'MEMB003B', 
          'MEMB003C', 'MEMB003D', 'MEMB003E', 'MEMB003F', 'MEMB004A', 'MEMB004B', 'MEMB004E', 'MEMB005D', 'MEMB006B', 
          'MEMB006C', 'MEMB007A', 'MEMB007B', 'MEMB007C', 'MEMB007D', 'MEMB008C']

In [7]:
def getRawMotifStrings(csv_file):
    """
    csv_file: a csv file that contains the 'filtered' data frame
    
    Creates a reader representing the data frame and iterates through each line retrieving the species ID and key.
    Parses through the raw file to find the motif sequence.
    Returns an data frame of the motifs.
    """
    f = open("../data/output/map_motif_bcd_with_threshold/" + csv_file, 'r')
    data = csv.reader(f)
    line0 = next(data)
    print(line0)
    motifs = []
    for line in data:
        print(line)
        species = line[2]
        direction = line[4]
        raw_index = int(float(line[3]))
        key = species.split('|')[0]
        record_dict = SeqIO.to_dict(SeqIO.parse("../data/raw/outlier_rm_with_length_" + key + ".fa", "fasta"))
        sequence = record_dict[species]
        motif_key = line[6]
        motif_length = getMotifLength(motif_key);
        motif = sequence[raw_index:raw_index + motif_length]
        string_motif = seqToString(motif)
        if direction == "negative":
            string_motif = getNegative(string_motif)
        motifs += [string_motif]
    f.close()
    motif_df = pd.DataFrame({'motif sequences': motifs})
    return motifs

In [11]:
getRawMotifStrings('occurance_align_outlier_rm_with_length_VT11048.fa.csv')

['', 'score', 'species', 'raw_position', 'strand', 'align_position', 'motif']
['0', '11.612828254699707', 'VT11048|0|MEMB005D|-|287', '39', 'negative', '60', 'MA0212.1_bcd.jaspar']
['1', '11.612828254699707', 'VT11048|0|MEMB006A|+|286', '38', 'negative', '60', 'MA0212.1_bcd.jaspar']


['TAATCC', 'TAATCC']