## Goals:
    1. Take in DF's from 'filtered' (using pandas)
    2. Read first row, get first item in species column
    3. Use species[1] to look up sequence in fasta file
    4. User first row, raw position # to get nucleotide position + n (length of motif)
    5. Put this nucleotide sequence into new column

TO-DO:
1. read in motif file and get motif length
2. make alignment file method for raw file
3. when reading file, need to account for negative/positive
4. make alignment file method --> need to account for spaces???

Account for negative string reading GGATTA
                                    CCTAAT
                                    
-bcd:
- GGATTA positive
- TAATCC negative

In [18]:
from Bio import SeqIO
from Bio import Seq
import pandas as pd
import numpy as np
import csv
import re
import os

In [19]:
def seqToString(motif):
    """
    motif: a Seq that represents the motif 
    
    Returns the String representation of the motif
    """
    i = 0
    string = ""
    length = motif.__len__()
    while i < length:
        string += motif.__getitem__(i)
        i += 1
    return string

In [20]:
def getMotifLength(motif_key):
    f = open("../data/jaspar_fm/modified/" + motif_key, 'r')
    length = 0
    for line in f:
        a = line.split()
        length = len(a)
    return length

In [21]:
getMotifLength("MA0049.1_hb.jaspar")

10

In [22]:
def getNegative(pos_seq):
    """
    pos_seq = dna sequence in the positive direction reading from the file
    
    Returns the negative counterpart of the positive sequence.
    """
    dict = {"A":'T','T':'A','G':'C','C':'G','-':'-'}
    negative = ""
    last_index = len(pos_seq) - 1
    while last_index > -1:
        negative += dict[pos_seq[last_index].upper()]
        last_index -= 1
    return negative

In [23]:
def getRawMotifStrings(csv_file):
    """
    csv_file: a csv file that contains the 'filtered' data frame
    
    Creates a reader representing the data frame and iterates through each line retrieving the species ID and key.
    Parses through the raw file to find the motif sequence.
    Returns an data frame of the motifs.
    """
    f = open("../data/output/bcd/" + csv_file, 'r')
    data = csv.reader(f)
    line0 = next(data)
    motifs = []
    for line in data:
        species = line[2]
        direction = line[4]
        raw_index = int(float(line[3]))
        key = species.split('|')[0]
        record_dict = SeqIO.to_dict(SeqIO.parse("../data/raw/outlier_rm_with_length_" + key + ".fa", "fasta"))
        sequence = record_dict[species]
        motif_key = line[6]
        motif_length = getMotifLength(motif_key);
        motif = sequence[raw_index:raw_index + motif_length]
        string_motif = seqToString(motif)
        if direction == "negative":
            string_motif = getNegative(string_motif)
        motifs += [string_motif]
    f.close()
    motif_df = pd.DataFrame({'motif sequences': motifs})
    return motifs

In [24]:
def getAlignMotifStrings(csv_file):
    f = open("../data/output/bcd/" + csv_file, 'r')
    data = csv.reader(f)
    line0 = next(data)
    motifs = []
    for line in data:
        species = line[2]
        direction = line[4]
        align_index = int(float(line[5]))
        key = species.split('|')[0]
        record_dict = SeqIO.to_dict(SeqIO.parse("../data/alignments/align_outlier_rm_with_length_" + key + ".fa", "fasta"))
        sequence = record_dict[species]
        motif_key = line[6]
        motif_length = getMotifLength(motif_key);
        motif = sequence[align_index:align_index + motif_length]
        string_motif = seqToString(motif)
        if direction == "negative":
            string_motif = getNegative(string_motif)
        motifs += [string_motif]
    f.close()
    motif_df = pd.DataFrame({'motif sequences': motifs})
    return motifs

In [25]:
motif = getAlignMotifStrings("occurance_align_outlier_rm_with_length_VT6436.fa.csv")
df = pd.DataFrame()
df["VT6436.fa.csv"] = motif
# print(df)
motif2 = getAlignMotifStrings("occurance_align_outlier_rm_with_length_VT6705.fa.csv")
df1 = pd.DataFrame()
df1["VT6705.fa.csv"] = motif2
# print(df1)
df = pd.concat([df,df1], ignore_index=False, axis = 1)
#print(df)

In [26]:
df = pd.DataFrame()
if df.empty:
    print("empty")

empty


In [27]:
def alignmentsDataFrame():
    csvList = os.listdir("../data/output/bcd/")
    df = pd.DataFrame()
    for file in csvList:
        motif = getAlignMotifStrings(file)
        df1 = pd.DataFrame()
        df1[file[39:]] = motif
        if df.empty:
            df[file[39:]] = motif
        else:
            df = pd.concat([df,df1], ignore_index=False, axis=1)
    new_csv = df.to_csv("../data/output/2.motif_extraction_output/align/bcd_motif.fa.csv")
    print(df)

In [28]:
alignmentsDataFrame()

    VT11048.fa.csv VT16679.fa.csv VT19895.fa.csv VT40548.fa.csv  \
0           TAATCC         TTATCC         TAATCC         TAATCC   
1           TAATCC         -----C         TT-ATC         TTATCC   
2              NaN         TAATCC         TAATCC         TTATCC   
3              NaN         TTATCC         TT-ATC         TAATCC   
4              NaN         T-----         TAATCC         TTATCC   
5              NaN         -----C         TAATCC         TAATCC   
6              NaN         TAATCC         TAATCC         TAATCC   
7              NaN         TTATCC         TAATCC         TTATCC   
8              NaN         TTATCC         A--TCC         TAATCC   
9              NaN         TTATCC         TTATCC         TAATCC   
10             NaN         -----C         TAATCC         TAATCC   
11             NaN         TAATCC         A--TCC         TAATCC   
12             NaN         TTATCC         TAATCC         TTA---   
13             NaN         TTATCC         TAATCC         TAATC

In [29]:
def csvToDataFrame(csv_file):
    df = pd.read_csv("../data/output/bcd/" + csv_file);
    return df

In [30]:
def csvToArray(csv_file):
    f = open(csv_file, 'r')
    data = csv.reader(f)
    labels = next(data)
    labels.append('motif sequence')
    array = []
    array.append(labels)
    
    for row in data:
        array.append(row)
    return array

In [35]:
def addRawMotifStrings(csv_file):
    """
    csv_file: a csv file that contains the 'filtered' data frame
    
    Appends the DF of motif sequences as a column onto the filtered DF.
    """
    raw_motif_seq = getRawMotifStrings(csv_file)
    df = csvToDataFrame(csv_file)
    df['raw motif sequences'] = raw_motif_seq
    new_csv = df.to_csv("../data/output/2.motif_extraction_output/raw/motif+" + csv_file)
    return df

In [36]:
d = addRawMotifStrings("occurance_align_outlier_rm_with_length_VT11048.fa.csv")

In [37]:
d

Unnamed: 0.1,Unnamed: 0,score,species,raw_position,strand,align_position,motif,raw motif sequences
0,0,11.612828,VT11048|0|MEMB005D|-|287,39,negative,60,MA0212.1_bcd.jaspar,TAATCC
1,1,11.612828,VT11048|0|MEMB006A|+|286,38,negative,60,MA0212.1_bcd.jaspar,TAATCC


In [179]:
csvList = os.listdir("../data/output/bcd/")
for file in csvList:
    addRawMotifStrings(file)