In [9]:
from Bio import SeqIO
from Bio import Seq
import pandas as pd
import numpy as np
import csv
import re
import os

In [10]:
def seqToString(motif):
    """
    motif: a Seq that represents the motif 
    
    Returns the String representation of the motif
    """
    i = 0
    string = ""
    length = motif.__len__()
    while i < length:
        string += motif.__getitem__(i)
        i += 1
    return string

In [11]:
def getNegative(pos_seq):
    """
    pos_seq = dna sequence in the positive direction reading from the file
    
    Returns the negative counterpart of the positive sequence.
    """
    dict = {"A":'T','T':'A','G':'C','C':'G','-':'-'}
    negative = ""
    last_index = len(pos_seq) - 1
    while last_index > -1:
        negative += dict[pos_seq[last_index].upper()]
        last_index -= 1
    return negative

In [12]:
def getMotifLength(motif_key):
    f = open("../data/jaspar_fm/modified/" + motif_key, 'r')
    length = 0
    for line in f:
        a = line.split()
        length = len(a)
    return length

In [13]:
species = [11048, 16679, 19895, 40548, 43692, 44110, 48156, 50550, 59000, 60074, 61209, 6436, 6705, 7859, 8646]
member = ['Dkik', 'MEMB002A', 'MEMB002B', 'MEMB002C', 'MEMB002D', 'MEMB002E', 'MEMB002F', 'MEMB003A', 'MEMB003B', 
          'MEMB003C', 'MEMB003D', 'MEMB003E', 'MEMB003F', 'MEMB004A', 'MEMB004B', 'MEMB004E', 'MEMB005D', 'MEMB006B', 
          'MEMB006C', 'MEMB007A', 'MEMB007B', 'MEMB007C', 'MEMB007D', 'MEMB008C']

In [2]:
thresh = pd.read_csv("../data/output/map_motif_bcd_with_threshold/occurance_align_outlier_rm_with_length_VT11048.fa.csv")
no_thresh = pd.read_csv("../data/output/map_motif_bcd_no_threshold/VT11048.fa.csv")

In [48]:
""" Getting the threshold csv in the format we want:
        - drop duplicates of aligned positions
        - drop motif, raw_postition, Unnamed: 0   """ 

thresh = thresh.drop_duplicates(subset = 'align_position')
thresh = thresh.drop(columns = ['motif', 'raw_position', 'Unnamed: 0'])

In [49]:
thresh

Unnamed: 0,score,species,strand,align_position
0,11.612828,VT11048|0|MEMB005D|-|287,negative,60


In [50]:
""" Getting the no threshold csv in the format that we want it. 
        - drop the columns: 'score', 'strand', 'motif', 'Unnamed:0'. """

no_thresh = no_thresh.drop(columns = ['score', 'motif', 'strand', 'Unnamed: 0'])

In [51]:
no_thresh

Unnamed: 0,species,raw_position,align_position
0,VT11048|0|MEMB002A|+|284,0,21
1,VT11048|0|MEMB002A|+|284,0,21
2,VT11048|0|MEMB002A|+|284,1,22
3,VT11048|0|MEMB002A|+|284,1,22
4,VT11048|0|MEMB002A|+|284,2,23
5,VT11048|0|MEMB002A|+|284,2,23
6,VT11048|0|MEMB002A|+|284,3,24
7,VT11048|0|MEMB002A|+|284,3,24
8,VT11048|0|MEMB002A|+|284,4,25
9,VT11048|0|MEMB002A|+|284,4,25


In [89]:
len(result)

24

In [54]:
""" Merging the two dfs to get the corresponding raw_positions for every species
        - merge based on align_position
        - drop ever duplicate of the species_y  """
result = pd.merge(thresh, no_thresh, on = 'align_position')
result = result.drop_duplicates(subset = 'species_y')

In [55]:
result

Unnamed: 0,score,species_x,strand,align_position,species_y,raw_position
0,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB002A|+|284,39
2,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB002B|-|284,39
4,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB002C|-|284,39
6,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB002D|+|269,35
8,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB002E|+|288,40
10,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB002F|+|278,36
12,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB003A|+|278,39
14,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB003B|+|289,43
16,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB003C|-|284,39
18,11.612828,VT11048|0|MEMB005D|-|287,negative,60,VT11048|0|MEMB003D|+|274,35


In [14]:
def get_raw(species, length):
    record_dict = SeqIO.to_dict(SeqIO.parse("../data/raw/outlier_rm_with_length_VT" + str(species) + ".fa", "fasta"))
    print(record_dict)
    seq = []
    before = []
    after = []
    for index, row in result.iterrows():
        spec = row['species_y']
        print(spec)
        pos = row['raw_position']
        print(pos)
        strand = row['strand']
        print(strand)
        seq = record_dict[spec]
        if strand == 'negative':
            sequences.append(getNegative(seqToString(seq[pos:pos + 6])))
            before.append(getNegative(seqToString(seq[pos - length:pos])))
            after.append(getNegative(seqToString(seq[pos + 6:pos + 6 + length])))
        else:
            sequences.append(seqToString(seq[pos:pos + 6]))
            before.append(seqToString(seq[pos - length:pos]))
            after.append(seqToString(seq[pos + 6:pos + 6 + length]))

In [102]:
record_dict = SeqIO.to_dict(SeqIO.parse("../data/raw/outlier_rm_with_length_VT11048.fa", "fasta"))
sequences = []
before = []
after = []
length = 5
for index, row in result.iterrows():
    spec = row['species_y']
    print(spec)
    pos = row['raw_position']
    print(pos)
    strand = row['strand']
    print(strand)
    seq = record_dict[spec]
    if strand == 'negative':
        sequences.append(getNegative(seqToString(seq[pos:pos + 6])))
        before.append(getNegative(seqToString(seq[pos - length:pos])))
        after.append(getNegative(seqToString(seq[pos + 6:pos + 6 + length])))
    else:
        sequences.append(seqToString(seq[pos:pos + 6]))
        before.append(seqToString(seq[pos - length:pos]))
        after.append(seqToString(seq[pos + 6:pos + 6 + length]))
# print(sequences)
# print(len(sequences))
# print(sequences.index('TAAGCC'))
result['raw_seq'] = np.array(sequences)
result['before_seq'] = np.array(before)
result['after_seq'] = np.array(after)
result
result.to_csv("../data/output/full_raw_motif_extraction/11048/11048_final_raw.fa.csv")

VT11048|0|MEMB002A|+|284
39
negative
VT11048|0|MEMB002B|-|284
39
negative
VT11048|0|MEMB002C|-|284
39
negative
VT11048|0|MEMB002D|+|269
35
negative
VT11048|0|MEMB002E|+|288
40
negative
VT11048|0|MEMB002F|+|278
36
negative
VT11048|0|MEMB003A|+|278
39
negative
VT11048|0|MEMB003B|+|289
43
negative
VT11048|0|MEMB003C|-|284
39
negative
VT11048|0|MEMB003D|+|274
35
negative
VT11048|0|MEMB003F|-|285
39
negative
VT11048|0|MEMB004A|+|284
39
negative
VT11048|0|MEMB004B|+|285
39
negative
VT11048|0|MEMB004E|-|285
39
negative
VT11048|0|MEMB005B|+|298
60
negative
VT11048|0|MEMB005D|-|287
39
negative
VT11048|0|MEMB006A|+|286
38
negative
VT11048|0|MEMB006B|-|275
38
negative
VT11048|0|MEMB006C|-|285
39
negative
VT11048|0|MEMB007B|-|284
39
negative
VT11048|0|MEMB007C|+|283
38
negative
VT11048|0|MEMB007D|-|274
38
negative
VT11048|0|MEMB008C|-|285
39
negative
VT11048|0|dkik|-|290
39
negative


In [6]:
thresh2 = pd.read_csv("../data/output/map_motif_bcd_with_threshold/occurance_align_outlier_rm_with_length_VT11048.fa.csv")
thresh2 = thresh2.aggregate('align_position')




In [7]:
thresh2

0    60
1    60
Name: align_position, dtype: int64

In [17]:
get_raw(16679, 5)

{'VT16679|0|MEMB002A|-|1805': SeqRecord(seq=Seq('CCGGCCTGGAAAGGGCGAAGGTTCGAAATGAAATGGGATTTCCGCAGGATGTTC...GAC', SingleLetterAlphabet()), id='VT16679|0|MEMB002A|-|1805', name='VT16679|0|MEMB002A|-|1805', description='VT16679|0|MEMB002A|-|1805', dbxrefs=[]), 'VT16679|0|MEMB002D|-|1976': SeqRecord(seq=Seq('CCGGCCTGCAGAATCGGAAATGGAATTTCCGCAGGATGTTCGATTGGTGGCATT...GAC', SingleLetterAlphabet()), id='VT16679|0|MEMB002D|-|1976', name='VT16679|0|MEMB002D|-|1976', description='VT16679|0|MEMB002D|-|1976', dbxrefs=[]), 'VT16679|0|MEMB002E|-|1989': SeqRecord(seq=Seq('CCGGCCTGCAGAATGAGGGGAAATGGGAAATGGAATTTCCGCAGGATGTTCGAT...GAC', SingleLetterAlphabet()), id='VT16679|0|MEMB002E|-|1989', name='VT16679|0|MEMB002E|-|1989', description='VT16679|0|MEMB002E|-|1989', dbxrefs=[]), 'VT16679|0|MEMB002F|-|1741': SeqRecord(seq=Seq('CCGGCCTGTGGAAAGGTCGAAATGTAATGGAAATTCGCAGGATGttcgattggt...GAC', SingleLetterAlphabet()), id='VT16679|0|MEMB002F|-|1741', name='VT16679|0|MEMB002F|-|1741', description='VT16679|0|MEMB00

NameError: name 'result' is not defined

In [19]:
#def raw_positions(name, thresh_file, length):
for file in os.listdir('../data/output/map_motif_bcd_with_threshold'):
    thresh = pd.read_csv(file)
    num = file.split('length_', 1)[1]
    no_thresh = pd.read_csv("../data/output/map_motif_bcd_no_threshold/"+str(num)+".fa.csv")
    print(num)

FileNotFoundError: [Errno 2] File b'occurance_align_outlier_rm_with_length_VT7859.fa.csv' does not exist: b'occurance_align_outlier_rm_with_length_VT7859.fa.csv'

In [28]:
def raw_string(spec, length):
    thresh = pd.read_csv("../data/output/map_motif_bcd_with_threshold/occurance_align_outlier_rm_with_length_VT"+str(spec)+".fa.csv")
    no_thresh = pd.read_csv("../data/output/map_motif_bcd_no_threshold/VT"+str(spec)+".fa.csv")
    thresh = thresh.drop_duplicates(subset = 'align_position')
    thresh = thresh.drop(columns = ['motif', 'raw_position', 'Unnamed: 0'])
    no_thresh = no_thresh.drop(columns = ['score', 'motif', 'strand', 'Unnamed: 0'])
    result = pd.merge(thresh, no_thresh, on = 'align_position')
    result = result.drop_duplicates(subset = 'species_y')
    record_dict = SeqIO.to_dict(SeqIO.parse("../data/raw/outlier_rm_with_length_VT11048.fa", "fasta"))
    sequences = []
    before = []
    after = []
    length = length
    for index, row in result.iterrows():
        spec = row['species_y']
        #print(spec)
        pos = row['raw_position']
        #print(pos)
        strand = row['strand']
        #print(strand)
        seq = record_dict[spec]
        if strand == 'negative':
            sequences.append(getNegative(seqToString(seq[pos:pos + 6])))
            before.append(getNegative(seqToString(seq[pos - length:pos])))
            after.append(getNegative(seqToString(seq[pos + 6:pos + 6 + length])))
        else:
            sequences.append(seqToString(seq[pos:pos + 6]))
            before.append(seqToString(seq[pos - length:pos]))
            after.append(seqToString(seq[pos + 6:pos + 6 + length]))
    # print(sequences)
    # print(len(sequences))
    # print(sequences.index('TAAGCC'))
    result['raw_seq'] = np.array(sequences)
    result['before_seq'] = np.array(before)
    result['after_seq'] = np.array(after)
    result
    if os.path.exists("../data/output/full_raw_motif_extraction/"+str(spec)) == False:
        os.mkdir("../data/output/full_raw_motif_extraction/"+str(spec))
    result.to_csv("../data/output/full_raw_motif_extraction/"+str(spec)+"/"+str(spec)+"_final_raw.fa.csv")

In [29]:
s = 11048
raw_string(s, 5)

VT11048|0|MEMB002A|+|284
39
negative
VT11048|0|MEMB002B|-|284
39
negative
VT11048|0|MEMB002C|-|284
39
negative
VT11048|0|MEMB002D|+|269
35
negative
VT11048|0|MEMB002E|+|288
40
negative
VT11048|0|MEMB002F|+|278
36
negative
VT11048|0|MEMB003A|+|278
39
negative
VT11048|0|MEMB003B|+|289
43
negative
VT11048|0|MEMB003C|-|284
39
negative
VT11048|0|MEMB003D|+|274
35
negative
VT11048|0|MEMB003F|-|285
39
negative
VT11048|0|MEMB004A|+|284
39
negative
VT11048|0|MEMB004B|+|285
39
negative
VT11048|0|MEMB004E|-|285
39
negative
VT11048|0|MEMB005B|+|298
60
negative
VT11048|0|MEMB005D|-|287
39
negative
VT11048|0|MEMB006A|+|286
38
negative
VT11048|0|MEMB006B|-|275
38
negative
VT11048|0|MEMB006C|-|285
39
negative
VT11048|0|MEMB007B|-|284
39
negative
VT11048|0|MEMB007C|+|283
38
negative
VT11048|0|MEMB007D|-|274
38
negative
VT11048|0|MEMB008C|-|285
39
negative
VT11048|0|dkik|-|290
39
negative
