In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import sklearn
import math
%matplotlib inline
import matplotlib as plt

In [3]:
positive = pd.read_csv("data_new/g_quad_AAAA_unique_noSSE2.bed", delimiter="\t", names=["chr", "start", "end",
                                                                                     "seq", "col", "strand"])
positive.head()

Unnamed: 0,chr,start,end,seq,col,strand
0,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-
1,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-
2,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+
3,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-
4,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+


In [4]:
wrong_index = []

def test_data(df):
    def test(row):
        if row["end"] - row["start"] != len(row["seq"]):
            wrong_index.append(row.name)
            
    df.apply(lambda row: test)

In [5]:
test_data(positive)

In [8]:
len(wrong_index)

0

In [9]:
negative = pd.read_csv("data_new/g_quad_NovaSeq_SSEs_50bp_clean2.bed", delimiter="\t", names=["chr", "start", "end",
                                                                                     "seq", "col", "strand"])
negative.head()

Unnamed: 0,chr,start,end,seq,col,strand
0,chr1,955891,955916,CCCTACCCCCCTTCACCCCCTCCCC,1111,-
1,chr1,984237,984268,GGGCCTGGGGGGCAAGTCGGGGGGCGGGGGG,1111,+
2,chr1,984273,984299,GGGCAGGGTCCCCTGGGAGGATGGGG,1111,+
3,chr1,1035169,1035192,GGGGGGGGGGGGGTGGGCAGGGG,1111,+
4,chr1,1035208,1035229,GGGCAAAGGGATGGGACAGGG,1111,+


In [10]:
test_data(negative)

In [11]:
len(wrong_index)

0

In [12]:
bases = dict(zip("ACGTN", [[1,0,0,0,0],[0,1,0,0,0],[0,0,1,0,0],[0,0,0,1,0],[0,0,0,0,1]]))
bases 


{'A': [1, 0, 0, 0, 0],
 'C': [0, 1, 0, 0, 0],
 'G': [0, 0, 1, 0, 0],
 'T': [0, 0, 0, 1, 0],
 'N': [0, 0, 0, 0, 1]}

In [13]:
def transform(df):
    values = []
    
    def encode(row):
        result = []
        for base in row:
            result.append(bases[base])
            
        values.append(result)
        
    df["seq_new"].apply(encode)
    
    return values

In [14]:
def get_window_start(row):
    seq = row["seq"]
    length = len(seq)
    result = row["start"] - 50
    middle = length // 2
    result += middle
    return result


def get_window_end(row):
    seq = row["seq"]
    length = len(seq)
    result = row["end"] + 50
    middle = length / 2
    if length % 2 == 0:
        result -= middle
    else:
        result -= math.ceil(middle)  
    return int(result)
    
    
def get_ranges(df):
        
    df["start_new"] = df.apply(get_window_start, axis=1)
    df["end_new"] = df.apply(get_window_end, axis=1)
    
    return df


In [15]:
positive_good_data_with_ranges = get_ranges(positive)
positive_good_data_with_ranges.head()

Unnamed: 0,chr,start,end,seq,col,strand,start_new,end_new
0,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-,834857,835257
1,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-,876065,876465
2,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+,902712,903112
3,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-,903834,904234
4,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+,903927,904327


In [16]:
negative_good_data_with_ranges = get_ranges(negative)
negative_good_data_with_ranges.head()


Unnamed: 0,chr,start,end,seq,col,strand,start_new,end_new
0,chr1,955891,955916,CCCTACCCCCCTTCACCCCCTCCCC,1111,-,955703,956103
1,chr1,984237,984268,GGGCCTGGGGGGCAAGTCGGGGGGCGGGGGG,1111,+,984052,984452
2,chr1,984273,984299,GGGCAGGGTCCCCTGGGAGGATGGGG,1111,+,984086,984486
3,chr1,1035169,1035192,GGGGGGGGGGGGGTGGGCAGGGG,1111,+,1034980,1035380
4,chr1,1035208,1035229,GGGCAAAGGGATGGGACAGGG,1111,+,1035018,1035418


In [17]:
def check_data_consistent(df):
    all_good = True
    
    def check(row):
        nonlocal all_good 
        if (row["end_new"] - row["start_new"]) != 400:
            all_good = False
            print("{} is not ok".format(row.name))
    
    df.apply(check, axis=1)
    
    if all_good:
        print("all good")
    else:
        print("wrong data")


In [18]:
check_data_consistent(negative_good_data_with_ranges)


all good


In [19]:
check_data_consistent(positive_good_data_with_ranges)


all good


In [20]:
from Bio.Seq import Seq
from Bio import SeqIO


records = list(SeqIO.parse("./data/genome.fa", "fasta"))
len(records)



2580

In [21]:
def add_new_seq(df):
    def new_seq(row):
        return str(records[0].seq[row["start_new"]: row["end_new"]])
        
    df["seq_new"] = df.apply(new_seq, axis=1)
    
    return df


In [22]:
negative_with_seq = add_new_seq(negative_good_data_with_ranges)
negative_with_seq.head()

Unnamed: 0,chr,start,end,seq,col,strand,start_new,end_new,seq_new
0,chr1,955891,955916,CCCTACCCCCCTTCACCCCCTCCCC,1111,-,955703,956103,TCCCTTTTCAGGAAGGAAAGAAGGTGGGGCCGCTCCAACTGGCCCC...
1,chr1,984237,984268,GGGCCTGGGGGGCAAGTCGGGGGGCGGGGGG,1111,+,984052,984452,TGGAGTTGGCCTGAGGCTTCAGGGGAAGCCCTTCCCTGTATCCAGC...
2,chr1,984273,984299,GGGCAGGGTCCCCTGGGAGGATGGGG,1111,+,984086,984486,CCTGTATCCAGCCCAGTCATGACCCTTCCTGGTGGGAGGGTGGCTG...
3,chr1,1035169,1035192,GGGGGGGGGGGGGTGGGCAGGGG,1111,+,1034980,1035380,GCGGTGGACTCTTCCAGGGAAGGGGGTCCTGCCTGCACCCCTGTGG...
4,chr1,1035208,1035229,GGGCAAAGGGATGGGACAGGG,1111,+,1035018,1035418,CCCTGTGGCTGGGGCCCCATCTGACAGGGGTCAGGCCATGACTATT...


In [23]:
positive_with_seq = add_new_seq(positive_good_data_with_ranges)
positive_with_seq.head()

Unnamed: 0,chr,start,end,seq,col,strand,start_new,end_new,seq_new
0,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-,834857,835257,ATAACTAGATATACGAGTTTAAATAATTTTGTCAGAAACTGTTTCT...
1,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-,876065,876465,CCCCACACTCCCCCATACTCGCCCACACTCCCCCACACCCCACACT...
2,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+,902712,903112,GCAGTGAGCCGAGATCGCGCCATTGCACCCAGCCTGGGCAACCAGA...
3,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-,903834,904234,GGCCCTGAGGCTGGGCAAGGCTGTCCACCCCGCTGTCAGAACCCCA...
4,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+,903927,904327,CCAGCGGGCACAAGGTTGGGGCAGCTCTGTTCCCAGCAGGCCGAGC...


In [24]:
negative_with_seq.to_csv("./data_new/negative_examples.csv")
positive_with_seq.to_csv("./data_new/positive_examples.csv")