In [39]:
import pandas as pd
import numpy as np
import tensorflow as tf
import keras
import sklearn
import intervaltree
import math


In [2]:
# 4 times 3 ggg and there can be 1-7 nucleotides in between. that nucleotide can be a G too.
# the minus ones are C. -/+ is the strand. + is the strand
# in the bed file is only the motif itself. we can look with deep learning more upstream and downstream

In [3]:
bases = dict(zip("ACGT", [0,1,2,3]))
bases 

{'A': 0, 'C': 1, 'G': 2, 'T': 3}

In [4]:
positive = pd.read_csv("data/g_quad_AAAA_unique_noSSE.bed", delimiter="\t", names=["chr", "start", "end",
                                                                                     "seq", "col", "strand"])
positive.head()

Unnamed: 0,chr,start,end,seq,col,strand
0,chr1,790094,790096,GGGATGGGATGGGATGGGATGGGATGGG,1111,+
1,chr1,790097,790113,GGGATGGGATGGGATGGGATGGGATGGG,1111,+
2,chr1,834091,834096,GGGGTTTGGGGGCTGGGGCCTGGGAGGG,1111,+
3,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-
4,chr1,844622,844631,GGGTCTGCGGGGAGTAGGGTGGGG,1111,+


In [5]:
negative = pd.read_csv("data/g_quad_NovaSeq_SSEs_50bp_clean.bed", delimiter="\t", names=["chr", "start", "end",
                                                                                     "seq", "col", "strand"])
negative.head()


Unnamed: 0,chr,start,end,seq,col,strand
0,chr1,904296,904299,GGGTGGGGGGGTGGGGGGCGGCATCGGG,1111,+
1,chr1,937401,937411,CCCCCCCCCCCACCC,1111,-
2,chr1,955891,955916,CCCTACCCCCCTTCACCCCCTCCCC,1111,-
3,chr1,984237,984268,GGGCCTGGGGGGCAAGTCGGGGGGCGGGGGG,1111,+
4,chr1,984273,984299,GGGCAGGGTCCCCTGGGAGGATGGGG,1111,+


In [31]:
def verify_data(df):
    wrong_data = []
    good_data = []
    
    def filter_wrong_and_good_data(row):
        diff = row["end"] - row["start"]
        if diff is not len(row["seq"]):
            wrong_data.append(row.name)
        else:
            good_data.append(row.name)
    
    df.apply(filter_wrong_and_good_data, axis = 1)
    
    return wrong_data, good_data


In [32]:
(len(verify_data(negative)[0]), len(verify_data(negative)[1]))


Unnamed: 0,chr,start,end,seq,col,strand
3,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-
11,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-
17,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+
18,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-
19,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+


In [34]:
negative_good_data = negative.drop(verify_data(negative)[0])
negative_good_data.head()


Unnamed: 0,chr,start,end,seq,col,strand
3,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-
11,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-
17,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+
18,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-
19,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+


In [35]:
(len(verify_data(positive)[0]), len(verify_data(positive)[1]))



Unnamed: 0,chr,start,end,seq,col,strand
3,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-
11,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-
17,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+
18,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-
19,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+


In [36]:

positive_good_data = positive.drop(verify_data(positive)[0])
positive_good_data.head()


Unnamed: 0,chr,start,end,seq,col,strand
3,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-
11,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-
17,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+
18,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-
19,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+


In [55]:
def get_window_start(row):
    seq = row["seq"]
    length = len(seq)
    result = row["start"] - 200
    middle = length // 2
    result += middle
    return result


def get_window_end(row):
    seq = row["seq"]
    length = len(seq)
    result = row["end"] + 200
    middle = length / 2
    if length % 2 == 0:
        result -= middle
    else:
        result -= math.ceil(middle)  
    return int(result)
    
    
def get_ranges(df):
        
    df["start_new"] = df.apply(get_window_start, axis=1)
    df["end_new"] = df.apply(get_window_end, axis=1)
    
    return df


In [56]:
positive_good_data_with_ranges = get_ranges(positive_good_data)
positive_good_data_with_ranges.head()

Unnamed: 0,chr,start,end,seq,col,strand,start_,end_,start_new,end_new
3,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-,834857,835257,834857,835257
11,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-,876065,876465,876065,876465
17,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+,902712,903112,902712,903112
18,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-,903834,904234,903834,904234
19,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+,903927,904327,903927,904327


In [62]:
def check_data_consistent(df):
    all_good = True
    
    def check(row):
        nonlocal all_good 
        if (row["end_new"] - row["start_new"]) != 400:
            all_good = False
            print("{} is not ok".format(row.name))
    
    df.apply(check, axis=1)
    
    if all_good:
        print("all good")
    else:
        print("wrong data")


all good


In [63]:
check_data_consistent(positive_good_data_with_ranges)


all good


In [60]:
negative_good_data_with_ranges = get_ranges(negative_good_data)
negative_good_data_with_ranges.head()


all good


In [64]:
check_data_consistent(negative_good_data_with_ranges)



all good


In [None]:
from Bio.Seq import Seq
from Bio import SeqIO

records = list(SeqIO.parse("./data/genome.fa", "fasta"))
len(records)


all good


In [74]:
def add_new_seq(df):
    def new_seq(row):
        return str(records[0].seq[row["start_new"]: row["end_new"]])
        
    df["seq_new"] = df.apply(new_seq, axis=1)
    
    return df


In [75]:
negative_with_seq = add_new_seq(negative_good_data_with_ranges)
negative_with_seq.head()

Unnamed: 0,chr,start,end,seq,col,strand,start_new,end_new,seq_new
2,chr1,955891,955916,CCCTACCCCCCTTCACCCCCTCCCC,1111,-,955703,956103,TCCCTTTTCAGGAAGGAAAGAAGGTGGGGCCGCTCCAACTGGCCCC...
3,chr1,984237,984268,GGGCCTGGGGGGCAAGTCGGGGGGCGGGGGG,1111,+,984052,984452,TGGAGTTGGCCTGAGGCTTCAGGGGAAGCCCTTCCCTGTATCCAGC...
4,chr1,984273,984299,GGGCAGGGTCCCCTGGGAGGATGGGG,1111,+,984086,984486,CCTGTATCCAGCCCAGTCATGACCCTTCCTGGTGGGAGGGTGGCTG...
6,chr1,1035169,1035192,GGGGGGGGGGGGGTGGGCAGGGG,1111,+,1034980,1035380,GCGGTGGACTCTTCCAGGGAAGGGGGTCCTGCCTGCACCCCTGTGG...
7,chr1,1035208,1035229,GGGCAAAGGGATGGGACAGGG,1111,+,1035018,1035418,CCCTGTGGCTGGGGCCCCATCTGACAGGGGTCAGGCCATGACTATT...


In [77]:
positive_with_seq = add_new_seq(positive_good_data_with_ranges)
positive_with_seq.head()

Unnamed: 0,chr,start,end,seq,col,strand,start_,end_,start_new,end_new,seq_new
3,chr1,835046,835069,CCCTCTCCCTTGCCTCCCTCCCC,1111,-,834857,835257,834857,835257,ATAACTAGATATACGAGTTTAAATAATTTTGTCAGAAACTGTTTCT...
11,chr1,876250,876281,CCCACACTCGCCCACACTCCCCCACACTCCC,1111,-,876065,876465,876065,876465,CCCCACACTCCCCCATACTCGCCCACACTCCCCCACACCCCACACT...
17,chr1,902899,902925,GGGGACGAGGGGGCCCGGGATGCGGG,1111,+,902712,903112,902712,903112,GCAGTGAGCCGAGATCGCGCCATTGCACCCAGCCTGGGCAACCAGA...
18,chr1,904022,904047,CCCCTGCAGACCCTGTGCCCAGCCC,1111,-,903834,904234,903834,904234,GGCCCTGAGGCTGGGCAAGGCTGTCCACCCCGCTGTCAGAACCCCA...
19,chr1,904114,904140,GGGAGGCCTGGGGCGGAGGGCCGGGG,1111,+,903927,904327,903927,904327,CCAGCGGGCACAAGGTTGGGGCAGCTCTGTTCCCAGCAGGCCGAGC...


In [78]:
negative_with_seq.to_csv("./data/negative_examples.csv")
positive_with_seq.to_csv("./data/positive_examples.csv")