In [1]:
import numpy as np
from Bio import SeqIO
import gzip
import os
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, MaxPooling1D, Flatten, GlobalAveragePooling1D, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve

np.random.seed(454)

def load_data(path):
    data = gzip.open(os.path.join(path,"sequences.fa.gz"),"rt")
    return data


def get_seq(protein, t_data):
    training_data = load_data("data/clip/%s/5000/training_sample_0"% protein)
    x_train = []
    
    
    for record in SeqIO.parse(training_data,"fasta"):
        sequence = list(record.seq)                
        nucleotide = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3, 'N' : 4} 
        num_seq = list() #sekvenca v številskem formatu
        

        for i in range(0,len(sequence)):
            num_seq.append(nucleotide[sequence[i]])


        X = np.zeros((len(num_seq),4))

        for i in range (len(num_seq)):
            if num_seq[i] <= 3:
                X[i,num_seq[i]] = 1               
               
        x_train.append(X.flatten())


    x_train = np.array(x_train)
    x_train = np.expand_dims(x_train, axis=2)
    return x_train
        
  
def get_class(protein, t_data):
    y_train = []
    

    if t_data == 'train':
        data = load_data("data/clip/%s/5000/training_sample_0"% protein)

    elif t_data == 'test':
        data = load_data("data/clip/%s/5000/test_sample_0"% protein)


    for record in SeqIO.parse(data,"fasta"):
        v = int((record.description).split(":")[1])
        y_train.append([int(v == 0), int(v != 0)])

    y_train = np.array(y_train)
    return y_train

Using TensorFlow backend.


In [3]:

protein = "1_PARCLIP_AGO1234_hg19"
with gzip.open(("data/clip/%s/5000/training_sample_0/matrix_Cobinding.tab.gz"% protein), "rt") as f:
  cobinding_data = np.loadtxt(f, skiprows=1) 

    
#print (cobinding_data.shape)

cobinding = np.zeros((5000,101,cobinding_data.shape[1]/101),dtype=np.int) #ustvarim prazen array
#print (cobinding.shape)

for n in range(0,cobinding_data.shape[1],101):
    a = cobinding_data[:,n:(n+101)]
    a[0,0] = 1 #zgolj da preverim kam se zapiše a ODSTRANI!
    cobinding[:,:,(n/101)] = a
    #print (a, a.shape)
   
    
    
    

#print (cobinding)


In [5]:
protein = "1_PARCLIP_AGO1234_hg19"
with gzip.open(("data/clip/%s/5000/training_sample_0/matrix_Cobinding.tab.gz"% protein), "rt") as f:
  cobinding_data = np.loadtxt(f, skiprows=1) 


cobinding = np.zeros((5000,101,cobinding_data.shape[1]/101),dtype=np.int) #ustvarim prazen array


for n in range(0,cobinding_data.shape[1],101):
    a = cobinding_data[:,n:(n+101)]
 
    cobinding[:,:,(n/101)] = a




In [23]:
for x in range (0,1):
   
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
    kf = KFold(n_splits=3) #10 različnih delitev podatkov  
    
    score = []
    for train_index, test_index in kf.split(X):
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 48,data_format='channels_last', input_shape=(101,26), strides = 1, padding='valid')) #, activation='relu'))
        model.add(MaxPooling1D(pool_size=12, strides=2, padding='valid'))
        model.add(Conv1D(12, 2, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        print(model.summary())  
        
        cobinding_train, cobinding_test = cobinding[train_index], cobinding[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(cobinding_train, y_train, epochs=10, batch_size=16, verbose=0)
   
    

    y_scores =(model.predict(cobinding_test))# pridobim napovedi za sekvence iz cobinding
    score = roc_auc_score(y_test, y_scores)      
    print(y_scores, score, y_scores.shape)
        


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_25 (Conv1D)           (None, 54, 120)           149880    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 22, 120)           0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 21, 12)            2892      
_________________________________________________________________
global_average_pooling1d_11  (None, 12)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 200)               2600      
_________________________________________________________________
dense_22 (Dense)             (None, 2)                 402       
Total params: 155,774
Trainable params: 155,774
Non-trainable params: 0
_________________________________________________________________
None

In [25]:
X.shape

(5000, 404, 1)