In [1]:
import numpy as np
from Bio import SeqIO
import gzip
import os
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, MaxPooling1D, Flatten, GlobalAveragePooling1D, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve

np.random.seed(454)

def load_data(path):
    data = gzip.open(os.path.join(path,"sequences.fa.gz"),"rt")
    return data


def get_seq(protein, t_data):
    training_data = load_data("data/clip/%s/5000/training_sample_0"% protein)
    x_train = []
    
    
    for record in SeqIO.parse(training_data,"fasta"):
        sequence = list(record.seq)                
        nucleotide = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3, 'N' : 4} 
        num_seq = list() #sekvenca v številskem formatu
        

        for i in range(0,len(sequence)):
            num_seq.append(nucleotide[sequence[i]])


        X = np.zeros((len(num_seq),4))

        for i in range (len(num_seq)):
            if num_seq[i] <= 3:
                X[i,num_seq[i]] = 1               
               
        x_train.append(X.flatten())


    x_train = np.array(x_train)
    x_train = np.expand_dims(x_train, axis=2)
    return x_train
        
  
def get_class(protein, t_data):
    y_train = []
    

    if t_data == 'train':
        data = load_data("data/clip/%s/5000/training_sample_0"% protein)

    elif t_data == 'test':
        data = load_data("data/clip/%s/5000/test_sample_0"% protein)


    for record in SeqIO.parse(data,"fasta"):
        v = int((record.description).split(":")[1])
        y_train.append([int(v == 0), int(v != 0)])

    y_train = np.array(y_train)
    return y_train

Using TensorFlow backend.


In [None]:
for x in range (0,1):
    protein = "1_PARCLIP_AGO1234_hg19"
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
        
    kf = KFold(n_splits=10) #10 različnih delitev podatkov
    score = []
    for train_index, test_index in kf.split(X):
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(500, 24, input_shape=(404, 1), strides = 4, padding='valid')) #, activation='relu'))
        model.add(MaxPooling1D(pool_size=24, strides=6, padding='valid'))
        model.add(Conv1D(120, 4, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        
        #print ("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz X_test
          
        score.append(roc_auc_score(y_test, y_scores))#izračun povrsine pod krivuljo za vsako delitev
        
    print ("AUC is %s" % (score))
    print (np.mean(score))

    print(model.summary())