In [1]:
import numpy as np
from Bio import SeqIO
import gzip
import os
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, MaxPooling1D, Flatten, GlobalAveragePooling1D, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve

np.random.seed(454)

def load_data(path):
    data = gzip.open(os.path.join(path,"sequences.fa.gz"),"rt")
    return data


def get_seq(protein, t_data):
    training_data = load_data("data/clip/%s/5000/training_sample_0"% protein)
    x_train = []
    
    
    for record in SeqIO.parse(training_data,"fasta"):
        sequence = list(record.seq)                
        nucleotide = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3, 'N' : 4} 
        num_seq = list() #sekvenca v številskem formatu
        

        for i in range(0,len(sequence)):
            num_seq.append(nucleotide[sequence[i]])


        X = np.zeros((len(num_seq),4))

        for i in range (len(num_seq)):
            if num_seq[i] <= 3:
                X[i,num_seq[i]] = 1               
               
        x_train.append(X.flatten())


    x_train = np.array(x_train)
    x_train = np.expand_dims(x_train, axis=2)
    return x_train
        
  
def get_class(protein, t_data):
    y_train = []
    

    if t_data == 'train':
        data = load_data("data/clip/%s/5000/training_sample_0"% protein)

    elif t_data == 'test':
        data = load_data("data/clip/%s/5000/test_sample_0"% protein)


    for record in SeqIO.parse(data,"fasta"):
        v = int((record.description).split(":")[1])
        y_train.append([int(v == 0), int(v != 0)])

    y_train = np.array(y_train)
    return y_train

Using TensorFlow backend.


In [3]:
for x in range (0,1):
    protein = "1_PARCLIP_AGO1234_hg19"
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
        
    kf = KFold(n_splits=10) #10 različnih delitev podatkov
    score = []
    for train_index, test_index in kf.split(X):
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 24, input_shape=(404, 1), strides = 4, padding='valid')) #, activation='relu'))
        model.add(MaxPooling1D(pool_size=24, strides=6, padding='valid'))
        model.add(Conv1D(120, 4, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        
        #print ("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz X_test
          
        score.append(roc_auc_score(y_test, y_scores))#izračun povrsine pod krivuljo za vsako delitev
        
    print ("AUC is %s" % (score))
    print (np.mean(score))

    print(model.summary())

AUC is [0.7223587223587223, 0.6271160854844624, 0.6752951235657547, 0.6927470914998427, 0.7173113810741688, 0.7117580305240974, 0.6849921834288693, 0.7424129631008214, 0.6811926018776702, 0.6879375]
0.6943121682914409
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_39 (Conv1D)           (None, 96, 120)           3000      
_________________________________________________________________
max_pooling1d_20 (MaxPooling (None, 13, 120)           0         
_________________________________________________________________
conv1d_40 (Conv1D)           (None, 10, 120)           57720     
_________________________________________________________________
global_average_pooling1d_20  (None, 120)               0         
_________________________________________________________________
dense_39 (Dense)             (None, 200)               24200     
________________________________________________________

In [4]:
for x in range (0,1):
    protein = "1_PARCLIP_AGO1234_hg19"
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
        
    kf = KFold(n_splits=10) #10 različnih delitev podatkov
    score = []
    for train_index, test_index in kf.split(X):
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 48, input_shape=(404, 1), strides = 4, padding='valid')) #, activation='relu'))
        model.add(MaxPooling1D(pool_size=48, strides=6, padding='valid'))
        model.add(Conv1D(120, 4, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        
        #print ("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz X_test
          
        score.append(roc_auc_score(y_test, y_scores))#izračun povrsine pod krivuljo za vsako delitev
        
    print ("AUC is %s" % (score))
    print (np.mean(score))

    print(model.summary())

AUC is [0.7064014160788354, 0.6974941234823488, 0.6833627537511033, 0.655945393564616, 0.7358669011082695, 0.6875101322402093, 0.6780813419687832, 0.7171641976227698, 0.7006344171292624, 0.7152625]
0.6977723176946198
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_59 (Conv1D)           (None, 90, 120)           5880      
_________________________________________________________________
max_pooling1d_30 (MaxPooling (None, 8, 120)            0         
_________________________________________________________________
conv1d_60 (Conv1D)           (None, 5, 120)            57720     
_________________________________________________________________
global_average_pooling1d_30  (None, 120)               0         
_________________________________________________________________
dense_59 (Dense)             (None, 200)               24200     
_________________________________________________________

In [21]:
for x in range (0,1):
    protein = "1_PARCLIP_AGO1234_hg19"
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
        
    kf = KFold(n_splits=10) #10 različnih delitev podatkov
    score = []
    for train_index, test_index in kf.split(X):
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 48, input_shape=(404, 1), strides = 4, padding='valid')) #, activation='relu'))
        model.add(MaxPooling1D(pool_size=48, strides=6, padding='valid'))
        model.add(Conv1D(120, 8, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        
        
        #print ("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz X_test
          
        score.append(roc_auc_score(y_test, y_scores))#izračun povrsine pod krivuljo za vsako delitev
        
    print ("AUC is %s" % (score))
    print (np.mean(score))
    
    print(model.summary())

   

AUC is [0.6652664394599879, 0.6640033641715728, 0.6178011915269197, 0.6780735771931663, 0.6313272591645354, 0.7048333680724426, 0.642869550112906, 0.6483039281371746, 0.6829832953876851, 0.6961375000000001]
0.663159947322639
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_118 (Conv1D)          (None, 90, 120)           5880      
_________________________________________________________________
max_pooling1d_63 (MaxPooling (None, 8, 120)            0         
_________________________________________________________________
conv1d_119 (Conv1D)          (None, 1, 120)            115320    
_________________________________________________________________
global_average_pooling1d_51  (None, 120)               0         
_________________________________________________________________
dense_101 (Dense)            (None, 200)               24200     
_________________________________________________

In [2]:
for x in range (0,1):
    protein = "17_ICLIP_HNRNPC_hg19"
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
        
    kf = KFold(n_splits=10) #10 različnih delitev podatkov
    score = []
    for train_index, test_index in kf.split(X):
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 24, input_shape=(404, 1), strides = 4, padding='valid')) #, activation='relu'))
        model.add(MaxPooling1D(pool_size=24, strides=6, padding='valid'))
        model.add(Conv1D(120, 4, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        
        #print ("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=1)

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz X_test
          
        score.append(roc_auc_score(y_test, y_scores))#izračun povrsine pod krivuljo za vsako delitev
        
    print ("AUC is %s" % (score))
    print (np.mean(score))

    print(model.summary())

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
E

In [10]:
score_list = []
protein_list = ["1_PARCLIP_AGO1234_hg19", "2_PARCLIP_AGO2MNASE_hg19","3_HITSCLIP_Ago2_binding_clusters","4_HITSCLIP_Ago2_binding_clusters_2","5_CLIPSEQ_AGO2_hg19", "6_CLIP-seq-eIF4AIII_1","7_CLIP-seq-eIF4AIII_2","8_PARCLIP_ELAVL1_hg19","9_PARCLIP_ELAVL1MNASE_hg19", "10_PARCLIP_ELAVL1A_hg19", "10_PARCLIP_ELAVL1A_hg19", "12_PARCLIP_EWSR1_hg19", "13_PARCLIP_FUS_hg19", "14_PARCLIP_FUS_mut_hg19", "15_PARCLIP_IGF2BP123_hg19", "16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters", "17_ICLIP_HNRNPC_hg19", "18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome", "19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome", "20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome", "21_PARCLIP_MOV10_Sievers_hg19", "22_ICLIP_NSUN2_293_group_4007_all-NSUN2-293-hg19_sum_G_hg19--ensembl59_from_3137-3202_bedGraph-cDNA-hits-in-genome", "23_PARCLIP_PUM2_hg19", "24_PARCLIP_QKI_hg19", "25_CLIPSEQ_SFRS1_hg19","26_PARCLIP_TAF15_hg19", "27_ICLIP_TDP43_hg19", "28_ICLIP_TIA1_hg19", "29_ICLIP_TIAL1_hg19", "30_ICLIP_U2AF65_Hela_iCLIP_ctrl_all_clusters", "31_ICLIP_U2AF65_Hela_iCLIP_ctrl+kd_all_clusters"]
for x in range (0,31):
    protein = protein_list[x]
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
        
    kf = KFold(n_splits=10) #10 različnih delitev podatkov
    score = []
    for train_index, test_index in kf.split(X):
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 24, input_shape=(404, 1), strides = 4, padding='valid')) #, activation='relu'))
        model.add(MaxPooling1D(pool_size=24, strides=6, padding='valid'))
        model.add(Conv1D(120, 4, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        
        
        #print ("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz X_test
          
        score.append(roc_auc_score(y_test, y_scores))#izračun povrsine pod krivuljo za vsako delitev
        
    print ("%s finishhed" % (protein))
    score_list.append(np.mean(score))
    
print (score_list)

1_PARCLIP_AGO1234_hg19 finishhed
2_PARCLIP_AGO2MNASE_hg19 finishhed
3_HITSCLIP_Ago2_binding_clusters finishhed
4_HITSCLIP_Ago2_binding_clusters_2 finishhed
5_CLIPSEQ_AGO2_hg19 finishhed
6_CLIP-seq-eIF4AIII_1 finishhed
7_CLIP-seq-eIF4AIII_2 finishhed
8_PARCLIP_ELAVL1_hg19 finishhed
9_PARCLIP_ELAVL1MNASE_hg19 finishhed
10_PARCLIP_ELAVL1A_hg19 finishhed
10_PARCLIP_ELAVL1A_hg19 finishhed
12_PARCLIP_EWSR1_hg19 finishhed
13_PARCLIP_FUS_hg19 finishhed
14_PARCLIP_FUS_mut_hg19 finishhed
15_PARCLIP_IGF2BP123_hg19 finishhed
16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters finishhed
17_ICLIP_HNRNPC_hg19 finishhed
18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome finishhed
19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome finishhed
20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome finishhed
21_PARC

In [34]:
import pandas as pd
import numpy as np


protein_list = ["1_PARCLIP_AGO1234_hg19", "2_PARCLIP_AGO2MNASE_hg19","3_HITSCLIP_Ago2_binding_clusters","4_HITSCLIP_Ago2_binding_clusters_2","5_CLIPSEQ_AGO2_hg19", "6_CLIP-seq-eIF4AIII_1","7_CLIP-seq-eIF4AIII_2","8_PARCLIP_ELAVL1_hg19","9_PARCLIP_ELAVL1MNASE_hg19", "10_PARCLIP_ELAVL1A_hg19", "10_PARCLIP_ELAVL1A_hg19", "12_PARCLIP_EWSR1_hg19", "13_PARCLIP_FUS_hg19", "14_PARCLIP_FUS_mut_hg19", "15_PARCLIP_IGF2BP123_hg19", "16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters", "17_ICLIP_HNRNPC_hg19", "18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome", "19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome", "20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome", "21_PARCLIP_MOV10_Sievers_hg19", "22_ICLIP_NSUN2_293_group_4007_all-NSUN2-293-hg19_sum_G_hg19--ensembl59_from_3137-3202_bedGraph-cDNA-hits-in-genome", "23_PARCLIP_PUM2_hg19", "24_PARCLIP_QKI_hg19", "25_CLIPSEQ_SFRS1_hg19","26_PARCLIP_TAF15_hg19", "27_ICLIP_TDP43_hg19", "28_ICLIP_TIA1_hg19", "29_ICLIP_TIAL1_hg19", "30_ICLIP_U2AF65_Hela_iCLIP_ctrl_all_clusters", "31_ICLIP_U2AF65_Hela_iCLIP_ctrl+kd_all_clusters"]
score = [0.702070450752216, 0.5750680929629388, 0.8406778010526474, 0.8312447282045671, 0.5978275820465371, 0.9096374227321012, 0.9239325562278464, 0.9050051026383841, 0.574279343186374, 0.8864106877430856, 0.8894066248748068, 0.8739343360588911, 0.8936358477075315, 0.9060995075236727, 0.6681995136349987, 0.9352192003189522, 0.961080563074127, 0.6707604012076377, 0.6706377440694384, 0.6524711753424095, 0.7996138569779265, 0.7504903524557578, 0.9397567924751795, 0.9528254485490679, 0.8643351939743337, 0.9217688953931902, 0.8863846973956108, 0.8807389649675208, 0.8564911129059997, 0.9088547815381828, 0.8965754492173857]
ionmf = [0.601, 0.518, 0.641, 0.660, 0.523, 0.776, 0.762, 0.682, 0.530, 0.660,0.690, 0.749, 0.761, 0.724, 0.570, 0.768, 0.970, 0.627, 0.614, 0.599,0.652, 0.715, 0.746, 0.845, 0.718, 0.768, 0.696, 0.682, 0.706, 0.754, 0.695]

difference = np.subtract(score,ionmf)
pd.set_option('display.max_columns', 4)


              
table_data = {'Score' : score, 'iONMF':ionmf, 'Difference':difference}
table = pd.DataFrame(table_data, index = protein_list)

print (table)

                                                    Difference     Score  iONMF
1_PARCLIP_AGO1234_hg19                                0.101070  0.702070  0.601
2_PARCLIP_AGO2MNASE_hg19                              0.057068  0.575068  0.518
3_HITSCLIP_Ago2_binding_clusters                      0.199678  0.840678  0.641
4_HITSCLIP_Ago2_binding_clusters_2                    0.171245  0.831245  0.660
5_CLIPSEQ_AGO2_hg19                                   0.074828  0.597828  0.523
6_CLIP-seq-eIF4AIII_1                                 0.133637  0.909637  0.776
7_CLIP-seq-eIF4AIII_2                                 0.161933  0.923933  0.762
8_PARCLIP_ELAVL1_hg19                                 0.223005  0.905005  0.682
9_PARCLIP_ELAVL1MNASE_hg19                            0.044279  0.574279  0.530
10_PARCLIP_ELAVL1A_hg19                               0.226411  0.886411  0.660
10_PARCLIP_ELAVL1A_hg19                               0.199407  0.889407  0.690
12_PARCLIP_EWSR1_hg19                   