In [2]:
import numpy as np
from Bio import SeqIO
import gzip
import os
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, MaxPooling1D, Flatten, GlobalAveragePooling1D, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve

np.random.seed(454)

def load_data(path):
    data = gzip.open(os.path.join(path,"sequences.fa.gz"),"rt")
    return data


def get_seq(protein, t_data):
    training_data = load_data("data/clip/%s/5000/training_sample_0"% protein)
    x_train = []
    
    
    for record in SeqIO.parse(training_data,"fasta"):
        sequence = list(record.seq)                
        nucleotide = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3, 'N' : 4} 
        num_seq = list() #sekvenca v številskem formatu
        

        for i in range(0,len(sequence)):
            num_seq.append(nucleotide[sequence[i]])


        X = np.zeros((len(num_seq),4))

        for i in range (len(num_seq)):
            if num_seq[i] <= 3:
                X[i,num_seq[i]] = 1               
               
        x_train.append(X.flatten())


    x_train = np.array(x_train)
    x_train = np.expand_dims(x_train, axis=2)
    return x_train
        
  
def get_class(protein, t_data):
    y_train = []
    

    if t_data == 'train':
        data = load_data("data/clip/%s/5000/training_sample_0"% protein)

    elif t_data == 'test':
        data = load_data("data/clip/%s/5000/test_sample_0"% protein)


    for record in SeqIO.parse(data,"fasta"):
        v = int((record.description).split(":")[1])
        y_train.append([int(v == 0), int(v != 0)])

    y_train = np.array(y_train)
    return y_train

Using TensorFlow backend.


In [3]:

protein = "1_PARCLIP_AGO1234_hg19"
with gzip.open(("data/clip/%s/5000/training_sample_0/matrix_Cobinding.tab.gz"% protein), "rt") as f:
  cobinding_data = np.loadtxt(f, skiprows=1) 

    
#print (cobinding_data.shape)

cobinding = np.zeros((5000,101,cobinding_data.shape[1]/101),dtype=np.int) #ustvarim prazen array
#print (cobinding.shape)

for n in range(0,cobinding_data.shape[1],101):
    a = cobinding_data[:,n:(n+101)]
    a[0,0] = 1 #zgolj da preverim kam se zapiše a ODSTRANI!
    cobinding[:,:,(n/101)] = a
    #print (a, a.shape)
   
    
    
    

#print (cobinding)


In [39]:
protein = "1_PARCLIP_AGO1234_hg19"
with gzip.open(("data/clip/%s/5000/training_sample_0/matrix_Cobinding.tab.gz"% protein), "rt") as f:
  cobinding_data = np.loadtxt(f, skiprows=1) 


cobinding = np.zeros((5000,101,cobinding_data.shape[1]/101),dtype=np.int) #ustvarim prazen array


for n in range(0,cobinding_data.shape[1],101):
    a = cobinding_data[:,n:(n+101)]
 
    cobinding[:,:,(n/101)] = a




In [23]:
for x in range (0,1):
   
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
    kf = KFold(n_splits=3) #10 različnih delitev podatkov  
    
    score = []
    for train_index, test_index in kf.split(X):
        
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 48,data_format='channels_last', input_shape=(101,26), strides = 1, padding='valid'))
        model.add(MaxPooling1D(pool_size=12, strides=2, padding='valid'))
        model.add(Conv1D(12, 2, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        print(model.summary())  
        
        cobinding_train, cobinding_test = cobinding[train_index], cobinding[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(cobinding_train, y_train, epochs=10, batch_size=16, verbose=0)
   
    

    y_scores =(model.predict(cobinding_test))# pridobim napovedi za sekvence iz cobinding
    score = roc_auc_score(y_test, y_scores)      
    print(y_scores, score, y_scores.shape)
        


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_25 (Conv1D)           (None, 54, 120)           149880    
_________________________________________________________________
max_pooling1d_13 (MaxPooling (None, 22, 120)           0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 21, 12)            2892      
_________________________________________________________________
global_average_pooling1d_11  (None, 12)                0         
_________________________________________________________________
dense_21 (Dense)             (None, 200)               2600      
_________________________________________________________________
dense_22 (Dense)             (None, 2)                 402       
Total params: 155,774
Trainable params: 155,774
Non-trainable params: 0
_________________________________________________________________
None

In [25]:
X.shape

(5000, 404, 1)

In [7]:
for x in range (0,1):
   
    X = get_seq(protein,"train")
    y = get_class(protein,"train")
        
    kf = KFold(n_splits=3) #10 različnih delitev podatkov  
    
    score = []
    for train_index, test_index in kf.split(X):
        
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 4,data_format='channels_first', input_shape=(101,26), strides = 1, padding='valid'))
        model.add(MaxPooling1D(pool_size=12, strides=2, padding='valid'))
        model.add(Conv1D(12, 2, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        print(model.summary())  
        
        cobinding_train, cobinding_test = cobinding[train_index], cobinding[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(cobinding_train, y_train, epochs=10, batch_size=16, verbose=0)
   
    

    y_scores =(model.predict(cobinding_test))# pridobim napovedi za sekvence iz cobinding
    score = roc_auc_score(y_test, y_scores)      
    print(y_scores, score, y_scores.shape)
        



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_6 (Conv1D)            (None, 120, 23)           48600     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 55, 23)            0         
_________________________________________________________________
conv1d_7 (Conv1D)            (None, 54, 12)            564       
_________________________________________________________________
global_average_pooling1d_3 ( (None, 12)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 200)               2600      
_________________________________________________________________
dense_6 (Dense)              (None, 2)                 402       
Total params: 52,166
Trainable params: 52,166
Non-trainable params: 0
_________________________________________________________________
None
_

In [25]:
#spreminjam obliko podatkov za sekvenco; spremenjena koda za get_seq

training_data = load_data("data/clip/%s/5000/training_sample_0"% protein)
x_train = []
    
    
for record in SeqIO.parse(training_data,"fasta"):
        sequence = list(record.seq)                
        nucleotide = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3, 'N' : 4} 
        num_seq = list() #sekvenca v številskem formatu
        
        for i in range(0,len(sequence)):
            num_seq.append(nucleotide[sequence[i]])
            
        
        x_train.append(num_seq)

x_train = np.array(x_train)

        

#x_train.append(X.flatten())

#x_train = np.expand_dims(x_train, axis=2

In [27]:
print(x_train.shape)

(5000, 101)


In [43]:
#uporabim na novo oblikovano sekvenco

print(cobinding.shape)
X = np.dstack((cobinding, x_train))
X.shape


(5000, 101, 26)


(5000, 101, 27)

In [47]:
#konvolucija s sekvenco in cobindingom

for x in range (0,1):
   
    X = X 
    y = get_class(protein,"train")
        
    kf = KFold(n_splits=3) #10 različnih delitev podatkov  
    
    score = []
    for train_index, test_index in kf.split(X):
        
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 48,data_format='channels_last', input_shape=(101,27), strides = 1, padding='valid'))
        model.add(MaxPooling1D(pool_size=12, strides=2, padding='valid'))
        model.add(Conv1D(12, 2, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

        print(model.summary())  
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)
   
    

    y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz cobinding
    score = roc_auc_score(y_test, y_scores)      
    print(y_scores, score, y_scores.shape)
        


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_32 (Conv1D)           (None, 54, 120)           155640    
_________________________________________________________________
max_pooling1d_16 (MaxPooling (None, 22, 120)           0         
_________________________________________________________________
conv1d_33 (Conv1D)           (None, 21, 12)            2892      
_________________________________________________________________
global_average_pooling1d_16  (None, 12)                0         
_________________________________________________________________
dense_31 (Dense)             (None, 200)               2600      
_________________________________________________________________
dense_32 (Dense)             (None, 2)                 402       
Total params: 161,534
Trainable params: 161,534
Non-trainable params: 0
_________________________________________________________________
None

In [1]:
import numpy as np
from Bio import SeqIO
import gzip
import os
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, MaxPooling1D, Flatten, GlobalAveragePooling1D, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve

np.random.seed(454)

def load_data(path):
    data = gzip.open(os.path.join(path,"sequences.fa.gz"),"rt")
    return data

def get_seq(protein, t_data):
    
    training_data = load_data("data/clip/%s/5000/training_sample_0"% protein)
    x_train = []
    
    
    for record in SeqIO.parse(training_data,"fasta"):
        sequence = list(record.seq)                
        nucleotide = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3, 'N' : 4} 
        num_seq = list() #sekvenca v številskem formatu
        
        for i in range(0,len(sequence)):
            num_seq.append(nucleotide[sequence[i]])
            
        
        x_train.append(num_seq)

    x_train = np.array(x_train)
    
    return x_train

def get_class(protein, t_data):
    y_train = []
    

    if t_data == 'train':
        data = load_data("data/clip/%s/5000/training_sample_0"% protein)

    elif t_data == 'test':
        data = load_data("data/clip/%s/5000/test_sample_0"% protein)


    for record in SeqIO.parse(data,"fasta"):
        v = int((record.description).split(":")[1])
        y_train.append([int(v == 0), int(v != 0)])

    y_train = np.array(y_train)
    return y_train
def get_cobinding(protein, t_data):
    with gzip.open(("data/clip/%s/5000/training_sample_0/matrix_Cobinding.tab.gz"% protein), "rt") as f:
        cobinding_data = np.loadtxt(f, skiprows=1) 
    
    
    cobinding = np.zeros((5000,101,cobinding_data.shape[1]/101),dtype=np.int) #ustvarim prazen array


    for n in range(0,cobinding_data.shape[1],101):
        a = cobinding_data[:,n:(n+101)]
        cobinding[:,:,(n/101)] = a
    
    return cobinding
    

Using TensorFlow backend.


In [57]:
for x in range (0,1):
    
    protein = "1_PARCLIP_AGO1234_hg19"
    
    X = np.dstack((get_cobinding(protein,"train"), get_seq(protein,"train")))
    y = get_class(protein,"train")
        
    kf = KFold(n_splits=10) #10 različnih delitev podatkov  
    
    score = []
    for train_index, test_index in kf.split(X):
        
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 48,data_format='channels_last', input_shape=(101,27), strides = 1, padding='valid'))
        model.add(MaxPooling1D(pool_size=12, strides=2, padding='valid'))
        model.add(Conv1D(12, 2, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

          
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)
   
    

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz cobinding
        score.append(roc_auc_score(y_test, y_scores))      
        
    print (score)
    print(model.summary())

[0.907360439618504, 0.8730726100364452, 0.9033539276257723, 0.8293810921287077, 0.8877277813299231, 0.868460362676301, 0.8737809871212685, 0.8856299163750961, 0.8491788391189787, 0.83964375]
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_122 (Conv1D)          (None, 54, 120)           155640    
_________________________________________________________________
max_pooling1d_61 (MaxPooling (None, 22, 120)           0         
_________________________________________________________________
conv1d_123 (Conv1D)          (None, 21, 12)            2892      
_________________________________________________________________
global_average_pooling1d_61  (None, 12)                0         
_________________________________________________________________
dense_121 (Dense)            (None, 200)               2600      
_________________________________________________________________
dense_122 (Dense)

In [4]:
import numpy as np
np.mean([0.907360439618504, 0.8730726100364452, 0.9033539276257723, 0.8293810921287077, 0.8877277813299231, 0.868460362676301, 0.8737809871212685, 0.8856299163750961, 0.8491788391189787, 0.83964375])


0.8717589706030997

In [59]:
# one-hot encoding za sekvenco
training_data = load_data("data/clip/%s/5000/training_sample_0"% protein)
x_train = np.zeros((5000,101,4))
r = 0    
    
for record in SeqIO.parse(training_data,"fasta"):
    sequence = list(record.seq)                
    nucleotide = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3, 'N' : 4} 
    num_seq = list() #sekvenca v številskem formatu
     
    
    for i in range(0,len(sequence)):
            num_seq.append(nucleotide[sequence[i]])


    X = np.zeros((1,len(num_seq),4))

    for i in range (len(num_seq)):
            if num_seq[i] <= 3:
                X[:,i,num_seq[i]] = 1               
               
    x_train[r,:,:] = X
    r = r + 1
    


     

In [1]:
import numpy as np
from Bio import SeqIO
import gzip
import os
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv1D, MaxPooling1D, Flatten, GlobalAveragePooling1D, Dropout
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score, roc_curve

np.random.seed(454)

def load_data(path):
    data = gzip.open(os.path.join(path,"sequences.fa.gz"),"rt")
    return data

def get_seq(protein, t_data):
    
    training_data = load_data("data/clip/%s/5000/training_sample_0"% protein)
    x_train = np.zeros((5000,101,4))
    r = 0    

    for record in SeqIO.parse(training_data,"fasta"):
        sequence = list(record.seq)                
        nucleotide = {'A' : 0, 'T' : 1, 'G' : 2, 'C' : 3, 'N' : 4} 
        num_seq = list() #sekvenca v številskem formatu


        for i in range(0,len(sequence)):
                num_seq.append(nucleotide[sequence[i]])


        X = np.zeros((1,len(num_seq),4))

        for i in range (len(num_seq)):
                if num_seq[i] <= 3:
                    X[:,i,num_seq[i]] = 1               

        x_train[r,:,:] = X
        r = r + 1
    
    return x_train

def get_class(protein, t_data):
    y_train = []
    

    if t_data == 'train':
        data = load_data("data/clip/%s/5000/training_sample_0"% protein)

    elif t_data == 'test':
        data = load_data("data/clip/%s/5000/test_sample_0"% protein)


    for record in SeqIO.parse(data,"fasta"):
        v = int((record.description).split(":")[1])
        y_train.append([int(v == 0), int(v != 0)])

    y_train = np.array(y_train)
    return y_train
def get_cobinding(protein, t_data):
    with gzip.open(("data/clip/%s/5000/training_sample_0/matrix_Cobinding.tab.gz"% protein), "rt") as f:
        cobinding_data = np.loadtxt(f, skiprows=1) 
    
    
    cobinding = np.zeros((5000,101,cobinding_data.shape[1]/101),dtype=np.int) #ustvarim prazen array


    for n in range(0,cobinding_data.shape[1],101):
        a = cobinding_data[:,n:(n+101)]
        cobinding[:,:,(n/101)] = a
    
    return cobinding
    





Using TensorFlow backend.


In [77]:
score_list = []
protein_list = ["1_PARCLIP_AGO1234_hg19", "2_PARCLIP_AGO2MNASE_hg19","3_HITSCLIP_Ago2_binding_clusters","4_HITSCLIP_Ago2_binding_clusters_2","5_CLIPSEQ_AGO2_hg19", "6_CLIP-seq-eIF4AIII_1","7_CLIP-seq-eIF4AIII_2","8_PARCLIP_ELAVL1_hg19","9_PARCLIP_ELAVL1MNASE_hg19", "10_PARCLIP_ELAVL1A_hg19", "10_PARCLIP_ELAVL1A_hg19", "12_PARCLIP_EWSR1_hg19", "13_PARCLIP_FUS_hg19", "14_PARCLIP_FUS_mut_hg19", "15_PARCLIP_IGF2BP123_hg19", "16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters", "17_ICLIP_HNRNPC_hg19", "18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome", "19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome", "20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome", "21_PARCLIP_MOV10_Sievers_hg19", "22_ICLIP_NSUN2_293_group_4007_all-NSUN2-293-hg19_sum_G_hg19--ensembl59_from_3137-3202_bedGraph-cDNA-hits-in-genome", "23_PARCLIP_PUM2_hg19", "24_PARCLIP_QKI_hg19", "25_CLIPSEQ_SFRS1_hg19","26_PARCLIP_TAF15_hg19", "27_ICLIP_TDP43_hg19", "28_ICLIP_TIA1_hg19", "29_ICLIP_TIAL1_hg19", "30_ICLIP_U2AF65_Hela_iCLIP_ctrl_all_clusters", "31_ICLIP_U2AF65_Hela_iCLIP_ctrl+kd_all_clusters"]

for x in range (0,31):

    protein = protein_list[x]
    
    X = np.dstack((get_cobinding(protein,"train"), get_seq(protein,"train")))
    y = get_class(protein,"train")
    
    size = X.shape[2]
    
    kf = KFold(n_splits=10) #10 različnih delitev podatkov  
    
    score = []
    for train_index, test_index in kf.split(X):
        
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 24,data_format='channels_last', input_shape=(101, size) , strides = 1, padding='valid'))
        model.add(MaxPooling1D(pool_size=24, strides=6, padding='valid'))
        model.add(Conv1D(120, 4, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

          
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)
   
    

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz cobinding
        score.append(roc_auc_score(y_test, y_scores))
        
    
    
    print ("%s finishhed" % (protein))
    score_list.append(np.mean(score))
    
    
print(score_list)

1_PARCLIP_AGO1234_hg19 finishhed
2_PARCLIP_AGO2MNASE_hg19 finishhed
3_HITSCLIP_Ago2_binding_clusters finishhed
4_HITSCLIP_Ago2_binding_clusters_2 finishhed
5_CLIPSEQ_AGO2_hg19 finishhed
6_CLIP-seq-eIF4AIII_1 finishhed
7_CLIP-seq-eIF4AIII_2 finishhed
8_PARCLIP_ELAVL1_hg19 finishhed
9_PARCLIP_ELAVL1MNASE_hg19 finishhed
10_PARCLIP_ELAVL1A_hg19 finishhed
10_PARCLIP_ELAVL1A_hg19 finishhed
12_PARCLIP_EWSR1_hg19 finishhed
13_PARCLIP_FUS_hg19 finishhed
14_PARCLIP_FUS_mut_hg19 finishhed
15_PARCLIP_IGF2BP123_hg19 finishhed
16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters finishhed
17_ICLIP_HNRNPC_hg19 finishhed
18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome finishhed
19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome finishhed
20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome finishhed
21_PARC

In [81]:
import pandas as pd
import numpy as np


protein_list = ["1_PARCLIP_AGO1234_hg19", "2_PARCLIP_AGO2MNASE_hg19","3_HITSCLIP_Ago2_binding_clusters","4_HITSCLIP_Ago2_binding_clusters_2","5_CLIPSEQ_AGO2_hg19", "6_CLIP-seq-eIF4AIII_1","7_CLIP-seq-eIF4AIII_2","8_PARCLIP_ELAVL1_hg19","9_PARCLIP_ELAVL1MNASE_hg19", "10_PARCLIP_ELAVL1A_hg19", "10_PARCLIP_ELAVL1A_hg19", "12_PARCLIP_EWSR1_hg19", "13_PARCLIP_FUS_hg19", "14_PARCLIP_FUS_mut_hg19", "15_PARCLIP_IGF2BP123_hg19", "16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters", "17_ICLIP_HNRNPC_hg19", "18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome", "19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome", "20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome", "21_PARCLIP_MOV10_Sievers_hg19", "22_ICLIP_NSUN2_293_group_4007_all-NSUN2-293-hg19_sum_G_hg19--ensembl59_from_3137-3202_bedGraph-cDNA-hits-in-genome", "23_PARCLIP_PUM2_hg19", "24_PARCLIP_QKI_hg19", "25_CLIPSEQ_SFRS1_hg19","26_PARCLIP_TAF15_hg19", "27_ICLIP_TDP43_hg19", "28_ICLIP_TIA1_hg19", "29_ICLIP_TIAL1_hg19", "30_ICLIP_U2AF65_Hela_iCLIP_ctrl_all_clusters", "31_ICLIP_U2AF65_Hela_iCLIP_ctrl+kd_all_clusters"]
score = [0.8784274080116949, 0.712064476180941, 0.8264209383405812, 0.847009581737597, 0.6497208726529128, 0.9006135614110178, 0.9433431526968606, 0.9477383033044304, 0.677806730286637, 0.9183066453987854, 0.9097194725956174, 0.9290900434189812, 0.9147761168064094, 0.9806999568739023, 0.8520893736781467, 0.9251161851548252, 0.9609008323111736, 0.7690864626572951, 0.6814716176149236, 0.7182194403291076, 0.9351248802543546, 0.7921685512048391, 0.9658200903838713, 0.9428692308457892, 0.8707048992001145, 0.9701312315204162, 0.8487082760564476, 0.9368091889596142, 0.9132027079540356, 0.9172633094372301, 0.9127423695032235]
ionmf = [0.698, 0.601, 0.761, 0.742, 0.593, 0.863, 0.863, 0.924, 0.594, 0.960, 0.956, 0.863, 0.798, 0.930, 0.699, 0.952, 0.973, 0.788, 0.713, 0.728, 0.907, 0.823, 0.936, 0.901, 0.787, 0.897, 0.733, 0.928, 0.907, 0.818, 0.795]
difference = np.subtract(score,ionmf)
pd.set_option('display.max_columns', 4)


              
table_data = {'Score' : score, 'iONMF':ionmf, 'Difference':difference}
table = pd.DataFrame(table_data, index = protein_list)

print (table)


 
    

                                                    Difference     Score  \
1_PARCLIP_AGO1234_hg19                                0.180427  0.878427   
2_PARCLIP_AGO2MNASE_hg19                              0.111064  0.712064   
3_HITSCLIP_Ago2_binding_clusters                      0.065421  0.826421   
4_HITSCLIP_Ago2_binding_clusters_2                    0.105010  0.847010   
5_CLIPSEQ_AGO2_hg19                                   0.056721  0.649721   
6_CLIP-seq-eIF4AIII_1                                 0.037614  0.900614   
7_CLIP-seq-eIF4AIII_2                                 0.080343  0.943343   
8_PARCLIP_ELAVL1_hg19                                 0.023738  0.947738   
9_PARCLIP_ELAVL1MNASE_hg19                            0.083807  0.677807   
10_PARCLIP_ELAVL1A_hg19                              -0.041693  0.918307   
10_PARCLIP_ELAVL1A_hg19                              -0.046281  0.909719   
12_PARCLIP_EWSR1_hg19                                 0.066090  0.929090   
13_PARCLIP_F

In [2]:
score_list = []
protein_list = ["1_PARCLIP_AGO1234_hg19", "2_PARCLIP_AGO2MNASE_hg19","3_HITSCLIP_Ago2_binding_clusters","4_HITSCLIP_Ago2_binding_clusters_2","5_CLIPSEQ_AGO2_hg19", "6_CLIP-seq-eIF4AIII_1","7_CLIP-seq-eIF4AIII_2","8_PARCLIP_ELAVL1_hg19","9_PARCLIP_ELAVL1MNASE_hg19", "10_PARCLIP_ELAVL1A_hg19", "10_PARCLIP_ELAVL1A_hg19", "12_PARCLIP_EWSR1_hg19", "13_PARCLIP_FUS_hg19", "14_PARCLIP_FUS_mut_hg19", "15_PARCLIP_IGF2BP123_hg19", "16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters", "17_ICLIP_HNRNPC_hg19", "18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome", "19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome", "20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome", "21_PARCLIP_MOV10_Sievers_hg19", "22_ICLIP_NSUN2_293_group_4007_all-NSUN2-293-hg19_sum_G_hg19--ensembl59_from_3137-3202_bedGraph-cDNA-hits-in-genome", "23_PARCLIP_PUM2_hg19", "24_PARCLIP_QKI_hg19", "25_CLIPSEQ_SFRS1_hg19","26_PARCLIP_TAF15_hg19", "27_ICLIP_TDP43_hg19", "28_ICLIP_TIA1_hg19", "29_ICLIP_TIAL1_hg19", "30_ICLIP_U2AF65_Hela_iCLIP_ctrl_all_clusters", "31_ICLIP_U2AF65_Hela_iCLIP_ctrl+kd_all_clusters"]

for x in range (0,31):

    protein = protein_list[x]
    
    X = np.dstack((get_cobinding(protein,"train"), get_seq(protein,"train")))
    y = get_class(protein,"train")
    
    size = X.shape[2]
    
    kf = KFold(n_splits=10) #10 različnih delitev podatkov  
    
    score = []
    for train_index, test_index in kf.split(X):
        
        # sestavljanje modela
        model = Sequential()
        model.add(Conv1D(120, 24,data_format='channels_last', input_shape=(101, size) , strides = 1, padding='valid'))
        model.add(MaxPooling1D(pool_size=12, strides=2, padding='valid'))
        model.add(Conv1D(120, 4, activation='relu'))
        model.add(GlobalAveragePooling1D())
        model.add(Dense(200, activation='relu'))
        model.add(Dense(2, activation='sigmoid'))
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

          
        
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        model.fit(X_train, y_train, epochs=10, batch_size=16, verbose=0)
   
    

        y_scores =(model.predict(X_test))# pridobim napovedi za sekvence iz cobinding
        score.append(roc_auc_score(y_test, y_scores))
        
    
    
    print ("%s finishhed" % (protein))
    score_list.append(np.mean(score))
    
    
print(score_list)

1_PARCLIP_AGO1234_hg19 finishhed
2_PARCLIP_AGO2MNASE_hg19 finishhed
3_HITSCLIP_Ago2_binding_clusters finishhed
4_HITSCLIP_Ago2_binding_clusters_2 finishhed
5_CLIPSEQ_AGO2_hg19 finishhed
6_CLIP-seq-eIF4AIII_1 finishhed
7_CLIP-seq-eIF4AIII_2 finishhed
8_PARCLIP_ELAVL1_hg19 finishhed
9_PARCLIP_ELAVL1MNASE_hg19 finishhed
10_PARCLIP_ELAVL1A_hg19 finishhed
10_PARCLIP_ELAVL1A_hg19 finishhed
12_PARCLIP_EWSR1_hg19 finishhed
13_PARCLIP_FUS_hg19 finishhed
14_PARCLIP_FUS_mut_hg19 finishhed
15_PARCLIP_IGF2BP123_hg19 finishhed
16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters finishhed
17_ICLIP_HNRNPC_hg19 finishhed
18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome finishhed
19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome finishhed
20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome finishhed
21_PARC

In [3]:
import pandas as pd
import numpy as np


protein_list = ["1_PARCLIP_AGO1234_hg19", "2_PARCLIP_AGO2MNASE_hg19","3_HITSCLIP_Ago2_binding_clusters","4_HITSCLIP_Ago2_binding_clusters_2","5_CLIPSEQ_AGO2_hg19", "6_CLIP-seq-eIF4AIII_1","7_CLIP-seq-eIF4AIII_2","8_PARCLIP_ELAVL1_hg19","9_PARCLIP_ELAVL1MNASE_hg19", "10_PARCLIP_ELAVL1A_hg19", "10_PARCLIP_ELAVL1A_hg19", "12_PARCLIP_EWSR1_hg19", "13_PARCLIP_FUS_hg19", "14_PARCLIP_FUS_mut_hg19", "15_PARCLIP_IGF2BP123_hg19", "16_ICLIP_hnRNPC_Hela_iCLIP_all_clusters", "17_ICLIP_HNRNPC_hg19", "18_ICLIP_hnRNPL_Hela_group_3975_all-hnRNPL-Hela-hg19_sum_G_hg19--ensembl59_from_2337-2339-741_bedGraph-cDNA-hits-in-genome", "19_ICLIP_hnRNPL_U266_group_3986_all-hnRNPL-U266-hg19_sum_G_hg19--ensembl59_from_2485_bedGraph-cDNA-hits-in-genome", "20_ICLIP_hnRNPlike_U266_group_4000_all-hnRNPLlike-U266-hg19_sum_G_hg19--ensembl59_from_2342-2486_bedGraph-cDNA-hits-in-genome", "21_PARCLIP_MOV10_Sievers_hg19", "22_ICLIP_NSUN2_293_group_4007_all-NSUN2-293-hg19_sum_G_hg19--ensembl59_from_3137-3202_bedGraph-cDNA-hits-in-genome", "23_PARCLIP_PUM2_hg19", "24_PARCLIP_QKI_hg19", "25_CLIPSEQ_SFRS1_hg19","26_PARCLIP_TAF15_hg19", "27_ICLIP_TDP43_hg19", "28_ICLIP_TIA1_hg19", "29_ICLIP_TIAL1_hg19", "30_ICLIP_U2AF65_Hela_iCLIP_ctrl_all_clusters", "31_ICLIP_U2AF65_Hela_iCLIP_ctrl+kd_all_clusters"]
score = [0.8854467745426089, 0.7389716116566503, 0.8486800058647509, 0.8558456066418886, 0.7024250393464896, 0.8966307808447306, 0.935885864297013, 0.9493017292655587, 0.7186794497766076, 0.9292762804991401, 0.9173037219001344, 0.9206776938194293, 0.8934579091067907, 0.9738436234314399, 0.8785229371893042, 0.9172132931900181, 0.9489552102800551, 0.7735668951519872, 0.724478273126381, 0.7632085437513132, 0.9432564453494334, 0.8101318069278541, 0.9730036404398559, 0.9426504774343695, 0.8595774908446335, 0.9606016792315117, 0.858870168513284, 0.9204543898078441, 0.9140122739172712, 0.8805678751447241, 0.8895041219714479]
ionmf = [0.698, 0.601, 0.761, 0.742, 0.593, 0.863, 0.863, 0.924, 0.594, 0.960, 0.956, 0.863, 0.798, 0.930, 0.699, 0.952, 0.973, 0.788, 0.713, 0.728, 0.907, 0.823, 0.936, 0.901, 0.787, 0.897, 0.733, 0.928, 0.907, 0.818, 0.795]
difference = np.subtract(score,ionmf)
pd.set_option('display.max_columns', 4)


              
table_data = {'Score' : score, 'iONMF':ionmf, 'Difference':difference}
table = pd.DataFrame(table_data, index = protein_list)

print (table)


                                                    Difference     Score  \
1_PARCLIP_AGO1234_hg19                                0.187447  0.885447   
2_PARCLIP_AGO2MNASE_hg19                              0.137972  0.738972   
3_HITSCLIP_Ago2_binding_clusters                      0.087680  0.848680   
4_HITSCLIP_Ago2_binding_clusters_2                    0.113846  0.855846   
5_CLIPSEQ_AGO2_hg19                                   0.109425  0.702425   
6_CLIP-seq-eIF4AIII_1                                 0.033631  0.896631   
7_CLIP-seq-eIF4AIII_2                                 0.072886  0.935886   
8_PARCLIP_ELAVL1_hg19                                 0.025302  0.949302   
9_PARCLIP_ELAVL1MNASE_hg19                            0.124679  0.718679   
10_PARCLIP_ELAVL1A_hg19                              -0.030724  0.929276   
10_PARCLIP_ELAVL1A_hg19                              -0.038696  0.917304   
12_PARCLIP_EWSR1_hg19                                 0.057678  0.920678   
13_PARCLIP_F