In [28]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
from eden.util import configure_logging
import logging
logger = logging.getLogger()
configure_logging(logger,verbosity=0)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [29]:
import cPickle as pickle
def save_data(data, fname='data.dat'):
    with open(fname, 'wb') as outfile:
        pickle.dump(data, outfile, pickle.HIGHEST_PROTOCOL)
    if len(data.shape)==1:
        print 'Saved %s (%d)' % (fname, data.shape[0])
    else:
        print 'Saved %s (%d,%d)' % (fname, data.shape[0], data.shape[1])

def load_data(fname='data.dat'):
    with open(fname, 'rb') as infile:
        data = pickle.load(infile)
    if len(data.shape)==1:
        print 'Loaded %s (%d)' % (fname, data.shape[0])
    else:
        print 'Loaded %s (%d,%d)' % (fname, data.shape[0], data.shape[1])
    return data

#Predicitve model performance evaluation

In [30]:
def train_perform(X_train, y_train, X_test,y_test):
    # Induce a predictive model
    print 'Training on data matrix [%d x %d]' %(X_train.shape)
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, class_weight='auto', shuffle=True, n_jobs=-1)
    estimator.fit(X_train,y_train)

    # Print predictive performance
    from sklearn.metrics import confusion_matrix
    print 'Confusion matrix:'
    print(confusion_matrix(y_test, estimator.predict(X_test)))
    print
    from sklearn.metrics import classification_report
    print 'Classification Report:'
    print classification_report(y_test, estimator.predict(X_test))

#Data preparation

Split data into train/test according to binary clustering

In [31]:
def train_test_ids_split(X, confidence_threshold=1):
    from sklearn.cluster import MiniBatchKMeans
    kmeans = MiniBatchKMeans(n_clusters=2)
    classes = kmeans.fit_predict(X)
    
    from sklearn.linear_model import SGDClassifier
    estimator = SGDClassifier(average=True, class_weight='balanced', shuffle=True, n_jobs=-1)
    estimator.fit(X,classes)
    conf = estimator.decision_function(X)
    
    train_ids = []
    test_ids = []
    for i,(class_info, conf_info) in enumerate(zip(classes, conf)):
        if abs(conf_info) > confidence_threshold:
            if class_info == 0:
                train_ids.append(i)
            else:
                test_ids.append(i)
    return train_ids, test_ids

Retrieve seed sequences from RFam families.

Use EDeN to transform sequences to vectors. Use RNAfold to create structures and EDeN to convert those to vectors.

Use the sequence vectors to guide the train/test split. 

In [32]:
from sklearn.random_projection import SparseRandomProjection
import time

def rfam_uri(family_id):
        return 'http://rfam.xfam.org/family/%s/alignment?acc=%s&format=fastau&download=0'%(family_id,family_id)
    
#RNAVectorizer
def pre_processor(seqs):
    n_neighbors = min(len(seqs),30)
    rs = int(time.time())
    from eden.RNA import Vectorizer as RNAVectorizer
    rnavec=RNAVectorizer(n_neighbors=n_neighbors,
                         sampling_prob=.15,
                         n_iter=5,
                         min_energy=-5,
                         random_state=rs)
    rnavec.fit(seqs)
    graphs = rnavec.graphs(seqs)
    #from eden.modifier.graph import structure 
    #graphs = structure.basepair_to_nesting(graphs)
    return graphs

#RNAfold
def pre_processor(seqs):
    from eden.converter.rna.rnafold import rnafold_to_eden
    graphs = rnafold_to_eden(seqs)
    #from eden.modifier.graph import structure 
    #graphs = structure.basepair_to_nesting(graphs)
    return graphs

#RNAplfold
def pre_processor(seqs):
    from eden.converter.rna.rnaplfold import rnaplfold_to_eden
    graphs = rnaplfold_to_eden(seqs,
                               window_size = 250,
                               max_bp_span = 35,
                               avg_bp_prob_cutoff = 0.01,
                               max_num_edges = 10,
                               no_lonely_bps=True,
                               nesting=True)
    return graphs
    
def rfam_to_matrix(rfam_id, use_structure=True, n_max=50, complexity=2, nbits=10):
    from eden.converter.fasta import fasta_to_sequence
    seqs = fasta_to_sequence(rfam_uri(rfam_id))
    from itertools import islice
    seqs = islice(seqs,n_max)
    seqs = list(seqs)
    if use_structure:
        graphs = pre_processor(seqs)
        from eden.graph import Vectorizer as GraphVectorizer
        vectorizer = GraphVectorizer(complexity=complexity,nbits=nbits)
        X = vectorizer.transform(graphs)
    else:
        from eden.path import Vectorizer as SeqVectorizer
        vectorizer = SeqVectorizer(complexity=complexity,nbits=nbits)
        X = vectorizer.transform(seqs)
    return X

def rfam_data(rfam_ids, n_max=300, complexity=3, nbits=13):
    import numpy as np
    from scipy.sparse import vstack
    seq_train_list = []
    seq_test_list = []
    struct_train_list = []
    struct_test_list = []
    y_train = []
    y_test = []
    for i,rfam_id in enumerate(rfam_ids):
        # seq case
        seq_X=rfam_to_matrix(rfam_id, use_structure=False, n_max=n_max, complexity=complexity, nbits=nbits)
        train_ids, test_ids = train_test_ids_split(seq_X)
        seq_X_train = seq_X[train_ids]
        seq_X_test = seq_X[test_ids]
        seq_train_list.append(seq_X_train)
        seq_test_list.append(seq_X_test)
        y_train += [i] * seq_X_train.shape[0]
        y_test += [i] * seq_X_test.shape[0]
        
        # struct case
        struct_X=rfam_to_matrix(rfam_id, use_structure=True, n_max=n_max, complexity=complexity, nbits=nbits)
        #NOTE: use the same split given by the sequence similarity
        struct_X_train = struct_X[train_ids]
        struct_X_test = struct_X[test_ids]
        struct_train_list.append(struct_X_train)
        struct_test_list.append(struct_X_test)
    seq_X_train = vstack(seq_train_list, format="csr")
    seq_X_test = vstack(seq_test_list, format="csr")
    struct_X_train = vstack(struct_train_list, format="csr")
    struct_X_test = vstack(struct_test_list, format="csr")
    target_train = np.array(y_train)
    target_test = np.array(y_test)
    
    return seq_X_train,\
        seq_X_test,\
        struct_X_train,\
        struct_X_test,\
        target_train,\
        target_test

rfam_ids=['RF00004','RF00005','RF00015','RF00020','RF00026','RF00169',
          'RF00380','RF00386','RF01051','RF01055','RF01234','RF01699',
          'RF01701','RF01705','RF01731','RF01734','RF01745','RF01750',
          'RF01942','RF01998','RF02005','RF02012','RF02034']

print 'num families:', len(rfam_ids)

num families: 23


In [33]:
data_ids=rfam_ids[0:5]
print 'Experiment using %d families' % len(data_ids)
prefix='f3_c4nb12_'

Experiment using 5 families


In [34]:
%%time
seq_X_train,\
seq_X_test,\
struct_X_train,\
struct_X_test,\
y_train,\
y_test = rfam_data(data_ids, n_max=300, complexity=4, nbits=11)

CPU times: user 1min 58s, sys: 11 s, total: 2min 9s
Wall time: 2min 28s


In [35]:
%%time
save_data(seq_X_train, fname=prefix+'seq_X_train.dat')
save_data(seq_X_test, fname=prefix+'seq_X_test.dat')
save_data(struct_X_train, fname=prefix+'struct_X_train.dat')
save_data(struct_X_test, fname=prefix+'struct_X_test.dat')
save_data(y_train, fname=prefix+'y_train.dat')
save_data(y_test, fname=prefix+'y_test.dat')

Saved f3_c4nb12_seq_X_train.dat (353,2049)
Saved f3_c4nb12_seq_X_test.dat (684,2049)
Saved f3_c4nb12_struct_X_train.dat (353,2049)
Saved f3_c4nb12_struct_X_test.dat (684,2049)
Saved f3_c4nb12_y_train.dat (353)
Saved f3_c4nb12_y_test.dat (684)
CPU times: user 15 ms, sys: 107 ms, total: 122 ms
Wall time: 380 ms


In [36]:
%%time
seq_X_train=load_data(fname=prefix+'seq_X_train.dat')
seq_X_test=load_data(fname=prefix+'seq_X_test.dat')
struct_X_train=load_data(fname=prefix+'struct_X_train.dat')
struct_X_test=load_data(fname=prefix+'struct_X_test.dat')
y_train=load_data(fname=prefix+'y_train.dat')
y_test=load_data(fname=prefix+'y_test.dat')

Loaded f3_c4nb12_seq_X_train.dat (353,2049)
Loaded f3_c4nb12_seq_X_test.dat (684,2049)
Loaded f3_c4nb12_struct_X_train.dat (353,2049)
Loaded f3_c4nb12_struct_X_test.dat (684,2049)
Loaded f3_c4nb12_y_train.dat (353)
Loaded f3_c4nb12_y_test.dat (684)
CPU times: user 17 ms, sys: 45.5 ms, total: 62.5 ms
Wall time: 61 ms


In [37]:
%%time
# sparse to dense arrays
Xseq_train = seq_X_train
Xseq_test = seq_X_test
X_train = struct_X_train
X_test = struct_X_test

CPU times: user 31 µs, sys: 5.08 ms, total: 5.11 ms
Wall time: 5.11 ms


In [38]:
%%time
# sparse to dense arrays
Xseq_train = seq_X_train.toarray()
Xseq_test = seq_X_test.toarray()
X_train = struct_X_train.toarray()
X_test = struct_X_test.toarray()

CPU times: user 26.9 ms, sys: 10.2 ms, total: 37.1 ms
Wall time: 36.5 ms


#Sequence features

In [39]:
%%time
train_perform(Xseq_train, y_train, Xseq_test,y_test)

Training on data matrix [353 x 2049]
Confusion matrix:
[[126   0  18   1   0]
 [  0  22 226   3   0]
 [  0   0  61   0   0]
 [  5   0   9  48   0]
 [  1  18 106  22  18]]

Classification Report:
             precision    recall  f1-score   support

          0       0.95      0.87      0.91       145
          1       0.55      0.09      0.15       251
          2       0.15      1.00      0.25        61
          3       0.65      0.77      0.71        62
          4       1.00      0.11      0.20       165

avg / total       0.72      0.40      0.38       684

CPU times: user 69.1 ms, sys: 5.33 ms, total: 74.5 ms
Wall time: 150 ms




#Structure features

In [40]:
%%time
train_perform(X_train, y_train, X_test,y_test)

Training on data matrix [353 x 2049]




Confusion matrix:
[[110   1  29   4   1]
 [  5 136  73  32   5]
 [ 23   5  28   5   0]
 [ 12   3  15  31   1]
 [  1  78  19  37  30]]

Classification Report:
             precision    recall  f1-score   support

          0       0.73      0.76      0.74       145
          1       0.61      0.54      0.57       251
          2       0.17      0.46      0.25        61
          3       0.28      0.50      0.36        62
          4       0.81      0.18      0.30       165

avg / total       0.61      0.49      0.49       684

CPU times: user 77.1 ms, sys: 5.14 ms, total: 82.3 ms
Wall time: 150 ms


#Learned map seq $\mapsto$ structure features

In [24]:
from sklearn.preprocessing import StandardScaler
seqs_scale = StandardScaler(with_mean=True)
data_matrix_in = seqs_scale.fit_transform(Xseq_train)
struct_scale = StandardScaler(with_mean=True)
data_matrix_out = struct_scale.fit_transform(X_train)
n_features_in = data_matrix_in.shape[1]
n_features_out = data_matrix_out.shape[1]
n_features_hidden = max(n_features_in, n_features_out) * 2
print 'n_neurons: #in [%d] -- #hidden [%d] -- #out [%d]' % (n_features_in, n_features_hidden, n_features_out)

n_neurons: #in [2049] -- #hidden [4098] -- #out [2049]


In [25]:
from sknn.mlp import Regressor, Layer

net = Regressor(layers=[
        Layer("Rectifier", units=n_features_hidden),
        Layer("Rectifier", units=n_features_hidden),
        Layer("Rectifier", units=n_features_hidden),
        Layer("Softmax", units=n_features_out)],
                learning_rate=0.000001,
                n_iter=100,
                regularize='L1',
                valid_size=0.2)

In [None]:
%%time
net.fit(data_matrix_in, data_matrix_out)

In [None]:
%%time
# Transform seq features to struct features
X_train_pred = net.predict(seqs_scale.transform(Xseq_train))
X_test_pred = net.predict(seqs_scale.transform(Xseq_test))

In [None]:
%%time
train_perform(X_train_pred, y_train, X_test_pred, y_test)

---