In [None]:
import pandas as pd
import numpy as np
from pylab import *
import scipy.stats
import random
import time
import json
import copy
import re
import math
import operator
rc('mathtext', default='regular')
import os
import string
import ast
import scipy.io as sio
from sklearn.metrics import roc_curve, auc
import MLR_L2_fixed_weights
import MLR_L1
import scipy.ndimage
fsize=14
%matplotlib inline

In [None]:
bases = ['A','T','C','G']
dna_dict = dict(zip(list('ATCG'),range(4)))
watsoncrick = {'N':'N','.':'.','C':'G','G':'C','A':'T','T':'A','*':'*'}
def add_base(li):
		"""Used in make_mer_list to add one more base to list"""
		new_li = []
		for s in li:
			for b in bases:
				new_li.append(s+b)
		return new_li

def make_mer_list(mer_len):
	"""Makes a list of all n-mers"""
	li = bases
	for i in range(mer_len-1):
		li = add_base(li)
	return li
		

def reverse_complement(seq):
    outseq = ''
    for s in seq:
        outseq = watsoncrick[s] + outseq
    return outseq

In [None]:
result_dir = '../results/20150614.Subsampling/'
#os.mkdir(result_dir)
fig_dir = '../doc/figures/20150614.Subsampling/'
#os.mkdir(fig_dir)

In [None]:
# A5SS
A5SS_data = sio.loadmat('../data/nobackup/A5SS/Alt_5SS_Usage_All_Cells.mat')['HEK']
A5SS_data = np.array(A5SS_data.todense())
A5SS_nn = find(A5SS_data.sum(axis=1))
A5SS_data = A5SS_data[A5SS_nn]
A5SS_data = A5SS_data/A5SS_data.sum(axis=1)[:,newaxis]
A5SS_seqs = pd.read_csv('../data/nobackup/A5SS/Alt_5SS_Tag_to_Seq_Map.csv',index_col=0).Seq.values[A5SS_nn]

In [None]:
def make_mer_matrix_no_pos(seqs,mer_len,start,end):
    mer_dict = dict(zip(make_mer_list(mer_len),range(4**mer_len)))
    rows,cols = [],[]
    r = 0
    for i in xrange(len(seqs)):
        cur_seq = seqs[i]
        for b in range(len(cur_seq)-mer_len+1):
            rows.append(r)
            cols.append(mer_dict[cur_seq[b:b+mer_len]])
        if(r%10000)==0:
            print r,
        r+=1
    vals = np.ones_like(cols)
    rows.append(r-1)
    cols.append(4**mer_len-1)
    vals = np.append(vals,0)
    X = scipy.sparse.csr_matrix((vals,(rows,cols)),dtype=np.float64)
    return X

In [None]:
X = {}
for mer_len in range(3,8):
    X_r1 = make_mer_matrix_no_pos(A5SS_seqs,mer_len,7-mer_len+1,32+mer_len-1)
    X_r2 = make_mer_matrix_no_pos(A5SS_seqs,mer_len,50-mer_len+1,75+mer_len-1)
    X[mer_len] = scipy.sparse.csr_matrix(scipy.sparse.hstack((X_r1,X_r2)))

In [None]:
Y = scipy.matrix(np.array((1-A5SS_data[:,0],A5SS_data[:,0])).T)

In [None]:
if False:
    inds = range(len(A5SS_seqs))
    shuffle(inds)
    train_set = inds[:int(len(inds)*0.9)]
    test_set = inds[int(len(inds)*0.9):]
else:
    train_set = np.loadtxt(result_dir+'training_inds').astype(int)
    test_set = np.loadtxt(result_dir+'test_inds').astype(int)

In [None]:
#np.savetxt(result_dir+'training_inds',np.array(train_set))
#np.savetxt(result_dir+'test_inds',np.array(test_set))

In [None]:
reload(MLR_L1)

In [None]:
data_sizes = np.int64(10**arange(2,5.26,0.25))
lambdas = 10**arange(-1,-9,-1.)

In [None]:
model_weights = {}
for L in lambdas:
    model_weights[L] = {}
    print '-----------------Lambda:',L
    for mer_len in range(3,8):
        model_weights[L][mer_len] = {}
        print '-----------------mer_len:',mer_len
        sys.stdout.flush()
        print '-----------------Data Size:',
        for data_size in data_sizes:
            print data_size,
            model_weights[L][mer_len][data_size],_,_ = MLR_L1.MLR(X[mer_len][train_set[:data_size]],
                                                         Y[train_set[:data_size]],
                                                         L)

In [None]:
model_preds = {}
for L in lambdas:
    model_preds[L] = {}
    print '-----------------Lambda:',L
    for mer_len in range(3,8):
        model_preds[L][mer_len] = {}
        print '-----------------mer_len:',mer_len
        for data_size in data_sizes:
            print data_size,
            model_preds[L][mer_len][data_size] = MLR_L2_fixed_weights.predict(X[mer_len][test_set],
                                                                       model_weights[L][mer_len][data_size])
        print ''

In [None]:
R2s = {}
for L in lambdas:
    R2s[L] = {}
    for mer_len in range(3,8):
        R2s[L][mer_len] = {}
        print '-----------------mer_len:',mer_len
        for data_size in data_sizes:
            R2s[L][mer_len][data_size] = scipy.stats.pearsonr(model_preds[L][mer_len][data_size][:,1],Y[test_set][:,1])[0][0]**2

In [None]:
R2s = pd.Panel(R2s)

In [None]:
R2s.to_pickle(result_dir+'Subsampling_R2.panel')

In [None]:
R2s = pd.read_pickle(result_dir+'Subsampling_R2.panel')

In [None]:
R2s.apply(max,axis=0)

In [None]:
R2_maxes = R2s.apply(max,axis=0).iloc[:14]
fig = figure(figsize=(9,4))
ax = fig.add_subplot(111)
markers = ['o','s','v','D','p']
c = 0
for col in R2_maxes.columns:
    R2_maxes[col].plot(label=str(col)+'-mer',marker=markers[c])
    c+=1
#R2s.apply(max,axis=0).iloc[:14].plot(ax=ax,marker='o')
ax.set_xscale('log')
leg = legend([str(i)+'-mers' for i in range(3,8)],bbox_to_anchor=(1.25,1),numpoints=1,fontsize=fsize)
leg.get_frame().set_alpha(0)
leg.set_title('Features')
ax.set_xlabel('Number of Training Points',fontsize=fsize)
ax.set_ylabel('$R^2$',fontsize=fsize)
setp(leg.get_title(),fontsize=fsize)
ax.tick_params(labelsize=fsize)
ax.set_xlim(90,200000)
ax.set_title('A5SS Library Learning Curve ($SD_1$)',fontsize=fsize)
if True:
    figname = 'Learning_Curve'
    fig.savefig(fig_dir+figname+'.png',bbox_inches='tight', dpi = 300)
    fig.savefig(fig_dir+figname+'.pdf',bbox_inches='tight', dpi = 300)
    fig.savefig(fig_dir+figname+'.eps',bbox_inches='tight', dpi = 300)

In [None]:
fig = figure()
ax = fig.add_subplot(111)
pd.DataFrame(R2s).plot(ax=ax,marker='o')
ax.set_xscale('log')
leg = legend(bbox_to_anchor=(1.4,1))

In [None]:
R2_0001_L1 = pd.DataFrame(R2s)


In [None]:
R2_0001 = pd.DataFrame(R2s)

In [None]:
R2_00001 = pd.DataFrame(R2s)


In [None]:
fig = figure()
ax = fig.add_subplot(111)
R2_00001.plot(ax=ax,marker='o')
ax.set_xscale('log')
leg = legend(bbox_to_anchor=(1.4,1))

R2_0001_L1.plot(ax=ax,marker='o')
ax.set_xscale('log')
leg = legend(bbox_to_anchor=(1.4,1))

In [None]:
R2