In [1]:
import pandas as pd
import numpy as np
import scipy
import scipy.sparse
import scipy.stats
import os
import scipy.io as sio
import dnatools
from MLR import MLR
%matplotlib inline
from pylab import *

# Plotting Params:
rc('mathtext', default='regular')
fsize=14


  .format(openpyxl_compat.start_ver, openpyxl_compat.stop_ver))


### Make directory to save results:

In [2]:
resultsdir = '../results/N9_Training_Joint_A5SS_A3SS_Model/'
if not os.path.exists(resultsdir):
    os.makedirs(resultsdir)
figdir = '../figures/N9_Training_Joint_A5SS_A3SS_Model/'
if not os.path.exists(figdir):
    os.makedirs(figdir)
    
#Choose if you want to actually save the plots:
SAVEFIGS = True

### Load library data:

In [3]:
data = sio.loadmat('../data/Reads.mat')

# A5SS
A5SS_data = data['A5SS']
A5SS_data = np.array(A5SS_data.todense())
# Get minigenes with reads
A5SS_nn = find(A5SS_data.sum(axis=1))
A5SS_data = A5SS_data[A5SS_nn]
A5SS_data = A5SS_data/A5SS_data.sum(axis=1)[:,newaxis]
A5SS_seqs = pd.read_csv('../data/A5SS_Seqs.csv',index_col=0).Seq[A5SS_nn]

# A3SS
A3SS_data = data['A3SS']
# Only look at SA_1 usage:
A3SS_data = np.array(A3SS_data[:,235].todense()).reshape(-1)/np.array(A3SS_data.sum(axis=1),dtype=np.float64).reshape(-1)
# Get minigenes with reads
A3SS_nn = find(pd.notnull(A3SS_data))
A3SS_data = A3SS_data[A3SS_nn]
A3SS_seqs = pd.read_csv('../data/A3SS_Seqs.csv',index_col=0).Seq[A3SS_nn]


### Function to make 6-mer matrix with position:

In [4]:
# Load Splice Site Model
data = sio.loadmat('../results/N7_A5SS_Model_Predictions/model_full_data.mat')
sd_scores = pd.DataFrame(index=dnatools.make_mer_list(6),data=data['Mer_scores'][:4**6*8].reshape(4**6,8)[:,2:6])

We will use the second randomized region from the A3SS library. However, we will not use the first 3 bases, since they are right on the intron-exon boundary. For the A5SS library we will use the whole first randomized region.

In [5]:
#Make matrices for A3SS lib:
#
X_A3 = dnatools.make_mer_matrix_no_pos(pd.Series(A3SS_seqs).str.slice(-22),6)
Y = A3SS_data
Y3 = scipy.matrix(np.hstack((1-Y[:,newaxis],Y[:,newaxis])))

#Make matrices for A5SS lib:
X_A5 = dnatools.make_mer_matrix_no_pos(pd.Series(A5SS_seqs).str.slice(7,32),6)
Y = A5SS_data[:,44]/(A5SS_data[:,0]+A5SS_data[:,44])
nn = find(pd.notnull(Y))
X_A5 = X_A5[nn,:]
Y5 = scipy.matrix(np.hstack((1-Y[nn,newaxis],Y[nn,newaxis])))

0 10000 20000 30000 40000 50000 60000 70000 80000 90000 100000 110000 120000 130000 140000 150000 160000 170000 180000 190000 200000 210000 220000 230000 240000 250000 260000 270000 280000 290000 300000 310000 320000 330000 340000 350000 360000 370000 380000 390000 400000 410000 420000 430000 440000 450000 460000 470000 480000 490000 500000 510000 520000 530000 540000 550000 560000 570000 580000 590000 600000 610000 620000 630000 640000 650000 660000 670000 680000 690000 700000 710000 720000 730000 740000 750000 760000 770000 780000 790000 800000 810000 820000 830000 840000 850000 860000 870000 880000 890000 900000 910000 920000 930000 940000 950000 960000 970000 980000 990000 1000000 1010000 1020000 1030000 1040000 1050000 1060000 1070000 1080000 1090000 1100000 1110000 1120000 1130000 1140000 1150000 1160000 1170000 1180000 1190000 1200000 1210000 1220000 1230000 1240000 1250000 1260000 1270000 1280000 1290000 1300000 1310000 1320000 1330000 1340000 1350000 1360000 1370000 1380000 13

In [6]:
# Make a combined matrix for both sets of events:
Xcomb = scipy.sparse.csr_matrix(scipy.sparse.vstack((X_A3,X_A5)))
Y = scipy.vstack((Y3,Y5))
# Add a row of ones for all A3SS terms, so they can have different biases:
X_comb_A3_bias = np.zeros(shape(X_A3)[0]+shape(X_A5)[0])
X_comb_A3_bias[:shape(X_A3)[0]] = 1
X_comb_A3_bias = scipy.sparse.csr_matrix(X_comb_A3_bias[:,newaxis])
# Combined matrix to train model:
XA3A5 = scipy.sparse.csr_matrix(scipy.sparse.hstack((Xcomb,X_comb_A3_bias)))

I'm not bothering to optimize the regularization parameter $\lambda$. Let's just add a little L2 regularization.

In [7]:
mlr = MLR(verbose=True)
mlr.fit(XA3A5,Y,reg_type='L2',reg_lambda=0.000001,maxit=500)

0.643368979 0.246430117361 0.231122900451 0.199076908436 0.167345538898 0.148949729845 0.144812737439 0.138456489141 0.133479678746 0.128656811894 0.127269765104 0.126507080996 0.125529387824 0.124842479276 0.125076011919 0.124606977309 0.124446807912 0.124223724281 0.123971936462 0.123824148106 0.124355314712 0.123698025625 0.123596759749 0.123365648984 0.123252619311 0.123032904009 0.122867253489 0.123332093519 0.122850987197 0.122781514866 0.122696222625 0.122582621032 0.122459504875 0.122326227556 0.12227247575 0.122179617547 0.122153257007 0.122109386283 0.122053306507 0.122037568295 0.121940116416 0.12192270601 0.121904854719 0.122078789747 0.121875419246 0.121842648636 0.12181166061 0.121793228437 0.121782505887 0.121757465815 0.121738971045 0.121712108731 0.121760222175 0.121707091873 0.12170107325 0.121698619291 0.121692175185 0.121679509864 0.121665688829 0.121694432531 0.121660777493 0.121653237356 0.121646821452 0.121637334972 0.121630612313 0.121633431391 0.12162942698 0.1

In [8]:
exonic_mer6_scores = pd.Series(dict(zip(dnatools.make_mer_list(6),mlr.W[:,1]*2)))
exonic_mer6_scores.to_pickle(resultsdir+'exonic_mer6_scores.series')

### Get 3-mer effect sizes for start of exon directly from 3' data:

Now we want to score the first three nucleotides of an exon differently, since this region overlaps the splice acceptor. We can use the A3SS library and look at the odds of splicing at a new splice acceptor depending on the +1 to +3 bases.

In [9]:
data = sio.loadmat('../data/Reads.mat')
reads = np.array(data['A3SS'].sum(axis=1)).reshape(-1).astype(float)
nn = reads>0
r1 = pd.Series(A3SS_seqs).str.slice(0,25)
r2 = pd.Series(A3SS_seqs).str.slice(0,25)
read_series = pd.Series(reads[nn])

Count the number of spliced reads at new splice acceptors for each potential +1 to +3 sequence:<br>
AAA = some number of spliced reads<br>
...<br>
TTT = some other number of splice reads

In [10]:
# First randomized region
splice_reads = {}
read_sum = {}
for i in range(23):
    sliced = r1.str.slice(0+i,3+i).values
    splice_reads[i] = pd.Series(np.array(data['A3SS'][:,235-21-25+i][nn].todense()).reshape(-1)).groupby(sliced).sum()
    read_sum[i] = read_series.groupby(sliced).sum()
    print i,

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22


In [11]:
# Second randomized region
for i in range(23):
    sliced = r2.str.slice(0+i,3+i).values
    splice_reads[i+23] = pd.Series(np.array(data['A3SS'][:,235+3+i][nn].todense()).reshape(-1)).groupby(sliced).sum()
    read_sum[i+23] = read_series.groupby(sliced).sum()
    print i,

0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22


In [12]:
logit = lambda x: log(x)-log(1-x)
exonic_acceptor_scores = (logit(pd.DataFrame(splice_reads).sum(axis=1)/pd.DataFrame(read_sum).sum(axis=1))-\
logit(pd.DataFrame(splice_reads).sum().sum()/pd.DataFrame(read_sum).sum().sum()))
exonic_acceptor_scores.to_pickle(resultsdir+'exonic_acceptor_scores.series')