In [None]:
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook
from src.utils import *
from src.spectrum import *
from sklearn import svm

In [None]:
train_Xs = [load(k=k) for k in range(3)]
train_Ys = [load(X=False, k=k) for k in range(3)]
test_Xs = [load(k=k, train=False) for k in range(3)]
X = train_Xs[0]
Y = train_Ys[0]
print(X, Y, sep='\n')
print(X.shape, Y.shape)

In [None]:
L = np.array([len(x) for x in X])
print(L.min(), L.max())

In [None]:
s = 'ATCG'
def one_hot(X):
    '''NxT -> NxTx4'''
    return X[:,:,None] == np.arange(4)[None,None,:]
X1 = one_hot(X)
print(X1.shape)

In [None]:
Ibound = np.nonzero(Y)[0]
Iunbound = np.nonzero(~Y)[0]
def plot_correl(f, X=X1):
    plt.figure(figsize=(15,5))
    
    plt.subplot(1, 2, 1)
    f(X[Ibound])
    plt.title('Bound sequences')
    plt.legend()
    
    plt.subplot(1, 2, 2)
    f(X[Iunbound])
    plt.title('Unbound sequences')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
print('{:.0f}% of bound sequences'.format(100 * Y.mean()))

In [None]:
def f(X1):
    plt.hist(X1.sum(axis=1), label=list(s))
    plt.ylabel('Number of sequences')
    plt.xlabel('Number of occurences of nucleotide')
plot_correl(f)

In [None]:
def f(X1):
    for i in range(4):
        plt.plot(100*X1.mean(axis=0)[:,i], label=s[i])
    plt.xlabel('Index')
    plt.ylabel('Percentage of occurences of nucleotide')
plot_correl(f)

In [None]:
plt.figure(figsize=(20,5))
i = 0
for c in s:
    i += 1
    plt.plot(np.arange(101), np.mean((2*X1[:,:,i-1] - 1) * (2*Y - 1)[:,None], axis=0), label=c)
plt.legend()
plt.title('Correlation between nucleotide and boundedness')
plt.show()

In [None]:
greedy_Cs = [1e2, 1e5, 1e5]
def correlindex(k=None, w=None, dataset=0, folds=5, stride=1):
    """
    Plots validation error when using only a slice of the input data.
    The window length is w = 2*k + 1, specify either one.
    """
    if w is None:
        w = 2*k+1
    else:
        k = (w-1)//2
    train_X = train_Xs[dataset]
    train_Y = train_Ys[dataset]
    C = greedy_Cs[dataset]

    xs = []
    means = []
    stds = []
    for x in tqdm_notebook(range(k, 101-k, stride)):
        xs.append(x)
        s = np.index_exp[:, slice(x-k, x+k+1)]
        train_K = cum_spectrum(train_X[s], k=w)
        mean, std = evaluate(svm.SVC(kernel='precomputed', C=C), train_K, train_Y, folds=folds)[0:2]
        means.append(mean)
        stds.append(std)
    means = np.array(means)
    stds = np.array(stds)
        
    plt.figure()
    plt.fill_between(xs, means - stds, means + stds, alpha=0.3)
    plt.plot(xs, means, label='Accuracy')
    plt.xlabel('Index in sequence')
    plt.ylabel('Validation score')
    plt.title('Evolution of score when using only a window of input')
    plt.plot(xs, means[::-1], label='Symmetrized accuracy around center')
    plt.fill_between(xs, means[::-1] - stds[::-1], means[::-1] + stds[::-1], alpha=0.3)
    x = xs[means.argmax()]
    plt.vlines(50, means.min() - 0.1, means.max() + 0.1, label='Center', colors='C2')
    plt.vlines(x, means.min() - 0.1, means.max() + 0.1, label='Max {:.0f}% at x={}'.format(100*means.max(), x), colors='C3')
    plt.legend()
    plt.show()

for d in range(3):
    correlindex(w=11, dataset=d)

Conclusions:
- $A \approx T \neq C \approx G$
- interesting things happen at the index 50, indices 0-49 and 51-100 are less important as we go near the bounds