# LSTM model to a Quality Array. Improvements Of Compression Rate

##

In [271]:
from Bio import SeqIO
import copy
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px
#import matplotlib.pyplot as plt

# NN NN Architectrue
from keras.models import Model#, Sequential
from keras.layers import Input, LSTM, Flatten, Dense, Permute, Dropout
from keras.optimizers import Adam
from keras.regularizers import L1


In [272]:
# Pre-Processing Functions
def seq2bool(seq, amines):
    return (np.expand_dims(np.array(list(seq)),1) == np.expand_dims(amines,1).T)

def qual2bool(qual):
    uq = np.unique(qual)
    return (np.expand_dims(qual,1) == np.expand_dims(uq,1).T)

def qual2R(qual):
    qual = np.array(qual)/(40.0)
    m,s = np.mean(qual), np.std(qual)
    qual = (qual - m)/s
    return qual, m, s;

def R2qual(qual,m,s):
    qual = qual *s + m
    qual = qual*40.0;
    return qual;

#  test
q1 = np.array([1,2,3,0])
q2, m, s = qual2R(q1)
q3 = R2qual(q2,m,s);
sum((q1 - q3)**2)

0.0

In [273]:
# Input- Output Construction Functions
def mk_xt(sequence_array,qual_array, N = 10**2, method = 's'):
    if method == 'q':
        # Quality
        x1 = sequence_array[:N,:];
        x1 = np.moveaxis(np.expand_dims(x1,2), 1,2)
        
        x2 = qual_array[:N,:];
        x2 = np.moveaxis(np.expand_dims(x2,2), 1,2)        

        x = np.concatenate([x1,x2], axis = 2)
        t = np.roll(x2,1)

    elif method == 's':    
        # Sequence
        x = np.moveaxis(np.expand_dims(sequence_array[:N,:],2), 1,2)
        
        t = np.roll(x,1)

    elif method == 'q+s':
        # Sequence + Quality
        x1 = sequence_array[:N,:];
        x1 = np.moveaxis(np.expand_dims(x1,2), 1,2)
        
        x2 = qual_array[:N,:];
        x2 = np.moveaxis(np.expand_dims(x2,2), 1,2)        
        
        x = np.concatenate([x1,x2], axis = 2)
        t = np.roll(x2,1)
    return x,t

def xt2vec(x,t,timesteps,regressors):
    # Vectorizations (one-hot encoding likewise)
    X = [];
    for i in range(timesteps):
        X.append(np.roll(x,-i))

    X = np.concatenate(X, axis = 1)
    t = t[:,0:1,0:regressors];

    t = np.moveaxis(t, 2, 1);
    return X, t;

def mk_arh(units, timesteps, predictors, regressors,learning_rate):
    # Make Architecture
    i = Input(shape=(timesteps,predictors),name = 'Input')
    l = LSTM (units, return_sequences = True, use_bias = False,name = 'LSTM') (i)
    l *= (1.0)/np.tanh(1).astype(float);
    l = Flatten(name = 'Flatten')(l)
    #l = Dropout(0.75,name = 'Dropout')(l)
    l = Dense(regressors, name = 'Dense', activation = 'linear', use_bias = False) (l)

    regressor = Model (inputs = i, outputs = l)
    regressor.compile(optimizer = Adam(lr=learning_rate), loss = 'mean_squared_error', metrics=['mse'])
    return regressor;

def n_params(nn):
    n = 0;
    w = nn.get_weights()
    for ww in w:
        n += ww.size
    return n;

In [274]:
!head -n 12 ../data/E_4_20_1_short_example.fq

'head' is not recognized as an internal or external command,
operable program or batch file.


In [275]:
qual_array, sequence_array = [],[]
nucleotids = ['A', 'C', 'G', 'T']
with open("../data/E_4_20_1_short_example.fq") as input_handle:
    for read in SeqIO.parse(input_handle, "fastq"):
        sequence_array.append(seq2bool(read.seq, nucleotids))
        qual_array.extend(read._per_letter_annotations["phred_quality"])

# Concatenate sequence
sequence_array = 1*np.concatenate(sequence_array, axis = 0)
qual_array,m,s = qual2R(np.array([qual_array]).T)

In [276]:
N = 2**12
method = 'q'; #'s', 'q+s'
n_timesteps = 4;
n_units = 1;
n_regressors = 1;
epochs = 2*10**2;
learning_rate = 0.075;

x,t = mk_xt(sequence_array,qual_array, N, method); #
X,t = xt2vec(x,t,n_timesteps,n_regressors);
n_predictors = X.shape[2];

nn = mk_arh(n_units, n_timesteps,n_predictors, n_regressors, learning_rate)
nn.fit(X, t, epochs = epochs, verbose = 0) # approximate
n = n_params(nn)


The `lr` argument is deprecated, use `learning_rate` instead.



In [277]:
y = nn(X)
y = np.expand_dims(y, axis = 2);

y = R2qual(y,m,s);
tint = R2qual(t,m,s);
e = y - tint

In [278]:
# Error Decay
regressor_df = pd.DataFrame.from_dict(
    {
        'loss': nn.history.history['loss'],
    }
)

error_df = pd.DataFrame.from_dict(
    {
        'error': np.array(e[:,0,0])
    }
)

fig = px.line(regressor_df, y="loss")
fig.show()

fig = px.area(error_df, y='error')
fig.show()

In [279]:
fig = px.area((error_df.abs().round() >0).rolling(n_timesteps).sum(), y='error')
fig.show()

In [280]:
# Summary
nn.summary()
print(nn.input_shape)
print(nn.output_shape)
print(X.shape)
print(t.shape)
print(n_params(nn))

Model: "model_15"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 4, 5)]            0         
                                                                 
 LSTM (LSTM)                 (None, 4, 1)              24        
                                                                 
 tf.math.multiply_15 (TFOpLa  (None, 4, 1)             0         
 mbda)                                                           
                                                                 
 Flatten (Flatten)           (None, 4)                 0         
                                                                 
 Dense (Dense)               (None, 1)                 4         
                                                                 
Total params: 28
Trainable params: 28
Non-trainable params: 0
______________________________________________________________

In [281]:
(error_df.abs().round()>0).sum()

error    3038
dtype: int64

In [282]:
# Error terms
print((error_df.abs().round()>0).mean())
M = ((error_df.abs().round()>0).sum().values + n + n_timesteps);
print('Compression rate')
print( float(N) / float(M))

error    0.741699
dtype: float64
Compression rate
1.3342019543973942


In [283]:
s = 1.0*(error_df.abs().round());
s = np.expand_dims(s,1);

In [284]:
s_idx = np.expand_dims(np.expand_dims(np.diff(np.where(s)[0],prepend=0),1),2);
max_idx = float(max(s_idx))
s_rng = np.array(s[np.where(s)[0]]);
max_rng = float(max(s_rng))

s_idx = s_idx/max_idx;
s_rng = s_rng/max_rng;

In [285]:
x_int = np.concatenate([s_idx,s_rng], 2)
t_int = np.roll(x_int,-1)

X2,z = xt2vec(x_int,t_int,n_timesteps,n_regressors);
T = np.moveaxis(t_int, 2,1);
print(T.shape)
print(X2.shape)

(3038, 2, 1)
(3038, 4, 2)


In [286]:
n_timesteps = 4;
n_units = 2;
n_regressors = 2;
epochs = 2*10**2;
learning_rate = 0.075;

n_predictors = X2.shape[2];
nn = mk_arh(n_units, n_timesteps,n_predictors, n_regressors, learning_rate)
nn.fit(X2, T, epochs = epochs, verbose = 1) # approximate
n = n_params(nn)

n2, n2_timesteps = copy.deepcopy(n),copy.deepcopy(n_timesteps);

Epoch 1/200



The `lr` argument is deprecated, use `learning_rate` instead.



Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78/200
Epoch 7

Epoch 81/200
Epoch 82/200
Epoch 83/200
Epoch 84/200
Epoch 85/200
Epoch 86/200
Epoch 87/200
Epoch 88/200
Epoch 89/200
Epoch 90/200
Epoch 91/200
Epoch 92/200
Epoch 93/200
Epoch 94/200
Epoch 95/200
Epoch 96/200
Epoch 97/200
Epoch 98/200
Epoch 99/200
Epoch 100/200
Epoch 101/200
Epoch 102/200
Epoch 103/200
Epoch 104/200
Epoch 105/200
Epoch 106/200
Epoch 107/200
Epoch 108/200
Epoch 109/200
Epoch 110/200
Epoch 111/200
Epoch 112/200
Epoch 113/200
Epoch 114/200
Epoch 115/200
Epoch 116/200
Epoch 117/200
Epoch 118/200
Epoch 119/200
Epoch 120/200
Epoch 121/200
Epoch 122/200
Epoch 123/200
Epoch 124/200
Epoch 125/200
Epoch 126/200
Epoch 127/200
Epoch 128/200
Epoch 129/200
Epoch 130/200
Epoch 131/200
Epoch 132/200
Epoch 133/200
Epoch 134/200
Epoch 135/200
Epoch 136/200
Epoch 137/200
Epoch 138/200
Epoch 139/200
Epoch 140/200
Epoch 141/200
Epoch 142/200
Epoch 143/200
Epoch 144/200
Epoch 145/200
Epoch 146/200
Epoch 147/200
Epoch 148/200
Epoch 149/200
Epoch 150/200
Epoch 151/200
Epoch 152/200
Epoch 153/2

Epoch 160/200
Epoch 161/200
Epoch 162/200
Epoch 163/200
Epoch 164/200
Epoch 165/200
Epoch 166/200
Epoch 167/200
Epoch 168/200
Epoch 169/200
Epoch 170/200
Epoch 171/200
Epoch 172/200
Epoch 173/200
Epoch 174/200
Epoch 175/200
Epoch 176/200
Epoch 177/200
Epoch 178/200
Epoch 179/200
Epoch 180/200
Epoch 181/200
Epoch 182/200
Epoch 183/200
Epoch 184/200
Epoch 185/200
Epoch 186/200
Epoch 187/200
Epoch 188/200
Epoch 189/200
Epoch 190/200
Epoch 191/200
Epoch 192/200
Epoch 193/200
Epoch 194/200
Epoch 195/200
Epoch 196/200
Epoch 197/200
Epoch 198/200
Epoch 199/200
Epoch 200/200


In [287]:
y2 = nn(X2)
y2 = np.expand_dims(y2, axis = 2);
e2 = y2 - T

In [288]:

# Error Decay
regressor2_df = pd.DataFrame.from_dict(
    {
        'loss': nn.history.history['loss'],
    }
)

error2_idx = pd.DataFrame.from_dict(
    {
        'ix': e2[:,0,0]*max_idx,
    }
)

error2_rng = pd.DataFrame.from_dict(
    {
        'rng': e2[:,1,0]*max_rng,
    }
)

fig = px.line(regressor2_df, y="loss")
fig.show()

fig = px.scatter(error2_idx.abs().round(), y='ix')
fig.show()

fig = px.scatter(error2_rng.abs().round(), y='rng')
fig.show()

In [289]:
# Summary
nn.summary()
print(nn.input_shape)
print(nn.output_shape)
print(X.shape)
print(t.shape)
print(n_params(nn))

Model: "model_16"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 4, 2)]            0         
                                                                 
 LSTM (LSTM)                 (None, 4, 2)              32        
                                                                 
 tf.math.multiply_16 (TFOpLa  (None, 4, 2)             0         
 mbda)                                                           
                                                                 
 Flatten (Flatten)           (None, 8)                 0         
                                                                 
 Dense (Dense)               (None, 2)                 16        
                                                                 
Total params: 48
Trainable params: 48
Non-trainable params: 0
______________________________________________________________

In [290]:
error2_idx.abs().round().sum()

ix    0.0
dtype: float64

In [291]:
error2_rng.abs().round().sum()

rng    0.0
dtype: float64

In [292]:
## Error terms
print((abs(e2).round()>0).mean())
M = ((error2_idx.abs().round().sum().values +  error2_rng.abs().round().sum().values + n + n_timesteps + n2 + n2_timesteps));
print('Compression rate')
print( float(N) / float(M))

0.0
Compression rate
39.38461538461539


In [293]:
M

array([104.])

In [294]:
N

4096

In [295]:
n

48

In [296]:
n_timesteps

4

In [297]:
n2

48

In [298]:
n2_timesteps

4

In [299]:
n.copy()

AttributeError: 'int' object has no attribute 'copy'