# LSTM model to a Sequence Array. Improvements Of Compression Rate

##

In [384]:
from Bio import SeqIO
import numpy as np
import plotly.graph_objects as go
import pandas as pd
import plotly.express as px
#import matplotlib.pyplot as plt

# NN NN Architectrue
from keras.models import Model#, Sequential
from keras.layers import Input, LSTM, Flatten, Dense, Permute, Dropout
from keras.optimizers import Adam
from keras.regularizers import L1


In [385]:
# Pre-Processing Functions
def seq2bool(seq, amines):
    return (np.expand_dims(np.array(list(seq)),1) == np.expand_dims(amines,1).T)

def qual2bool(qual):
    uq = np.unique(qual)
    return (np.expand_dims(qual,1) == np.expand_dims(uq,1).T)

def qual2R(qual):
    qual = np.array(qual)/(40.0)
    m,s = np.mean(qual), np.std(qual)
    qual = (qual - m)/s
    return qual, m, s;

def R2qual(qual,m,s):
    qual = qual *s + m
    qual = qual*40.0;
    return qual;

#  test
q1 = np.array([1,2,3,0])
q2, m, s = qual2R(q1)
q3 = R2qual(q2,m,s);
sum((q1 - q3)**2)

0.0

In [386]:
# Input- Output Construction Functions
def mk_xt(sequence_array,qual_array, N = 10**2, method = 's'):
    if method == 'q':
        # Quality
        x1 = sequence_array[:N,:];
        x1 = np.moveaxis(np.expand_dims(x1,2), 1,2)
        
        x2 = qual_array[:N,:];
        x2 = np.moveaxis(np.expand_dims(x2,2), 1,2)        

        x = np.concatenate([x1,x2], axis = 2)
        t = np.roll(x2,1)

    elif method == 's':    
        # Sequence
        x = np.moveaxis(np.expand_dims(sequence_array[:N,:],2), 1,2)
        
        t = np.roll(x,1)

    elif method == 'q+s':
        # Sequence + Quality
        x1 = sequence_array[:N,:];
        x1 = np.moveaxis(np.expand_dims(x1,2), 1,2)
        
        x2 = qual_array[:N,:];
        x2 = np.moveaxis(np.expand_dims(x2,2), 1,2)        
        
        x = np.concatenate([x1,x2], axis = 2)
        t = np.roll(x2,1)
    return x,t

def xt2vec(x,t,timesteps,regressors):
    # Vectorizations (one-hot encoding likewise)
    X = [];
    for i in range(timesteps):
        X.append(np.roll(x,-i))

    X = np.concatenate(X, axis = 1)
    t = t[:,0:1,0:regressors];

    t = np.moveaxis(t, 2, 1);
    return X, t;

def mk_arh(units, timesteps, predictors, regressors,learning_rate):
    # Make Architecture
    i = Input(shape=(timesteps,predictors),name = 'Input')
    l = LSTM (units, return_sequences = True, use_bias = False,name = 'LSTM') (i)
    l *= (1.0)/np.tanh(1).astype(float);
    l = Flatten(name = 'Flatten')(l)
    #l = Dropout(0.95,name = 'Dropout')(l)
    l = Dense(regressors, name = 'Dense', activation = 'linear', use_bias = False) (l)

    regressor = Model (inputs = i, outputs = l)
    regressor.compile(optimizer = Adam(lr=learning_rate), loss = 'mean_squared_error', metrics=['mse'])
    return regressor;

def n_params(nn):
    n = 0;
    w = nn.get_weights()
    for ww in w:
        n += ww.size
    return n;

In [387]:
!head -n 12 ../data/E_4_20_1_short_example.fq

'head' is not recognized as an internal or external command,
operable program or batch file.


In [388]:
qual_array, sequence_array = [],[]
nucleotids = ['A', 'C', 'G', 'T']
with open("../data/E_4_20_1_short_example.fq") as input_handle:
    for read in SeqIO.parse(input_handle, "fastq"):
        sequence_array.append(seq2bool(read.seq, nucleotids))
        qual_array.extend(read._per_letter_annotations["phred_quality"])

# Concatenate sequence
sequence_array = 1*np.concatenate(sequence_array, axis = 0)
qual_array,m,s = qual2R(np.array([qual_array]).T)

In [389]:
N = 2**12
method = 's'; #'s', 'q+s'
n_timesteps = 4;
n_units = 1;
n_regressors = 4;
epochs = 1*10**3;
learning_rate = 0.075;

x,t = mk_xt(sequence_array,qual_array, N, method); #
X,t = xt2vec(x,t,n_timesteps,n_regressors);
n_predictors = X.shape[2];

nn = mk_arh(n_units, n_timesteps,n_predictors, n_regressors, learning_rate)
nn.fit(X, t, epochs = epochs, verbose = 0) # approximate
n = n_params(nn)


The `lr` argument is deprecated, use `learning_rate` instead.



In [390]:
y = nn(X)
y = np.expand_dims(y, axis = 2);
e = y - t

In [391]:
# Error Decay
regressor_df = pd.DataFrame.from_dict(
    {
        'loss': nn.history.history['loss'],
    }
)

error_df = pd.DataFrame.from_dict(
    {
        'A_error': np.array(e[:,0,0]),
        'C_error': np.array(e[:,1,0]),
        'G_error': np.array(e[:,2,0]),
        'T_error': np.array(e[:,3,0])
    }
)

fig = px.line(regressor_df, y="loss")
fig.show()

fig = px.area(error_df, y='A_error')
fig.show()

fig = px.area(error_df, y='C_error')
fig.show()

fig = px.area(error_df, y='G_error')
fig.show()

fig = px.area(error_df, y='T_error')
fig.show()

In [392]:
e.shape

(4096, 4, 1)

In [393]:
# Summary
nn.summary()
print(nn.input_shape)
print(nn.output_shape)
print(X.shape)
print(t.shape)
print(n_params(nn))

Model: "model_23"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Input (InputLayer)          [(None, 4, 4)]            0         
                                                                 
 LSTM (LSTM)                 (None, 4, 1)              20        
                                                                 
 tf.math.multiply_23 (TFOpLa  (None, 4, 1)             0         
 mbda)                                                           
                                                                 
 Flatten (Flatten)           (None, 4)                 0         
                                                                 
 Dense (Dense)               (None, 4)                 16        
                                                                 
Total params: 36
Trainable params: 36
Non-trainable params: 0
______________________________________________________________

In [394]:
(e**2).round().sum()

604.0

In [395]:
# Error terms
print((e**2).round().mean())
M = (e**2).round().sum() + n;
print('Compression rate')
print( float(N) / float(M))

0.036865234375
Compression rate
6.4


In [396]:
N

4096

In [397]:
M

640.0

In [398]:
nn.get_weights()

[array([[ 26.309372  ,  10.528496  , -14.142902  ,  -2.8428934 ],
        [-14.616819  ,  -2.952526  ,   5.2517123 ,   0.17140982],
        [ -9.225942  , -15.242914  ,   3.9370863 ,   0.05579814],
        [ -6.318511  ,  -4.3012266 ,   6.6541853 ,  -0.08440068]],
       dtype=float32),
 array([[ 12.432823 ,  11.9631605, -46.559227 ,  79.13174  ]],
       dtype=float32),
 array([[-3.6415203e+00, -1.6893963e+01,  1.3181363e-01, -1.4971890e-01],
        [-5.3277874e+00, -2.0600725e-02, -1.6943350e+01,  6.7930333e-02],
        [-4.6048760e+00, -1.0634136e-02, -9.9900533e-03, -1.6639482e+01],
        [-6.0687518e+00, -2.8388402e-01, -4.8034973e-03, -1.7427674e-02]],
       dtype=float32)]