<a href="https://colab.research.google.com/github/bahrad/Covid/blob/main/Covid_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Initialization

Minimal imports (Tensorflow singularity packages + Pandas for output)

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras

import numpy as np
import os
import csv

import pandas as pd

In [None]:
from google.colab import drive, files
drive.mount('/content/drive')

FILELOC = "/content/drive/My Drive/COVID_Python/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.TPUStrategy(tpu)
    tpu_env=True
except ValueError:
    print('Not connected to a TPU runtime.')
    tpu_env=False

Not connected to a TPU runtime.


Data Files

In [None]:
DATA_FILE= FILELOC + 'covid_aligned_crossval/TrainData.csv'
LABEL_FILE= FILELOC + 'covid_aligned_crossval/TrainLabels_Age.csv'

Parameters

In [None]:
ENCDIM = 1500
NC = 300
NL = 1
NT = 1
NHEADS = 8
FFDIM = 64
NDENSE = 64
TRANSDROPRATE = 0.1
DROPRATE = 0.0

LEARN_RATE = 0.0001

BATCH_SIZE = 48

STEPS_PER_EXECUTION = 50

ismlen = 1273

In [None]:
def reset_model():

    model = AttModel(L=ismlen,
                     vocab_size=len(aa_list)+1,
                     embdim = ENCDIM,
                     numheads = NHEADS,
                     ffdim = FFDIM,
                     num_dense = NDENSE,
                     mask_zero=True,
                     dropout_rate = DROPRATE,
                     trans_drop = TRANSDROPRATE,
                     Nt = NT,
                     W = 1, Nc = NC, Nl = NL,
                     )
    
    optimizer = keras.optimizers.Adam(learning_rate=LEARN_RATE)


    loss = keras.losses.MeanSquaredError()
    optimizer = keras.optimizers.Adam(learning_rate=LEARN_RATE)
    metrics = [keras.metrics.MeanSquaredError(name='mse'),
               keras.metrics.MeanSquaredLogarithmicError(name='msle'),
               keras.losses.MeanAbsoluteError(name='mae')
               ]

    model.compile(loss=loss, optimizer=optimizer, metrics=metrics,
                  steps_per_execution = STEPS_PER_EXECUTION)
    
    return model

In [None]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, mask_zero=False):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocab_size,
                                                output_dim=embed_dim,
                                                mask_zero=mask_zero)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim,
                                              mask_zero=mask_zero)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

def linear01(x):
    return tf.clip_by_value(x, clip_value_min=0, clip_value_max=1)

def AttModel(L, vocab_size, embdim, numheads, ffdim, num_dense=False,
             mask_zero=False, dropout_rate=False, trans_drop=0.1,
             Nt=1, W=False, Nc=False, Nl=False):

    inpTensor = keras.Input(shape=(L,))
    x = inpTensor

    if mask_zero:
        x = keras.layers.Masking(mask_value=0)(x)   

    x = TokenAndPositionEmbedding(L, vocab_size, embdim, mask_zero)(x)

    if W and Nc and Nl:
        for n in range(Nl):
            x = keras.layers.Conv1D(filters = Nc,
                                kernel_size = W,
                                activation = 'relu',
                                padding = 'same',
                                )(x)
            if n > 1 and n < Nl-1:
                x = keras.layers.BatchNormalization()(x)

    for n in range(Nt):
        x = TransformerBlock(Nc, numheads, ffdim, rate=trans_drop)(x)

    # Attention layer
    h = keras.layers.TimeDistributed(keras.layers.Dense(Nc, activation='tanh'))(x)
    attention = keras.layers.TimeDistributed(keras.layers.Dense(1, activation='tanh'))(h)
    attention = keras.layers.Flatten()(attention)  
    attention = keras.layers.Softmax(axis=1, name='attention')(attention) # normalize attention values

    # attention = keras.layers.RepeatVector(embdim)(attention)
    attention = keras.layers.RepeatVector(Nc)(attention)

    attention = keras.layers.Permute([2, 1])(attention)
    representation = keras.layers.multiply([h, attention])
    representation = tf.math.reduce_sum(representation, axis = 1)
    x = representation

    if num_dense:
        x = keras.layers.Dense(num_dense, activation = 'relu')(x)
    if dropout_rate:
        x = keras.layers.Dropout(Params[dropout_rate])(x)

    finalOut = keras.layers.Dense(1, activation=linear01)(x)

    # define the model's start and end points    
    model = keras.Model(inpTensor,finalOut)

    return model

In [None]:
labels = np.loadtxt(LABEL_FILE)
with open(DATA_FILE, 'r') as f:
    readfile = csv.reader(f)
    a = []
    for line in readfile:
        if len(line) < 1273:
            line += '*'*(1273-len(line))
        elif len(line) > 1273:
            line = line[:1273]
        a.append(line)
    ism_array = np.vstack(a)

aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E',
          'G', 'H', 'I', 'L', 'K', 'M', 'F',
          'P', 'S', 'T', 'W', 'Y', 'V', '-',
          ]
aa_tokenizer = {aa_list[k]:k+1 for k in range(len(aa_list))}
aa_tokenizer['*'] = 0
aa_tokenizer['X'] = 0
x_train_tok = np.vectorize(aa_tokenizer.get)(ism_array)

y_train = labels

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor = 'loss',
    verbose = 1,
    patience = 20,
    mode = 'auto',
    min_delta = 0,
    baseline = 0.2,
    restore_best_weights = True
    )

early_stopping_2 = tf.keras.callbacks.EarlyStopping(
    monitor = 'loss',
    verbose = 1,
    patience = 200,
    mode = 'auto',
    min_delta = 0,
    baseline = 0.2,
    restore_best_weights = True
    )

# Fit Models

In [None]:
model = reset_model()

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 1273)]       0                                            
__________________________________________________________________________________________________
masking (Masking)               (None, 1273)         0           input_1[0][0]                    
__________________________________________________________________________________________________
token_and_position_embedding (T (None, 1273, 1500)   1942500     masking[0][0]                    
__________________________________________________________________________________________________
conv1d (Conv1D)                 (None, 1273, 300)    450300      token_and_position_embedding[0][0
______________________________________________________________________________________________

In [None]:
n_epochs = 0
while (n_epochs < 35):
    tf.keras.backend.clear_session()
    with tpu_strategy.scope():
        model = reset_model()
    history = model.fit(x_train_tok,
                        y_train,
                        batch_size = BATCH_SIZE,
                        epochs = 600,
                        verbose = 2,
                        callbacks = [early_stopping],
                        )
    n_epochs = len(history.history['loss'])

if n_epochs < 250:
        history = model.fit(x_train_tok,
                        y_train,
                        batch_size = BATCH_SIZE,
                        epochs = 600,
                        verbose = 2,
                        callbacks = [early_stopping_2],
                        )

model.save_weights(FILELOC + 'covid_trans_fulldata_age_20211025_1.h5', save_format='h5', overwrite=True)

# Validate Model

In [None]:
nruns = 5

aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E', 'G', 'H', 'I', 'L', 'K', 'M', 'F',
          'P', 'S', 'T', 'W', 'Y', 'V', '-',]
aa_tokenizer = {aa_list[k]:k+1 for k in range(len(aa_list))}
aa_tokenizer['*'] = 0
aa_tokenizer['X'] = 0
def padder(x):
    if len(x) < 1273:
        return x + '*'*(1273-len(x))
    elif len(x) > 1273:
        return x[:1273]
    else:
        return x

Predictions on validation data set + creation of attention and embedding outputs for training and validation datasets.

In [None]:
# model_dir = FILELOC + 'covidout_trans_fulldata_age/'
# traindata = pd.read_csv(FILELOC + 'covid_aligned_crossval/covid_patient_0912_aligned.csv')
# valdata = pd.read_csv(FILELOC + 'covid_aligned_crossval/covid_patient_0912_valid1001_aligned.csv')
model_dir = FILELOC + 'covidout_trans_fulldata_age_raw/'
traindata = pd.read_csv(FILELOC + 'covid_rawseqs_crossval/covid_patient_0912_rawseqs.csv')
valdata = pd.read_csv(FILELOC + 'covid_rawseqs_crossval/covid_patient_0912_valid1001_rawseqs.csv')

ismlen = 1273

train_array = np.vstack(traindata['ISM'].apply(padder).apply(lambda x: np.array(list(x))))
val_array = np.vstack(valdata['ISM'].apply(padder).apply(lambda x: np.array(list(x))))
train_tok = np.vectorize(aa_tokenizer.get)(train_array) 
val_tok = np.vectorize(aa_tokenizer.get)(val_array)
y_train = traindata['Label'].values

MSE = {}

for run in range(1, nruns+1):
    tf.keras.backend.clear_session()    # reset Tensorflow session
    with tpu_strategy.scope():
        model = reset_model()       
        # model.load_weights(model_dir + 'covid_trans_fulldata_age_20211025_'+str(run)+'.h5')
        model.load_weights(model_dir + 'covid_rawseqs_fulldata_age_weights_20211025_'+str(run)+'.h5')

        pred = model.predict(train_tok, verbose=2).ravel()
        np.savetxt(model_dir + 'covid_traindata0912_pred_age_'+str(run)+'.csv', pred, delimiter=',')
        traindata['Predict'] = pred
        get_attention_model = keras.Model(inputs=model.input,outputs=model.get_layer('attention').output)
        att = get_attention_model.predict(train_tok, verbose=2, batch_size=32)
        np.savetxt(model_dir + 'covid_traindata0912_att_age_'+str(run)+'.csv', att, delimiter=',')

        traindata['Attention'] = [a for a in att]
        get_embedding_model = keras.Model(inputs=model.input,outputs=model.get_layer('dense_4').output)
        emb = get_embedding_model.predict(train_tok, verbose=2, batch_size=32)
        np.savetxt(model_dir + 'covid_traindata0912_emb_age_'+str(run)+'.csv', emb, delimiter=',')

        pred = model.predict(val_tok, verbose=2).ravel()
        np.savetxt(model_dir + 'covid_valdata1001_pred_age_'+str(run)+'.csv', pred, delimiter=',')
        get_attention_model = keras.Model(inputs=model.input,outputs=model.get_layer('attention').output)
        att = get_attention_model.predict(val_tok, verbose=2, batch_size=32)
        np.savetxt(model_dir + 'covid_valdata1001_att_age_'+str(run)+'.csv', att, delimiter=',')
        get_embedding_model = keras.Model(inputs=model.input,outputs=model.get_layer('dense_4').output)
        emb = get_embedding_model.predict(val_tok, verbose=2, batch_size=32)
        np.savetxt(model_dir + 'covid_valdata1001_emb_age_'+str(run)+'.csv', emb, delimiter=',')
