<a href="https://colab.research.google.com/github/bahrad/Covid/blob/main/Corona_Taxonomy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Initialization

In [None]:
%tensorflow_version 2.x
import tensorflow as tf
from tensorflow import keras

import numpy as np
import os
import csv

import pandas as pd
import pickle

In [None]:
!python --version

Python 3.7.12


In [None]:
from google.colab import drive, files
# drive.mount('/content/drive')

FILELOC = "/content/drive/My Drive/COVID_Python/"

In [None]:
try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    print('Running on TPU ', tpu.cluster_spec().as_dict()['worker'])
    tf.config.experimental_connect_to_cluster(tpu)
    tf.tpu.experimental.initialize_tpu_system(tpu)
    tpu_strategy = tf.distribute.TPUStrategy(tpu)
    tpu_env=True
except ValueError:
    print('Not connected to a TPU runtime.')
    tpu_env=False

In [None]:
def reset_model(regress, singleclass, multiclass, output_multiheadatt, use_att, nclasses=4,
                output_two=False):

    if output_multiheadatt:
        model_fn = AttMod_2
    elif output_two:
        model_fn = AttMod_3
    else:
        model_fn = AttModel

    model = model_fn(L=ismlen,
                     vocab_size=len(aa_list)+1,
                     embdim = ENCDIM,
                     numheads = NHEADS,
                     ffdim = FFDIM,
                     num_dense = NDENSE,
                     mask_zero=True,
                     dropout_rate = DROPRATE,
                     trans_drop = TRANSDROPRATE,
                     Nt = NT,
                     W = 1, Nc = NC, Nl = NL,
                     regress=regress, singleclass=singleclass,
                     multiclass=multiclass, use_att=use_att,
                     nclasses=nclasses,
                     )
    
    optimizer = keras.optimizers.Adam(learning_rate=LEARN_RATE)
    if regress:
        loss = keras.losses.MeanSquaredError()
        metrics = [keras.metrics.MeanSquaredError(name='mse'),
            keras.metrics.MeanSquaredLogarithmicError(name='msle'),
            keras.losses.MeanAbsoluteError(name='mae')
            ]
    if singleclass:
        loss = keras.losses.BinaryCrossentropy()
        metrics = [keras.metrics.BinaryAccuracy(name='acc'),
                   keras.metrics.AUC(name='auc')]
    if multiclass:    
        loss = keras.losses.SparseCategoricalCrossentropy()
        metrics = [keras.metrics.SparseCategoricalAccuracy(name='acc')]

    model.compile(loss=loss, optimizer=optimizer, metrics=metrics,)
                #   steps_per_execution = STEPS_PER_EXECUTION,)

    if output_two:
        losses = {'outfirst':'mean_squared_error',
                  'outpeak':'mean_squared_error'}
        lossweights = {'outfirst':1.0, 'outpeak':1.0}
        metrics = [keras.metrics.MeanSquaredError(name='mse'),
                   keras.metrics.MeanSquaredLogarithmicError(name='msle'),
                   keras.losses.MeanAbsoluteError(name='mae')]
        model.compile(loss=losses, loss_weights=lossweights, optimizer=optimizer,metrics=metrics)

    return model

In [None]:
class TransformerBlock(keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = keras.layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [keras.layers.Dense(ff_dim, activation="relu"), keras.layers.Dense(embed_dim),]
        )
        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = keras.layers.Dropout(rate)
        self.dropout2 = keras.layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

class TokenAndPositionEmbedding(keras.layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, mask_zero=False):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = keras.layers.Embedding(input_dim=vocab_size,
                                                output_dim=embed_dim,
                                                mask_zero=mask_zero)
        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim,
                                              mask_zero=mask_zero)

    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions

def linear01(x):
    return tf.clip_by_value(x, clip_value_min=0, clip_value_max=1)

In [None]:
def AttMod_2(L, vocab_size, embdim, numheads, ffdim, num_dense=False,
             mask_zero=False, dropout_rate=False, trans_drop=0.1,
             Nt=1, W=False, Nc=False, Nl=False,
             regress=True, singleclass=False, multiclass=False, use_att=True,
             nclasses=4):

    inpTensor = keras.Input(shape=(L,))
    x = inpTensor

    if mask_zero:
        x = keras.layers.Masking(mask_value=0)(x)   

    x = TokenAndPositionEmbedding(L, vocab_size, embdim, mask_zero)(x)

    if W and Nc and Nl:
        for n in range(Nl):
            x = keras.layers.Conv1D(filters = Nc,
                                kernel_size = W,
                                activation = 'relu',
                                padding = 'same',
                                )(x)
            if n > 1 and n < Nl-1:
                x = keras.layers.BatchNormalization()(x)

    y, attout = keras.layers.MultiHeadAttention(num_heads=numheads, key_dim=embdim,
                                                )(x, x, return_attention_scores=True)
    y = keras.layers.Dropout(trans_drop)(y)
    z = keras.layers.LayerNormalization(epsilon=1e-6)(x + y)
    z1 = keras.Sequential( [keras.layers.Dense(ffdim, activation="relu"), keras.layers.Dense(embdim),])
    z1 = keras.layers.Dropout(trans_drop)(z)
    x = keras.layers.LayerNormalization(epsilon=1e-6)(z + z1)

    if use_att:
        # Attention layer
        h = keras.layers.TimeDistributed(keras.layers.Dense(Nc, activation='tanh'))(x)
        attention = keras.layers.TimeDistributed(keras.layers.Dense(1, activation='tanh'))(h)
        attention = keras.layers.Flatten()(attention)  
        attention = keras.layers.Softmax(axis=1, name='attention')(attention) # normalize attention values
        attention = keras.layers.RepeatVector(Nc)(attention)
        attention = keras.layers.Permute([2, 1])(attention)
        representation = keras.layers.multiply([h, attention])
        representation = tf.math.reduce_sum(representation, axis = 1)
        x = representation
    else:
        x = keras.layers.GlobalAveragePooling1D()(x)

    if num_dense:
        x = keras.layers.Dense(num_dense, activation = 'relu')(x)
    if dropout_rate:
        x = keras.layers.Dropout(Params[dropout_rate])(x)

    if regress:
        finalOut = keras.layers.Dense(1, activation=linear01)(x)
    if singleclass:
        finalOut = keras.layers.Dense(1, activation='sigmoid')(x)
    if multiclass:
        finalOut = keras.layers.Dense(nclasses, activation='softmax')(x)

    # define the model's start and end points    
    model = keras.Model(inpTensor,finalOut)

    return model

In [None]:
def AttModel(L, vocab_size, embdim, numheads, ffdim, num_dense=False,
             mask_zero=False, dropout_rate=False, trans_drop=0.1,
             Nt=1, W=False, Nc=False, Nl=False,
             regress=True, singleclass=False, multiclass=False, use_att=True,
             nclasses=4):

    inpTensor = keras.Input(shape=(L,))
    x = inpTensor

    if mask_zero:
        x = keras.layers.Masking(mask_value=0)(x)   

    x = TokenAndPositionEmbedding(L, vocab_size, embdim, mask_zero)(x)

    if W and Nc and Nl:
        for n in range(Nl):
            x = keras.layers.Conv1D(filters = Nc,
                                kernel_size = W,
                                activation = 'relu',
                                padding = 'same',
                                )(x)
            if n > 1 and n < Nl-1:
                x = keras.layers.BatchNormalization()(x)

    for n in range(Nt):
        x = TransformerBlock(Nc, numheads, ffdim, rate=trans_drop)(x)

    if use_att:
        # Attention layer
        h = keras.layers.TimeDistributed(keras.layers.Dense(Nc, activation='tanh'))(x)
        attention = keras.layers.TimeDistributed(keras.layers.Dense(1, activation='tanh'))(h)
        attention = keras.layers.Flatten()(attention)  
        attention = keras.layers.Softmax(axis=1, name='attention')(attention) # normalize attention values
        attention = keras.layers.RepeatVector(Nc)(attention)
        attention = keras.layers.Permute([2, 1])(attention)
        representation = keras.layers.multiply([h, attention])
        representation = tf.math.reduce_sum(representation, axis = 1)
        x = representation
    else:
        x = keras.layers.GlobalAveragePooling1D()(x)

    if num_dense:
        x = keras.layers.Dense(num_dense, activation = 'relu')(x)
    if dropout_rate:
        x = keras.layers.Dropout(Params[dropout_rate])(x)

    if regress:
        # finalOut = keras.layers.Dense(1, activation=linear01)(x)
        finalOut = keras.layers.Dense(1, activation='sigmoid')(x)
    if singleclass:
        finalOut = keras.layers.Dense(1, activation='sigmoid')(x)
    if multiclass:
        finalOut = keras.layers.Dense(nclasses, activation='softmax')(x)

    # define the model's start and end points    
    model = keras.Model(inpTensor,finalOut)

    return model

In [None]:
def AttMod_3(L, vocab_size, embdim, numheads, ffdim, num_dense=False,
             mask_zero=False, dropout_rate=False, trans_drop=0.1,
             Nt=1, W=False, Nc=False, Nl=False,
             regress=True, singleclass=False, multiclass=False, use_att=True,
             nclasses=4):

    inpTensor = keras.Input(shape=(L,))
    x = inpTensor

    if mask_zero:
        x = keras.layers.Masking(mask_value=0)(x)   

    x = TokenAndPositionEmbedding(L, vocab_size, embdim, mask_zero)(x)

    if W and Nc and Nl:
        for n in range(Nl):
            x = keras.layers.Conv1D(filters = Nc,
                                kernel_size = W,
                                activation = 'relu',
                                padding = 'same',
                                )(x)
            if n > 1 and n < Nl-1:
                x = keras.layers.BatchNormalization()(x)

    for n in range(Nt):
        x = TransformerBlock(Nc, numheads, ffdim, rate=trans_drop)(x)

    if use_att:
        # Attention layer
        h = keras.layers.TimeDistributed(keras.layers.Dense(Nc, activation='tanh'))(x)
        attention = keras.layers.TimeDistributed(keras.layers.Dense(1, activation='tanh'))(h)
        attention = keras.layers.Flatten()(attention)  
        attention = keras.layers.Softmax(axis=1, name='attention')(attention) # normalize attention values
        attention = keras.layers.RepeatVector(Nc)(attention)
        attention = keras.layers.Permute([2, 1])(attention)
        representation = keras.layers.multiply([h, attention])
        representation = tf.math.reduce_sum(representation, axis = 1)
        x = representation
    else:
        x = keras.layers.GlobalAveragePooling1D()(x)

    if num_dense:
        x = keras.layers.Dense(num_dense, activation = 'relu')(x)
    if dropout_rate:
        x = keras.layers.Dropout(Params[dropout_rate])(x)

    if regress:
        # finalOut = keras.layers.Dense(1, activation=linear01)(x)
        finalOut = keras.layers.Dense(1, activation='sigmoid')(x)
    if singleclass:
        finalOut = keras.layers.Dense(1, activation='sigmoid')(x)
    if multiclass:
        finalOut = keras.layers.Dense(nclasses, activation='softmax')(x)

    out1 = keras.layers.Dense(1, activation='sigmoid', name='outfirst')(x)
    out2 = keras.layers.Dense(1, activation='sigmoid', name='outpeak')(x)
    # define the model's start and end points    
    model = keras.Model(inpTensor,[out1,out2])

    return model

In [None]:
# These parameters are currently hard-coded
ENCDIM = 1500
NC = 300
NL = 1
NT = 1
NHEADS = 8
FFDIM = 64
NDENSE = 64
TRANSDROPRATE = 0.1
DROPRATE = 0.0

LEARN_RATE = 0.0001

BATCH_SIZE = 48

STEPS_PER_EXECUTION = 50

#Sequence Tokenization

In [None]:
ismlen = 1500

def tokenize_sequences(data_dataframe, SeqCol='ISM', seqlen=1273):
    def f(x):
        if len(x) < seqlen:
            return x + '*'*(seqlen-len(x))
        elif len(x) > seqlen:
            return x[:seqlen]
        else:
            return x
    data = np.vstack(data_dataframe[SeqCol].apply(f).apply(lambda x: np.array(list(x))))
    aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E',
            'G', 'H', 'I', 'L', 'K', 'M', 'F',
            'P', 'S', 'T', 'W', 'Y', 'V', '-',
            ]
    aa_tokenizer = {aa_list[k]:k+1 for k in range(len(aa_list))}
    aa_tokenizer['*'] = 0
    aa_tokenizer['X'] = 0
    # optionally handle B, J, Z ambiguities
    # Asx	B	Aspartic acid or Asparagine (D or N)
    # Glx	Z	Glutamic acid or Glutamine (E or Q)
    # Xaa	X	Any amino acid
    # Xle	J	Leucine or Isoleucine (L or I)
    aa_tokenizer['B'] = 0
    aa_tokenizer['Z'] = 0
    aa_tokenizer['J'] = 0

    return np.vectorize(aa_tokenizer.get)(data)

aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E',
        'G', 'H', 'I', 'L', 'K', 'M', 'F',
        'P', 'S', 'T', 'W', 'Y', 'V', '-',
        ]

In [None]:
# datadf = pd.read_csv(FILELOC + 'species_dataset_reduced_20211121.csv')

# datadf = pd.read_csv(FILELOC + 'species_dataset_reduced_20211127.csv')
# print(len(datadf))
# datadf.drop_duplicates('Seq', inplace=True)
# datadf.reset_index(drop=False, inplace=True)
# datadf.rename(columns={'index':'original_index'}, inplace=True)
# print(len(datadf))
# datadf.to_csv(FILELOC + 'species_dataset_reduced_20211127_dropduplicates.csv', index=False)

# datadf = pd.read_csv(FILELOC + 'species_dataset_reduced_20211127_dropduplicates.csv')

# datadf.to_excel('Supplemental Table 2.xls', index=False)

# REMOVE SEQUENCES THAT WERE FOUND AFTER COVID EMERGED (i.e. near neighbors to SARS-Cov-2)

# from dateutil.parser import parse as dateparse
# datadf['date_parsed'] = datadf.date.apply(dateparse)
# print(len(datadf))
# datadf.drop(datadf[(datadf.Species.str.contains('Pangolin')) & (datadf.date_parsed > dateparse('2020-01-01'))].index,
#             inplace=True)
# print(len(datadf))
# datadf.to_csv(FILELOC + 'species_dataset_20220112.csv')

datadf = pd.read_csv(FILELOC + 'species_dataset_20220112.csv')

In [None]:
datadf['seqlen'] = datadf.Seq.apply(len)

In [None]:
seqtok = tokenize_sequences(datadf, 'Seq', 1500)
y = datadf.genuslabel

In [None]:
seqtok = tokenize_sequences(datadf[datadf.seqlen>1000], 'Seq', 1500)
y = datadf[datadf.seqlen>1000].genuslabel

In [None]:
# CODE TO GENERATE RANDOM TRAIN/TEST SPLITS
# saves the identity of those splits for each of the possible input data sets

# trainindex = np.random.choice(range(len(seqtok)), size = int(0.9*len(seqtok)), replace=False)
# testindex = [k for k in range(len(seqtok)) if k not in trainindex]
# np.savetxt(FILELOC + 'species_reduced_trainindex_2021121.csv', trainindex, fmt='%i', delimiter=',')
# trainindex = np.loadtxt(FILELOC + 'species_reduced_trainindex_2021121.csv', dtype=int, delimiter=',')
# testindex = [k for k in range(len(seqtok)) if k not in trainindex]
# xtraintok = seqtok[trainindex]; ytrain = y[trainindex]
# xtesttok = seqtok[testindex]; ytest = y[testindex]

# trainindex = np.random.choice(range(len(seqtok)), size = int(0.9*len(seqtok)), replace=False)
# testindex = [k for k in range(len(seqtok)) if k not in trainindex]
# np.savetxt(FILELOC + 'species_reduced_trainindex_20211127_dropduplicates.csv', trainindex, fmt='%i', delimiter=',')
# trainindex = np.loadtxt(FILELOC + 'species_reduced_trainindex_20211127.csv', dtype=int, delimiter=',')
# trainindex = np.loadtxt(FILELOC + 'species_reduced_trainindex_20211127_dropduplicates.csv', dtype=int, delimiter=',')
# testindex = [k for k in range(len(seqtok)) if k not in trainindex]
# xtraintok = seqtok[trainindex]; ytrain = y[trainindex]
# xtesttok = seqtok[testindex]; ytest = y[testindex]

# trainindex = np.random.choice(range(len(seqtok)), size = int(0.9*len(seqtok)), replace=False)
# testindex = [k for k in range(len(seqtok)) if k not in trainindex]
# np.savetxt(FILELOC + 'species_trainindex_20220112.csv', trainindex, fmt='%i', delimiter=',')
# trainindex = np.loadtxt(FILELOC + 'species_trainindex_20220112.csv', dtype=int, delimiter=',')
# trainindex = np.loadtxt(FILELOC + 'species_trainindex_20220112.csv', dtype=int, delimiter=',')
# testindex = [k for k in range(len(seqtok)) if k not in trainindex]
# xtraintok = seqtok[trainindex]; ytrain = y[trainindex]
# xtesttok = seqtok[testindex]; ytest = y[testindex]


#Class Balancing

In [None]:
# from sklearn.utils import class_weight
# class_weights = list(class_weight.compute_class_weight(class_weight='balanced',
#                                                        classes=np.unique(ytrain), y=ytrain))
# sample_weights = np.array([class_weights[int(y)] for y in ytrain])
# print(class_weights)

from sklearn.utils import class_weight
class_weights = list(class_weight.compute_class_weight(class_weight='balanced',
                                                       classes=np.unique(y), y=y))
sample_weights = np.array([class_weights[int(yi)] for yi in y])
print(class_weights)

[1.1203703703703705, 0.6364894795127354, 0.8668929110105581, 2.6125]


#Training

In [None]:
NUM_EPOCHS = 20
BATCH_SIZE = 48
VAL_SPLIT = 0.2

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor = 'val_loss',
    verbose = 1,
    patience = 10, #5,
    mode = 'auto',
    min_delta = 0,
    restore_best_weights = True
    )

for run in range(1,2):
    tf.keras.backend.clear_session()
    with tpu_strategy.scope():
        model = reset_model(regress=False, singleclass=False, multiclass=True,
                            output_multiheadatt=False, use_att=True, nclasses=4)
    history = model.fit(seqtok, y,
                        # xtraintok, ytrain,
                        sample_weight = sample_weights,
                        batch_size = BATCH_SIZE,
                        epochs = NUM_EPOCHS,
                        verbose = 1,
                        # validation_split = VAL_SPLIT,
                        # callbacks = [early_stopping],
                        )
    # model.save_weights(f"{FILELOC}taxonomy_weights_20220112_seq1000_{run}.h5", save_format='h5', overwrite=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


Save Model

In [None]:
# model.save_weights(FILELOC + 'coronavirus_spike_taxonomy_20211127_dropduplicates.h5', save_format='h5', overwrite=True)

Show multilabel classification matrix

In [None]:
# from sklearn.metrics import multilabel_confusion_matrix
# from collections import Counter
# for run in range(1,6):
#     tf.keras.backend.clear_session()
#     with tpu_strategy.scope():
#         model= reset_model(regress=False, singleclass=False, multiclass=True,
#                         output_multiheadatt=False, use_att=True, nclasses=4)
#         model.load_weights(f"{FILELOC}taxonomy_weights_20211213_{run}.h5")
#         model.compile()
#     print(f'Run {run}')
#     print(multilabel_confusion_matrix(ytest, model.predict(xtest).argmax(axis=1)))

##Load Pretrained Model

In [None]:
ismlen = 1500

def tokenize_sequences(data_dataframe, SeqCol='ISM', seqlen=1273):
    def f(x):
        if len(x) < seqlen:
            return x + '*'*(seqlen-len(x))
        elif len(x) > seqlen:
            return x[:seqlen]
        else:
            return x
    data = np.vstack(data_dataframe[SeqCol].apply(f).apply(lambda x: np.array(list(x))))
    aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E',
            'G', 'H', 'I', 'L', 'K', 'M', 'F',
            'P', 'S', 'T', 'W', 'Y', 'V', '-',
            ]
    aa_tokenizer = {aa_list[k]:k+1 for k in range(len(aa_list))}
    aa_tokenizer['*'] = 0
    aa_tokenizer['X'] = 0
    # optionally handle B, J, Z ambiguities
    # Asx	B	Aspartic acid or Asparagine (D or N)
    # Glx	Z	Glutamic acid or Glutamine (E or Q)
    # Xaa	X	Any amino acid
    # Xle	J	Leucine or Isoleucine (L or I)
    aa_tokenizer['B'] = 0
    aa_tokenizer['Z'] = 0
    aa_tokenizer['J'] = 0

    return np.vectorize(aa_tokenizer.get)(data)

aa_list = ['A', 'R', 'N', 'D', 'C', 'Q', 'E',
        'G', 'H', 'I', 'L', 'K', 'M', 'F',
        'P', 'S', 'T', 'W', 'Y', 'V', '-',
        ]

Example of generating predictions, embeddings, and attention values for a pretrained model (using full species dataset)

In [None]:
sdf = pd.read_csv(FILELOC + "species_dataset_20211127.csv")

In [None]:
tf.keras.backend.clear_session()
with tpu_strategy.scope():
    model= reset_model(regress=False, singleclass=False, multiclass=True,
                    output_multiheadatt=False, use_att=True, nclasses=4)
    model.load_weights(f"{FILELOC}taxonomy_weights_pretrained.h5")
    model.compile()

In [None]:
tok = tokenize_sequences(sdf,'Seq',1500)

with tpu_strategy.scope():
    get_embedding_model = keras.Model(inputs=model.input,outputs=model.get_layer('dense_4').output)
    get_embedding_model.compile()
    get_attention_model = keras.Model(inputs=model.input,outputs=model.get_layer('attention').output)
    get_attention_model.compile()
    pred = model.predict(tok)
    emb = get_embedding_model.predict(tok)
    att = get_attention_model.predict(tok)
    sdf['pred'] = [p for p in pred]
    sdf['emb'] = [e for e in emb]
    sdf['att'] = [a for a in att]

Instructions for updating:
use `experimental_local_results` instead.


Instructions for updating:
use `experimental_local_results` instead.
