In [1]:
import tensorflow as tf
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from keras import backend as K
from keras.preprocessing import sequence
import Levenshtein
import pickle

# The custom accuracy metric used for this task
def myaccuracy(y_true, y_pred):
    y = tf.argmax(y_true, axis =- 1)
    y_ = tf.argmax(y_pred, axis =- 1)
    mask = tf.greater(y, 0)
    return K.cast(K.equal(tf.boolean_mask(y, mask), tf.boolean_mask(y_, mask)), K.floatx())

# Maps the sequence to a one-hot encoding
def onehot_to_seq(oh_seq, index, length=None):
    s = ''
    if length is None:
        for idx, o in enumerate(oh_seq):
            i = np.argmax(o)
            if i != 0:
                s += index[i]
            else:
                break
    else:
        for idx, o in enumerate(oh_seq):
            i = np.argmax(o[1:])
            if idx < length:
                s += index[i+1]
            else:
                break
    return s

# prints the results
def print_results(x, y_, revsere_decoder_index):
    # print("input     : " + str(x))
    # print("prediction: " + str(onehot_to_seq(y_, revsere_decoder_index).upper()))
    print(str(onehot_to_seq(y_, revsere_decoder_index).upper()))

def decode_predictions(y_, revsere_decoder_index, length=None):
    return str(onehot_to_seq(y_, revsere_decoder_index, length=length).upper())


def predict_all(model, test_df, tokenizer_encoder, tokenizer_decoder, n_gram, augmented_input=None, max_len=None, filepath="submission.csv"):
    test_input_ids = test_df['id'].values
    test_input_seqs = test_df['input'].values.T
    test_input_grams = seq2ngrams(test_input_seqs, n=n_gram)
    
    revsere_decoder_index = {value:key for key,value in tokenizer_decoder.word_index.items()}

    if max_len is None:
        max_test_length = max([len(x) for x in test_input_grams])
    else:
        max_test_length = max_len 
    test_input_data_full = tokenizer_encoder.texts_to_sequences(test_input_grams)
    test_input_data_full = sequence.pad_sequences(test_input_data_full, maxlen = max_test_length, padding = 'post')
    if augmented_input is None:
        y_test_pred = model.predict(test_input_data_full[:])
    else:
        y_test_pred = model.predict([test_input_data_full[:], augmented_input])
    np.save(filepath.replace(".csv", "_raw_pred.npy"), y_test_pred)
    y_test_pred_decoded = []
    for i in range(len(y_test_pred)):
        decoded = decode_predictions(y_test_pred[i], revsere_decoder_index, length=len(test_input_grams[i]))
        y_test_pred_decoded.append(decoded)
    test_pred_df = pd.DataFrame({'id':test_input_ids, "expected": y_test_pred_decoded},
                                columns = ['id', 'expected'])
    if np.all(np.array([len(x) for x in test_pred_df['expected']]) == np.array([len(x) for x in test_df['input']])):
        print("All length match")
    else:
        print("Some lengths do not match!")
    test_pred_df.to_csv(filepath, index=False)
    return test_pred_df

def ham_distance(x, y):
    return np.sum([a != b for a, b in zip(x, y)])

def edit_score(input_df, pred_df, filepath="edit_score.csv", plot=True):
    assert np.all(input_df['id'].values == pred_df['id'].values)
    if not np.all(np.array([len(x) for x in pred_df['expected']]) == np.array([len(x) for x in input_df['input']])):
        print("Some lengths do not match!")
        return None, None 
    output_df = input_df.copy().reset_index(drop=True)
    lev_dist = [Levenshtein.distance(x, y) for x, y in zip(input_df['expected'], pred_df['expected'])]
    ham_dist = [ham_distance(x, y) for x, y in zip(input_df['expected'], pred_df['expected'])]
    lev_score = np.mean(lev_dist)
    ham_score = np.mean(ham_dist)

    total_ham = np.sum(ham_dist)
    total_len = input_df['expected'].map(len).sum()
    accuracy = 1 - total_ham / total_len

    output_df['predicted'] = pred_df['expected'].values
    output_df['levdist'] = np.array(lev_dist)
    output_df['hamdist'] = np.array(ham_dist)
    output_df['levpercent'] = output_df['levdist'] / output_df['len']
    output_df['hampercent'] = output_df['hamdist'] / output_df['len']
    output_df['accuracy'] = 1 - output_df['hampercent']
    ham_percent = np.mean(output_df['hampercent'])
    mean_acc = np.mean(output_df['accuracy'])

    output_df.to_csv(filepath, index=False)
    print_str = "total acc: {:.4f}, mean acc: {:.4f}, lev: {:.1f}, ham: {:.1f}".format(accuracy, mean_acc, lev_score, ham_score)
    print(print_str)
    output_df.plot("len", "accuracy", kind="scatter")
    plt.hlines(y=accuracy, xmin=0, xmax=output_df['len'].max())
    plt.title(print_str)
    plt.savefig(filepath.replace(".csv", "_plot.png"))
    if plot:
        plt.show()
    plt.close()
    return accuracy, output_df

# Computes and returns the n-grams of a particualr sequence, defaults to trigrams
def seq2ngrams(seqs, n = 3):
    return np.array([[seq[i : i + n] for i in range(len(seq))] for seq in seqs])

def load_augmented_data(npy_path, max_len, centered=False):
    data = np.load(npy_path)
    residue_list = ['A', 'C', 'E', 'D', 'G', 'F', 'I', 'H', 'K', 'M', 'L', 'N', 'Q', 'P', 'S', 'R', 'T', 'W', 'V', 'Y', 'X','NoSeq']
    q8_list = ['L', 'B', 'E', 'G', 'I', 'H', 'S', 'T','NoSeq']

    data_reshape = data.reshape(data.shape[0], 700, -1)
    residue_onehot = data_reshape[:,:,0:22]
    residue_q8_onehot = data_reshape[:,:,22:31]
    profile = data_reshape[:,:,35:57]
    # if centered:
    #     profile = profile - 0.5  # range [0,1]

    if max_len > profile.shape[1]:
        zero_arr = np.zeros((profile.shape[0], max_len - profile.shape[1], profile.shape[2]))
        zero_arr[:,:,-1] = 1.0
        profile_padded = np.concatenate([profile, zero_arr], axis=1)
    else:
        profile_padded = profile

    residue_array = np.array(residue_list)[residue_onehot.argmax(2)]
    q8_array = np.array(q8_list)[residue_q8_onehot.argmax(2)]
    residue_str_list = []
    q8_str_list = []
    for vec in residue_array:
        x = ''.join(vec[vec != 'NoSeq'])
        residue_str_list.append(x)
    for vec in q8_array:
        x = ''.join(vec[vec != 'NoSeq'])
        q8_str_list.append(x)

    id_list = np.arange(1, len(residue_array) + 1)
    len_list = np.array([len(x) for x in residue_str_list])

    train_df = pd.DataFrame({'id': id_list, 'len': len_list, 'input': residue_str_list, 'expected': q8_str_list})
    return train_df, profile_padded


In [None]:
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.models import *
from keras.layers import *
from keras.optimizers import *
from sklearn.model_selection import train_test_split
from keras.metrics import categorical_accuracy
from keras import backend as K
import tensorflow as tf
from keras.callbacks import TensorBoard, LearningRateScheduler, ModelCheckpoint, ReduceLROnPlateau
from datetime import datetime
import os, pickle


# os.environ['CUDA_VISIBLE_DEVICES'] = '0'
from tensorflow.compat.v1 import ConfigProto 
from tensorflow.compat.v1 import InteractiveSession 
config = ConfigProto() 
config.gpu_options.allow_growth = True 
session = InteractiveSession(config=config)


maxlen_seq = 768

cb513filename ="../data/cb513+profile_split1.npy.gz"
cb6133filteredfilename = "../data/cullpdb+profile_6133_filtered.npy.gz"


train_df, train_augment_data = load_augmented_data(cb6133filteredfilename, maxlen_seq)
test_df, test_augment_data = load_augmented_data(cb513filename, maxlen_seq)

n_samples = len(train_df)
np.random.seed(0)
validation_idx = np.random.choice(np.arange(n_samples), size=300, replace=False)
training_idx = np.array(list(set(np.arange(n_samples))-set(validation_idx)))

val_df = train_df.iloc[validation_idx]

# Loading and converting the inputs to ngrams
train_input_seqs, train_target_seqs = train_df[['input', 'expected']].values.T
train_input_grams = seq2ngrams(train_input_seqs, n=1)

# Initializing and defining the tokenizer encoders and decoders based on the train set
tokenizer_encoder = Tokenizer()
tokenizer_encoder.fit_on_texts(train_input_grams)
tokenizer_decoder = Tokenizer(char_level = True)
tokenizer_decoder.fit_on_texts(train_target_seqs)

# Using the tokenizer to encode and decode the sequences for use in training
# Inputs
train_input_data = tokenizer_encoder.texts_to_sequences(train_input_grams)
train_input_data = sequence.pad_sequences(train_input_data, maxlen = maxlen_seq, padding = 'post', truncating='post')

# Targets
train_target_data = tokenizer_decoder.texts_to_sequences(train_target_seqs)
train_target_data = sequence.pad_sequences(train_target_data, maxlen = maxlen_seq, padding = 'post', truncating='post')
train_target_data = to_categorical(train_target_data)

# Computing the number of words and number of tags to be passed as parameters to the keras model
n_words = len(tokenizer_encoder.word_index) + 1
n_tags = len(tokenizer_decoder.word_index) + 1

############################################
# Splitting the data for train and validation sets

X_val = train_input_data[validation_idx]
X_train = train_input_data[training_idx]
y_val = train_target_data[validation_idx]
y_train = train_target_data[training_idx]

X_train_augment = train_augment_data[training_idx]
X_val_augment = train_augment_data[validation_idx]

############################################
# save preprocessed val and test data and tokenizer

# script_name = os.path.basename(__file__).split(".")[0]
script_name='script_name'
model_name = datetime.now().strftime("%Y%m%d-%H%M%S") + "-" + script_name
log_dir = '../logs/{}'.format(model_name)
os.mkdir(log_dir)

val_df.to_csv(os.path.join(log_dir, 'val_data.csv'))
np.save(os.path.join(log_dir, 'val_augment_data.npy'), X_val_augment)
test_df.to_csv(os.path.join(log_dir, 'test_data.csv'))
np.save(os.path.join(log_dir, 'test_augment_data.npy'), test_augment_data)

with open(os.path.join(log_dir, 'tokenizer_encoder.pickle'), 'wb') as handle:
    pickle.dump(tokenizer_encoder, handle)

with open(os.path.join(log_dir, 'tokenizer_decoder.pickle'), 'wb') as handle:
    pickle.dump(tokenizer_decoder, handle)

############################################
# Dropout to prevent overfitting. 
droprate = 0.3


def conv_block(x, n_channels, droprate):
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv1D(n_channels, 3, padding = 'same', kernel_initializer = 'he_normal')(x) 
    x = Dropout(droprate)(x)
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = Conv1D(n_channels, 3, padding = 'same', kernel_initializer = 'he_normal')(x)
    return x 

def up_block(x, n_channels):
    x = BatchNormalization()(x)
    x = ReLU()(x)
    x = UpSampling1D(size = 2)(x)
    x = Conv1D(n_channels, 2, padding = 'same', kernel_initializer = 'he_normal')(x)
    return x

input = Input(shape = (None, ))
augment_input = Input(shape = (None, 22))

# Defining an embedding layer mapping from the words (n_words) to a vector of len 128
embed_input = Embedding(input_dim = n_words, output_dim = 128, input_length = None)(input)

merged_input = concatenate([embed_input, augment_input], axis = 2)
merged_input = Conv1D(128, 3, padding = 'same', kernel_initializer = 'he_normal')(merged_input) 

conv1 = conv_block(merged_input, 128, droprate)
pool1 = MaxPooling1D(pool_size=2)(conv1)

conv2 = conv_block(pool1, 192, droprate)
pool2 = MaxPooling1D(pool_size=2)(conv2)

conv3 = conv_block(pool2, 384, droprate)
pool3 = MaxPooling1D(pool_size=2)(conv3)

conv4 = conv_block(pool3, 768, droprate)
pool4 = MaxPooling1D(pool_size=2)(conv4)

conv5 = conv_block(pool4, 1536, droprate)

up4 = up_block(conv5, 768)
up4 = concatenate([conv4,up4], axis = 2)
up4 = conv_block(up4, 768, droprate)

up3 = up_block(up4, 384)
up3 = concatenate([conv3,up3], axis = 2)
up3 = conv_block(up3, 384, droprate)

up2 = up_block(up3, 192)
up2 = concatenate([conv2,up2], axis = 2)
up2 = conv_block(up2, 192, droprate)

up1 = up_block(up2, 128)
up1 = concatenate([conv1,up1], axis = 2)
up1 = conv_block(up1, 128, droprate)

up1 = BatchNormalization()(up1)
up1 = ReLU()(up1)

# the following it equivalent to Conv1D with kernel size 1
# A dense layer to output from the LSTM's64 units to the appropriate number of tags to be fed into the decoder
y = TimeDistributed(Dense(n_tags, activation = "softmax"))(up1)


# Defining the model as a whole and printing the summary
model = Model([input, augment_input], y)
model.summary()

optim = RMSprop(lr=0.002)

def scheduler(i, lr):
    if i in [60]:
        return lr * 0.5
    return lr

reduce_lr = LearningRateScheduler(schedule=scheduler, verbose=1)
# reduce_lr = ReduceLROnPlateau(monitor='val_accuracy', factor=0.5,
#                             patience=8, min_lr=0.0005, verbose=1)

# Setting up the model with categorical x-entropy loss and the custom accuracy function as accuracy
model.compile(optimizer = optim, loss = "categorical_crossentropy", metrics = ["accuracy", myaccuracy])

tensorboard = TensorBoard(log_dir=log_dir)

checkpoint = ModelCheckpoint(os.path.join(log_dir, "best_val_acc.h5"),
                            monitor='val_accuracy',
                            verbose=1,
                            save_best_only=True,
                            mode='max')

# Training the model on the training data and validating using the validation set
model.fit([X_train, X_train_augment], y_train, batch_size = 128, 
            validation_data = ([X_val, X_val_augment], y_val), verbose = 1,
            callbacks=[tensorboard, reduce_lr, checkpoint], 
            epochs = 90)

K.clear_session()

model = load_model(os.path.join(log_dir, "best_val_acc.h5"))

val_pred_df = predict_all(model, val_df, tokenizer_encoder, tokenizer_decoder, n_gram=1,  max_len=maxlen_seq, 
                            augmented_input=X_val_augment,
                            filepath = os.path.join(log_dir, "val_pred_{}.csv".format(model_name)))
val_score, val_score_df = edit_score(val_df, val_pred_df,
                                    filepath = os.path.join(log_dir, "val_score_{}.csv".format(model_name)), plot=False)
plt.close()
K.clear_session()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None)]       0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, None, 128)    2816        input_1[0][0]                    
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 22)]   0                                            
__________________________________________________________________________________________________
concatenate (Concatenate)       (None, None, 150)    0           embedding[0][0]                  
                                                                 input_2[0][0]         


Epoch 00001: LearningRateScheduler reducing learning rate to 0.0020000000949949026.
Epoch 1/90
Instructions for updating:
use `tf.profiler.experimental.stop` instead.
Epoch 00001: val_accuracy improved from -inf to 0.79844, saving model to ../logs/20201224-191136-script_name\best_val_acc.h5

Epoch 00002: LearningRateScheduler reducing learning rate to 0.0020000000949949026.
Epoch 2/90
Epoch 00002: val_accuracy did not improve from 0.79844

Epoch 00003: LearningRateScheduler reducing learning rate to 0.0020000000949949026.
Epoch 3/90
Epoch 00003: val_accuracy did not improve from 0.79844

Epoch 00004: LearningRateScheduler reducing learning rate to 0.0020000000949949026.
Epoch 4/90
Epoch 00004: val_accuracy did not improve from 0.79844

Epoch 00005: LearningRateScheduler reducing learning rate to 0.0020000000949949026.
Epoch 5/90
Epoch 00005: val_accuracy improved from 0.79844 to 0.80254, saving model to ../logs/20201224-191136-script_name\best_val_acc.h5

Epoch 00006: LearningRateSche