### Import Statements

In [None]:
import tensorflow as tf
from keras import backend, callbacks
from keras.models import Model, load_model
from keras.layers import add, Add, BatchNormalization, Concatenate, Convolution1D, Dense, Dot, Dropout, Embedding, Input, Lambda, Layer, LayerNormalization, Permute, RepeatVector, Softmax, TimeDistributed
from keras.regularizers import l2
from tensorflow.keras.optimizers import Adam

import math
import numpy as np
import shutil
import sys
import os
import random

from tensorflow.python.client import device_lib
device_lib.list_local_devices()

### Prediction Pipeline Architecture Components

#### Inception Block

In [None]:
def inception_block(x, num_inception_filters=100, drop_rate=0.1):
    x = BatchNormalization()(x)
    
    conv1_1 = Convolution1D(filters=num_inception_filters, kernel_size=1, activation="relu", padding="same", kernel_regularizer=l2(0.001))(x)
    conv1_1 = Dropout(rate=drop_rate)(conv1_1)
    conv1_1 = BatchNormalization()(conv1_1)

    conv2_1 = Convolution1D(filters=num_inception_filters, kernel_size=1, activation="relu", padding="same", kernel_regularizer=l2(0.001))(x)
    conv2_1 = Dropout(rate=drop_rate)(conv2_1)
    conv2_1 = BatchNormalization()(conv2_1)
    conv2_2 = Convolution1D(filters=num_inception_filters, kernel_size=3, activation="relu", padding="same", kernel_regularizer=l2(0.001))(conv2_1)
    conv2_2 = Dropout(rate=drop_rate)(conv2_2)
    conv2_2 = BatchNormalization()(conv2_2)

    conv3_1 = Convolution1D(filters=num_inception_filters, kernel_size=1, activation="relu", padding="same", kernel_regularizer=l2(0.001))(x)
    conv3_1 = Dropout(rate=drop_rate)(conv3_1)
    conv3_1 = BatchNormalization()(conv3_1)
    conv3_2 = Convolution1D(filters=num_inception_filters, kernel_size=3, activation="relu", padding="same", kernel_regularizer=l2(0.001))(conv3_1)
    conv3_2 = Dropout(rate=drop_rate)(conv3_2)
    conv3_2 = BatchNormalization()(conv3_2)
    conv3_3 = Convolution1D(filters=num_inception_filters, kernel_size=3, activation="relu", padding="same", kernel_regularizer=l2(0.001))(conv3_2)
    conv3_3 = Dropout(rate=drop_rate)(conv3_3)
    conv3_3 = BatchNormalization()(conv3_3)
    conv3_4 = Convolution1D(filters=num_inception_filters, kernel_size=3, activation="relu", padding="same", kernel_regularizer=l2(0.001))(conv3_3)
    conv3_4 = Dropout(rate=drop_rate)(conv3_4)
    conv3_4 = BatchNormalization()(conv3_4)

    concat = Concatenate()([conv1_1, conv2_2, conv3_4])
    concat = BatchNormalization()(concat)
    return concat

#### Scaled Dot-product Submodule of Self-attention Module

In [None]:
def get_shape_list(x):
    if backend.backend() != "theano":
        temp = backend.int_shape(x)
    else:
        temp = x.shape

    temp = list(temp)
    temp[0] = -1
    return temp

def scaled_dot_product(activations, attn_mask):
    drop_rate, units = 0.1, int(activations.shape[2])
    
    Q = TimeDistributed(Dense(units=units, activation=None, use_bias=False))(activations)
    Q = Dropout(rate=drop_rate)(Q)
    K = TimeDistributed(Dense(units=units, activation=None, use_bias=False))(activations)
    K = Dropout(rate=drop_rate)(K)
    V = TimeDistributed(Dense(units=units, activation=None, use_bias=False))(activations)
    V = Dropout(rate=drop_rate)(V)

    QK_T = Dot(axes=-1, normalize=False)([Q, K])
    QK_T = Lambda(lambda x: x[0] / backend.sqrt(backend.cast(get_shape_list(x[1])[-1], "float32")))([QK_T, V])
    
    attention_mask = RepeatVector(n=QK_T.shape[1])(attn_mask)
    QK_T = Add()([QK_T, attention_mask])
    QK_T = Softmax(axis=-1)(QK_T)
    QK_T = Dropout(rate=drop_rate)(QK_T)

    V = Permute(dims=[2, 1])(V)
    V_prime = Dot(axes=-1, normalize=False)([QK_T, V])
    return V_prime

#### Self-attention Module

In [None]:
class CustomLayer(Layer):
    def __init__(self, **kwargs):
        super().__init__(**kwargs)
        self._x = None
    
    def build(self, input_shape):
        self._x = backend.variable(0.2)
        self._x._trainable = True
        self._trainable_weights = [self._x]
        super().build(input_shape)

    def call(self, x, **kwargs):
        output_after_attention, previous_layer_input = x
        result = add([self._x * output_after_attention, (1 - self._x) * previous_layer_input])
        return result

    def compute_output_shape(self, input_shape):
        return input_shape[0]

def get_position_encoding(protein_len: int, d_emb: int) -> np.array:
    position_encoding = np.array(object=[[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)] if pos != 0 else np.zeros(d_emb) for pos in range(protein_len)], dtype=np.float32)
    position_encoding[1:, 0::2] = np.sin(position_encoding[1:, 0::2])
    position_encoding[1:, 1::2] = np.cos(position_encoding[1:, 1::2])
    return position_encoding

def get_position_embedding(position_ids, output_dim=50):
    protein_len, output_dim = 700, int(output_dim)
    position_embedding = Dropout(rate=0.1)(Embedding(protein_len, output_dim, trainable=False, input_length=protein_len, weights=[get_position_encoding(protein_len, output_dim)])(position_ids))
    position_embedding = LayerNormalization(epsilon=1e-5)(position_embedding)
    return position_embedding

def self_attention_module(x, attention_mask, position_ids=None, drop_rate=0.1, use_attention=True):
    if not use_attention:
        return x
    
    original_dim = int(x.shape[-1])
    
    if position_ids is not None:
        position_embedding = get_position_embedding(position_ids=position_ids, output_dim=original_dim)
        x = Add()([x, position_embedding])
    
    attention_layer = scaled_dot_product(activations=x, attn_mask=attention_mask)
    attention_layer = Dropout(rate=drop_rate)(attention_layer)
    
    x = CustomLayer()([attention_layer, x])
    x = Dropout(rate=drop_rate)(x)
    x = BatchNormalization()(x)
    return x

#### 2A3I Module

In [None]:
def residual_connection(previous_layer_feature, current_layer_feature, num_residual_filters=300, use_skip_connection=False):
    if not use_skip_connection:
        return current_layer_feature
    else:
        current_layer_feature = Convolution1D(filters=num_residual_filters, kernel_size=1, activation="relu", padding="same", kernel_regularizer=l2(0.001))(current_layer_feature)
        return Add()([previous_layer_feature, current_layer_feature])

def deep_2a3i_module(x, attention_mask, position_ids=None, do_batchnorm=True, num_residual_filters=300, num_inception_filters=100, inception_drop_rate=0.1, use_skip_connection=False, use_attention=True):
    block1_1 = inception_block(x, num_inception_filters=num_inception_filters, drop_rate=inception_drop_rate)
    block1_1 = residual_connection(x, block1_1, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    block1_1_attn = self_attention_module(block1_1, attention_mask, position_ids, use_attention=use_attention)
    block1_1 = residual_connection(block1_1, block1_1_attn, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    
    block2_1 = inception_block(x, num_inception_filters=num_inception_filters, drop_rate=inception_drop_rate)
    block2_1 = residual_connection(x, block2_1, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    block2_2 = inception_block(block2_1, num_inception_filters=num_inception_filters, drop_rate=inception_drop_rate)
    block2_2 = residual_connection(block2_1, block2_2, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    block2_2_attn = self_attention_module(block2_2, attention_mask, position_ids, use_attention=use_attention)
    block2_2 = residual_connection(block2_2, block2_2_attn, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    
    block3_1 = inception_block(x, num_inception_filters=num_inception_filters, drop_rate=inception_drop_rate)
    block3_1 = residual_connection(x, block3_1, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    block3_2 = inception_block(block3_1, num_inception_filters=num_inception_filters, drop_rate=inception_drop_rate)
    block3_2 = residual_connection(block3_1, block3_2, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    block3_3 = inception_block(block3_2, num_inception_filters=num_inception_filters, drop_rate=inception_drop_rate)
    block3_3 = residual_connection(block3_2, block3_3, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    block3_4 = inception_block(block3_3, num_inception_filters=num_inception_filters, drop_rate=inception_drop_rate)
    block3_4 = residual_connection(block3_3, block3_4, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    block3_4_attn = self_attention_module(block3_4, attention_mask, position_ids, use_attention=use_attention)
    block3_4 = residual_connection(block3_4, block3_4_attn, num_residual_filters=num_residual_filters, use_skip_connection=use_skip_connection)
    
    concat = Concatenate()([block1_1, block2_2, block3_4])
    
    if do_batchnorm:
        concat = BatchNormalization()(concat)
    
    return concat

#### Core Architecture

In [None]:
def get_model(args):
    drop_rate = 0.4
    input_features = []
    start_index = end_index = 0
    
    main_input = Input(shape=(700, args["num_features"]), name="main_input")
    attention_mask = Input(shape=(700,), name="attention_mask")
    position_ids = Input(batch_shape=(None, 700), name="position_ids", dtype="int32")
    
    if args["features"]["ProtTrans"]["add_feature"]:
        end_index = start_index + args["features"]["ProtTrans"]["feature_len"]
        prottrans_features = main_input[:, :, start_index:end_index]
        
        if args["num_prottrans_filters"] == args["features"]["ProtTrans"]["feature_len"]:
            input_features.append(prottrans_features)
        else:
            conv_prottrans_features = Convolution1D(filters=args["num_prottrans_filters"], kernel_size=7, activation="relu", padding="same", kernel_regularizer=l2(0.001))(prottrans_features)
            input_features.append(conv_prottrans_features)
    if args["features"]["ESM1b"]["add_feature"]:
        start_index = end_index
        end_index = start_index + args["features"]["ESM1b"]["feature_len"]
        esm1b_features = main_input[:, :, start_index:end_index]
        
        if args["num_esm1b_filters"] == args["features"]["ESM1b"]["feature_len"]:
            input_features.append(esm1b_features)
        else:
            conv_esm1b_features = Convolution1D(filters=args["num_esm1b_filters"], kernel_size=7, activation="relu", padding="same", kernel_regularizer=l2(0.001))(esm1b_features)
            input_features.append(conv_esm1b_features)
    if args["features"]["ProtTransBFD"]["add_feature"]:
        start_index = end_index
        end_index = start_index + args["features"]["ProtTransBFD"]["feature_len"]
        prottransBFD_features = main_input[:, :, start_index:end_index]
        
        if args["num_prottransBFD_filters"] == args["features"]["ProtTransBFD"]["feature_len"]:
            input_features.append(prottransBFD_features)
        else:
            conv_prottransBFD_features = Convolution1D(filters=args["num_prottransBFD_filters"], kernel_size=7, activation="relu", padding="same", kernel_regularizer=l2(0.001))(prottransBFD_features)
            input_features.append(conv_prottransBFD_features)
    if end_index < args["num_features"]:
        start_index = end_index
        biological_features = main_input[:, :, start_index:]
        input_features.append(biological_features)
    
    if len(input_features) > 1:
        input_features = tf.keras.layers.concatenate(input_features, axis=-1)
    else:
        input_features = input_features[0]
    
    if args["architecture"] == "Basic1" or args["architecture"] == "Basic2":
        block1 = deep_2a3i_module(input_features, attention_mask, position_ids, num_residual_filters=args["num_residual_filters"], num_inception_filters=args["num_inception_filters"], inception_drop_rate=args["inception_drop_rate"], use_skip_connection=args["use_skip_connection"], use_attention=args["use_attention"])
        
        if args["architecture"] == "Basic1":
            output_2a3i = block1
        elif args["architecture"] == "Basic2":
            output_2a3i = deep_2a3i_module(block1, attention_mask, position_ids, num_residual_filters=args["num_residual_filters"], num_inception_filters=args["num_inception_filters"], inception_drop_rate=args["inception_drop_rate"], use_skip_connection=args["use_skip_connection"], use_attention=args["use_attention"])
        
        output_2a3i_attention = self_attention_module(output_2a3i, attention_mask, position_ids, use_attention=args["use_attention"])
        conv11 = Convolution1D(filters=100, kernel_size=11, activation="relu", padding="same", kernel_regularizer=l2(0.001))(output_2a3i_attention)
        conv11_attention = self_attention_module(conv11, attention_mask, position_ids, use_attention=args["use_attention"])
        
        Q8_dense1 = TimeDistributed(Dense(units=256, activation="relu"))(conv11_attention)
        Q8_dense1 = Dropout(rate=drop_rate)(Q8_dense1)
        Q8_dense1_attention = self_attention_module(Q8_dense1, attention_mask, position_ids, use_attention=args["use_attention"])
        
        Phi_Psi_dense1 = TimeDistributed(Dense(units=256, activation="relu"))(conv11_attention)
        Phi_Psi_dense1 = Dropout(rate=drop_rate)(Phi_Psi_dense1)
        Phi_Psi_dense1_attention = self_attention_module(Phi_Psi_dense1, attention_mask, position_ids, use_attention=args["use_attention"])
    elif args["architecture"] == "Residual1" or args["architecture"] == "Residual2":
        block1 = deep_2a3i_module(input_features, attention_mask, position_ids, num_residual_filters=args["num_residual_filters"], num_inception_filters=args["num_inception_filters"], inception_drop_rate=args["inception_drop_rate"], use_skip_connection=args["use_skip_connection"], use_attention=args["use_attention"])
        block1 = residual_connection(input_features, block1, num_residual_filters=args["num_residual_filters"], use_skip_connection=args["use_skip_connection"])
        
        if args["architecture"] == "Residual1":
            output_2a3i = block1
        elif args["architecture"] == "Residual2":
            output_2a3i = deep_2a3i_module(block1, attention_mask, position_ids, num_residual_filters=args["num_residual_filters"], num_inception_filters=args["num_inception_filters"], inception_drop_rate=args["inception_drop_rate"], use_skip_connection=args["use_skip_connection"], use_attention=args["use_attention"])
            output_2a3i = residual_connection(block1, output_2a3i, num_residual_filters=args["num_residual_filters"], use_skip_connection=args["use_skip_connection"])
        
        output_2a3i_attention = self_attention_module(output_2a3i, attention_mask, position_ids, use_attention=args["use_attention"])
        output_2a3i_attention = residual_connection(output_2a3i, output_2a3i_attention, num_residual_filters=args["num_residual_filters"], use_skip_connection=args["use_skip_connection"])
        
        conv11 = Convolution1D(filters=100, kernel_size=11, activation="relu", padding="same", kernel_regularizer=l2(0.001))(output_2a3i_attention)
        conv11 = residual_connection(output_2a3i_attention, conv11, num_residual_filters=args["num_residual_filters"], use_skip_connection=args["use_skip_connection"])
        
        conv11_attention = self_attention_module(conv11, attention_mask, position_ids, use_attention=args["use_attention"])
        conv11_attention = residual_connection(conv11, conv11_attention, num_residual_filters=args["num_residual_filters"], use_skip_connection=args["use_skip_connection"])
        
        Q8_dense1 = TimeDistributed(Dense(units=args["num_residual_filters"], activation="relu"))(conv11_attention)
        Q8_dense1 = Dropout(rate=drop_rate)(Q8_dense1)
        
        Q8_dense1_attention = self_attention_module(Q8_dense1, attention_mask, position_ids, use_attention=args["use_attention"])
        Q8_dense1_attention = residual_connection(Q8_dense1, Q8_dense1_attention, num_residual_filters=args["num_residual_filters"], use_skip_connection=args["use_skip_connection"])
        
        Phi_Psi_dense1 = TimeDistributed(Dense(units=args["num_residual_filters"], activation="relu"))(conv11_attention)
        Phi_Psi_dense1 = Dropout(rate=drop_rate)(Phi_Psi_dense1)
        
        Phi_Psi_dense1_attention = self_attention_module(Phi_Psi_dense1, attention_mask, position_ids, use_attention=args["use_attention"])
        Phi_Psi_dense1_attention = residual_connection(Phi_Psi_dense1, Phi_Psi_dense1_attention, num_residual_filters=args["num_residual_filters"], use_skip_connection=args["use_skip_connection"])
    
    Q8_output = TimeDistributed(Dense(units=8, activation="softmax"), name="Q8_output")(Q8_dense1_attention)
    Phi_Psi_output = TimeDistributed(Dense(units=4, activation="tanh"), name="Phi_Psi_output")(Phi_Psi_dense1_attention)
    
    model = Model(inputs=[main_input, attention_mask, position_ids], outputs=[Q8_output, Phi_Psi_output])
    return model

### Performance Evaluation Methods

#### Secondary Structure 8-state (Q8) Prediction

In [None]:
def custom_categorical_cross_entropy(y_true, y_predicted):
    mask = backend.sum(y_true, axis=2)

    loss = backend.sum(y_true * backend.log(y_predicted + sys.float_info.epsilon), axis=2)
    loss = backend.sum(loss * mask, axis=1)

    return -1 * backend.sum(loss, axis=0)

def average_accuracy(y_true, y_predicted):
    mask = backend.sum(y_true, axis=2)

    y_true_labels, y_predicted_labels = backend.cast(backend.argmax(y_true, axis=2), "int8"), backend.cast(backend.argmax(y_predicted, axis=2), "int8")

    is_identical = backend.cast(backend.equal(y_true_labels, y_predicted_labels), "float32")
    num_identicals, protein_lengths = backend.sum(is_identical * mask, axis=1), backend.sum(mask, axis=1)

    return backend.mean(num_identicals / protein_lengths, axis=0)

def total_accuracy(y_true, y_predicted):
    mask = backend.sum(y_true, axis=2)

    y_true_labels, y_predicted_labels = backend.cast(backend.argmax(y_true, axis=2), "int8"), backend.cast(backend.argmax(y_predicted, axis=2), "int8")

    is_identical = backend.cast(backend.equal(y_true_labels, y_predicted_labels), "float32")
    num_identicals, protein_lengths = backend.sum(is_identical * mask, axis=1), backend.sum(mask, axis=1)

    return backend.sum(num_identicals, axis=0) / backend.sum(protein_lengths, axis=0)

def total_correct_prediction(y_true, y_predicted):
    mask = backend.sum(y_true, axis=2)

    y_true_labels, y_predicted_labels = backend.cast(backend.argmax(y_true, axis=2), "int8"), backend.cast(backend.argmax(y_predicted, axis=2), "int8")

    is_identical = backend.cast(backend.equal(y_true_labels, y_predicted_labels), "float32")
    num_identicals = backend.sum(is_identical * mask, axis=1)
    
    return backend.sum(num_identicals, axis=0)

#### Backbone Torsion φ and ψ Angles Prediction

**Performance Metrics:-**
- `tse` -> *Total Squared Error (TSE)*
- `mse` -> *Mean Squared Error (MSE)*
- `mae` -> *Mean Absolute Error (MAE)*
- `sae` -> *Sum of Absolute Error (SAE)* = *MAE* * `total_residue_count`

In [None]:
def total_tse(y_true, y_predicted):
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    
    y_true_phi_sine, y_true_phi_cosine = y_true[:, :, 0] * mask, y_true[:, :, 1] * mask
    y_true_psi_sine, y_true_psi_cosine = y_true[:, :, 2] * mask, y_true[:, :, 3] * mask
    y_pred_phi_sine, y_pred_phi_cosine = y_predicted[:, :, 0] * mask, y_predicted[:, :, 1] * mask
    y_pred_psi_sine, y_pred_psi_cosine = y_predicted[:, :, 2] * mask, y_predicted[:, :, 3] * mask
    
    phi_diff_sine, phi_diff_cosine = backend.abs(y_true_phi_sine - y_pred_phi_sine), backend.abs(y_true_phi_cosine - y_pred_phi_cosine)
    psi_diff_sine, psi_diff_cosine = backend.abs(y_true_psi_sine - y_pred_psi_sine), backend.abs(y_true_psi_cosine - y_pred_psi_cosine)
    phi_mse_sine, phi_mse_cosine = backend.sum(backend.square(phi_diff_sine)), backend.sum(backend.square(phi_diff_cosine))
    psi_mse_sine, psi_mse_cosine = backend.sum(backend.square(psi_diff_sine)), backend.sum(backend.square(psi_diff_cosine))
    
    return phi_mse_sine + phi_mse_cosine + psi_mse_sine + psi_mse_cosine

def mean_tse(y_true, y_predicted):
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    
    y_true_phi_sine, y_true_phi_cosine = y_true[:, :, 0] * mask, y_true[:, :, 1] * mask
    y_true_psi_sine, y_true_psi_cosine = y_true[:, :, 2] * mask, y_true[:, :, 3] * mask
    y_pred_phi_sine, y_pred_phi_cosine = y_predicted[:, :, 0] * mask, y_predicted[:, :, 1] * mask
    y_pred_psi_sine, y_pred_psi_cosine = y_predicted[:, :, 2] * mask, y_predicted[:, :, 3] * mask
    
    phi_diff_sine, phi_diff_cosine = backend.abs(y_true_phi_sine - y_pred_phi_sine), backend.abs(y_true_phi_cosine - y_pred_phi_cosine)
    psi_diff_sine, psi_diff_cosine = backend.abs(y_true_psi_sine - y_pred_psi_sine), backend.abs(y_true_psi_cosine - y_pred_psi_cosine)
    phi_mse_sine, phi_mse_cosine = backend.sum(backend.square(phi_diff_sine)), backend.sum(backend.square(phi_diff_cosine))
    psi_mse_sine, psi_mse_cosine = backend.sum(backend.square(psi_diff_sine)), backend.sum(backend.square(psi_diff_cosine))
    
    return 0.25 * (phi_mse_sine + phi_mse_cosine + psi_mse_sine + psi_mse_cosine)

def total_mse(y_true, y_predicted):
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    total_residue_count = backend.sum(mask)
    
    y_true_phi_sine, y_true_phi_cosine = y_true[:, :, 0] * mask, y_true[:, :, 1] * mask
    y_true_psi_sine, y_true_psi_cosine = y_true[:, :, 2] * mask, y_true[:, :, 3] * mask
    y_pred_phi_sine, y_pred_phi_cosine = y_predicted[:, :, 0] * mask, y_predicted[:, :, 1] * mask
    y_pred_psi_sine, y_pred_psi_cosine = y_predicted[:, :, 2] * mask, y_predicted[:, :, 3] * mask
    
    phi_diff_sine, phi_diff_cosine = backend.abs(y_true_phi_sine - y_pred_phi_sine), backend.abs(y_true_phi_cosine - y_pred_phi_cosine)
    psi_diff_sine, psi_diff_cosine = backend.abs(y_true_psi_sine - y_pred_psi_sine), backend.abs(y_true_psi_cosine - y_pred_psi_cosine)
    phi_mse_sine, phi_mse_cosine = backend.sum(backend.square(phi_diff_sine)) / total_residue_count, backend.sum(backend.square(phi_diff_cosine)) / total_residue_count
    psi_mse_sine, psi_mse_cosine = backend.sum(backend.square(psi_diff_sine)) / total_residue_count, backend.sum(backend.square(psi_diff_cosine)) / total_residue_count
    
    total_mse = phi_mse_sine + phi_mse_cosine + psi_mse_sine + psi_mse_cosine
    return total_mse

def mean_mse(y_true, y_predicted):
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    total_residue_count = backend.sum(mask)
    
    y_true_phi_sine, y_true_phi_cosine = y_true[:, :, 0] * mask, y_true[:, :, 1] * mask
    y_true_psi_sine, y_true_psi_cosine = y_true[:, :, 2] * mask, y_true[:, :, 3] * mask
    y_pred_phi_sine, y_pred_phi_cosine = y_predicted[:, :, 0] * mask, y_predicted[:, :, 1] * mask
    y_pred_psi_sine, y_pred_psi_cosine = y_predicted[:, :, 2] * mask, y_predicted[:, :, 3] * mask
    
    phi_diff_sine, phi_diff_cosine = backend.abs(y_true_phi_sine - y_pred_phi_sine), backend.abs(y_true_phi_cosine - y_pred_phi_cosine)
    psi_diff_sine, psi_diff_cosine = backend.abs(y_true_psi_sine - y_pred_psi_sine), backend.abs(y_true_psi_cosine - y_pred_psi_cosine)
    phi_mse_sine, phi_mse_cosine = backend.sum(backend.square(phi_diff_sine)) / total_residue_count, backend.sum(backend.square(phi_diff_cosine)) / total_residue_count
    psi_mse_sine, psi_mse_cosine = backend.sum(backend.square(psi_diff_sine)) / total_residue_count, backend.sum(backend.square(psi_diff_cosine)) / total_residue_count
    
    mean_mse = 0.25 * (phi_mse_sine + phi_mse_cosine + psi_mse_sine + psi_mse_cosine)
    return mean_mse

def mean_mae(y_true, y_predicted):
    y_true_phi_angle = tf.atan2(y_true[:, :, 0], y_true[:, :, 1]) * 180 / np.pi
    y_pred_phi_angle = tf.atan2(y_predicted[:, :, 0], y_predicted[:, :, 1]) * 180 / np.pi
    y_true_psi_angle = tf.atan2(y_true[:, :, 2], y_true[:, :, 3]) * 180 / np.pi
    y_pred_psi_angle = tf.atan2(y_predicted[:, :, 2], y_predicted[:, :, 3]) * 180 / np.pi
    
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    total_residue_count = backend.sum(mask)
    
    phi_diff, psi_diff = backend.abs(y_true_phi_angle - y_pred_phi_angle), backend.abs(y_true_psi_angle - y_pred_psi_angle)
    phi_diff_rev, psi_diff_rev = Lambda(lambda x: 360 - x)(phi_diff), Lambda(lambda x: 360 - x)(psi_diff)
    
    phi_mask = backend.cast(backend.greater(phi_diff[:, :], 180), dtype="float32")
    phi_mask_rev = 1 - phi_mask
    psi_mask = backend.cast(backend.greater(psi_diff[:, :], 180), dtype="float32")
    psi_mask_rev = 1 - psi_mask
    
    phi_error, psi_error = phi_diff * phi_mask_rev + phi_diff_rev * phi_mask, psi_diff * psi_mask_rev + psi_diff_rev * psi_mask
    phi_mae, psi_mae = backend.sum(phi_error * mask) / total_residue_count, backend.sum(psi_error * mask) / total_residue_count
    
    mean_mae = 0.5 * (phi_mae + psi_mae)
    return mean_mae

def phi_mae(y_true, y_predicted):
    y_true_phi_angle = tf.atan2(y_true[:, :, 0], y_true[:, :, 1]) * 180 / np.pi
    y_pred_phi_angle = tf.atan2(y_predicted[:, :, 0], y_predicted[:, :, 1]) * 180 / np.pi
    
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    total_residue_count = backend.sum(mask)
    
    phi_diff = backend.abs(y_true_phi_angle - y_pred_phi_angle)
    phi_diff_rev = Lambda(lambda x: 360 - x)(phi_diff)
    
    phi_mask = backend.cast(backend.greater(phi_diff[:, :], 180), dtype="float32")
    phi_mask_rev = 1 - phi_mask
    
    phi_error = phi_diff * phi_mask_rev + phi_diff_rev * phi_mask
    phi_mae = backend.sum(phi_error * mask) / total_residue_count
    return phi_mae

def psi_mae(y_true, y_predicted):
    y_true_psi_angle = tf.atan2(y_true[:, :, 2], y_true[:, :, 3]) * 180 / np.pi
    y_pred_psi_angle = tf.atan2(y_predicted[:, :, 2], y_predicted[:, :, 3]) * 180 / np.pi
    
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    total_residue_count = backend.sum(mask)
    
    psi_diff = backend.abs(y_true_psi_angle - y_pred_psi_angle)
    psi_diff_rev = Lambda(lambda x: 360 - x)(psi_diff)
    
    psi_mask = backend.cast(backend.greater(psi_diff[:, :], 180), dtype="float32")
    psi_mask_rev = 1 - psi_mask
    
    psi_error = psi_diff * psi_mask_rev + psi_diff_rev * psi_mask
    psi_mae = backend.sum(psi_error * mask) / total_residue_count
    return psi_mae

def mean_sae(y_true, y_predicted):
    y_true_phi_angle = tf.atan2(y_true[:, :, 0], y_true[:, :, 1]) * 180 / np.pi
    y_pred_phi_angle = tf.atan2(y_predicted[:, :, 0], y_predicted[:, :, 1]) * 180 / np.pi
    y_true_psi_angle = tf.atan2(y_true[:, :, 2], y_true[:, :, 3]) * 180 / np.pi
    y_pred_psi_angle = tf.atan2(y_predicted[:, :, 2], y_predicted[:, :, 3]) * 180 / np.pi
    
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    
    phi_diff, psi_diff = backend.abs(y_true_phi_angle - y_pred_phi_angle), backend.abs(y_true_psi_angle - y_pred_psi_angle)
    phi_diff_rev, psi_diff_rev = Lambda(lambda x: 360 - x)(phi_diff), Lambda(lambda x: 360 - x)(psi_diff)
    
    phi_mask = backend.cast(backend.greater(phi_diff[:, :], 180), dtype="float32")
    phi_mask_rev = 1 - phi_mask
    psi_mask = backend.cast(backend.greater(psi_diff[:, :], 180), dtype="float32")
    psi_mask_rev = 1 - psi_mask
    
    phi_error, psi_error = phi_diff * phi_mask_rev + phi_diff_rev * phi_mask, psi_diff * psi_mask_rev + psi_diff_rev * psi_mask
    phi_sae, psi_sae = backend.sum(phi_error * mask), backend.sum(psi_error * mask)
    
    mean_sae = 0.5 * (phi_sae + psi_sae)
    return mean_sae

def phi_sae(y_true, y_predicted):
    y_true_phi_angle = tf.atan2(y_true[:, :, 0], y_true[:, :, 1]) * 180 / np.pi
    y_pred_phi_angle = tf.atan2(y_predicted[:, :, 0], y_predicted[:, :, 1]) * 180 / np.pi
    
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    
    phi_diff = backend.abs(y_true_phi_angle - y_pred_phi_angle)
    phi_diff_rev = Lambda(lambda x: 360 - x)(phi_diff)
    
    phi_mask = backend.cast(backend.greater(phi_diff[:, :], 180), dtype="float32")
    phi_mask_rev = 1 - phi_mask
    
    phi_error = phi_diff * phi_mask_rev + phi_diff_rev * phi_mask
    phi_sae = backend.sum(phi_error * mask)
    return phi_sae

def psi_sae(y_true, y_predicted):
    y_true_psi_angle = tf.atan2(y_true[:, :, 2], y_true[:, :, 3]) * 180 / np.pi
    y_pred_psi_angle = tf.atan2(y_predicted[:, :, 2], y_predicted[:, :, 3]) * 180 / np.pi
    
    mask = 1 - backend.cast(backend.equal(y_true[:, :, 0], -500), dtype="float32")
    
    psi_diff = backend.abs(y_true_psi_angle - y_pred_psi_angle)
    psi_diff_rev = Lambda(lambda x: 360 - x)(psi_diff)
    
    psi_mask = backend.cast(backend.greater(psi_diff[:, :], 180), dtype="float32")
    psi_mask_rev = 1 - psi_mask
    
    psi_error = psi_diff * psi_mask_rev + psi_diff_rev * psi_mask
    psi_sae = backend.sum(psi_error * mask)
    return psi_sae

### `CustomDataLoader` Class Definition

In [None]:
class CustomDataLoader(tf.keras.utils.Sequence):
    def __init__(self, dataset_path, dataset_name, features, num_features, batch_size, shuffle):
        self.dataset_path, self.dataset_name = dataset_path, dataset_name
        self.features, self.num_features = features, num_features
        self.batch_size, self.shuffle = batch_size, shuffle
        self.proteins_dict, self.protein_names = None, None
        
        with open(dataset_path + os.sep + dataset_name + "_below_700_proteins.txt", 'r') as proteins_file:
            self.proteins_dict = [content.split(',') for content in proteins_file.read().split('\n') if content != '']
            self.proteins_dict = {contents[0]: int(contents[1]) for contents in self.proteins_dict}
            self.protein_names = list(self.proteins_dict.keys())
        
        self.on_epoch_end()
    
    def on_epoch_end(self):
        if self.shuffle:
            random.shuffle(self.protein_names)
    
    def __len__(self):
        return math.ceil(len(self.protein_names) / self.batch_size)
    
    def __getitem__(self, index):
        batch_protein_names = self.protein_names[index * self.batch_size:(index + 1) * self.batch_size]
        
        main_input = np.zeros(shape=(len(batch_protein_names), 700, self.num_features))
        attention_mask = np.zeros(shape=(len(batch_protein_names), 700))
        position_ids = np.zeros(shape=(len(batch_protein_names), 700))
        weight_mask = np.ones(shape=(len(batch_protein_names), 700))
        Q8_labels = np.zeros(shape=(len(batch_protein_names), 700, 8))
        phi_psi_labels = np.full(shape=(len(batch_protein_names), 700, 4), fill_value=-500, dtype=np.float32)
        
        for batch_index, protein_name in enumerate(batch_protein_names):
            data_path = self.dataset_path + os.sep + "Rawdata" + os.sep + protein_name + os.sep + protein_name
            protein_features = []
            
            for feature in self.features:
                if self.features[feature]["add_feature"]:
                    with open(data_path + '_' + self.features[feature]["extension"] + ".npy", 'rb') as feature_file:
                        protein_features.append(np.nan_to_num(np.load(file=feature_file), nan=0.0))
            
            main_input[batch_index, :self.proteins_dict[protein_name]] = np.concatenate(protein_features, axis=-1)
            attention_mask[batch_index, self.proteins_dict[protein_name]:] = -np.inf
            position_ids[batch_index] = np.arange(700)
            weight_mask[batch_index, self.proteins_dict[protein_name]:] = 0
            
            with open(data_path + "_ss8.npy", 'rb') as label_file:
                Q8_labels[batch_index, :self.proteins_dict[protein_name]] = np.load(file=label_file)
            
            with open(data_path + "_phi.npy", 'rb') as label_file:
                phi_angles = np.load(file=label_file)
                phi_angles = np.reshape(phi_angles, newshape=(-1,))
            
            with open(data_path + "_psi.npy", 'rb') as label_file:
                psi_angles = np.load(file=label_file)
                psi_angles = np.reshape(psi_angles, newshape=(-1,))
            
            phi_psi_labels[batch_index, :self.proteins_dict[protein_name], 0] = np.sin(phi_angles * np.pi / 180)
            phi_psi_labels[batch_index, :self.proteins_dict[protein_name], 1] = np.cos(phi_angles * np.pi / 180)
            phi_psi_labels[batch_index, :self.proteins_dict[protein_name], 2] = np.sin(psi_angles * np.pi / 180)
            phi_psi_labels[batch_index, :self.proteins_dict[protein_name], 3] = np.cos(psi_angles * np.pi / 180)
            
            phi_psi_labels[batch_index, np.where(phi_angles == -500), :] = -500 ##
            phi_psi_labels[batch_index, np.where(psi_angles == -500), :] = -500 ##
            
            phi_psi_labels[batch_index, 0, :] = -500 ##
            phi_psi_labels[batch_index, self.proteins_dict[protein_name] - 1, :] = -500 ##
        
        return {"main_input": main_input, "attention_mask": attention_mask, "position_ids": position_ids}, {"Q8_output": Q8_labels, "Phi_Psi_output": phi_psi_labels}, weight_mask

### Callbacks

#### `CustomStepLR` Class Definition

In [None]:
class CustomStepLR(callbacks.Callback):
    def __init__(self, step_size, factor, min_lr):
        super().__init__()
        self.step_size, self.current_step, self.factor, self.min_lr = step_size, 0, factor, min_lr
    
    def on_epoch_end(self, epoch, logs=None):
        print(f"Training Epoch {epoch + 1}:-\nAvg Accuracy: {logs['Q8_output_average_accuracy']}")
        print(f"Total Accuracy: {logs['Q8_output_total_accuracy']}")
        print(f"Avg Total Correct Prediction: {logs['Q8_output_total_correct_prediction']}")
        print(f"Mean MAE: {logs['Phi_Psi_output_mean_mae']}\nPhi MAE: {logs['Phi_Psi_output_phi_mae']}")
        print(f"Psi MAE: {logs['Phi_Psi_output_psi_mae']}\n")
        print(f"Validation Epoch {epoch + 1}:-\nAvg Accuracy: {logs['val_Q8_output_average_accuracy']}")
        print(f"Total Accuracy: {logs['val_Q8_output_total_accuracy']}")
        print(f"Avg Total Correct Prediction: {logs['val_Q8_output_total_correct_prediction']}")
        print(f"Mean MAE: {logs['val_Phi_Psi_output_mean_mae']}\nPhi MAE: {logs['val_Phi_Psi_output_phi_mae']}")
        print(f"Psi MAE: {logs['val_Phi_Psi_output_psi_mae']}\n")
        
        self.current_step = self.current_step + 1
        
        if self.current_step == self.step_size:
            self.current_step = 0
            current_lr = float(backend.get_value(self.model.optimizer.lr))
            
            if current_lr > self.min_lr:
                reduced_lr = current_lr * self.factor
                backend.set_value(self.model.optimizer.lr, reduced_lr)
                
                print(f"Reducing current learning rate to {reduced_lr} from {current_lr}...")
            else:
                print(f"Learning rate reached {self.min_lr}, can not be reduced any further...")
        
        print(f"\nEpoch {epoch + 1}: Learning rate is {float(backend.get_value(self.model.optimizer.lr))}...\n")

#### `CustomReduceLROnPlateau` Class Definition

In [None]:
class CustomReduceLROnPlateau(callbacks.Callback):
    def __init__(self, monitor_metric, monitor_mode, monitor_value, patience, factor, min_lr):
        super().__init__()
        self.monitor_metric, self.monitor_mode = monitor_metric, monitor_mode
        self.monitor_value = monitor_value if monitor_value is not None else (np.inf if monitor_mode == "min" else (-np.inf if monitor_mode == "max" else None))
        self.init_patience, self.current_patience, self.factor, self.min_lr = patience, patience, factor, min_lr
    
    def on_epoch_end(self, epoch, logs=None):
        print(f"Training Epoch {epoch + 1}:-\nAvg Accuracy: {logs['Q8_output_average_accuracy']}")
        print(f"Total Accuracy: {logs['Q8_output_total_accuracy']}")
        print(f"Avg Total Correct Prediction: {logs['Q8_output_total_correct_prediction']}")
        print(f"Mean MAE: {logs['Phi_Psi_output_mean_mae']}\nPhi MAE: {logs['Phi_Psi_output_phi_mae']}")
        print(f"Psi MAE: {logs['Phi_Psi_output_psi_mae']}\n")
        print(f"Validation Epoch {epoch + 1}:-\nAvg Accuracy: {logs['val_Q8_output_average_accuracy']}")
        print(f"Total Accuracy: {logs['val_Q8_output_total_accuracy']}")
        print(f"Avg Total Correct Prediction: {logs['val_Q8_output_total_correct_prediction']}")
        print(f"Mean MAE: {logs['val_Phi_Psi_output_mean_mae']}\nPhi MAE: {logs['val_Phi_Psi_output_phi_mae']}")
        print(f"Psi MAE: {logs['val_Phi_Psi_output_psi_mae']}\n")
        
        assert self.monitor_value is not None
        
        if self.monitor_mode == "min":
            if logs[self.monitor_metric] < self.monitor_value:
                print(f"{self.monitor_metric} decreased from {self.monitor_value} to {logs[self.monitor_metric]}...")
                
                self.monitor_value = logs[self.monitor_metric]
                self.current_patience = self.init_patience
            else:
                print(f"{self.monitor_metric} did not decrease from {self.monitor_value}...")
                
                if epoch > 0 and self.current_patience > 0:
                    self.current_patience = self.current_patience - 1
                elif epoch > 0 and self.current_patience == 0:
                    current_lr = float(backend.get_value(self.model.optimizer.lr))
                    
                    if current_lr > self.min_lr:
                        reduced_lr = current_lr * self.factor
                        backend.set_value(self.model.optimizer.lr, reduced_lr)
                        self.current_patience = self.init_patience
                        
                        print(f"Reducing current learning rate to {reduced_lr} from {current_lr}...")
                    else:
                        print(f"Learning rate reached {self.min_lr}, can not be reduced any further...")
        elif self.monitor_mode == "max":
            if logs[self.monitor_metric] > self.monitor_value:
                print(f"{self.monitor_metric} increased from {self.monitor_value} to {logs[self.monitor_metric]}...")
                
                self.monitor_value = logs[self.monitor_metric]
                self.current_patience = self.init_patience
            else:
                print(f"{self.monitor_metric} did not increase from {self.monitor_value}...")
                
                if epoch > 0 and self.current_patience > 0:
                    self.current_patience = self.current_patience - 1
                elif epoch > 0 and self.current_patience == 0:
                    current_lr = float(backend.get_value(self.model.optimizer.lr))
                    
                    if current_lr > self.min_lr:
                        reduced_lr = current_lr * self.factor
                        backend.set_value(self.model.optimizer.lr, reduced_lr)
                        self.current_patience = self.init_patience
                        
                        print(f"Reducing current learning rate to {reduced_lr} from {current_lr}...")
                    else:
                        print(f"Learning rate reached {self.min_lr}, can not be reduced any further...")
        
        print(f"\nEpoch {epoch + 1}: Learning rate is {float(backend.get_value(self.model.optimizer.lr))}...\n")

#### `LegacyReduceLROnPlateau` Class Definition

In [None]:
class LegacyReduceLROnPlateau(callbacks.Callback):
    def __init__(self, monitor_value, patience, factor, min_lr, data_args):
        super().__init__()
        self.monitor_value = monitor_value if monitor_value is not None else np.inf
        self.init_patience, self.current_patience, self.factor, self.min_lr = patience, patience, factor, min_lr
        self.data_args = data_args
    
    def on_epoch_end(self, epoch, logs=None):
        train_avg_accuracy, train_total_accuracy = logs["Q8_output_average_accuracy"], logs["Q8_output_total_accuracy"]
        train_avg_total_correct_prediction = logs["Q8_output_total_correct_prediction"]
        
        train_legacy_accuracy = train_avg_total_correct_prediction * self.data_args["num_train_batches"] / self.data_args["num_train_residues"]
        
        val_avg_accuracy, val_total_accuracy = logs["val_Q8_output_average_accuracy"], logs["val_Q8_output_total_accuracy"]
        val_avg_total_correct_prediction = logs["val_Q8_output_total_correct_prediction"]
        
        val_legacy_accuracy = val_avg_total_correct_prediction * self.data_args["num_val_batches"] / self.data_args["num_val_residues"]
        
        train_mean_mae, train_phi_mae, train_psi_mae = logs["Phi_Psi_output_mean_mae"], logs["Phi_Psi_output_phi_mae"], logs["Phi_Psi_output_psi_mae"]
        train_mean_sae, train_phi_sae, train_psi_sae = logs["Phi_Psi_output_mean_sae"], logs["Phi_Psi_output_phi_sae"], logs["Phi_Psi_output_psi_sae"]
        
        train_legacy_mean_mae = train_mean_sae * self.data_args["num_train_batches"] / (self.data_args["num_train_residues"] - 2 * self.data_args["num_train_proteins"])
        train_legacy_phi_mae = train_phi_sae * self.data_args["num_train_batches"] / (self.data_args["num_train_residues"] - 2 * self.data_args["num_train_proteins"])
        train_legacy_psi_mae = train_psi_sae * self.data_args["num_train_batches"] / (self.data_args["num_train_residues"] - 2 * self.data_args["num_train_proteins"])
        
        val_mean_mae, val_phi_mae, val_psi_mae = logs["val_Phi_Psi_output_mean_mae"], logs["val_Phi_Psi_output_phi_mae"], logs["val_Phi_Psi_output_psi_mae"]
        val_mean_sae, val_phi_sae, val_psi_sae = logs["val_Phi_Psi_output_mean_sae"], logs["val_Phi_Psi_output_phi_sae"], logs["val_Phi_Psi_output_psi_sae"]
        
        val_legacy_mean_mae = val_mean_sae * self.data_args["num_val_batches"] / (self.data_args["num_val_residues"] - 2 * self.data_args["num_val_proteins"])
        val_legacy_phi_mae = val_phi_sae * self.data_args["num_val_batches"] / (self.data_args["num_val_residues"] - 2 * self.data_args["num_val_proteins"])
        val_legacy_psi_mae = val_psi_sae * self.data_args["num_val_batches"] / (self.data_args["num_val_residues"] - 2 * self.data_args["num_val_proteins"])
        
        print(f"Training Epoch {epoch + 1}:-\nAvg Accuracy: {train_avg_accuracy}\nTotal Accuracy: {train_total_accuracy}")
        print(f"Avg Total Correct Prediction: {train_avg_total_correct_prediction}\nLegacy Accuracy: {train_legacy_accuracy}")
        print(f"Mean MAE: {train_mean_mae}\nPhi MAE: {train_phi_mae}\nPsi MAE: {train_psi_mae}")
        print(f"Legacy Mean MAE: {train_legacy_mean_mae}\nLegacy Phi MAE: {train_legacy_phi_mae}\nLegacy Psi MAE: {train_legacy_psi_mae}\n")
        
        print(f"Validation Epoch {epoch + 1}:-\nAvg Accuracy: {val_avg_accuracy}\nTotal Accuracy: {val_total_accuracy}")
        print(f"Avg Total Correct Prediction: {val_avg_total_correct_prediction}\nLegacy Accuracy: {val_legacy_accuracy}")
        print(f"Mean MAE: {val_mean_mae}\nPhi MAE: {val_phi_mae}\nPsi MAE: {val_psi_mae}")
        print(f"Legacy Mean MAE: {val_legacy_mean_mae}\nLegacy Phi MAE: {val_legacy_phi_mae}\nLegacy Psi MAE: {val_legacy_psi_mae}\n")
        
        if logs["val_Phi_Psi_output_mean_mae"] < self.monitor_value:
            print(f"val_Phi_Psi_output_mean_mae decreased from {self.monitor_value} to {logs['val_Phi_Psi_output_mean_mae']}...")
            
            self.monitor_value = logs["val_Phi_Psi_output_mean_mae"]
            self.current_patience = self.init_patience
        else:
            print(f"val_Phi_Psi_output_mean_mae did not decrease from {self.monitor_value}...")
            
            if epoch > 0 and self.current_patience > 0:
                self.current_patience = self.current_patience - 1
            elif epoch > 0 and self.current_patience == 0:
                current_lr = float(backend.get_value(self.model.optimizer.lr))
                
                if current_lr > self.min_lr:
                    reduced_lr = current_lr * self.factor
                    backend.set_value(self.model.optimizer.lr, reduced_lr)
                    self.current_patience = self.init_patience
                    
                    print(f"Reducing current learning rate to {reduced_lr} from {current_lr}...")
                else:
                    print(f"Learning rate reached {self.min_lr}, can not be reduced any further...")
        
        print(f"\nEpoch {epoch + 1}: Learning rate is {float(backend.get_value(self.model.optimizer.lr))}...\n")

#### `ModelBackup` Class Definition

In [None]:
class ModelBackup(callbacks.Callback):
    def __init__(self, monitor_metric, monitor_mode, monitor_value, best_model_path, backup_model_path):
        super().__init__()
        self.monitor_metric, self.monitor_mode = monitor_metric, monitor_mode
        self.monitor_value = monitor_value if monitor_value is not None else (np.inf if monitor_mode == "min" else (-np.inf if monitor_mode == "max" else None))
        self.best_model_path, self.backup_model_path = best_model_path, backup_model_path
    
    def on_epoch_end(self, epoch, logs=None):
        assert self.monitor_value is not None
        
        if self.monitor_mode == "min":
            if logs[self.monitor_metric] < self.monitor_value:
                self.monitor_value = logs[self.monitor_metric]
                shutil.copyfile(src=self.best_model_path, dst=self.backup_model_path)
        elif self.monitor_mode == "max":
            if logs[self.monitor_metric] > self.monitor_value:
                self.monitor_value = logs[self.monitor_metric]
                shutil.copyfile(src=self.best_model_path, dst=self.backup_model_path)

### Training Pipeline Arguments and Hyperparameters Configuration

In [None]:
args = {
    "architecture": "Basic1", 
    
    "features": {
        "ProtTrans": {"add_feature": False, "feature_len": 1024, "extension": "prottrans"}, 
        "ESM1b": {"add_feature": False, "feature_len": 1280, "extension": "esm"}, 
        "ProtTransBFD": {"add_feature": True, "feature_len": 1024, "extension": "bfd"}, 
        "PSSM": {"add_feature": False, "feature_len": 20, "extension": "pssm"}, 
        "HHM": {"add_feature": False, "feature_len": 30, "extension": "hhm"}, 
        "PCP": {"add_feature": False, "feature_len": 7, "extension": "pcp"}, 
        "AA": {"add_feature": False, "feature_len": 20, "extension": "aa"}, 
        "PSP": {"add_feature": False, "feature_len": 19, "extension": "psp"}, 
        "Win10": {"add_feature": False, "feature_len": 16, "extension": "win10"}, 
        "Win20": {"add_feature": False, "feature_len": 36, "extension": "win20"}, 
        "Win50": {"add_feature": False, "feature_len": 96, "extension": "win50"}
    }, 
    "num_features": None, 
    
    "num_prottrans_filters": 1024, 
    "num_esm1b_filters": 1280, 
    "num_prottransBFD_filters": 1024, 
    "num_residual_filters": 300, 
    "num_inception_filters": 100, 
    "inception_drop_rate": None, 
    "use_skip_connection": None, 
    "use_attention": True, 
    
    "train_batch_size": 16, 
    "validation_batch_size": 32, 
    "train_set_path": "../datasets/SPOT-1D-Single/Features/Training", 
    "validation_set_path": "../datasets/SPOT-1D-Single/Features/Validation", 
    
    "model_name": "SAINT_Martin_Single_Basic1_BFD1024", 
    "best_model_path": None, 
    "backup_model_path": None, 
    
    "num_epochs": 100, 
    "init_lr": 1e-3, 
    "patience": 5, 
    "factor": 0.5, 
    "min_lr": 1e-8, 
    
    "lr_scheduler": "LegacyReduceLROnPlateau", 
    "monitor_metric": "val_Phi_Psi_output_mean_mae", 
    "monitor_mode": "min", 
    
    "Q8_loss_weight": 0.5, 
    "Phi_Psi_loss_weight": 0.5, 
    
    "do_retrain": False, 
    "retrain_from_lr": 1e-3, 
    "retrain_from_epoch": 0, 
    "retrain_from_monitor_value": None, 
    
    "model_args_path": None
}

architecture_configs = {
    "Basic1": {"inception_drop_rate": 0.1, "use_skip_connection": False}, 
    "Basic2": {"inception_drop_rate": 0.1, "use_skip_connection": False}, 
    "Residual1": {"inception_drop_rate": 0.2, "use_skip_connection": True}, 
    "Residual2": {"inception_drop_rate": 0.2, "use_skip_connection": True}
}

args["num_features"] = sum([int(args["features"][feature]["add_feature"]) * args["features"][feature]["feature_len"] for feature in args["features"]])
args["inception_drop_rate"] = architecture_configs[args["architecture"]]["inception_drop_rate"]
args["use_skip_connection"] = architecture_configs[args["architecture"]]["use_skip_connection"]
args["best_model_path"] = "../Best_Model" + os.sep + args["model_name"] + ".h5"
args["backup_model_path"] = "../Backup_Model" + os.sep + args["model_name"] + ".h5"
args["model_args_path"] = "../Best_Model" + os.sep + args["model_name"] + "_args.txt"

custom_objects = {
    "CustomLayer": CustomLayer, 
    "backend": backend, 
    "get_shape_list": get_shape_list, 
    "custom_categorical_cross_entropy": custom_categorical_cross_entropy, 
    "average_accuracy": average_accuracy, 
    "total_accuracy": total_accuracy, 
    "total_correct_prediction": total_correct_prediction, 
    "total_tse": total_tse, 
    "mean_tse": mean_tse, 
    "total_mse": total_mse, 
    "mean_mse": mean_mse, 
    "mean_mae": mean_mae, 
    "phi_mae": phi_mae, 
    "psi_mae": psi_mae, 
    "mean_sae": mean_sae, 
    "phi_sae": phi_sae, 
    "psi_sae": psi_sae
}

with open(args["model_args_path"], 'w') as args_file:
    args_file.write(str(args))

### Getting Started

#### Instantiating DataLoaders for Training and Validation

In [None]:
train_dataloader = CustomDataLoader(
    dataset_path=args["train_set_path"], 
    dataset_name="Training", 
    features=args["features"], 
    num_features=args["num_features"], 
    batch_size=args["train_batch_size"], 
    shuffle=True
)
validation_dataloader = CustomDataLoader(
    dataset_path=args["validation_set_path"], 
    dataset_name="Validation", 
    features=args["features"], 
    num_features=args["num_features"], 
    batch_size=args["validation_batch_size"], 
    shuffle=False
)

#### Preparing Model and Setting up Training Pipeline

In [None]:
if args["do_retrain"]:
    model = load_model(args["best_model_path"], custom_objects=custom_objects)
    optimizer = Adam(learning_rate=args["retrain_from_lr"])
else:
    model = get_model(args)
    optimizer = Adam(learning_rate=args["init_lr"])

model.compile(
    optimizer=optimizer, 
    loss={
        "Q8_output": custom_categorical_cross_entropy, 
        "Phi_Psi_output": total_tse
    }, 
    metrics={
        "Q8_output": [average_accuracy, total_accuracy, total_correct_prediction], 
        "Phi_Psi_output": [mean_mae, phi_mae, psi_mae, mean_sae, phi_sae, psi_sae]
    }, 
    loss_weights={
        "Q8_output": args["Q8_loss_weight"], 
        "Phi_Psi_output": args["Phi_Psi_loss_weight"]
    }, 
    sample_weight_mode="temporal"
)

model_checkpoint = callbacks.ModelCheckpoint(
    filepath=args["best_model_path"], 
    monitor=args["monitor_metric"], 
    verbose=1, 
    save_best_only=True, 
    mode=args["monitor_mode"]
)
model_backup = ModelBackup(
    monitor_metric=args["monitor_metric"], 
    monitor_mode=args["monitor_mode"], 
    monitor_value=None, 
    best_model_path=args["best_model_path"], 
    backup_model_path=args["backup_model_path"]
)

if args["lr_scheduler"] == "CustomStepLR":
    lr_scheduler = CustomStepLR(step_size=args["patience"], factor=args["factor"], min_lr=args["min_lr"])
elif args["lr_scheduler"] == "CustomReduceLROnPlateau":
    lr_scheduler = CustomReduceLROnPlateau(
        monitor_metric=args["monitor_metric"], 
        monitor_mode=args["monitor_mode"], 
        monitor_value=None, 
        patience=args["patience"], 
        factor=args["factor"], 
        min_lr=args["min_lr"]
    )
elif args["lr_scheduler"] == "LegacyReduceLROnPlateau":
    data_args = {}
    
    with open(args["train_set_path"] + os.sep + "Training_below_700_proteins.txt", 'r') as proteins_file:
        data = [datum.split(',') for datum in proteins_file.read().split('\n') if datum != '']
        data_args["num_train_batches"], data_args["num_train_proteins"] = len(train_dataloader), len(data)
        data_args["num_train_residues"] = sum([int(datum[1]) for datum in data])
        
    with open(args["validation_set_path"] + os.sep + "Validation_below_700_proteins.txt", 'r') as proteins_file:
        data = [datum.split(',') for datum in proteins_file.read().split('\n') if datum != '']
        data_args["num_val_batches"], data_args["num_val_proteins"] = len(validation_dataloader), len(data)
        data_args["num_val_residues"] = sum([int(datum[1]) for datum in data])
    
    lr_scheduler = LegacyReduceLROnPlateau(
        monitor_value=None, 
        patience=args["patience"], 
        factor=args["factor"], 
        min_lr=args["min_lr"], 
        data_args=data_args
    )

#### Training Model

In [None]:
model.fit(
    x=train_dataloader, 
    epochs=args["num_epochs"], 
    verbose=1, 
    callbacks=[model_checkpoint, lr_scheduler, model_backup], 
    validation_data=validation_dataloader, 
    initial_epoch=(args["retrain_from_epoch"] if args["do_retrain"] else 0), 
    workers=1
)