### Import package

In [429]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

pd.set_option('future.no_silent_downcasting', True)

### Read in files

In [430]:
def read_protein_file(file_path):
    with open(file_path, "r") as file:
        # Read the entire contents of the file
        seq = ""
        str = ""

        sequences = []
        strings = []
        count = 0
        for line in file:
            if count < 8:
                pass
            count += 1

            line = line.strip()
            if line == "<>":
                seq = ""
                str = ""
            elif line == "<end>" or line=="end":
                sequences.append(seq)
                strings.append(str)
            else:
                letters = line.split(" ")
                if len(letters) == 2:
                    seq += letters[0]
                    str += letters[1] if letters[1] != "_" else "c"
        return sequences, strings


In [431]:
train_seq, train_str = read_protein_file("data/protein-secondary-structure.train")
test_seq, test_str = read_protein_file("data/protein-secondary-structure.test")

### NOTE TO SELF
Create sliding windows

Note. we need to split the sequences up, but since our test input is of different lengths, what do we do then?

When your test data contains protein sequences of different lengths compared to your training data, you need to handle this discrepancy appropriately. Here are a few strategies you can consider:

Padding:

Pad shorter sequences in the test data to match the length of the longest sequence in your training data.
You can pad the sequences with a special token (e.g., all zeros) so that the neural network can recognize them as padding.
During inference, you would need to trim the predictions to the original length of the test sequence.
Dynamic Input Shape:

Modify your neural network architecture to accept variable-length input sequences.
Use techniques like masking to handle variable-length sequences effectively.
Many deep learning frameworks, such as TensorFlow and PyTorch, support dynamic input shapes and masking.
Bucketing or Binning:

Group sequences of similar lengths together and pad each group separately to the maximum length within that group.
This approach reduces the amount of padding required, leading to more efficient training.
You can organize your test data into buckets or bins based on sequence lengths and process each bucket separately during evaluation.
Batch Processing:

While processing batches during testing, group sequences of similar lengths together.
This allows you to minimize padding within each batch, improving computational efficiency.

One hot encoding the sequence

Encoding Amino Acids:

First, you need to encode each amino acid into a numerical representation.
Common encoding methods include one-hot encoding and embedding.
In one-hot encoding, each amino acid is represented as a binary vector where only one element is 1 (indicating the presence of that amino acid).
In embedding, each amino acid is mapped to a low-dimensional vector space.
Padding:

After encoding amino acids, you can proceed with padding sequences as discussed earlier.
Pad the sequences to match the length of the longest sequence in your training data.

PERFORM NEURAL NETWORK??? use RNN or CNN (keras, tensorflow, pytorch)

https://www.pnas.org/doi/epdf/10.1073/pnas.86.1.152

sliding window = 17

### Create sliding windows

In [432]:
def split_based_on_windows(data_seq, data_str, W=17):
    all_sequences = []
    all_strings = []
    for i in range(len(data_seq)):
        sequences = [data_seq[i][j:j+W] for j in range(0, len(data_seq[i]), W)]
        strings = [data_str[i][j:j+W] for j in range(0, len(data_str[i]), W)]
        # PADDING
        if len(sequences[-1]) != W:
            sequences[-1] = sequences[-1] + "X"*((W-len(sequences[-1])))
        if len(strings[-1]) != W:
            strings[-1] = strings[-1] + "X"*((W-len(strings[-1])))

        all_sequences += sequences
        all_strings += strings

    train_df = pd.DataFrame({"sequence": all_sequences, "string": all_strings})
    return train_df


In [433]:
train_df = split_based_on_windows(train_seq, train_str)
test_df = split_based_on_windows(test_seq, test_str)

### One hot encode the data

In [434]:
def get_matrix_from_letter_cols(letters, df):
    all_columns = []
    for i in letters:
        columns = [str(j)+"_"+i for j in range(0, 17)]
        all_columns += columns

    template = pd.DataFrame(columns=all_columns, dtype=bool)
    unique_columns = template.columns.difference(df.columns)

    one_hot_encoded_full = pd.concat([df, template[unique_columns]], axis=1).fillna(False)
    ohe_df = one_hot_encoded_full.reindex(sorted(one_hot_encoded_full.columns), axis=1).astype(int)
    return ohe_df.values

In [435]:
def get_ohe_matrix(df):
    X_letters = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "X", "Y"]
    Y_letters = ['c', 'e', 'h', 'X']

    by_pos_df = df['sequence'].apply(lambda x:pd.Series(list(x)))
    ohe_df_X = pd.get_dummies(by_pos_df)
    ohe_X = get_matrix_from_letter_cols(X_letters, ohe_df_X)

    by_pos_df = df['string'].apply(lambda x:pd.Series(list(x)))
    ohe_df_Y = pd.get_dummies(by_pos_df)
    ohe_Y = get_matrix_from_letter_cols(Y_letters, ohe_df_Y)

    return ohe_X, ohe_Y

In [436]:
train_ohe_X, train_ohe_Y = get_ohe_matrix(train_df)
test_ohe_X, test_ohe_Y = get_ohe_matrix(test_df)

### Define the neural network

In [437]:
# Custom loss function
def custom_loss(y_true, y_pred):
    # Apply thresholding to the predicted values
    y_pred_binary = tf.cast(tf.greater_equal(y_pred, 0.5), tf.float32)
    
    # Binary cross-entropy loss for binary classification
    bce_loss = tf.keras.losses.binary_crossentropy(y_true, y_pred)
    
    # Penalty term for deviation from 17 ones
    ones_penalty = tf.reduce_sum(tf.abs(tf.reduce_sum(y_pred_binary, axis=1) - 17))  # Compute the absolute difference from 17 ones
    
    # Combine binary cross-entropy and penalty term
    total_loss = bce_loss + 1 * ones_penalty  # Adjust the penalty coefficient as needed
    
    return total_loss

In [438]:
# Custom constraint for the output layer
class OnesConstraint(tf.keras.constraints.Constraint):
    def __init__(self, target_ones):
        self.target_ones = target_ones

    def __call__(self, w):
        return w

    def get_config(self):
        return {'target_ones': self.target_ones}

In [446]:
model = models.Sequential([
    layers.Dense(units=train_ohe_Y.shape[1],input_shape=(train_ohe_X.shape[1],), activation="relu"),
    layers.Dense(train_ohe_Y.shape[1], activation='sigmoid', kernel_constraint=OnesConstraint(17))

])

# Compile the model
model.compile(loss=custom_loss, optimizer= "adam", metrics=['accuracy'])

# Train the model
model.fit(train_ohe_X, train_ohe_Y, epochs=100, batch_size=32, validation_split=0.2)
predictions = model.predict(test_ohe_X)
output = np.where(predictions > 0.5, 1, 0)
output

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

AttributeError: 'Sequential' object has no attribute 'predict_classes'

### Convert the prediction back

In [440]:
output

array([[0, 0, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       ...,
       [0, 1, 0, ..., 1, 0, 0],
       [0, 1, 0, ..., 0, 1, 0],
       [0, 1, 0, ..., 0, 0, 0]])

In [441]:
output.shape

(190, 68)

In [442]:
def convert_mat_back_to_seq(mat):
    letters = ['c', 'e', 'h', 'X']

    all_columns = []
    for i in letters:
        columns = [str(j)+"_"+i for j in range(0, 17)]
        all_columns += columns

    template = pd.DataFrame(mat, columns=all_columns, dtype=bool)

    strings = []
    for i in range(mat.shape[0]):
        string = ""
        for j in range(mat.shape[1]//len(letters)):
            for k in letters:
                if template[f"{j}_{k}"].iloc[i]:
                    string+=k
                    break
        strings.append(string)
    return strings

In [443]:
pred_sequences = convert_mat_back_to_seq(output)


In [444]:
test_df['predict'] = pred_sequences

In [445]:
test_df.head()

Unnamed: 0,sequence,string,predict
0,ENLKLGFLVKQPEEPWF,cccccceeeccccccch,ehceceecehXce
1,QTEWKFADKAGKDLGFE,hhhhhhhhhhhhhcccc,cXhcXhXhche
2,VIKIAVPDGEKTLNAID,cceeeccchhhhhhhhh,hXcXecXeXh
3,SLAASGAKGFVICTPDP,hhhhccccccccccccc,ececchecXhecXh
4,KLGSAIVAKARGYDMKV,cccchhhhhhhhhcccc,chchceXece
