Note: inspired by https://www.kaggle.com/code/helmehelmuto/secondary-structure-prediction-with-keras/notebook

### Import package

In [235]:
import pandas as pd
import numpy as np
import tensorflow as tf

tf.keras.utils.set_random_seed(812)

pd.set_option('future.no_silent_downcasting', True)
np.set_printoptions(suppress=True)


### Read in files

In [236]:
def read_protein_file(file_path):
    with open(file_path, "r") as file:
        # Read the entire contents of the file
        seq = ""
        str = ""

        sequences = []
        strings = []
        count = 0
        for line in file:
            if count < 8:
                pass
            count += 1

            line = line.strip()
            if line == "<>":
                seq = ""
                str = ""
            elif line == "<end>" or line=="end":
                sequences.append(seq)
                strings.append(str)
            else:
                letters = line.split(" ")
                if len(letters) == 2:
                    seq += letters[0]
                    str += letters[1] if letters[1] != "_" else "c"
        return sequences, strings


In [237]:
train_seq, train_str = read_protein_file("data/protein-secondary-structure.train")
test_seq, test_str = read_protein_file("data/protein-secondary-structure.test")

### Create sliding windows

In [238]:
def split_based_on_windows(data_seq, data_str, W=17):
    all_sequences = []
    all_strings = []
    for i in range(len(data_seq)):
        sequences = [data_seq[i][j:j+W] for j in range(0, len(data_seq[i]), W)]
        strings = [data_str[i][j:j+W] for j in range(0, len(data_str[i]), W)]

        all_sequences += sequences
        all_strings += strings

    train_df = pd.DataFrame({"sequence": all_sequences, "string": all_strings})
    return train_df


In [239]:
WINDOW_SIZE = 17

In [240]:
train_df = split_based_on_windows(train_seq, train_str, WINDOW_SIZE)
test_df = split_based_on_windows(test_seq, test_str, WINDOW_SIZE)

In [241]:
train_df.head()

Unnamed: 0,sequence,string
0,GVGTVPMTDYGNDVEYY,ccccccccccccccccc
1,GQVTIGTPGKSFNLNFD,cceecccccceecceee
2,TGSSNLWVGSVQCQASG,cccccceeccccccccc
3,CKGGRDKFNPSDGSTFK,ccccccccccccccccc
4,ATGYDASIGYGDGSASG,cccccccccccccccee


### TEST

In [242]:
from keras.preprocessing import text, sequence
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical

In [243]:
def preprocess_data(df):
    tokenizer_encoder = Tokenizer()
    tokenizer_encoder.fit_on_texts(df['sequence'])
    input_data = tokenizer_encoder.texts_to_sequences(df['sequence'])
    input_data = sequence.pad_sequences(input_data, maxlen=WINDOW_SIZE, padding='post')

    tokenizer_decoder = Tokenizer(char_level=True)
    tokenizer_decoder.fit_on_texts(df['string'])
    target_data = tokenizer_decoder.texts_to_sequences(df['string'])
    target_data = sequence.pad_sequences(target_data, maxlen=WINDOW_SIZE, padding='post')
    target_data = to_categorical(target_data)
    return input_data, target_data

In [244]:
X_train, y_train = preprocess_data(train_df)
X_test, y_test = preprocess_data(test_df)

### Create the neural network

In [245]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed, Embedding, Bidirectional
from tensorflow.keras import layers, Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras import backend  as K


In [246]:
n_words = len(tokenizer_encoder.word_index) + 1
n_tags = len(tokenizer_decoder.word_index) + 1

In [247]:
def q3_acc(y_true, y_pred):
    y = tf.argmax(y_true, axis=-1)
    y_ = tf.argmax(y_pred, axis=-1)
    mask = tf.greater(y, 0)
    return K.cast(K.equal(tf.boolean_mask(y, mask), tf.boolean_mask(y_, mask)), K.floatx())


In [248]:
def create_model():
    drop_out = 0.3

    # model = Sequential([
    # LSTM(128, input_shape=(WINDOW_SIZE, len(PROTEIN_LETTERS)), return_sequences=True),
    # layers.BatchNormalization(),
    # layers.Dropout(drop_out),
    # Dense(64, activation='tanh'),
    # layers.BatchNormalization(),
    # layers.Dropout(drop_out),
    # Dense(128, activation='linear'),
    # layers.BatchNormalization(),
    # layers.Dropout(drop_out),
    # Dense(64, activation='relu'),
    # layers.BatchNormalization(),
    # layers.Dropout(drop_out),
    # Dense(32, activation='relu'),
    # TimeDistributed(Dense(len(SECONDARY_LETTERS), activation='softmax')) # Apply Dense layer to each time step
    # ])

    model = Sequential([
    Embedding(input_dim=n_words, output_dim=128, input_length=WINDOW_SIZE),
    Bidirectional(LSTM(units=64, return_sequences=True, recurrent_dropout=0.1)),
    TimeDistributed(Dense(n_tags, activation="softmax"))
    ])
    model.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['accuracy', q3_acc])

    return model

In [249]:
model = create_model()
model.summary()

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 17, 128)           119296    
                                                                 
 bidirectional_6 (Bidirecti  (None, 17, 128)           98816     
 onal)                                                           
                                                                 
 time_distributed_6 (TimeDi  (None, 17, 4)             516       
 stributed)                                                      
                                                                 
Total params: 218628 (854.02 KB)
Trainable params: 218628 (854.02 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [250]:
model.fit(X_train, y_train, epochs=5, batch_size=8, validation_split=0.2)
loss, accuracy, q3_acc = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}, Test MAE: {q3_acc}')


Epoch 1/5


Epoch 2/5

KeyboardInterrupt: 

In [None]:
predictions = model.predict(X_test)

1/6 [====>.........................] - ETA: 0s



### Convert the prediction back

In [None]:
decoder = Tokenizer(char_level=True)
decoder.fit_on_texts(test_df['string'])
index = {value:key for key,value in decoder.word_index.items()}

In [None]:
def decode_back(oh_seq, index):
    return ''.join(index[np.argmax(o)] for o in oh_seq if np.argmax(o) != 0)

In [None]:
result = [decode_back(pred, index) for pred in predictions]

In [None]:
test_df['prediction'] = result

In [None]:
test_df

Unnamed: 0,sequence,string,prediction
0,ENLKLGFLVKQPEEPWF,cccccceeeccccccch,ccchhhhhhhhhhhhhh
1,QTEWKFADKAGKDLGFE,hhhhhhhhhhhhhcccc,ccchhhhhhhhhhhhhh
2,VIKIAVPDGEKTLNAID,cceeeccchhhhhhhhh,cchhhhhhhhhhhhhhh
3,SLAASGAKGFVICTPDP,hhhhccccccccccccc,ccccccccccccccccc
4,KLGSAIVAKARGYDMKV,cccchhhhhhhhhcccc,ccccccccccccccccc
...,...,...,...
185,VKLVSWYDNEFGYSERV,eeeecccchhhhhhhhh,ccccccccccccccccc
186,VDLMAHMASKE,hhhhhhhhhhc,ccccccccccccccccc
187,SIPPEVKFNKPFVFLMI,cccceeecccceeeeee,ccccccccccccccccc
188,EQNTKSPLFMGKVVNPT,ecccceeeeeeeecccc,ccccccccccccccccc
