### Import package

In [120]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LSTM, TimeDistributed
from tensorflow.keras.preprocessing.sequence import pad_sequences

pd.set_option('future.no_silent_downcasting', True)

### Read in files

In [121]:
def read_protein_file(file_path):
    with open(file_path, "r") as file:
        # Read the entire contents of the file
        seq = ""
        str = ""

        sequences = []
        strings = []
        count = 0
        for line in file:
            if count < 8:
                pass
            count += 1

            line = line.strip()
            if line == "<>":
                seq = ""
                str = ""
            elif line == "<end>" or line=="end":
                sequences.append(seq)
                strings.append(str)
            else:
                letters = line.split(" ")
                if len(letters) == 2:
                    seq += letters[0]
                    str += letters[1] if letters[1] != "_" else "c"
        return sequences, strings


In [122]:
train_seq, train_str = read_protein_file("data/protein-secondary-structure.train")
test_seq, test_str = read_protein_file("data/protein-secondary-structure.test")

### Create sliding windows

In [123]:
def split_based_on_windows(data_seq, data_str, W=17):
    all_sequences = []
    all_strings = []
    for i in range(len(data_seq)):
        sequences = [data_seq[i][j:j+W] for j in range(0, len(data_seq[i]), W)]
        strings = [data_str[i][j:j+W] for j in range(0, len(data_str[i]), W)]

        all_sequences += sequences
        all_strings += strings

    train_df = pd.DataFrame({"sequence": all_sequences, "string": all_strings})
    return train_df


In [124]:
WINDOW_SIZE = 17

In [125]:
train_df = split_based_on_windows(train_seq, train_str, WINDOW_SIZE)
test_df = split_based_on_windows(test_seq, test_str, WINDOW_SIZE)

In [126]:
train_df.head()

Unnamed: 0,sequence,string
0,GVGTVPMTDYGNDVEYY,ccccccccccccccccc
1,GQVTIGTPGKSFNLNFD,cceecccccceecceee
2,TGSSNLWVGSVQCQASG,cccccceeccccccccc
3,CKGGRDKFNPSDGSTFK,ccccccccccccccccc
4,ATGYDASIGYGDGSASG,cccccccccccccccee


### One hot encode the data

In [127]:
PROTEIN_LETTERS = 'ACDEFGHIKLMNPQRSTVWXY'
SECONDARY_LETTERS = 'ceh'

In [128]:
def ohe_for_nn(sequences, strings):
    
    X_ohe = [[PROTEIN_LETTERS.index(letter) for letter in seq] for seq in sequences]
    y_ohe = [[SECONDARY_LETTERS.index(letter) for letter in string] for string in strings]

    max_length = max(len(seq) for seq in X_ohe)
    X_padded = pad_sequences(X_ohe, maxlen=max_length, padding='post')
    y_padded = pad_sequences(y_ohe, maxlen=max_length, padding='post')

    X = np.zeros((len(X_padded), max_length, len(PROTEIN_LETTERS)))
    y = np.zeros((len(y_padded), max_length, len(SECONDARY_LETTERS)))

    for i in range(len(X_padded)):
        for j, aa_index in enumerate(X_padded[i]):
            X[i, j, aa_index] = 1
        for j, structure_index in enumerate(y_padded[i]):
            y[i, j, structure_index] = 1

    return X, y

In [129]:
X_train, y_train = ohe_for_nn(train_df['sequence'], train_df['string'])
X_test, y_test = ohe_for_nn(test_df['sequence'], test_df['string'])

### Create the neural network

In [130]:
# Define the neural network architecture
model = Sequential([
    LSTM(128, input_shape=(WINDOW_SIZE, len(PROTEIN_LETTERS)), return_sequences=True),
    Dense(64, activation='tanh'),
    Dense(128, activation='linear'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    TimeDistributed(Dense(len(SECONDARY_LETTERS), activation='softmax')) # Apply Dense layer to each time step
])

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')

predictions = model.predict(X_test)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

### Convert the prediction back

In [131]:
def convert_pred_to_str(predictions):
    inv_structure_map = {0: 'c', 1:'e', 2:'h'}
    y_pred_classes = np.argmax(predictions, axis=-1)
    protein_array = np.vectorize(inv_structure_map.get)(y_pred_classes)

    array_strings = ["".join(map(str, row)) for row in protein_array]
    return array_strings

In [132]:
result = convert_pred_to_str(predictions)

In [133]:
test_df['prediction'] = result

In [134]:
test_df

Unnamed: 0,sequence,string,prediction
0,ENLKLGFLVKQPEEPWF,cccccceeeccccccch,ccccccccccccchccc
1,QTEWKFADKAGKDLGFE,hhhhhhhhhhhhhcccc,ccceeeechcccccccc
2,VIKIAVPDGEKTLNAID,cceeeccchhhhhhhhh,cececcccccceeehcc
3,SLAASGAKGFVICTPDP,hhhhccccccccccccc,cchhccccccccccccc
4,KLGSAIVAKARGYDMKV,cccchhhhhhhhhcccc,chcchheccccccchhh
...,...,...,...
185,VKLVSWYDNEFGYSERV,eeeecccchhhhhhhhh,cccccccccccceeeec
186,VDLMAHMASKE,hhhhhhhhhhc,cehhhhhhhcccccccc
187,SIPPEVKFNKPFVFLMI,cccceeecccceeeeee,ccccchccccccccccc
188,EQNTKSPLFMGKVVNPT,ecccceeeeeeeecccc,ccccccceeeccccccc
