In [372]:
import pandas as pd
pd.set_option('future.no_silent_downcasting', True)

In [373]:
def read_protein_file(file_path):
    with open(file_path, "r") as file:
        # Read the entire contents of the file
        seq = ""
        str = ""

        sequences = []
        strings = []
        count = 0
        for line in file:
            if count < 8:
                pass
            count += 1

            line = line.strip()
            if line == "<>":
                seq = ""
                str = ""
            elif line == "<end>" or line=="end":
                sequences.append(seq)
                strings.append(str)
            else:
                letters = line.split(" ")
                if len(letters) == 2:
                    seq += letters[0]
                    str += letters[1] if letters[1] != "_" else "c"
        return sequences, strings


In [374]:
train_seq, train_str = read_protein_file("data/protein-secondary-structure.train")
test_seq, test_str = read_protein_file("data/protein-secondary-structure.test")

Get all the test lengths

In [375]:
protein_lengths = []
for i in test_seq:
    protein_lengths.append(len(i))
protein_lengths

[306, 108, 113, 322, 62, 212, 281, 218, 198, 107, 461, 149, 220, 334, 35]

Create sliding windows

Note. we need to split the sequences up, but since our test input is of different lengths, what do we do then?

When your test data contains protein sequences of different lengths compared to your training data, you need to handle this discrepancy appropriately. Here are a few strategies you can consider:

Padding:

Pad shorter sequences in the test data to match the length of the longest sequence in your training data.
You can pad the sequences with a special token (e.g., all zeros) so that the neural network can recognize them as padding.
During inference, you would need to trim the predictions to the original length of the test sequence.
Dynamic Input Shape:

Modify your neural network architecture to accept variable-length input sequences.
Use techniques like masking to handle variable-length sequences effectively.
Many deep learning frameworks, such as TensorFlow and PyTorch, support dynamic input shapes and masking.
Bucketing or Binning:

Group sequences of similar lengths together and pad each group separately to the maximum length within that group.
This approach reduces the amount of padding required, leading to more efficient training.
You can organize your test data into buckets or bins based on sequence lengths and process each bucket separately during evaluation.
Batch Processing:

While processing batches during testing, group sequences of similar lengths together.
This allows you to minimize padding within each batch, improving computational efficiency.

One hot encoding the sequence

Encoding Amino Acids:

First, you need to encode each amino acid into a numerical representation.
Common encoding methods include one-hot encoding and embedding.
In one-hot encoding, each amino acid is represented as a binary vector where only one element is 1 (indicating the presence of that amino acid).
In embedding, each amino acid is mapped to a low-dimensional vector space.
Padding:

After encoding amino acids, you can proceed with padding sequences as discussed earlier.
Pad the sequences to match the length of the longest sequence in your training data.

PERFORM NEURAL NETWORK??? use RNN or CNN (keras, tensorflow, pytorch)

https://www.pnas.org/doi/epdf/10.1073/pnas.86.1.152

sliding window = 17

In [376]:
def split_based_on_windows(data_seq, data_str, W=17):
    all_sequences = []
    all_strings = []
    print([len(i) for i in data_seq])
    for i in range(len(data_seq)):
        sequences = [data_seq[i][j:j+W] for j in range(0, len(data_seq[i]), W)]
        strings = [data_str[i][j:j+W] for j in range(0, len(data_str[i]), W)]
        # PADDING
        if len(sequences[-1]) != W:
            sequences[-1] = sequences[-1] + "X"*((W-len(sequences[-1])))
        if len(strings[-1]) != W:
            strings[-1] = strings[-1] + "X"*((W-len(strings[-1])))

        all_sequences += sequences
        all_strings += strings

    train_df = pd.DataFrame({"sequence": all_sequences, "string": all_strings})
    return train_df


In [377]:
train_df = split_based_on_windows(train_seq, train_str)
test_df = split_based_on_windows(test_seq, test_str)

[324, 129, 125, 123, 256, 83, 111, 108, 46, 71, 118, 103, 136, 240, 207, 146, 54, 147, 29, 174, 70, 67, 149, 145, 85, 239, 30, 329, 130, 164, 129, 153, 153, 26, 124, 111, 36, 107, 293, 124, 65, 56, 247, 194, 318, 323, 85, 256, 127, 293, 146, 106, 95, 87, 75, 57, 153, 222, 325, 61, 114, 114, 181, 141, 151, 107, 184, 478, 207, 112, 237, 98, 146, 99, 415, 230, 224, 50, 316, 82, 437, 159, 138, 222, 153, 307, 333, 58, 54, 374, 498]
[306, 108, 113, 322, 62, 212, 281, 218, 198, 107, 461, 149, 220, 334, 35]


In [378]:
train_df.shape

(934, 2)

In [379]:
test_df.shape

(190, 2)

In [380]:
test_df.tail()

Unnamed: 0,sequence,string
185,VKLVSWYDNEFGYSERV,eeeecccchhhhhhhhh
186,VDLMAHMASKEXXXXXX,hhhhhhhhhhcXXXXXX
187,SIPPEVKFNKPFVFLMI,cccceeecccceeeeee
188,EQNTKSPLFMGKVVNPT,ecccceeeeeeeecccc
189,QXXXXXXXXXXXXXXXX,cXXXXXXXXXXXXXXXX


#### NEURAL NETWORK

In [381]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models

In [382]:
by_pos_train_df = train_df['sequence'].apply(lambda x:pd.Series(list(x)))

In [383]:
by_pos_train_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,G,V,G,T,V,P,M,T,D,Y,G,N,D,V,E,Y,Y
1,G,Q,V,T,I,G,T,P,G,K,S,F,N,L,N,F,D
2,T,G,S,S,N,L,W,V,G,S,V,Q,C,Q,A,S,G
3,C,K,G,G,R,D,K,F,N,P,S,D,G,S,T,F,K
4,A,T,G,Y,D,A,S,I,G,Y,G,D,G,S,A,S,G
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
929,V,Q,R,F,N,S,A,N,D,D,N,V,T,Q,V,R,T
930,F,Y,L,K,V,L,N,E,E,Q,R,K,R,L,C,E,N
931,I,A,G,H,L,K,D,A,Q,L,F,I,Q,K,K,A,V
932,K,N,F,S,D,V,H,P,E,Y,G,S,R,I,Q,A,L


In [384]:
letters = ["A", "C", "D", "E", "F", "G", "H", "I", "K", "L", "M", "N", "P", "Q", "R", "S", "T", "V", "W", "X", "Y"]

In [385]:
oh_train_df_X = pd.get_dummies(by_pos_train_df)

In [386]:
all_columns = []
for i in letters:
    columns = [str(j)+"_"+i for j in range(0, 17)]
    all_columns += columns

In [387]:
template = pd.DataFrame(columns=all_columns, dtype=bool)

In [388]:
unique_columns = template.columns.difference(oh_train_df_X.columns)

In [389]:
one_hot_encoded_full = pd.concat([oh_train_df_X, template[unique_columns]], axis=1)
ohe_train_df_X = one_hot_encoded_full.reindex(sorted(one_hot_encoded_full.columns), axis=1)

In [390]:
by_pos_train_df_str = train_df['string'].apply(lambda x:pd.Series(list(x)))

In [391]:
oh_train_df_Y = pd.get_dummies(by_pos_train_df_str)

In [392]:
letters = ['c', 'e', 'h', 'X']

In [393]:
all_columns = []
for i in letters:
    columns = [str(j)+"_"+i for j in range(0, 17)]
    all_columns += columns

In [394]:
template = pd.DataFrame(columns=all_columns, dtype=bool).fillna(False)

In [395]:
unique_columns = template.columns.difference(oh_train_df_Y.columns)

In [396]:
one_hot_encoded_full = pd.concat([oh_train_df_Y, template[unique_columns]], axis=1)
ohe_train_df_Y = one_hot_encoded_full.reindex(sorted(one_hot_encoded_full.columns), axis=1)

### MAKE MATRIX OUT OF IT?

In [397]:
ohe_train_df_X.values

array([[False, False, False, ..., False, False, True],
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, False],
       ...,
       [False, False, False, ..., False, False, False],
       [False, False, False, ..., False, False, True],
       [False, False, False, ..., False, True, False]], dtype=object)

In [398]:
ohe_train_df_X = ohe_train_df_X.fillna(False)
ohe_train_df_Y = ohe_train_df_Y.fillna(False)

In [399]:
train_matrix_X = ohe_train_df_X.values
train_matrix_Y = ohe_train_df_Y.values

In [401]:
train_matrix_X.shape

(934, 357)

In [402]:
train_matrix_Y.shape

(934, 68)

#### SET UP NEURAL NETWORK

In [None]:
# Generate some dummy data (replace with your actual data)
# X_train: Input sequences, y_train: Corresponding labels
X_train = np.random.rand(100, 20, 20)  # Example: 100 sequences of length 20 with 20 features
y_train = np.random.randint(3, size=(100, 20))  # Example: 3 secondary structure classes

# Define the RNN model
model_rnn = models.Sequential([
    layers.SimpleRNN(64, return_sequences=True, input_shape=(20, 20)),
    layers.Dense(3, activation='softmax')
])

# Compile the model
model_rnn.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])

# Train the model
model_rnn.fit(X_train, y_train, epochs=10, batch_size=32)


In [400]:
# # Generate some dummy data (replace with your actual data)
# # X_train: Input sequences, y_train: Corresponding labels
# X_train = np.random.rand(100, 20, 20)  # Example: 100 sequences of length 20 with 20 features
# y_train = np.random.randint(3, size=(100, 20))  # Example: 3 secondary structure classes

# # Define the RNN model
# model_rnn = models.Sequential([
#     layers.SimpleRNN(64, return_sequences=True, input_shape=(20, 20)),
#     layers.Dense(3, activation='softmax')
# ])

# # Compile the model
# model_rnn.compile(optimizer='adam',
#                   loss='sparse_categorical_crossentropy',
#                   metrics=['accuracy'])

# # Train the model
# model_rnn.fit(X_train, y_train, epochs=10, batch_size=32)


Epoch 1/10


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.src.callbacks.History at 0x265b2629be0>