# Import

In [2]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Keras and tensorflow
import tensorflow as tf
from tensorflow import keras

# Build RNN
from tensorflow.keras.models import Sequential
from tensorflow.keras import Input
from tensorflow.keras.layers import Dense, Dropout, LSTM, BatchNormalization, Bidirectional, TimeDistributed


from tensorflow.keras.layers import LeakyReLU
from sklearn.preprocessing import MinMaxScaler


# creating PDB
import csv
import re

%matplotlib inline


# Data normalisation and train/test split

In [3]:
# get PDB atom postions for all frames 
DATA_FILE = 'CAM_extracted_pdb_trajectory.npy'
Coordinates_array = np.load(DATA_FILE)
Coordinates_array = Coordinates_array.astype(float)

# Number of Atoms
ATOM_NUM = 8076


# change to pandas DataFrame and remove the time stamp columns
df_coo = pd.DataFrame(Coordinates_array)
df_coo_only = df_coo.iloc[:, 1:]

# Change back to numpy array that only has
cleaned_coo = df_coo_only.to_numpy()


# Use MinMax normalisation on the dataset
scaler = MinMaxScaler()
Coordinates_array_scalled = scaler.fit_transform(cleaned_coo)


##              Split data to train validate and test in a 50%, 25%, 25% split

In [4]:
# Remove the Testing sample

# size of training sample
train_len = int(Coordinates_array_scalled.shape[0]*0.6)

test_len = Coordinates_array_scalled.shape[0] - train_len

end_val = train_len

tv_Coordinates_array_scalled = Coordinates_array_scalled[:end_val]
test_Coordinates_array_scalled = Coordinates_array_scalled[end_val:]

print(f'Shape of Train and Val data {tv_Coordinates_array_scalled.shape}')
print(f'Shape of Testing data {test_Coordinates_array_scalled.shape}')

Shape of Train and Val data (3000, 8076)
Shape of Testing data (2000, 8076)


# AutoEncoder build and data encoding 

In [5]:
def AutoEncode_layer_build(lst,act_func='lr'):
    """ 
    builds an Autoencoder along with both the
    decode and encode 
    """
    # activation function where the defualt is 
    # leaky relu
    if act_func == 'lr':
        AF = keras.layers.LeakyReLU(alpha=0.3)

    # user set activation function
    else:
        AF = act_func

    # build the encoder layers
    # using the denfined list (lst)
    for i, layer in enumerate(lst):

        if i == 0: # first layer (input layer)
            encoder_input = keras.Input((layer,), name='Encoder_input')

        elif i ==1: # second layer 
            layers = keras.layers.Dense(
                            layer, activation=AF, name=f'layer_{i}',
                            kernel_initializer='normal')(encoder_input)

            layers = keras.layers.BatchNormalization(name=f'Batch_Norm_{i}')(layers)


        elif i ==(len(lst)-1): # the bottleneck layer

            encoder_output = keras.layers.Dense(
                            layer, activation=AF, name=f'layer_{i}',
                            kernel_initializer='normal')(layers)      

        else: #other layers between second layer and final layer
            layers = keras.layers.Dense(
                            layer, activation=AF, name=f'layer_{i}',
                            kernel_initializer='normal')(layers)
            
            layers = keras.layers.BatchNormalization(name=f'Batch_Norm_{i}')(layers)

    # build the encoder 
    encoder = keras.Model(encoder_input, encoder_output)

    # build the decoder layer
    for i, layer in enumerate(lst[::-1]):
        if i == 0:
            continue
        elif i == 1:
            de_layer_input = keras.layers.Dense(layer, activation=AF,
                                name=f'de_layer_{i}',
                                kernel_initializer='normal')(encoder_output)

            de_layer_input = keras.layers.BatchNormalization(name=f'Batch_Norm_d_{i}')(de_layer_input)

        elif i ==2:
            de_layer = keras.layers.Dense(
                       layer, activation=AF, name=f'de_layer_{i}', 
                       kernel_initializer='normal')(de_layer_input)

            de_layer = keras.layers.BatchNormalization(name=f'Batch_Norm_d_{i}')(de_layer)
        
        elif i == (len(lst)-1):
            decode_output = keras.layers.Dense(layer, activation=AF, name=f'de_layer_{i}',
                                          kernel_initializer='normal')(de_layer)

        else:
            de_layer = keras.layers.Dense(layer, activation=AF, name=f'de_layer_{i}',
                                          kernel_initializer='normal')(de_layer)

            de_layer = keras.layers.BatchNormalization(name=f'Batch_Norm_d_{i}')(de_layer)

    
    AE = keras.Model(encoder_input, decode_output)
    # get the number of layer in Decoder
    loop = len(lst) + (len(lst)-2)
    print(loop)
    # Build decoder
    for i in range(loop):
        if i == 0:
            dec_input = keras.Input((lst[-1],), name='Encded_input')
        
        elif i == 1:
            decco = AE.layers[-(loop-i)](dec_input)
        
        else:
            decco = AE.layers[-(loop-i)](decco)
         


    decoder = keras.Model(dec_input, decco, name='Decoder_lr')

    return encoder,decoder, AE

### Build the AutoEncoder

In [6]:
# Shape of AE layers
lst2 = [8076, 519, 260, 120, 8, 1]
print(lst2)

# define the AutoEncoder layers
encoder_1, decoder_1, AE_1 = AutoEncode_layer_build(lst2) 
AE_1.summary()

[8076, 519, 260, 120, 8, 1]
10
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Encoder_input (InputLayer)   [(None, 8076)]            0         
_________________________________________________________________
layer_1 (Dense)              (None, 519)               4191963   
_________________________________________________________________
Batch_Norm_1 (BatchNormaliza (None, 519)               2076      
_________________________________________________________________
layer_2 (Dense)              (None, 260)               135200    
_________________________________________________________________
Batch_Norm_2 (BatchNormaliza (None, 260)               1040      
_________________________________________________________________
layer_3 (Dense)              (None, 120)               31320     
_________________________________________________________________
Batch_Norm_3 (BatchNormaliza

In [7]:
# Compile the Autoencoder
opt = keras.optimizers.Adam(lr=0.001, decay=1e-6)

AE_1.compile(optimizer=opt, loss='mse')


history = AE_1.fit(tv_Coordinates_array_scalled, tv_Coordinates_array_scalled,
                   epochs=10, batch_size=8, validation_split=0.25, verbose=0)

# Encode the entire data

# Train data
encoded_train = []
for i in range(tv_Coordinates_array_scalled.shape[0]):
    encoded_coordinate = list(encoder_1.predict(tv_Coordinates_array_scalled[i].reshape((-1, ATOM_NUM)))[0])
    encoded_train.append(encoded_coordinate)


# test data
encoded_test = []
for i in range(test_Coordinates_array_scalled.shape[0]):
    encoded_coordinate = list(encoder_1.predict(
        test_Coordinates_array_scalled[i].reshape((-1, 8076)))[0])
    encoded_test.append(encoded_coordinate)


### Create the Target for the RNN model

In [8]:
# convert both the test and training datasets to DataFrames
df_encoded_train = pd.DataFrame(encoded_train)
df_encoded_test = pd.DataFrame(encoded_test)

# create a 1000 frame by 1000 frame
df_train_features = df_encoded_train.iloc[:1000]
df_train_target = df_encoded_train.iloc[1000:2000]

# create target for val data 
df_val_features = df_encoded_train.iloc[2000:2500]
df_val_target = df_encoded_train.iloc[2500:3000]

# target data for test data
df_test_feature = df_encoded_test[:1000]
df_test_target = df_encoded_test[1000:]

### Normalise data again with MinMax

In [14]:
scaler_2 = MinMaxScaler()

# scale training dataset
train_features_saclled = scaler_2.fit_transform(df_train_features.to_numpy())
train_target_scalled = scaler_2.fit_transform(df_train_target.to_numpy())

# scale valadation dataset
val_features_scalled = scaler_2.fit_transform(df_val_features.to_numpy())
val_target_scalled = scaler_2.fit_transform(df_val_target.to_numpy())

# scale testing dataset
test_feature_scalled = scaler_2.fit_transform(df_test_feature.to_numpy())
test_target_scalled = scaler_2.fit_transform(df_test_target.to_numpy())


# prepare the Dataset for time series ML
# by reshaping the dataset 


# # train
# train_features_saclled = train_features_saclled.reshape(10,100,1)
# train_target_scalled = train_target_scalled.reshape(10,100,1)

# # val
# val_features_scalled = val_features_scalled.reshape(5,100,1)
# val_target_scalled = val_target_scalled.reshape(5,100,1)

# # test
# test_feature_scalled = test_feature_scalled.reshape(10,100,1)
# test_target_scalled = test_target_scalled.reshape(10,100,1)


# train
train_features_saclled = train_features_saclled.reshape(1, 1000,1)
train_target_scalled = train_target_scalled.reshape(1, 1000,1)

# val
val_features_scalled = val_features_scalled.reshape(1, 500, 1)
val_target_scalled = val_target_scalled.reshape(1, 500, 1)

# test
test_feature_scalled = test_feature_scalled.reshape(1, 1000, 1)
test_target_scalled = test_target_scalled.reshape(1, 1000, 1)


In [20]:
train_features_saclled.shape[2]


1

In [21]:
del Coordinates_array
del Coordinates_array_scalled
del df_test_feature
del df_test_target
del df_train_features
del df_train_target
del test_Coordinates_array_scalled
del tv_Coordinates_array_scalled


# Build and train RNN

In [19]:
model_RNN = Sequential()

# create input layer
model_RNN.add(Input(shape=(train_features_saclled.shape[1],train_features_saclled.shape[2]), name='Input_layer'))

model_RNN.add(Bidirectional(LSTM(128, activation='relu',return_sequences=True,),name='Hidden_LSTM_Layer_1'))
model_RNN.add(Dropout(0.2))

model_RNN.add(Bidirectional(LSTM(128, activation='relu',return_sequences=True,),name='Hidden_LSTM_Layer_2'))
model_RNN.add(Dropout(0.2))

model_RNN.add(TimeDistributed(Dense(units=1, activation='linear'), name='Output_Layer')) 
model_RNN.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Hidden_LSTM_Layer_1 (Bidirec (None, 1000, 256)         133120    
_________________________________________________________________
dropout_2 (Dropout)          (None, 1000, 256)         0         
_________________________________________________________________
Hidden_LSTM_Layer_2 (Bidirec (None, 1000, 256)         394240    
_________________________________________________________________
dropout_3 (Dropout)          (None, 1000, 256)         0         
_________________________________________________________________
Output_Layer (TimeDistribute (None, 1000, 1)           257       
Total params: 527,617
Trainable params: 527,617
Non-trainable params: 0
_________________________________________________________________


### Train RNN Model

In [None]:
# Compile the RNN
opt = keras.optimizers.Adam(lr=0.001, decay=1e-6)

model_RNN.compile(optimizer=opt, loss='mse', metrics='mse')

model_RNN.fit(train_features_saclled, train_target_scalled,
              batch_size=1,
              epochs=1000,
              validation_data=(val_features_arr_reshaped,
                               val_targets_arr))


# Test model and export PDB for PyMol analysis