<a href="https://colab.research.google.com/github/Chrisvanhoorn/BioAI/blob/main/DCNN_keras_tensor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
from google.colab import drive

try:
  drive.mount("/content/drive")
except Exception as error:
  if "EBUSY" in str(error):
    # Drive already mounted
    print("Drive already mounted")
  else:
    # Other error occurred
    raise error

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!pip install tensorrt
#!pip install memory_profiler
!pip install line_profiler



In [8]:
import sys
!nvcc --version
import tensorflow as tf
import tensorrt as trt
print(trt.__version__)
print(tf.__version__)
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2023 NVIDIA Corporation
Built on Tue_Aug_15_22:02:13_PDT_2023
Cuda compilation tools, release 12.2, V12.2.140
Build cuda_12.2.r12.2/compiler.33191640_0
10.1.0
2.15.0
Found GPU at: /device:GPU:0


In [9]:
import os
import pandas as pd
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.utils.class_weight import compute_class_weight

data_path = '/content/drive/My Drive/ColabNotebooks/random_split'


# combine separate CSVs per folder
def read_data(split, data_folder=data_path):
    data = []
    for filename in os.listdir(os.path.join(data_folder, split)):
        with open(os.path.join(data_folder, split, filename)) as f:
            data.append(pd.read_csv(f, index_col=None))
    return pd.concat(data)


# three split folders
df_train = read_data('train')
df_dev = read_data('dev')
df_test = read_data('test')

def remove_duplicate_sequences(df_train, df_dev, df_test):
    # Removes duplicate sequences across and within all three dataframes.
    # Remove duplicates within each split
    df_train_unique = df_train.drop_duplicates(subset='sequence', keep='first')
    df_dev_unique = df_dev.drop_duplicates(subset='sequence', keep='first')
    df_test_unique = df_test.drop_duplicates(subset='sequence', keep='first')

    # Drop sequences from dev and test that are in train
    df_dev_unique = df_dev_unique[~df_dev_unique['sequence'].isin(df_train_unique['sequence'])]
    df_test_unique = df_test_unique[~df_test_unique['sequence'].isin(df_train_unique['sequence'])]

    # Drop sequences from test that are in dev
    df_test_unique = df_test_unique[~df_test_unique['sequence'].isin(df_dev_unique['sequence'])]

    return df_train_unique, df_dev_unique, df_test_unique

# Remove duplicates across and within all splits
df_train_unique, df_dev_unique, df_test_unique = remove_duplicate_sequences(df_train, df_dev, df_test)

#update dfs
df_train = df_train_unique
df_dev = df_dev_unique
df_test = df_test_unique

# data frame looks like:
df_train.head()

Unnamed: 0,family_id,sequence_name,family_accession,aligned_sequence,sequence
0,MORN_2,Q8EI47_SHEON/428-449,PF07661.13,LHGEFRNQTSSGQLLELI.NFNH,LHGEFRNQTSSGQLLELINFNH
1,Plexin_cytopl,H2TB23_TAKRU/1240-1793,PF08337.12,.MPFLDYKTYTDCNFFLPSKDGAND......AMITRKLQIPE.......,MPFLDYKTYTDCNFFLPSKDGANDAMITRKLQIPEARRAIVAQALN...
2,RT_RNaseH,H3H8E9_PHYRM/405-501,PF17917.1,DYSRRFHVFADAS.GH.QIGGVIVQ........................,DYSRRFHVFADASGHQIGGVIVQGRRILACFSRSMTDTQKKYSTME...
3,Transposase_20,Q981X5_RHILO/224-313,PF02371.16,VEAYQAMRGASFLVAVIFAAEI.GDV.RR.FDTPPQLMAFLGLVPG...,VEAYQAMRGASFLVAVIFAAEIGDVRRFDTPPQLMAFLGLVPGERS...
4,Mycobact_memb,MMPS4_MYCLE/16-154,PF05423.13,LSRIWIPLVILVVLVVGGFVVYRVHSYFASEKRESYADSNLGSSKP...,LSRIWIPLVILVVLVVGGFVVYRVHSYFASEKRESYADSNLGSSKP...


In [5]:
###before memory optimisation
@profile
class ProteinHelper:
    def __init__(self, df_train, df_dev, df_test, batch_size=8, shuffle=True, pad=True, class_weights=None):
        self.df_train = df_train
        self.df_dev = df_dev
        self.df_test = df_test

        # get maximum sequence length
        self.max_seq_len = max(len(str(seq)) for df in [df_train, df_dev, df_test] for seq in df['sequence']) # Cast seq to string

        # vocabulary of amino acids
        self.vocab = sorted(set("".join(str(seq) for seq in df_train['sequence']))) + ['_PAD_'] #vocab is unique AAs (includes X), add padding character

        # create numerical indexes for AAs
        self.char2idx = {char: idx for idx, char in enumerate(self.vocab)} #puts '_PAD_' in 0
        self.idx2char = {idx: char for idx, char in enumerate(self.vocab)}

        # Data Generator Attributes
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Prepare all datasets (training, validation, testing)
        self.prepare_data(pad)

        # Store prepared data and labels separately for training, validation, and testing
        self.X = {"train": self.X_train, "dev": self.X_dev, "test": self.X_test}
        self.y = {"train": self.y_train, "dev": self.y_dev, "test": self.y_test}

        # Set initial indices for shuffling
        self.indices = {"train": np.arange(len(self.X_train))}
        self.on_epoch_end()  # Shuffle data initially

        # class weights
        self.class_weights = class_weights


    def _pad_sequence(self, seq, pad=True):
      if pad:
        seq_list = list(seq)  # Convert string to list, trying-place mod for ram usage

        # Calculate padding amounts
        padding_length = max(self.max_seq_len - len(seq_list), 2 * 5)
        left_padding = padding_length // 2
        right_padding = padding_length - left_padding

        # Insert padding characters (in-place modification)
        seq_list = ['_PAD_'] * left_padding + seq_list + ['_PAD_'] * right_padding

        # Truncate if necessary to ensure max length
        seq_list = seq_list[:self.max_seq_len]

        return ''.join(seq_list)  # Convert back to string
      else:
        return seq
#        # Calculate padding to add on each side, called by prepare_data()
#        padding_length = max(self.max_seq_len - len(seq), 2 * 5) #minimum of 5 padding also on the lognest sequence
#        left_padding = padding_length // 2
#        right_padding = padding_length - left_padding

#        # Add padding to both sides
#        return '_PAD_' * left_padding + seq + '_PAD_' * right_padding



    def prepare_data(self, pad=True):
        # Pad sequences and convert to numerical representations
        self.X_train = np.array([[self.char2idx.get(char, 0) for char in self._pad_sequence(str(seq))] for seq in self.df_train['sequence']], dtype=np.int8) # Handle missing characters, assign index 0 (padding)
        self.X_dev = np.array([[self.char2idx.get(char, 0) for char in self._pad_sequence(str(seq))] for seq in self.df_dev['sequence']], dtype=np.int8) #int8 to reduce ram usage.
        self.X_test = np.array([[self.char2idx.get(char, 0) for char in self._pad_sequence(str(seq))] for seq in self.df_test['sequence']], dtype=np.int8)

        # Convert labels to numerical representations
        label2idx = {label: idx for idx, label in enumerate(set(self.df_train['family_accession']))}
        self.y_train = np.array([label2idx[label] for label in self.df_train['family_accession']])
        self.y_dev = np.array([label2idx[label] for label in self.df_dev['family_accession']])
        self.y_test = np.array([label2idx[label] for label in self.df_test['family_accession']])

    def __len__(self):
        # keras calls to determine batches
        return int(np.ceil(len(self.X_train) / float(self.batch_size)))  # Number of batches in the training set

    def __getitem__(self, index, class_weights=None):
        # keras calls this to fetch each batch of data
        batch_indices = self.indices["train"][index * self.batch_size:(index + 1) * self.batch_size]  # Get indices for the current batch
        batch_x = self.X["train"][batch_indices]  # Get sequences for the current batch
        batch_y = self.y["train"][batch_indices]  # Get labels for the current batch

        if self.class_weights is not None:
            batch_sample_weights = np.array([self.class_weights[label] for label in batch_y])
        else:
            batch_sample_weights = np.ones(len(batch_y))

        return batch_x, batch_y, batch_sample_weights

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices["train"])


In [16]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import Sequence
from memory_profiler import profile  # For memory profiling


class ProteinHelper(Sequence):
    def __init__(self, df_train, df_dev, df_test, batch_size=32, shuffle=True, pad=True, max_seq_len=None, class_weights=None):
        # Initialize datasets views of just sequence and family for memory efficiency
        self.df_train = df_train.loc[:, ['sequence', 'family_accession']]
        self.df_dev = df_dev.loc[:, ['sequence', 'family_accession']]
        self.df_test = df_test.loc[:, ['sequence', 'family_accession']]
        #self.df_test = df_test[['sequence', 'family_accession']].copy(deep=False)

        self.original_index_train = self.df_train.index # Store the original train index
        self.original_index_dev = self.df_dev.index
        self.original_index_test = self.df_test.index

        # get maximum sequence length
        self.max_seq_len = max(len(str(seq)) for df in [df_train, df_dev, df_test] for seq in df['sequence']) # Cast seq to string

        # Vocabulary of amino acids (using list)
        self.vocab = sorted(list("".join(df_train['sequence']))) + ['_PAD_']

        # create numerical indexes for AAs
        self.char2idx = {char: idx for idx, char in enumerate(self.vocab)} #puts '_PAD_' in 0
        self.idx2char = {idx: char for idx, char in enumerate(self.vocab)}

        # Data Generator Attributes
        self.batch_size = batch_size
        self.shuffle = shuffle

        # Prepare all datasets (training, validation, testing)
        self.prepare_data(pad)

        # Store prepared data and labels separately for training, validation, and testing
        self.X = {"train": self.X_train, "dev": self.X_dev, "test": self.X_test}
        self.y = {"train": self.y_train, "dev": self.y_dev, "test": self.y_test}

        # Set initial indices for shuffling
        self.indices = {"train": np.arange(len(self.X_train))}
        self.on_epoch_end()  # Shuffle data initially

        # class weights
        self.class_weights = class_weights

    def _pad_sequence(self, seq, pad=True):
      if pad:
        seq_list = list(seq)  # Convert string to list, trying-place mod for ram usage

        # Calculate padding amounts
        padding_length = max(self.max_seq_len - len(seq_list), 2 * 5)
        left_padding = padding_length // 2
        right_padding = padding_length - left_padding

        # Insert padding characters (in-place modification)
        seq_list = ['_PAD_'] * left_padding + seq_list + ['_PAD_'] * right_padding

        # Truncate if necessary to ensure max length
        seq_list = seq_list[:self.max_seq_len]

        return ''.join(seq_list)  # Convert back to string
      else:
        return seq


    def prepare_data(self, pad=True):
        # Pad sequences and convert to numerical representations
        self.X_train = np.array([[self.char2idx.get(char, 0) for char in self._pad_sequence(seq, pad)]
                                 for seq in self.df_train['sequence']], dtype=np.int8)
        self.X_dev = np.array([[self.char2idx.get(char, 0) for char in self._pad_sequence(seq, pad)]
                               for seq in self.df_dev['sequence']], dtype=np.int8)
        self.X_test = np.array([[self.char2idx.get(char, 0) for char in self._pad_sequence(seq, pad)]
                               for seq in self.df_test['sequence']], dtype=np.int8)

        # Convert labels to numerical representations
        label2idx = {label: idx for idx, label in enumerate(set(self.df_train['family_accession']))}
        self.y_train = np.array([label2idx[label] for label in self.df_train['family_accession']], dtype=np.int8) # Smaller datatype
        self.y_dev = np.array([label2idx[label] for label in self.df_dev['family_accession']], dtype=np.int8)
        self.y_test = np.array([label2idx[label] for label in self.df_test['family_accession']], dtype=np.int8)

    def __len__(self):
        # keras calls to determine batches
        return int(np.ceil(len(self.X_train) / float(self.batch_size)))  # Number of batches in the training set

    def __getitem__(self, index, class_weights=None):
        # keras calls this to fetch each batch of data
        batch_indices = self.indices["train"][index * self.batch_size:(index + 1) * self.batch_size]  # Get indices for the current batch
        batch_x = self.X["train"][batch_indices]  # Get sequences for the current batch
        batch_y = self.y["train"][batch_indices]  # Get labels for the current batch

        if self.class_weights is not None:
            batch_sample_weights = np.array([self.class_weights[label] for label in batch_y])
        else:
            batch_sample_weights = np.ones(len(batch_y))

        return batch_x, batch_y, batch_sample_weights

    def on_epoch_end(self):
        if self.shuffle:
            np.random.shuffle(self.indices["train"])

In [11]:
def embedding_layer(self, vocab_size, embedding_dim):
        # embedding layer that learns protein sequence representations
        embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim,
                                    mask_zero=True,  # Mask padding tokens
                                    name="embedding")
        return embedding

def build_model(vocab_size, embedding_dim, num_classes):
    model = keras.Sequential(name="Protein_Dilated_CNN")

    # Embedding Layer (learns protein representations)
    model.add(ProteinHelper.embedding_layer(vocab_size, embedding_dim))

    # Early average pooling for sequence length.

    # Convolutional Block 1
    model.add(layers.Conv1D(filters=128, kernel_size=5, padding='same', dilation_rate=1, activation='relu', name="conv1d_1")) #this is a large filter number, not every AA relevant
    model.add(layers.BatchNormalization(name="batchnorm_1"))
    #no maxpool layer, emphasise local information, for short sequences.

    # Convolutional Block 2
    model.add(layers.Conv1D(filters=256, kernel_size=5, padding='same', dilation_rate=2, activation='relu', name="conv1d_2"))
    model.add(layers.BatchNormalization(name="batchnorm_2"))
    model.add(layers.MaxPooling1D(pool_size=2, name="maxpool_2"))

    # Convolutional Block 3
    model.add(layers.Conv1D(filters=512, kernel_size=5, padding='same', dilation_rate=4, activation='relu', name="conv1d_4"))
    model.add(layers.BatchNormalization(name="batchnorm_3"))
    model.add(layers.MaxPooling1D(pool_size=2, name="maxpool_3"))

    # Convolutional Block 4
    model.add(layers.Conv1D(filters=1024, kernel_size=5, padding='same', dilation_rate=8, activation='relu', name="conv1d_4"))
    model.add(layers.BatchNormalization(name="batchnorm_4"))
    model.add(layers.MaxPooling1D(pool_size=2, name="maxpool_4"))

    # Convolutional Block 5
    model.add(layers.Conv1D(filters=2048, kernel_size=5, padding='same', dilation_rate=16, activation='relu', name="conv1d_5"))
    model.add(layers.BatchNormalization(name="batchnorm_5"))
    model.add(layers.GlobalMaxPooling1D(name="globalmaxpool")) #consider using GlobalAveragePooling1D to reduce weight of length.
    #model.add(layers.GlobalAveragePooling1D(name="globalavgpool")) #consider using GlobalAveragePooling1D to reduce weight of length.


    #ResidualConnection
    #shortcut = layers.Conv1D(filters, 1, padding='same')(input_layer)  # Shortcut connection

    #x = layers.Add()([x, shortcut])  # Add input and output
    #x = layers.Activation('relu')(x)

    # Attention layer 1 # because of long sequences
    #model.add(BahdanauAttention(units=128))

    # Fully Connected Layer
    model.add(layers.Dropout(0.5, name="dropout")) #perhaps L1 (encourages sparcity) or L2 (encourages smaller weights) prevent overfitting especially with large number of filters
    model.add(layers.Dense(units=1024, activation='relu', name="dense_1"))
    model.add(layers.Dropout(0.5, name="dropout_2"))
    model.add(layers.Dense(units=num_classes, activation='softmax', name="dense_2"))

    print(model.summary())
    return model

In [12]:
def calculate_class_weights(y_train):
    class_labels = np.unique(y_train)
    class_weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=y_train)

    # Convert to dictionary
    class_weights_dict = dict(enumerate(class_weights))

    # Normalize
    total_weight = sum(class_weights_dict.values())
    class_weights_dict = {label: weight / total_weight for label, weight in class_weights_dict.items()}

    return class_weights_dict

In [None]:
from line_profiler import LineProfiler
lp = LineProfiler()

# Embedding Dimension
embedding_dim = 128  # Hyperparameter to optimize


# Decorate the method(s) you want to profile
lp_wrapper = lp(ProteinHelper.prepare_data)  # Wrap the method

# Create an instance of ProteinHelper and prepare data (this will trigger profiling)
protein_helper = ProteinHelper(df_train, df_dev, df_test, batch_size=4, shuffle=True, pad=True) #removed padding because of computation, but got an error
lp_wrapper(protein_helper, pad=True)

lp.print_stats()  # Print the line-by-line profiling results





In [2]:
from line_profiler import LineProfiler
lp = LineProfiler()
lp_wrapper(protein_helper, pad=True)

lp.print_stats()

NameError: name 'lp_wrapper' is not defined

In [None]:
os.environ['XLA_PYTHON_CLIENT_ALLOCATOR'] = 'platform'
os.environ['XLA_PYTHON_CLIENT_PREALLOCATE'] = 'false'

In [None]:

# Embedding Dimension
embedding_dim = 128  # Hyperparameter to optimize

# Initialize the ProteinHelper
protein_helper = ProteinHelper(df_train, df_dev, df_test, batch_size=8, shuffle=True, pad=False) #removed padding because of computation


In [None]:
# Calculate class weights
class_weights_dict = calculate_class_weights(protein_helper.y_train)


In [None]:
# Vocabulary Size and Number of Classes
vocab_size = len(protein_helper.vocab)
num_classes = len(set(protein_helper.df_train['family_accession']))
print("Vocab Size:", vocab_size)
print("Num Classes:", num_classes)



In [None]:
# Build the Model
model = build_model(vocab_size, embedding_dim, num_classes)


In [None]:
# Compile model (with weighted metrics)
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"], weighted_metrics=['accuracy'])


In [None]:
# Train model
model.fit(protein_helper, epochs=10, validation_data=(protein_helper.X["dev"], protein_helper.y["dev"]),
          class_weight=class_weights_dict, callbacks=[tensorboard_callback, early_stopping, model_checkpoint])


In [None]:
# Evaluate the Model
loss, accuracy = model.evaluate(protein_helper.test_dataset)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)

In [None]:
# Create a KerasClassifier wrapper
model = KerasClassifier(model=build_model, epochs=10, batch_size=32, verbose=0)

# Perform cross-validation
scores = cross_val_score(model, protein_helper.X_train, protein_helper.y_train, cv=5)  # 5-fold CV

print("Cross-validation scores:", scores)
print("Average score:", np.mean(scores))