In [1]:
from warnings import filterwarnings
filterwarnings('ignore')

!pip install -q -U biopython
from Bio import SeqIO

import numpy as np
import pandas as pd

from matplotlib import pyplot as plt

from tqdm import tqdm
tqdm.pandas()

import h5py

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split

In [2]:
from keras.models import load_model
from keras.models import Model
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
from keras.layers import Input, Dense, Dropout, Flatten, Activation
from keras.layers import Conv1D, Add, MaxPooling1D, BatchNormalization

In [3]:
class ProteinsOneHotEncoder:
    
    def __init__(self, id_file, fasta_file= None, max_sequences=0):
        
        self.possible_aa = ['M', 'N', 'S', 'V', 'T', 'H', 'A', 'P', 'Y', 'I', 'D', 'W', 'E','Q', 'L', 'F', 'R', 'K', 'C', 'G', 'U', 'O', 'X', 'B', 'Z']
        self.fasta_file = fasta_file
        self.max_sequences = max_sequences
        self.ids = np.load(id_file)
        if self.max_sequences:
            self.ids = self.ids[:self.max_sequences]
        self.retained_proteins = []
    
    def _fasta_to_dataframe(self):
        
        if self.fasta_file==None:
            raise ValueError("fasta_file not given")
        
        records = SeqIO.parse(self.fasta_file, "fasta")
        data = []
        count = 0
        for record in tqdm(records, desc='Converting to DataFrame', unit=' Records'):
            if self.max_sequences and count >= self.max_sequences:
                break
            sequence = str(record.seq)
            header = record.id
            data.append([header, sequence])
            count += 1

        df = pd.DataFrame(data, columns=["Header", "Sequence"])
        return df



    @staticmethod
    def clip(seq : str, length=750):
        if (len(seq) > 750):
            return seq[: length]
        else: return seq

    
    @staticmethod
    def elongate(seq : str):
        
        n = len(seq)

        if n > 375:
            new_seq = seq + seq[: 750 % n]
        elif n > 250:
            new_seq = seq*2 + seq[: 750 % n]
        elif n > 187.5:
            new_seq = seq*3 + seq[: 750 % n]
        elif n > 150:
            new_seq = seq*4 + seq[: 750 % n]
        elif n == 150:
            new_seq = seq*5

        return new_seq

    def _clip_and_elongate_sequences(self):
        
        df = self._fasta_to_dataframe()
        df['Seq_clip'] = df['Sequence'].progress_apply(self.clip)
        self.retained_proteins = [not val for val in df.Seq_clip.progress_apply(str.__len__) < 150]
        df = df[self.retained_proteins]
        df['Seq_clip'] = df['Seq_clip'].progress_apply(self.elongate)
        return df
    
    def encode_data(self):

        df = self._clip_and_elongate_sequences()
        
        # Create an instance of OneHotEncoder
        encoder = OneHotEncoder(sparse_output=False, categories=[self.possible_aa])

        # Initialize an empty list to store the encoded strings
        encoded_strings = []

        # Iterate over each string in the list
        for string in tqdm(df.iloc[:,2].values, desc='Encoding Sequences', unit=' Encodings'):
            # Transform the string to a list of individual letters
            letters = list(string)

            # Encode the letters
            one_hot_encoded = encoder.fit_transform(np.array(letters).reshape(-1, 1))

            # Append the encoded string to the list
            encoded_strings.append(one_hot_encoded.astype(np.int8))

        # Convert the list of encoded strings to a numpy array
        encoded_array = np.array(encoded_strings)
        
        return encoded_array

In [4]:
max_seq = 100000 #@param {type:"integer"}
train_encoder = ProteinsOneHotEncoder(
    id_file = '/kaggle/input/t5embeds/train_ids.npy',
    fasta_file = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta',
    max_sequences = max_seq
)
train_encodings = train_encoder.encode_data()

with h5py.File('/kaggle/input/train-labels-cafa5/train_labels.h5', 'r') as hf:
    train_labels = hf['labels'][:]

if max_seq:
    train_labels = train_labels[:max_seq]
train_labels = train_labels[train_encoder.retained_proteins]

In [5]:
print('Sanity Check')
train_encodings.shape, train_labels.shape

In [6]:
X_train, X_test, Y_train, Y_test = train_test_split(train_encodings, train_labels, test_size = 0.1, random_state=123)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.11111111111, random_state=123)

X_train.shape, Y_train.shape, X_val.shape, Y_val.shape, X_test.shape, Y_test.shape

In [7]:
# Utility function: plot model's accuracy and loss

plt.style.use('seaborn-v0_8-paper')

def plot_history(history):
  binary_accuracy = history.history['binary_accuracy']
  val_binary_accuracy = history.history['val_binary_accuracy']
  loss = history.history['loss']
  val_loss = history.history['val_loss']
  x = range(1, len(binary_accuracy) + 1)

  plt.figure(figsize=(12, 5))
  plt.subplot(1, 2, 1)
  plt.plot(x[2:], binary_accuracy[2:], 'b', label='Training acc')
  plt.plot(x[2:], val_binary_accuracy[2:], 'r', label='Validation acc')
  plt.title('Training and validation accuracy')
  plt.legend()

  plt.subplot(1, 2, 2)
  plt.plot(x[2:], loss[2:], 'b', label='Training loss')
  plt.plot(x[2:], val_loss[2:], 'r', label='Validation loss')
  plt.title('Training and validation loss')
  plt.legend()

In [8]:
# Utility function: Display model score(Loss & Accuracy) across all sets.

def display_model_score(model, train, val, test, batch_size):

  train_score = model.evaluate(train[0], train[1], batch_size=batch_size, verbose=1)
  print('Train loss: ', train_score[0])
  print('Train binary_accuracy: ', train_score[1])
  print('-'*70)

  val_score = model.evaluate(val[0], val[1], batch_size=batch_size, verbose=1)
  print('Val loss: ', val_score[0])
  print('Val binary_accuracy: ', val_score[1])
  print('-'*70)
  
  test_score = model.evaluate(test[0], test[1], batch_size=batch_size, verbose=1)
  print('Test loss: ', test_score[0])
  print('Test binary_accuracy: ', test_score[1])

In [9]:
def residual_block(data, filters, d_rate):
  """
  _data: input
  _filters: convolution filters
  _d_rate: dilation rate
  """

  shortcut = data

  bn1 = BatchNormalization()(data)
  act1 = Activation('relu')(bn1)
  conv1 = Conv1D(filters, 1, dilation_rate=d_rate, padding='same', kernel_regularizer=l2(0.001))(act1)

  #bottleneck convolution
  bn2 = BatchNormalization()(conv1)
  act2 = Activation('relu')(bn2)
  conv2 = Conv1D(filters, 3, padding='same', kernel_regularizer=l2(0.001))(act2)

  #skip connection
  x = Add()([conv2, shortcut])

  return x

In [10]:
# model

x_input = Input(shape=(750, 25))

#initial conv
conv = Conv1D(256, 1, padding='same')(x_input)

# per-residue representation
res1 = residual_block(conv, 256, 2)
res2 = residual_block(res1, 256, 3)

x = MaxPooling1D(3)(res2)
x = Dropout(0.5)(x)

# softmax classifier
x = Flatten()(x)
x_output = Dense(1500, activation='sigmoid', kernel_regularizer=l2(0.0001))(x)

model_CNN = Model(inputs=x_input, outputs=x_output)
model_CNN.compile(optimizer='adam', loss='binary_crossentropy', metrics=['binary_accuracy'])

model_CNN.summary()


In [11]:
# Early Stopping
es = EarlyStopping(monitor='val_loss', patience=3, verbose=1)

In [12]:
history = model_CNN.fit(
    X_train, Y_train,
    epochs=10, batch_size=256,
    validation_data= (X_val,Y_val),
    callbacks=[es]
    )

In [13]:
# saving model weights.
model_CNN.save_weights('model_CNN.h5')

In [14]:
plot_history(history)

In [15]:
display_model_score(
    model_CNN,
    [X_train, Y_train],
    [X_val, Y_val],
    [X_test, Y_test],
    256)

In [16]:
from sklearn.metrics import hamming_loss
pred_tn = np.round(model_CNN.predict(X_train))
print(f'Hamming Loss On Training Set = {hamming_loss(Y_train, pred_tn)}')
pred_tt = np.round(model_CNN.predict(X_test))
print(f'Hamming Loss On Test Set = {hamming_loss(Y_test, pred_tt)}')
pred_val = np.round(model_CNN.predict(X_val))
print(f'Hamming Loss On Validation Set = {hamming_loss(Y_val, pred_val)}')

In [17]:
max_seq = 0
test_encoder = ProteinsOneHotEncoder(
    id_file = '/kaggle/input/t5embeds/test_ids.npy',
    fasta_file = '/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta',
    max_sequences = max_seq
)
test_encodings = test_encoder.encode_data()

In [18]:
np.save('test_proteins_retained.npy', test_encoder.retained_proteins)

In [19]:
model_CNN.load_weights('/kaggle/input/train-labels-cafa5/model_CNN.h5')

In [20]:
predictions = model_CNN.predict(test_encodings)

In [21]:
np.save('predictions_CNN.npy', predictions)

In [22]:
predictions = np.load('/kaggle/input/cafa-5-cnn/predictions_CNN.npy')

In [23]:
labels = np.load('/kaggle/input/train-labels-cafa5/top_1500_labels.npy',allow_pickle=True)
slice_ = np.load('/kaggle/input/train-labels-cafa5/test_proteins_retained.npy')

In [24]:
test_ids = np.load('/kaggle/input/t5embeds/test_ids.npy')
test_protein_ids = test_ids[slice_]

In [25]:
l = []
for k in tqdm(list(test_protein_ids)):
    l += [ k] * predictions.shape[1]

100%|██████████| 123639/123639 [00:02<00:00, 56633.88it/s]


In [26]:
df_submission = pd.DataFrame(
    {
        'Protein ID': l,
        'GO Term ID': np.tile(labels, predictions.shape[0]),
        'Prediction': np.round(predictions.ravel(),3)
    }
)

In [27]:
df_submission.head()

Unnamed: 0,Protein ID,GO Term ID,Prediction
0,Q9CQV8,GO:0005575,0.744
1,Q9CQV8,GO:0008150,0.715
2,Q9CQV8,GO:0110165,0.762
3,Q9CQV8,GO:0003674,0.624
4,Q9CQV8,GO:0005622,0.796


In [28]:
df_submission.to_csv('submission.tsv', sep='\t', header=None, index= None)