In [1]:
MAIN_DIR = "/kaggle/input/cafa-5-protein-function-prediction"

# UTILITARIES
import numpy as np
import pandas as pd
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
plt.style.use('ggplot')
from sklearn.model_selection import train_test_split
# TORCH MODULES FOR METRICS COMPUTATION :
import torch
from torch.utils import data
from torch.utils.data import Dataset
from torch import nn
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import MultilabelF1Score
from torchmetrics.classification import MultilabelAccuracy

import pytorch_lightning as pl
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import WandbLogger
import torch.nn.functional as F
# KERAS
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from keras.callbacks import EarlyStopping
import tensorflow as tf
# WANDB FOR LIGHTNING :
import wandb

# FILES VISUALIZATION
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
from transformers import BertModel, BertTokenizer

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


/kaggle/input/train-targets-top500/train_targets_top500.npy
/kaggle/input/protbert-embeddings-for-cafa5/train_ids.npy
/kaggle/input/protbert-embeddings-for-cafa5/train_embeddings.npy
/kaggle/input/protbert-embeddings-for-cafa5/test_ids.npy
/kaggle/input/protbert-embeddings-for-cafa5/test_embeddings.npy
/kaggle/input/cafa-5-protein-function-prediction/sample_submission.tsv
/kaggle/input/cafa-5-protein-function-prediction/IA.txt
/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta
/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset-taxon-list.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta
/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo


In [2]:
class config:
    train_sequences_path = MAIN_DIR  + "/Train/train_sequences.fasta"
    train_labels_path = MAIN_DIR + "/Train/train_terms.tsv"
    test_sequences_path = MAIN_DIR + "/Test (Targets)/testsuperset.fasta"
    
    num_labels = 500
    n_epochs = 5
    batch_size = 128
    lr = 0.001
    MAX_FEATURES = 1024
    LSTM_UNITS = 256
    NUM_AUX_TARGETS = 6
    embed_size = 1024
    TRAINED_MODEL_DIR = 'lstm/'
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### LOAD PROBERT EMBEDDINGS

In [3]:
train_embeds = np.load('/kaggle/input/protbert-embeddings-for-cafa5/train_embeddings.npy')
train_ids = np.load('/kaggle/input/protbert-embeddings-for-cafa5/train_ids.npy')

In [4]:
train_embeds.shape

(142246, 1024)

In [5]:
test_embeds = np.load('/kaggle/input/protbert-embeddings-for-cafa5/test_embeddings.npy')
test_ids = np.load('/kaggle/input/protbert-embeddings-for-cafa5/test_ids.npy')

In [6]:
test_embeds.shape

(141865, 1024)

In [7]:
IX = np.arange(train_embeds.shape[0])
IX_train, IX_test, _,_ = train_test_split( IX, IX, train_size=0.5, random_state=42)

### Generating and saving top K labels

In [None]:
print("GENERATE TARGETS FOR ENTRY IDS ("+str(config.num_labels)+" MOST COMMON GO TERMS)")
ids = np.load("/kaggle/input/protbert-embeddings-for-cafa5/train_ids.npy")
labels = pd.read_csv(config.train_labels_path, sep = "\t")

top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
labels_names = top_terms[:config.num_labels].index.values
train_labels_sub = labels[(labels.term.isin(labels_names)) & (labels.EntryID.isin(ids))]
id_labels = train_labels_sub.groupby('EntryID')['term'].apply(list).to_dict()

go_terms_map = {label: i for i, label in enumerate(labels_names)}
labels_matrix = np.empty((len(ids), len(labels_names)))

for index, id in tqdm(enumerate(ids)):
    id_gos_list = id_labels[id]
    temp = [go_terms_map[go] for go in labels_names if go in id_gos_list]
    labels_matrix[index, temp] = 1

np.save("/kaggle/working/train_targets_top"+str(config.num_labels)+".npy", np.array(labels_matrix))
print("GENERATION FINISHED!")

In [None]:
type(labels_names)

In [None]:
np.save('label-names-top-500.npy',labels_names)

### Load saved labels

In [8]:
Y = np.load('/kaggle/input/train-targets-top500/train_targets_top500.npy')

In [9]:
Y.shape

(142246, 500)

### LSTM MODEL

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [11]:
torch.cuda.is_available()

True

In [12]:
x_train_torch = torch.tensor(train_embeds[IX_train,:], dtype=torch.float32).to(device)
x_val_torch = torch.tensor(train_embeds[IX_test,:], dtype=torch.float32).to(device)
y_train_torch = torch.tensor(Y[IX_train,:], dtype=torch.float32).to(device)
y_val_torch = torch.tensor(Y[IX_test,:], dtype=torch.float32).to(device)

train_dataset = data.TensorDataset(x_train_torch, y_train_torch)
val_dataset = data.TensorDataset(x_val_torch, y_val_torch)

In [13]:
class SpatialDropout(nn.Dropout2d):
    def forward(self, x):
        x = x.unsqueeze(2)
        x = x.permute(0, 3, 2, 1)
        x = super(SpatialDropout, self).forward(x)
        x = x.permute(0, 3, 2, 1)
        x = x.squeeze(2)
        return x

In [52]:
class Lstm(nn.Module):
    def __init__(self):
        super(Lstm, self).__init__()
#         embed_size = embedding_matrix.shape[1]
#         self.embedding = nn.Embedding(config.MAX_FEATURES, embed_size)
#         self.embedding.weight = nn.Parameter(torch.tensor(embedding_matrix, dtype=torch.float32))
#         self.embedding.weight.requires_grad = False
#         self.embedding_dropout = SpatialDropout(0.3)
        self.lstm1 = nn.LSTM(config.embed_size, config.LSTM_UNITS, bidirectional=False, batch_first=True)
        self.lstm2 = nn.LSTM(config.LSTM_UNITS, config.LSTM_UNITS, bidirectional=False, batch_first=True)
        
        self.linear1 = nn.Linear(256, 128)
        self.linear2 = nn.Linear(config.LSTM_UNITS, 128)
        
        self.linear_out = nn.Linear(config.LSTM_UNITS, 1)
        self.linear_aux_out = nn.Linear(config.LSTM_UNITS, config.NUM_AUX_TARGETS)
    
    def forward(self, x):
#         h_embedding = self.embedding(x)
#         h_embedding = self.embedding_dropout(h_embedding)
        h_lstm1, _ = self.lstm1(x)
        h_lstm2, _ = self.lstm2(h_lstm1)
        
        # global average pooling
        avg_pool = torch.mean(h_lstm2, 1)
        
        # global max pooling
        max_pool, _ = torch.max(h_lstm2, 1)
#         avg_pool = torch.unsqueeze(avg_pool, 0)
#         max_pool = torch.unsqueeze(max_pool, 0)
        print(avg_pool.shape)
        print(max_pool.shape)
        h_conc = torch.cat((max_pool, avg_pool), 0)
        h_conc_linear1 = F.relu(self.linear1(h_conc))
        h_conc_linear2 = F.relu(self.linear2(h_conc))
        h_conc_linear = torch.cat((h_conc_linear1, h_conc_linear2), 0)
        print(h_conc_linear.shape)
        hidden = h_conc + h_conc_linear
        result = self.linear_out(hidden)
        aux_result = self.linear_aux_out(hidden)
        out = torch.cat([result, aux_result], 0)
        return out

In [15]:
def train_model(model, train, test, model_file, model_name, loss_fn, lr=0.001, batch_size=512, n_epochs=10):
    param_lrs = [{'params': param, 'lr': lr} for param in model.parameters()]
    optimizer = torch.optim.Adam(param_lrs, lr=lr)
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lambda epoch: 0.6 ** epoch)
    train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    training_loss = []
    validation_loss = []
    
    best_loss = float("inf")
    for epoch in range(n_epochs):
        start_time = time.time()
        
        model.train()
        avg_loss = 0
        
        for data in tqdm(train_loader, disable=False):
            x_batch = data[:-1]
            y_batch = data[-1]
            if model_name != 'attention':
                y_pred = model(*x_batch)
            else:
                
                y_pred, _ = model(*x_batch, config.MAX_LEN)
            loss = loss_fn(y_pred, y_batch)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            avg_loss += loss.item() / len(train_loader)
            
        training_loss.append(avg_loss)
        model.eval()
        _logger.info(f'... Validating {model_name} ... ')
        avg_val_loss = 0
        for val_data in tqdm(test_loader, disable=False):
            x_batch = val_data[:-1]
            y_batch = val_data[-1]
            if model_name != 'attention':
                y_pred = model(*x_batch)
            else:
                y_pred, _ = model(*x_batch, config.MAXLEN)
            
            val_loss = loss_fn(y_pred, y_batch)
            avg_val_loss += val_loss.item() / len(test_loader)
        
        elapsed_time = time.time() - start_time
        validation_loss.append(avg_val_loss)
        if avg_val_loss < best_loss:
            _logger.info('saving the best model so far')
            best_loss = avg_val_loss
            torch.save(model.state_dict(), model_file)
        _logger.info(
            f'Epoch {epoch + 1}/{n_epochs}\t training_loss={avg_loss:.4f} \t validation_loss={avg_val_loss: 4f} \t time={elapsed_time:.2f}s')
        scheduler.step()
    return training_loss, validation_loss

In [16]:
def custom_loss(data, targets):
        ''' Define custom loss function for weighted BCE on 'target' column '''
        bce_loss_1 = nn.BCEWithLogitsLoss(weight=targets[:, 1:2])(data[:, :1], targets[:, :1])
        bce_loss_2 = nn.BCEWithLogitsLoss()(data[:, 1:], targets[:, 2:])
        return (bce_loss_1 * loss_weights) + bce_loss_2

In [53]:
lstm_model = Lstm()

In [54]:
lstm_model.to(device)

Lstm(
  (lstm1): LSTM(1024, 256, batch_first=True)
  (lstm2): LSTM(256, 256, batch_first=True)
  (linear1): Linear(in_features=256, out_features=128, bias=True)
  (linear2): Linear(in_features=256, out_features=128, bias=True)
  (linear_out): Linear(in_features=256, out_features=1, bias=True)
  (linear_aux_out): Linear(in_features=256, out_features=6, bias=True)
)

In [58]:
model_file = config.TRAINED_MODEL_DIR + '/probert_1024_v1.pt'
training_loss, validation_loss = train_model(lstm_model, train_dataset, val_dataset, model_file, model_name="lstm",
                                                 n_epochs=1,loss_fn=nn.BCEWithLogitsLoss(), batch_size = config.batch_size)

  0%|          | 0/556 [00:00<?, ?it/s]


torch.Size([128])
torch.Size([128])
torch.Size([256])


ValueError: Target size (torch.Size([128, 500])) must be the same as input size (torch.Size([7]))

## SIMPLER IMPLEMENTATION :

In [None]:
new_shape = (1, train_embeds.shape[0])  # Add a dimension at the beginning
# X_train = np.reshape(train_embeds, new_shape)
X_train = np.expand_dims(train_embeds, axis=-1)
X_train.shape

In [None]:
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable GPU memory growth
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

In [None]:
from tensorflow.python.client import device_lib
print(device_lib.list_local_devices())

In [None]:
sess = tf.compat.v1.Session(config=tf.compat.v1.ConfigProto(log_device_placement=True))

In [None]:
with tf.device('/GPU:1'):
    embed_size = train_embeds.shape[1]
    model = Sequential()
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(1024, activation='relu'))
    model.add(Dense(512, activation='relu'))
    model.add(Dense(500, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    epochs = 5
    batch_size = 64

    history = model.fit(X_train[IX_train,:], Y[IX_train,:], epochs=epochs, batch_size=batch_size,validation_split=0.1,callbacks=[EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)])

In [None]:
model.save("probert_1024_v1")

In [None]:
model_json = model.to_json()
with open("lstm_probert_1024_v1.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

In [None]:
accr = model.evaluate(X_train[IX_train,:],Y[IX_train,:])
print('Test set\n  Loss: {:0.3f}\n  Accuracy: {:0.3f}'.format(accr[0],accr[1]))

In [None]:
X_test = np.expand_dims(test_embeds, axis=-1)

In [None]:
Y_pred = model.predict(X_test)

In [None]:
np.save("lstm_probert_1024_v1.npy",Y_pred)

In [None]:
Y_pred.shape

### USING PYTORCH

In [None]:
class LSTM():
    def __init__(self):
        super(Lstm, self).__init__()
        self.lstm1 = nn.LSTM(config.embed_size, config.LSTM_UNITS, bidirectional=False, batch_first=True)
        self.linear1 = nn.Linear(256, config.LSTM_UNITS)
        
        self.linear_out = nn.Linear(config.LSTM_UNITS * 2, 1)