In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import progressbar
import os
from sklearn.model_selection import train_test_split
import time
from sklearn.metrics import roc_auc_score

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


### Loading ProtBert Train embeddings and labels

In [2]:
train_embeddings = np.load('/kaggle/input/t5embeds/train_embeds.npy')
labels_y = np.load("/kaggle/input/xgbdata/Y_1499.npy")

### Splitting Train Dataset into Train and Validation Dataset

In [3]:
X_trn, X_tst, y_trn, y_tst = train_test_split( train_embeddings, labels_y, test_size=0.2, random_state=42)

In [4]:
column_num = train_embeddings.shape[1]
train_df = pd.DataFrame(train_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
print(train_df.shape)

(142246, 1024)


In [5]:
train_terms = pd.read_csv("/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv",sep="\t")
print(train_terms.shape)

(5363863, 3)


### Defining and Training the model

In [None]:
INPUT_SHAPE = (1024, 1) 
BATCH_SIZE = 5120

model = tf.keras.Sequential([
    tf.keras.layers.Reshape(INPUT_SHAPE, input_shape=(1024,)),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),   
    tf.keras.layers.Dense(units=712, activation='leaky_relu'),
    tf.keras.layers.Dense(units=1012, activation='leaky_relu'),
    tf.keras.layers.Dense(units=1499,activation='sigmoid')
])

checkpoint_path = "training/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)


cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1,save_freq='epoch')

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC()]
)

history = model.fit(
    X_trn, y_trn,
    batch_size=BATCH_SIZE,
    epochs=100,callbacks=[cp_callback]
)

Epoch 1/100
Epoch 1: saving model to training/cp.ckpt
Epoch 2/100
Epoch 2: saving model to training/cp.ckpt
Epoch 3/100
Epoch 3: saving model to training/cp.ckpt
Epoch 4/100
Epoch 4: saving model to training/cp.ckpt
Epoch 5/100
Epoch 5: saving model to training/cp.ckpt
Epoch 6/100
Epoch 6: saving model to training/cp.ckpt
Epoch 7/100
Epoch 7: saving model to training/cp.ckpt
Epoch 8/100
Epoch 8: saving model to training/cp.ckpt
Epoch 9/100
Epoch 9: saving model to training/cp.ckpt
Epoch 10/100
Epoch 10: saving model to training/cp.ckpt
Epoch 11/100
Epoch 11: saving model to training/cp.ckpt
Epoch 12/100
Epoch 12: saving model to training/cp.ckpt
Epoch 13/100
Epoch 13: saving model to training/cp.ckpt
Epoch 14/100
Epoch 14: saving model to training/cp.ckpt
Epoch 15/100
Epoch 15: saving model to training/cp.ckpt
Epoch 16/100
Epoch 16: saving model to training/cp.ckpt
Epoch 17/100
Epoch 17: saving model to training/cp.ckpt
Epoch 18/100
Epoch 18: saving model to training/cp.ckpt
Epoch 19/1

### Calculating Hamming loss and F1 Score on Valid dataset

In [11]:
# Define the threshold
threshold = 0.5
# model.fit(X_trn,y_trn)
predictions = model.predict(X_tst)
val_preds = np.where(predictions > threshold, 1, 0)
val_labels = y_tst
tp = np.sum((val_preds == 1) & (val_labels == 1))
fp = np.sum((val_preds == 1) & (val_labels == 0))
fn = np.sum((val_preds == 0) & (val_labels == 1))

precision = tp / (tp + fp)
recall = tp / (tp + fn)
f1_score = 2 * (precision * recall) / (precision + recall)
hamming_loss = np.mean(val_preds != val_labels)

print("Hamming loss:", hamming_loss)
print("F-max score:", f1_score)

Hamming loss: 0.020409529024035943
F-max score: 0.37890877538001333


### If using pre-trained weights

In [4]:
# Testing
INPUT_SHAPE = (1024, 1) 
BATCH_SIZE = 5120

model = tf.keras.Sequential([
    tf.keras.layers.Reshape(INPUT_SHAPE, input_shape=(1024,)),
    tf.keras.layers.Conv1D(filters=32, kernel_size=3, activation='relu'),
    tf.keras.layers.Conv1D(filters=64, kernel_size=3, activation='relu'),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.MaxPooling1D(pool_size=2),
    tf.keras.layers.Flatten(),   
    tf.keras.layers.Dense(units=712, activation='leaky_relu'),
    tf.keras.layers.Dense(units=1012, activation='leaky_relu'),
    tf.keras.layers.Dense(units=1499,activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=['binary_accuracy', tf.keras.metrics.AUC()]
)


In [5]:
latest = tf.train.latest_checkpoint("/kaggle/input/checkpt")
latest

'/kaggle/input/checkpt/cp.ckpt'

In [6]:
model.load_weights(latest)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x7d7db1fff850>

In [9]:
test_embeddings = np.load('/kaggle/input/protbert-embeddings-for-cafa5/test_embeddings.npy')

column_num = test_embeddings.shape[1]
test_df = pd.DataFrame(test_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
print(test_df.shape)

(141865, 1024)


### Getting and Saving Predictions

In [10]:
predictions = model.predict(test_df)



In [11]:
np.save("pred_0.34.npy", predictions)

In [None]:
t0 = time.time()
# model.fit(X_trn,y_trn)
Y_pred_test = model.predict(X_tst)
tt = time.time() - t0
print("MLP hehe", tt)
l = []
for i in range(y_trn.shape[1]):
    if len(np.unique(X_tst) ) > 1:
        s = roc_auc_score(y_tst[:,i], Y_pred_test[:,i]);
    else:
        s = 0.5
    l.append(s)        
    if i %10 == 0:
        print(i, s)


In [None]:
df_models_stat = pd.DataFrame()
df_models_stat.loc["MLP",'RocAuc Mean Test'] = np.mean(l)
df_models_stat.loc["MLP",'Time'] = np.round(tt,1)
df_models_stat.loc["MLP",'Test Size'] = len(X_tst)
df_models_stat

In [None]:
test_df.head()