# HyperParameterTraining for ANN model - CAFA 5

In [1]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd

import tensorflow as tf

import h5py

from sklearn.model_selection import train_test_split
from sklearn.metrics import hamming_loss

!pip install -U -q keras-tuner
import keras_tuner as kt

[0m

In [2]:
if tf.test.gpu_device_name():
    print("GPU is available : {}".format(tf.test.gpu_device_name()))
else:
    print("GPU is not available, Training CPU instead.")

GPU is available : /device:GPU:0


In [3]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/train-labels-cafa5/top_1500_labels.npy
/kaggle/input/train-labels-cafa5/KerasTunerMLP.h5
/kaggle/input/train-labels-cafa5/train_labels.h5
/kaggle/input/train-labels-cafa5/MultiaLayerPerceptron.h5
/kaggle/input/cafa-5-protein-function-prediction/sample_submission.tsv
/kaggle/input/cafa-5-protein-function-prediction/IA.txt
/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset.fasta
/kaggle/input/cafa-5-protein-function-prediction/Test (Targets)/testsuperset-taxon-list.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/train_sequences.fasta
/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv
/kaggle/input/cafa-5-protein-function-prediction/Train/go-basic.obo
/kaggle/input/t5embeds/train_ids.npy
/kaggle/input/t5embeds/test_embeds.npy
/kaggle/input/t5embeds/train_embeds.npy
/kaggle/input/t5embeds/test_ids.npy


In [None]:
# # Just to involve the CAFA 5 dataset
# train_tax = pd.read_csv('/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv',sep='\t')
# train_tax.head()

In [4]:
X = np.load('/kaggle/input/t5embeds/train_embeds.npy')

with h5py.File('/kaggle/input/train-labels-cafa5/train_labels.h5', 'r') as hf:
    Y = hf['labels'][:]

In [5]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size= 0.01, random_state=123)

## Check the hamming loss on the [Previous ANN Model](https://www.kaggle.com/code/stomar21/kerastuner?scriptVersionId=135047119)

In [6]:
old_model = tf.keras.models.load_model('/kaggle/input/train-labels-cafa5/KerasTunerMLP.h5')
pred_old = np.round(old_model.predict(X_test))
pred_old_train = np.round(old_model.predict(X_train))

print('Hamming Loss for Old Model on Test Set= {}'.format(hamming_loss(Y_test,pred_old)))
print('Hamming Loss for Old Model on Train Set= {}'.format(hamming_loss(Y_train,pred_old_train)))

Hamming Loss for Old Model on Test Set= 0.017841649098149448
Hamming Loss for Old Model on Train Set= 0.016263271387959827


```
Output - 
------------------------------------------------------------------------
45/45 [==============================] - 3s 2ms/step
4401/4401 [==============================] - 8s 2ms/step
Hamming Loss for Old Model on Test Set= 0.017841649098149448
Hamming Loss for Old Model on Train Set= 0.016263271387959827
------------------------------------------------------------------------
```

# Creating A Keras Tuner HyperBand to look for best hyperparameters
(Code is now Commented Out as the notebook was then used for kaggle submission)

In [20]:
# Creating a hamming_loss function for passing as a metric
def hamming_loss_tf(y_true, y_pred):
    y_pred_rounded = tf.round(y_pred)
    return tf.py_function(hamming_loss, (y_true, y_pred_rounded), tf.float32)

In [34]:
!rm -rf keras_tuner_directory

In [35]:
tuner = kt.Hyperband(model_builder,
                     objective=kt.Objective("val_hamming_loss_tf", direction="min"),
                     max_epochs=10,
                     factor=3,
                     directory='keras_tuner_directory',
                     project_name='keras_optimisation_MLP_hamming_loss')

In [36]:
stop_early = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3)

In [37]:
tuner.search(X_train, Y_train, epochs=25, validation_split=0.2, callbacks=[stop_early])

Trial 30 Complete [00h 04m 08s]
val_hamming_loss_tf: 0.019137214869260788

Best val_hamming_loss_tf So Far: 0.017995186150074005
Total elapsed time: 01h 19m 18s


In [None]:
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

In [None]:
print('Best Parameters After Search')
for hp_name, hp_value in best_hps.values.items():
    print(f'{hp_name}: {hp_value}')

```
Ouput - 
------------------------------------
Best Parameters After Search
activation: relu
layer_1: 1024
layer_2: 256
layer_3: 512
learning_rate: 0.001
tuner/epochs: 10
tuner/initial_epoch: 4
tuner/bracket: 2
tuner/round: 2
tuner/trial_id: 0014
------------------------------------
```

In [None]:
model = tuner.hypermodel.build(best_hps)
history = model.fit(X_train, y_train, epochs=25, validation_data=(X_test,Y_test),
                    callbacks=[stop_early])

In [None]:
model.save('KerasTunerMLP_hamming_loss.h5')

# Checking the new model

In [None]:
new_model = tf.keras.models.load_model('/kaggle/input/train-labels-cafa5/KerasTunerMLP_hamming_loss.h5')
pred_new = new_model.predict(X_test)
pred_new[pred_new<=0.5] = 0
pred_new[pred_new>0.5] = 1
pred_new_train = new_model.predict(X_train)
pred_new_train[pred_new_train<=0.5] = 0
pred_new_train[pred_new_train>0.5] = 1

print('Hamming Loss for New Model on Test Set= {}'.format(hamming_loss(Y_test,pred_new)))
print('Hamming Loss for New Model on Train Set= {}'.format(hamming_loss(Y_train,pred_new_train)))

```
Output - 
------------------------------------------------------------------------
45/45 [==============================] - 0s 5ms/step
4401/4401 [==============================] - 21s 5ms/step
Hamming Loss for New Model on Test Set= 0.017841649098149448
Hamming Loss for New Model on Train Set= 0.016263271387959827
------------------------------------------------------------------------
```

# Creating A New Submission Using the New Model

In [None]:
test_embeddings = np.load('/kaggle/input/t5embeds/test_embeds.npy')
labels = np.load('/kaggle/input/train-labels-cafa5/top_1500_labels.npy',allow_pickle=True)
new_model = tf.keras.models.load_model('/kaggle/input/train-labels-cafa5/KerasTunerMLP.h5')
print('Data Loaded Successfully!')

In [None]:
%%time
predictions = new_model.predict(test_embeddings)

In [None]:
%time
test_protein_ids = np.load('/kaggle/input/t5embeds/test_ids.npy')
l = []
for k in list(test_protein_ids):
    l += [ k] * predictions.shape[1]

In [None]:
%%time
df_submission = pd.DataFrame(
    {
        'Protein ID': l,
        'GO Term ID': np.tile(labels, predictions.shape[0]),
        'Prediction': np.round(predictions.ravel(),3)
    }
)

print('df_submission Created Successfully!')

In [None]:
df_submission.head()

In [None]:
%%time
print('Starting to save submission to .csv . . . . .')
df_submission.to_csv('submission.tsv', sep='\t', header=None, index= None)
print('Done!\nCreated a submission.tsv file!')