# Convolutional Neural Networks

This note book will be used to test convolutional neural networks on each of the 3 ontologies.

In [16]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import itertools
import pprint
import yaml

In [17]:
print("TensorFlow v" + tf.__version__)
print("Numpy v" + np.__version__)

TensorFlow v2.15.0
Numpy v1.26.4


In [18]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [19]:
config_path = './config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

In [20]:
USE_75_PERCENT_DATA = config['use_75_percent_datasets']
partial_dataset_prefix = '75percent_' if USE_75_PERCENT_DATA else ''
BP_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_BiologicalProcesses.pkl")
CC_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_CellularComponent.pkl")
MF_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_MolecularFunction.pkl")
BP_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_BiologicalProcesses.pkl")
CC_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_CellularComponent.pkl")
MF_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_MolecularFunction.pkl")

In [21]:
train_data_dict = {'Biological Processes': [BP_train_df, BP_label_df],
                   'Cellular Component': [CC_train_df, CC_label_df],
                   'Molecular Function': [MF_train_df, MF_label_df]
}

In [22]:
num_labels = 1500
num_folds = config['num_folds']

## Model 1 Architecture: CNN

In [23]:
BATCH_SIZE = config['batch_size']

In [24]:
def model1_training(dataset_name, data):
  train = data[0]
  label = data[1]

  INPUT_SHAPE = (train.shape[1], 1)

  best_f1 = 0
  print('=======================================================================')
  print(f'Training for {dataset_name}')
  
  model_root_path = f'{config["directories"]["models"]}/CNNMod1'

  for num_filters, size_kernel in itertools.product([16, 32], [64, 128, 256]):
    print('----------------------------------------------------------------------')
    print('The number of filters is ', num_filters)
    print('The size of the kernel is ', size_kernel)

    kfold = KFold(n_splits=num_folds, shuffle=True)
    fold_no = 1

    for train_fold, test_fold in kfold.split(train, label):

      model1 = tf.keras.Sequential([
          tf.keras.layers.BatchNormalization(input_shape = INPUT_SHAPE),
          tf.keras.layers.Conv1D(filters = num_filters, kernel_size = size_kernel, activation = 'relu'),
          tf.keras.layers.MaxPooling1D(pool_size = 2),
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(units = 512, activation = 'relu'),
          tf.keras.layers.Dense(units = num_labels, activation = 'sigmoid')
          ])

      # Compile model
      model1.compile(
          optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
          loss='binary_crossentropy',
          metrics=['binary_accuracy',
                  tf.keras.metrics.AUC(),
                  tf.keras.metrics.Precision(),
                  tf.keras.metrics.Recall(),
                  ] # tf.keras.metrics.F1Score() not appropriate as it is calculated batchwise
          )

      print(f'Training for fold {fold_no} ...')

      # Fit the data to the model
      history = model1.fit(
          train.iloc[train_fold], label.iloc[train_fold],
          validation_data=(train.iloc[test_fold], label.iloc[test_fold]),
          batch_size=BATCH_SIZE,
          epochs=5
          )

      # Generate metrics
      scores = model1.evaluate(train.iloc[test_fold], label.iloc[test_fold], verbose=0)
      precision = scores[3]
      recall = scores[4]
      F1_score = 2*precision*recall / (precision + recall)
      print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model1.metrics_names[0]} of {scores[0]}; {model1.metrics_names[1]} of {scores[1]*100}%')

      if F1_score > best_f1:
        best_f1 = F1_score
        if dataset_name == 'Biological Processes':
          tf.keras.models.save_model(
              model1,
              f'{model_root_path}/best_BP_model')
          print(f'Current best model for Biological Processes has {num_filters} filters, with a kernel size of {size_kernel} and has an F1 score of {F1_score}')

        elif dataset_name == 'Molecular Function':
          tf.keras.models.save_model(
              model1,
              f'{model_root_path}/best_MF_model')
          print(f'Current best model for Molecular Function has {num_filters} filters, with a kernel size of {size_kernel} and has an F1 score of {F1_score}')

        else:
          tf.keras.models.save_model(
              model1,
              f'{model_root_path}/best_CC_model')
          print(f'Current best model for Cellular Component has {num_filters} filters, with a kernel size of {size_kernel} and has an F1 score of {F1_score}')

      fold_no += 1

In [25]:
for dataset in train_data_dict:
  dataset_name = dataset
  data = train_data_dict[dataset]
  model1_training(dataset_name, data)

Training for Biological Processes
----------------------------------------------------------------------
The number of filters is  16
The size of the kernel is  64
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.37484124798104557; loss of 0.07890092581510544; binary_accuracy of 97.44029641151428%
INFO:tensorflow:Assets written to: ./models/CNNMod1/best_BP_model/assets


INFO:tensorflow:Assets written to: ./models/CNNMod1/best_BP_model/assets


Current best model for Biological Processes has 16 filters, with a kernel size of 64 and has an F1 score of 0.37484124798104557
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.32249055029598817; loss of 0.08144965022802353; binary_accuracy of 97.41843342781067%
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.3891332102075608; loss of 0.07708101719617844; binary_accuracy of 97.52199649810791%
INFO:tensorflow:Assets written to: ./models/CNNMod1/best_BP_model/assets


INFO:tensorflow:Assets written to: ./models/CNNMod1/best_BP_model/assets


Current best model for Biological Processes has 16 filters, with a kernel size of 64 and has an F1 score of 0.3891332102075608
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.38212572909483056; loss of 0.07803097367286682; binary_accuracy of 97.55370020866394%
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.3275335157003611; loss of 0.07878384739160538; binary_accuracy of 97.51865267753601%
Training for fold 6 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 6: F1 score of 0.37058668420557933; loss of 0.07630092650651932; binary_accuracy of 97.52975106239319%
Training for fold 7 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 7: F1 score of 0.3025247078115808; loss of 0.0777750015258789; binary_accuracy of 97.461998462677%
Training for fold 8 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 8: F1 score of 0.4244

INFO:tensorflow:Assets written to: ./models/CNNMod1/best_BP_model/assets


Current best model for Biological Processes has 16 filters, with a kernel size of 64 and has an F1 score of 0.42444720605221126
Training for fold 9 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 9: F1 score of 0.29739902590700984; loss of 0.08202087134122849; binary_accuracy of 97.49948382377625%
Training for fold 10 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 10: F1 score of 0.40940489901653004; loss of 0.0771559625864029; binary_accuracy of 97.46654033660889%
----------------------------------------------------------------------
The number of filters is  16
The size of the kernel is  128
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.3490705938865439; loss of 0.07742241024971008; binary_accuracy of 97.50298857688904%
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.3984031075241416; loss of 0.07791265100240707; binary_accur

KeyboardInterrupt: 

## Model 2 Architecture: CNN with LSTM

In [None]:
BATCH_SIZE = 256

In [None]:
def model2_training(dataset_name, data):
  train = data[0]
  label = data[1]

  INPUT_SHAPE = (train.shape[1], 1)

  best_f1 = 0
  print('=======================================================================')
  print(f'Training for {dataset_name}')
  
  model_root_path = f'{config["directories"]["models"]}/CNNMod2'

  for lstm_units in [2, 32]:
    print('----------------------------------------------------------------------')
    print('The number of LSTM units is ', lstm_units)

    kfold = KFold(n_splits=num_folds, shuffle=True)
    fold_no = 1

    for train_fold, test_fold in kfold.split(train, label):
      forward = tf.keras.layers.LSTM(lstm_units, return_sequences=True)
      backward = tf.keras.layers.LSTM(lstm_units, activation='relu', return_sequences=True, go_backwards=True)

      model2 = tf.keras.Sequential([
          tf.keras.layers.BatchNormalization(input_shape = INPUT_SHAPE),
          tf.keras.layers.Conv1D(filters = 32, kernel_size = 128, activation = 'relu'),
          tf.keras.layers.MaxPooling1D(pool_size = 2),
          tf.keras.layers.Bidirectional(forward, backward),
          tf.keras.layers.Flatten(),
          tf.keras.layers.Dense(units = 512, activation = 'relu'),
          tf.keras.layers.Dense(units = num_labels, activation = 'sigmoid')
          ])

      # Compile model
      model2.compile(
          optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
          loss='binary_crossentropy',
          metrics=['binary_accuracy',
                  tf.keras.metrics.AUC(),
                  tf.keras.metrics.Precision(),
                  tf.keras.metrics.Recall(),
                  ] # tf.keras.metrics.F1Score() not appropriate as it is calculated batchwise
          )

      print(f'Training for fold {fold_no} ...')

      # Fit the data to the model
      history = model2.fit(
          train.iloc[train_fold], label.iloc[train_fold],
          validation_data=(train.iloc[test_fold], label.iloc[test_fold]),
          batch_size=BATCH_SIZE,
          epochs=5
          )

      # Generate metrics
      scores = model2.evaluate(train.iloc[test_fold], label.iloc[test_fold], verbose=0)
      precision = scores[3]
      recall = scores[4]
      F1_score = 2*precision*recall / (precision + recall)
      print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model2.metrics_names[0]} of {scores[0]}; {model2.metrics_names[1]} of {scores[1]*100}%')

      if F1_score > best_f1:
        best_f1 = F1_score
        if dataset_name == 'Biological Processes':
          tf.keras.models.save_model(
              model2,
              f'{model_root_path}/best_BP_model'
              )
          print(f'Current best model for Biological Processes has {lstm_units} bidirectional LSTM units and an F1 score of {F1_score}')

        elif dataset_name == 'Molecular Function':
          tf.keras.models.save_model(
              model2,
              f'{model_root_path}/best_MF_model'
              )
          print(f'Current best model for Molecular Function has {lstm_units} bidirectional LSTM units and an F1 score of {F1_score}')

        else:
          tf.keras.models.save_model(
              model2,
              f'{model_root_path}/best_CC_model')
          print(f'Current best model for Cellular Component has {lstm_units} bidirectional LSTM units and an F1 score of {F1_score}')

      fold_no += 1

In [None]:
for dataset in train_data_dict:
  dataset_name = dataset
  data = train_data_dict[dataset]
  model2_training(dataset_name, data)

Training for Biological Processes
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.8176819614715455; loss of 0.07005693763494492; binary_accuracy of 97.65358567237854%
Current best model for Biological Processes has an F1 score of 0.8176819614715455
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.8093954914906214; loss of 0.07036034017801285; binary_accuracy of 97.65002131462097%
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.8143425218404675; loss of 0.07011492550373077; binary_accuracy of 97.65574932098389%
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.816635784118438; loss of 0.06992646306753159; binary_accuracy of 97.65384793281555%
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.806318392725535