# Feed-forward Neural Networks

This note book will be used to test Feed-forward neural networks on each of the 3 ontologies.

In [10]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import itertools
import yaml

In [2]:
print("TensorFlow v" + tf.__version__)
print("Numpy v" + np.__version__)

TensorFlow v2.15.0
Numpy v1.26.4


In [3]:
# from google.colab import drive
# drive.mount('/content/drive/')

In [3]:
config_path = './config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

In [4]:
USE_75_PERCENT_DATA = config['use_75_percent_datasets']
partial_dataset_prefix = '75percent_' if USE_75_PERCENT_DATA else ''
BP_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_BiologicalProcesses.pkl")
CC_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_CellularComponent.pkl")
MF_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_MolecularFunction.pkl")
BP_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_BiologicalProcesses.pkl")
CC_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_CellularComponent.pkl")
MF_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_MolecularFunction.pkl")

In [5]:
train_data_dict = {'Biological Processes': [BP_train_df, BP_label_df],
                   'Cellular Component': [CC_train_df, CC_label_df],
                   'Molecular Function': [MF_train_df, MF_label_df]
}

In [6]:
num_labels = 1500
num_folds = config['num_folds']

## Model 1 Architecture: All Dense layers of the same size

In [7]:
BATCH_SIZE = config['batch_size']

In [8]:
def model1_training(dataset_name, data):
  train = data[0]
  label = data[1]

  INPUT_SHAPE = [train.shape[1]]

  best_f1 = 0
  print('=======================================================================')
  print(f'Training for {dataset_name}')
  
  model_root_path = f'{config["directories"]["models"]}/FFNNMod1'

  for unit_param, act_param in itertools.product([512, 256, 128], ['relu', 'tanh']):
    print('----------------------------------------------------------------------')
    print('The number of units in each layer is ', unit_param)
    print('The activation function in each layer is ', act_param)

    kfold = KFold(n_splits=num_folds, shuffle=True)
    fold_no = 1

    for train_fold, test_fold in kfold.split(train, label):

      model1 = tf.keras.Sequential([
          tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
          tf.keras.layers.Dense(units = unit_param, activation = act_param),
          tf.keras.layers.Dense(units = unit_param, activation = act_param),
          tf.keras.layers.Dense(units = unit_param, activation = act_param),
          tf.keras.layers.Dense(units = num_labels, activation = 'sigmoid')
          ])

      # Compile model
      model1.compile(
          optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
          loss='binary_crossentropy',
          metrics=['binary_accuracy',
                  tf.keras.metrics.AUC(),
                  tf.keras.metrics.Precision(),
                  tf.keras.metrics.Recall(),
                  ] # tf.keras.metrics.F1Score() not appropriate as it is calculated batchwise
          )

      print(f'Training for fold {fold_no} ...')

      # Fit the data to the model
      history = model1.fit(
            train.iloc[train_fold], label.iloc[train_fold],
            validation_data=(train.iloc[test_fold], label.iloc[test_fold]),
            batch_size=BATCH_SIZE,
            epochs=5
        )

      # Generate metrics
      scores = model1.evaluate(train.iloc[test_fold], label.iloc[test_fold], verbose=0)
      precision = scores[3]
      recall = scores[4]
      F1_score = 2*precision*recall / (precision + recall)
      print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model1.metrics_names[0]} of {scores[0]}; {model1.metrics_names[1]} of {scores[1]*100}%')

      if F1_score > best_f1:
        best_f1 = F1_score
        if dataset_name == 'Biological Processes':
          tf.keras.models.save_model(
              model1,
              f'{model_root_path}/best_BP_model',
          )
          print(f'Current best model for Biological Processes has {unit_param} units in each layer, uses {act_param} activation function and has an F1 score of {F1_score}')

        elif dataset_name == 'Molecular Function':
          tf.keras.models.save_model(
              model1,
              f'{model_root_path}/best_MF_model',
          )
          print(f'Current best model for Molecular Function has {unit_param} units in each layer, uses {act_param} activation function and has an F1 score of {F1_score}')

        else:
          tf.keras.models.save_model(
              model1,
              f'{model_root_path}/best_CC_model',
          )
          print(f'Current best model for Cellular Component has {unit_param} units in each layer, uses {act_param} activation function and has an F1 score of {F1_score}')

      fold_no += 1

In [None]:
for dataset in train_data_dict:
  dataset_name = dataset
  data = train_data_dict[dataset]
  model1_training(dataset_name, data)

## Model 2 Architecture: Decreasing then increasing layer size

In [11]:
BATCH_SIZE = config['batch_size']

In [12]:
def model2_training(dataset_name, data):
  train = data[0]
  label = data[1]

  INPUT_SHAPE = [train.shape[1]]

  best_f1 = 0
  print('=======================================================================')
  print(f'Training for {dataset_name}')

  model_root_path = f'{config["directories"]["models"]}/FFNNMod2'

  act_param = 'relu'

  kfold = KFold(n_splits=num_folds, shuffle=True)
  fold_no = 1

  for train_fold, test_fold in kfold.split(train, label):

    model2 = tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
        tf.keras.layers.Dense(units = 512, activation = act_param),
        tf.keras.layers.Dense(units = 256, activation = act_param),
        tf.keras.layers.Dense(units = 128, activation = act_param),
        tf.keras.layers.Dense(units = 256, activation = act_param),
        tf.keras.layers.Dense(units = 512, activation = act_param),
        tf.keras.layers.Dense(units = num_labels, activation = 'sigmoid')
        ])

    # Compile model
    model2.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['binary_accuracy',
                tf.keras.metrics.AUC(),
                tf.keras.metrics.Precision(),
                tf.keras.metrics.Recall(),
                ] # tf.keras.metrics.F1Score() not appropriate as it is calculated batchwise
        )

    print(f'Training for fold {fold_no} ...')

    # Fit the data to the model
    history = model2.fit(
            train.iloc[train_fold], label.iloc[train_fold],
            validation_data=(train.iloc[test_fold], label.iloc[test_fold]),
            batch_size=BATCH_SIZE,
            epochs=5
        )

    # Generate metrics
    scores = model2.evaluate(train, label, verbose=0)
    precision = scores[3]
    recall = scores[4]
    F1_score = 2*precision*recall / (precision + recall)
    print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model2.metrics_names[0]} of {scores[0]}; {model2.metrics_names[1]} of {scores[1]*100}%')

    if F1_score > best_f1:
      best_f1 = F1_score
      if dataset_name == 'Biological Processes':
        tf.keras.models.save_model(
            model2,
            f'{model_root_path}/best_BP_model',
        )
        print(f'Current best model for Biological Processes has an F1 score of {F1_score}')

      elif dataset_name == 'Molecular Function':
        tf.keras.models.save_model(
            model2,
            f'{model_root_path}/best_MF_model',
        )
        print(f'Current best model for Molecular Function has an F1 score of {F1_score}')

      else:
        tf.keras.models.save_model(
            model2,
            f'{model_root_path}/best_CC_model',
        )
        print(f'Current best model for Cellular Component has an F1 score of {F1_score}')

    fold_no += 1

In [13]:
for dataset in train_data_dict:
  dataset_name = dataset
  data = train_data_dict[dataset]
  model2_training(dataset_name, data)

Training for Biological Processes
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.40463973992582647; loss of 0.07077210396528244; binary_accuracy of 97.63535857200623%
INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_BP_model/assets


INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_BP_model/assets


Current best model for Biological Processes has an F1 score of 0.40463973992582647
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.38762048593469683; loss of 0.0713156908750534; binary_accuracy of 97.62780666351318%
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.39679436883418273; loss of 0.07171579450368881; binary_accuracy of 97.61015176773071%
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.40454342215528893; loss of 0.07122363895177841; binary_accuracy of 97.62759208679199%
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.4313931855785713; loss of 0.0707026869058609; binary_accuracy of 97.64573574066162%
INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_BP_model/assets


INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_BP_model/assets


Current best model for Biological Processes has an F1 score of 0.4313931855785713
Training for fold 6 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 6: F1 score of 0.42106710599565167; loss of 0.07064038515090942; binary_accuracy of 97.6440966129303%
Training for fold 7 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 7: F1 score of 0.3646359507960561; loss of 0.07176294177770615; binary_accuracy of 97.608482837677%
Training for fold 8 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 8: F1 score of 0.4425751018222649; loss of 0.07133384048938751; binary_accuracy of 97.6239800453186%
INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_BP_model/assets


INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_BP_model/assets


Current best model for Biological Processes has an F1 score of 0.4425751018222649
Training for fold 9 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 9: F1 score of 0.4352342282035109; loss of 0.07128937542438507; binary_accuracy of 97.62567281723022%
Training for fold 10 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 10: F1 score of 0.40138483648139883; loss of 0.07143636047840118; binary_accuracy of 97.62712121009827%
Training for Cellular Component
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.46577807557339573; loss of 0.058596041053533554; binary_accuracy of 98.08456301689148%
INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_CC_model/assets


INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_CC_model/assets


Current best model for Cellular Component has an F1 score of 0.46577807557339573
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.45157794282735764; loss of 0.05864283815026283; binary_accuracy of 98.07959794998169%
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.46490196913770626; loss of 0.059121958911418915; binary_accuracy of 98.07690382003784%
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.4962278912594993; loss of 0.05954664945602417; binary_accuracy of 98.06178212165833%
INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_CC_model/assets


INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_CC_model/assets


Current best model for Cellular Component has an F1 score of 0.4962278912594993
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.44502127804478037; loss of 0.05918203666806221; binary_accuracy of 98.06416630744934%
Training for fold 6 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 6: F1 score of 0.46913675108489594; loss of 0.058856237679719925; binary_accuracy of 98.08136820793152%
Training for fold 7 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 7: F1 score of 0.4655249276794386; loss of 0.058543529361486435; binary_accuracy of 98.08377623558044%
Training for fold 8 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 8: F1 score of 0.46257425369230226; loss of 0.05927106738090515; binary_accuracy of 98.07446599006653%
Training for fold 9 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 9: F1 score of 0.47230119727151454; loss of 0.0588369555771350

INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_MF_model/assets


Current best model for Molecular Function has an F1 score of 0.48580617788975133
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.4728350216691462; loss of 0.07084241509437561; binary_accuracy of 97.61848449707031%
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.47875401548324414; loss of 0.0704716145992279; binary_accuracy of 97.62996435165405%
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.4766632124685695; loss of 0.070428267121315; binary_accuracy of 97.63559103012085%
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.47316309734552997; loss of 0.07011868059635162; binary_accuracy of 97.63938188552856%
Training for fold 6 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 6: F1 score of 0.47320043469214335; loss of 0.07029090821743011; bi

INFO:tensorflow:Assets written to: ./models/FFNNMod2/best_MF_model/assets


Current best model for Molecular Function has an F1 score of 0.4962166985258705
Training for fold 8 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 8: F1 score of 0.46500409177285545; loss of 0.07071368396282196; binary_accuracy of 97.61626720428467%
Training for fold 9 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 9: F1 score of 0.4907043230085026; loss of 0.0702923834323883; binary_accuracy of 97.64026403427124%
Training for fold 10 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 10: F1 score of 0.48543521218406693; loss of 0.07013863325119019; binary_accuracy of 97.64605760574341%
