# Feed-forward Neural Networks

This note book will be used to test Feed-forward neural networks on each of the 3 ontologies.

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import itertools
import pprint

In [2]:
print("TensorFlow v" + tf.__version__)
print("Numpy v" + np.__version__)

TensorFlow v2.15.0
Numpy v1.25.2


In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
#%%capture
!unzip '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data.zip'

Archive:  /content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data.zip
  inflating: bio_data/75percent_train_embeddings_BiologicalProcesses.pkl  
  inflating: bio_data/75percent_train_labels_BiologicalProcesses.pkl  
  inflating: bio_data/train_embeddings_CellularComponent.pkl  
  inflating: bio_data/train_embeddings_MolecularFunction.pkl  
  inflating: bio_data/train_labels_CellularComponent.pkl  
  inflating: bio_data/train_labels_MolecularFunction.pkl  


In [5]:
BP_train_df = pd.read_pickle('/content/bio_data/75percent_train_embeddings_BiologicalProcesses.pkl')
CC_train_df = pd.read_pickle('/content/bio_data/train_embeddings_CellularComponent.pkl')
MF_train_df = pd.read_pickle('/content/bio_data/train_embeddings_MolecularFunction.pkl')

In [6]:
BP_label_df = pd.read_pickle('/content/bio_data/75percent_train_labels_BiologicalProcesses.pkl')
CC_label_df = pd.read_pickle('/content/bio_data/train_labels_CellularComponent.pkl')
MF_label_df = pd.read_pickle('/content/bio_data/train_labels_MolecularFunction.pkl')

In [7]:
train_data_dict = {'Biological Processes': [BP_train_df, BP_label_df],
                   'Cellular Component': [CC_train_df, CC_label_df],
                   'Molecular Function': [MF_train_df, MF_label_df]
}

In [8]:
num_labels = 1500
num_folds = 10

## Model 1 Architecture: All Dense layers of the same size

In [9]:
BATCH_SIZE = 256

In [10]:
def model1_training(dataset_name, data):
  train = data[0]
  label = data[1]

  INPUT_SHAPE = [train.shape[1]]

  best_f1 = 0
  print('=======================================================================')
  print(f'Training for {dataset_name}')

  for unit_param, act_param in itertools.product([512, 256, 128], ['relu', 'tanh']):
    print('----------------------------------------------------------------------')
    print('The number of units in each layer is ', unit_param)
    print('The activation function in each layer is ', act_param)

    kfold = KFold(n_splits=num_folds, shuffle=True)
    fold_no = 1

    for train_fold, test_fold in kfold.split(train, label):

      model1 = tf.keras.Sequential([
          tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
          tf.keras.layers.Dense(units = unit_param, activation = act_param),
          tf.keras.layers.Dense(units = unit_param, activation = act_param),
          tf.keras.layers.Dense(units = unit_param, activation = act_param),
          tf.keras.layers.Dense(units = num_labels, activation = 'sigmoid')
          ])

      # Compile model
      model1.compile(
          optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
          loss='binary_crossentropy',
          metrics=['binary_accuracy',
                  tf.keras.metrics.AUC(),
                  tf.keras.metrics.Precision(),
                  tf.keras.metrics.Recall(),
                  ] # tf.keras.metrics.F1Score() not appropriate as it is calculated batchwise
          )

      print(f'Training for fold {fold_no} ...')

      # Fit the data to the model
      history = model1.fit(
          train, label,
          batch_size=BATCH_SIZE,
          epochs=5
          )

      # Generate metrics
      scores = model1.evaluate(train, label, verbose=0)
      precision = scores[3]
      recall = scores[4]
      F1_score = 2*precision*recall / (precision + recall)
      print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model1.metrics_names[0]} of {scores[0]}; {model1.metrics_names[1]} of {scores[1]*100}%')

      if F1_score > best_f1:
        best_f1 = F1_score
        if dataset_name == 'Biological Processes':
          tf.keras.models.save_model(
              model1,
              '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/FFNNMod1/best_BP_model',
          )
          print(f'Current best model for Biological Processes has {unit_param} units in each layer, uses {act_param} activation function and has an F1 score of {F1_score}')

        elif dataset_name == 'Molecular Function':
          tf.keras.models.save_model(
              model1,
              '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/FFNNMod1/best_MF_model',
          )
          print(f'Current best model for Molecular Function has {unit_param} units in each layer, uses {act_param} activation function and has an F1 score of {F1_score}')

        else:
          tf.keras.models.save_model(
              model1,
              '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/FFNNMod1/best_CC_model',
          )
          print(f'Current best model for Cellular Component has {unit_param} units in each layer, uses {act_param} activation function and has an F1 score of {F1_score}')

      fold_no += 1

In [11]:
for dataset in train_data_dict:
  dataset_name = dataset
  data = train_data_dict[dataset]
  model1_training(dataset_name, data)

Training for Biological Processes
----------------------------------------------------------------------
The number of units in each layer is  512
The activation function in each layer is  relu
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.4687332350606924; loss of 0.06806392222642899; binary_accuracy of 97.70029783248901%
Current best model for Biological Processes has 512 units in each layer, uses relu activation function and has an F1 score of 0.4687332350606924
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.4383225108478719; loss of 0.0681770071387291; binary_accuracy of 97.69333004951477%
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.4424663807824361; loss of 0.06769152730703354; binary_accuracy of 97.70452380180359%
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 

## Model 2 Architecture: Decreasing then increasing layer size

In [12]:
BATCH_SIZE = 256

In [13]:
def model2_training(dataset_name, data):
  train = data[0]
  label = data[1]

  INPUT_SHAPE = [train.shape[1]]

  best_f1 = 0
  print('=======================================================================')
  print(f'Training for {dataset_name}')

  act_param = 'relu'

  kfold = KFold(n_splits=num_folds, shuffle=True)
  fold_no = 1

  for train_fold, test_fold in kfold.split(train, label):

    model2 = tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
        tf.keras.layers.Dense(units = 512, activation = act_param),
        tf.keras.layers.Dense(units = 256, activation = act_param),
        tf.keras.layers.Dense(units = 128, activation = act_param),
        tf.keras.layers.Dense(units = 256, activation = act_param),
        tf.keras.layers.Dense(units = 512, activation = act_param),
        tf.keras.layers.Dense(units = num_labels, activation = 'sigmoid')
        ])

    # Compile model
    model2.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['binary_accuracy',
                tf.keras.metrics.AUC(),
                tf.keras.metrics.Precision(),
                tf.keras.metrics.Recall(),
                ] # tf.keras.metrics.F1Score() not appropriate as it is calculated batchwise
        )

    print(f'Training for fold {fold_no} ...')

    # Fit the data to the model
    history = model2.fit(
        train, label,
        batch_size=BATCH_SIZE,
        epochs=5
        )

    # Generate metrics
    scores = model2.evaluate(train, label, verbose=0)
    precision = scores[3]
    recall = scores[4]
    F1_score = 2*precision*recall / (precision + recall)
    print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model2.metrics_names[0]} of {scores[0]}; {model2.metrics_names[1]} of {scores[1]*100}%')

    if F1_score > best_f1:
      best_f1 = F1_score
      if dataset_name == 'Biological Processes':
        tf.keras.models.save_model(
            model2,
            '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/FFNNMod2/best_BP_model',
        )
        print(f'Current best model for Biological Processes has an F1 score of {F1_score}')

      elif dataset_name == 'Molecular Function':
        tf.keras.models.save_model(
            model2,
            '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/FFNNMod2/best_MF_model',
        )
        print(f'Current best model for Molecular Function has an F1 score of {F1_score}')

      else:
        tf.keras.models.save_model(
            model2,
            '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/FFNNMod2/best_CC_model',
        )
        print(f'Current best model for Cellular Component has an F1 score of {F1_score}')

    fold_no += 1

In [14]:
for dataset in train_data_dict:
  dataset_name = dataset
  data = train_data_dict[dataset]
  model2_training(dataset_name, data)

Training for Biological Processes
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.42692201973979943; loss of 0.07086903601884842; binary_accuracy of 97.64177203178406%
Current best model for Biological Processes has an F1 score of 0.42692201973979943
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.4209447373143758; loss of 0.0704914927482605; binary_accuracy of 97.64474630355835%
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.4241805675883048; loss of 0.07001075893640518; binary_accuracy of 97.65877723693848%
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.41473337135076077; loss of 0.07036077231168747; binary_accuracy of 97.64835238456726%
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.425370897668