# Multi-Layer Perceptron

This note book will be used to test a multilayer perceptron on each of the 3 ontologies.

In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
import itertools
import pprint

In [2]:
print("TensorFlow v" + tf.__version__)
print("Numpy v" + np.__version__)

TensorFlow v2.15.0
Numpy v1.25.2


In [3]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [4]:
#%%capture
!unzip '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data.zip'

Archive:  /content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data.zip
  inflating: bio_data/75percent_train_embeddings_BiologicalProcesses.pkl  
  inflating: bio_data/75percent_train_labels_BiologicalProcesses.pkl  
  inflating: bio_data/train_embeddings_CellularComponent.pkl  
  inflating: bio_data/train_embeddings_MolecularFunction.pkl  
  inflating: bio_data/train_labels_CellularComponent.pkl  
  inflating: bio_data/train_labels_MolecularFunction.pkl  


In [5]:
BP_train_df = pd.read_pickle('/content/bio_data/75percent_train_embeddings_BiologicalProcesses.pkl')
CC_train_df = pd.read_pickle('/content/bio_data/train_embeddings_CellularComponent.pkl')
MF_train_df = pd.read_pickle('/content/bio_data/train_embeddings_MolecularFunction.pkl')

In [6]:
BP_label_df = pd.read_pickle('/content/bio_data/75percent_train_labels_BiologicalProcesses.pkl')
CC_label_df = pd.read_pickle('/content/bio_data/train_labels_CellularComponent.pkl')
MF_label_df = pd.read_pickle('/content/bio_data/train_labels_MolecularFunction.pkl')

In [7]:
train_data_dict = {'Biological Processes': [BP_train_df, BP_label_df],
                   'Cellular Component': [CC_train_df, CC_label_df],
                   'Molecular Function': [MF_train_df, MF_label_df]
}

In [8]:
num_labels = 1500
num_folds = 10

## Architecture: Dense layers of the same size with linear activation function

In [9]:
BATCH_SIZE = 256

In [11]:
def model_training(dataset_name, data):
  train = data[0]
  label = data[1]

  INPUT_SHAPE = [train.shape[1]]

  act_param = 'linear'
  unit_param = 1024

  best_f1 = 0
  print('=======================================================================')
  print(f'Training for {dataset_name}')

  kfold = KFold(n_splits=num_folds, shuffle=True)
  fold_no = 1

  for train_fold, test_fold in kfold.split(train, label):

    model = tf.keras.Sequential([
        tf.keras.layers.BatchNormalization(input_shape=INPUT_SHAPE),
        tf.keras.layers.Dense(units = unit_param, activation = act_param),
        tf.keras.layers.Dense(units = num_labels, activation = 'sigmoid')
        ])

    # Compile model
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='binary_crossentropy',
        metrics=['binary_accuracy',
                tf.keras.metrics.AUC(),
                tf.keras.metrics.Precision(),
                tf.keras.metrics.Recall(),
                ] # tf.keras.metrics.F1Score() not appropriate as it is calculated batchwise
        )

    print(f'Training for fold {fold_no} ...')

    # Fit the data to the model
    history = model.fit(
        train.iloc[train_fold], label.iloc[train_fold],
        validation_data = (train.iloc[test_fold], label.iloc[test_fold]),
        batch_size=BATCH_SIZE,
        epochs=5
        )

    # Generate metrics
    scores = model.evaluate(train, label, verbose=0)
    precision = scores[3]
    print(f'{model.metrics_names[3]} is {precision}')
    recall = scores[4]
    print(f'{model.metrics_names[4]} is {recall}')
    F1_score = 2*precision*recall / (precision + recall)
    print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')

    if F1_score > best_f1:
      best_f1 = F1_score
      if dataset_name == 'Biological Processes':
        tf.keras.models.save_model(
            model,
            '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/MLPMod/best_BP_model',
        )
        print(f'Current best model for Biological Processes has an F1 score of {F1_score}')

      elif dataset_name == 'Molecular Function':
        tf.keras.models.save_model(
            model,
            '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/MLPMod/best_MF_model',
        )
        print(f'Current best model for Molecular Function has an F1 score of {F1_score}')

      else:
        tf.keras.models.save_model(
            model,
            '/content/drive/MyDrive/ColabNotebooks/BiologicalData/Final_Project/bio_data/MLPMod/best_CC_model',
        )
        print(f'Current best model for Cellular Component has an F1 score of {F1_score}')

    fold_no += 1

In [12]:
for dataset in train_data_dict:
  dataset_name = dataset
  data = train_data_dict[dataset]
  model_training(dataset_name, data)

Training for Biological Processes
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
precision is 0.679935097694397
recall is 0.2596537470817566
Score for fold 1: F1 score of 0.3757977691419078; loss of 0.07381152361631393; binary_accuracy of 97.5385308265686%
Current best model for Biological Processes has an F1 score of 0.3757977691419078
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
precision_1 is 0.652550220489502
recall_1 is 0.2971310019493103
Score for fold 2: F1 score of 0.4083325988869524; loss of 0.07364422082901001; binary_accuracy of 97.54276275634766%
Current best model for Biological Processes has an F1 score of 0.4083325988869524
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
precision_2 is 0.6405624151229858
recall_2 is 0.3077648878097534
Score for fold 3: F1 score of 0.415769153151658; loss of 0.07394573837518692; binary_accuracy of 97.53175377845764%
Current best model for Biological Processe