In [1]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import yaml
from sklearn.metrics import f1_score, accuracy_score
import shutil

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd
2024-02-23 21:27:24.151297: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-23 21:27:24.151388: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-23 21:27:24.229468: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory 

In [2]:
config_path = './config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)


def load_data(data_type):
    data_path = f"{config['directories']['preprocessed_data']}/train_embeddings_{data_type}.pkl"
    label_path = f"{config['directories']['preprocessed_data']}/train_labels_{data_type}.pkl"
    data_df = pd.read_pickle(data_path)
    label_df = pd.read_pickle(label_path)
    return data_df, label_df

datasets = {category: load_data(category) for category in ['BiologicalProcesses', 'CellularComponent', 'MolecularFunction']}

In [3]:
# Assuming parent_path is the parent directory of model_root_path
models_path = config["directories"]["models"]

model_data_map = {
    'BiologicalProcesses': 'best_BP_model',
    'CellularComponent': 'best_CC_model',
    'MolecularFunction': 'best_MF_model'
}

In [4]:
model_folders = [d for d in os.listdir(models_path) if os.path.isdir(os.path.join(models_path, d))]
model_folders

['CNNMod1', 'ResNet1D_BCE', 'FFNNMod1', 'FFNNMod2', 'MLPMod']

In [5]:
best_models = {}

for category in datasets:
    data_df, label_df = datasets[category]
    best_f1_score = 0
    best_model_path = None
    best_accuracy = 0
    
    print(f"Evaluating models for {category}:")
    for model_folder in model_folders:
        print(f'Evaluating {model_folder}...')
        current_model_path = f'{models_path}/{model_folder}/{model_data_map[category]}'
        
        model = tf.keras.models.load_model(current_model_path)
        
        scores = model.evaluate(data_df, label_df, verbose=0)
        accuracy = scores[1]
        precision = scores[3]
        recall = scores[4]
        F1_score = 2*precision*recall / (precision + recall)
      
        print(f"Model: {model_folder}, Accuracy: {accuracy}, F1 Score: {F1_score}")
        
        if F1_score > best_f1_score:
            best_f1_score = F1_score
            best_accuracy = accuracy
            best_model_path = current_model_path
            
    best_models[category] = (best_model_path, best_accuracy, best_f1_score)
    
for category, (path, accuracy, f1) in best_models.items():
    print(f"\nBest model for {category}: {path} with Accuracy: {accuracy}, F1 Score (Sample Avg): {f1}")

Evaluating models for BiologicalProcesses:
Evaluating CNNMod1...


2024-02-23 21:28:14.261264: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 79086 MB memory:  -> device: 0, name: NVIDIA A100-SXM4-80GB, pci bus id: 0000:44:00.0, compute capability: 8.0
2024-02-23 21:28:18.331753: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8904


Model: CNNMod1, Accuracy: 0.969669759273529, F1 Score: 0.4469348519335776
Evaluating ResNet1D_BCE...
Model: ResNet1D_BCE, Accuracy: 0.972293496131897, F1 Score: 0.19853940015414506
Evaluating FFNNMod1...
Model: FFNNMod1, Accuracy: 0.9770002961158752, F1 Score: 0.46438130478447454
Evaluating FFNNMod2...
Model: FFNNMod2, Accuracy: 0.976239800453186, F1 Score: 0.4425751018222649
Evaluating MLPMod...
Model: MLPMod, Accuracy: 0.9752091765403748, F1 Score: 0.36634526301696674
Evaluating models for CellularComponent:
Evaluating CNNMod1...
Model: CNNMod1, Accuracy: 0.9799372553825378, F1 Score: 0.45780743722073797
Evaluating ResNet1D_BCE...
Model: ResNet1D_BCE, Accuracy: 0.9772809743881226, F1 Score: 0.303207578044891
Evaluating FFNNMod1...
Model: FFNNMod1, Accuracy: 0.9811740517616272, F1 Score: 0.4972965738268135
Evaluating FFNNMod2...
Model: FFNNMod2, Accuracy: 0.9806178212165833, F1 Score: 0.4962278912594993
Evaluating MLPMod...
Model: MLPMod, Accuracy: 0.9799655079841614, F1 Score: 0.4149

In [7]:
for category, (path, _, _) in best_models.items():
    shutil.copytree(path, f"{config['directories']['best_performing_k_fold_models']}/{model_data_map[category]}")