# ResNet-like 1D Neural Network

This note book will be used to test resnet inspired neural network on each of the 3 ontologies.

In [10]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import yaml

In [11]:
print("TensorFlow v" + tf.__version__)
print("Numpy v" + np.__version__)

TensorFlow v2.15.0
Numpy v1.26.4


In [12]:
config_path = './config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

In [13]:
USE_75_PERCENT_DATA = config['use_75_percent_datasets']
partial_dataset_prefix = '75percent_' if USE_75_PERCENT_DATA else ''
BP_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_BiologicalProcesses.pkl")
CC_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_CellularComponent.pkl")
MF_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_MolecularFunction.pkl")
BP_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_BiologicalProcesses.pkl")
CC_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_CellularComponent.pkl")
MF_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_MolecularFunction.pkl")

In [14]:
train_data_dict = {
    'Biological Processes': [BP_train_df, BP_label_df],
    'Cellular Component': [CC_train_df, CC_label_df],
    'Molecular Function': [MF_train_df, MF_label_df]
}

In [15]:
num_labels = 1500
num_folds = config['num_folds']

In [16]:
BATCH_SIZE = config['batch_size']

In [17]:
from tensorflow.keras import layers, models, Input

def resnet1d_block(input_tensor, in_channels, out_channels, stride=1, downsample=None):
    x = input_tensor
    identity = x

    # First convolution
    x = layers.Conv1D(out_channels, kernel_size=3, strides=stride, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)

    # Second convolution
    x = layers.Conv1D(out_channels, kernel_size=3, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)

    # Downsample if necessary
    if downsample is not None:
        identity = downsample(input_tensor)

    # Add identity
    x = layers.add([x, identity])
    x = layers.ReLU()(x)

    return x

def ResNet1D(input_shape=(None, 1), num_classes=1500):
    inputs = Input(shape=input_shape)

    # Initial conv and max-pooling
    x = layers.Conv1D(64, kernel_size=7, strides=2, padding='same', use_bias=False)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPool1D(pool_size=3, strides=2, padding='same')(x)

    # Utility function to apply ResNet blocks
    def make_layer(x, in_channels, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or in_channels != out_channels:
            downsample = models.Sequential([
                layers.Conv1D(out_channels, kernel_size=1, strides=stride, use_bias=False),
                layers.BatchNormalization(),
            ])

        x = resnet1d_block(x, in_channels, out_channels, stride, downsample)
        for _ in range(1, blocks):
            x = resnet1d_block(x, out_channels, out_channels)
        return x

    # ResNet layers
    x = make_layer(x, 64, 64, 2)
    x = make_layer(x, 64, 128, 2, stride=2)
    x = make_layer(x, 128, 256, 2, stride=2)
    x = make_layer(x, 256, 512, 2, stride=2)

    # Global average pooling and dense layer
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes)(x)

    # Create model
    model = models.Model(inputs=inputs, outputs=outputs)

    return model


In [22]:
def train_model(get_model, dataset_name, model_name, data, num_folds, BATCH_SIZE, loss_function='binary_crossentropy'):
    train, label = data

    best_f1 = 0
    print('=======================================================================')
    print(f'Training for {dataset_name}')
    
    model_root_path = f'{config["directories"]["models"]}/{model_name}'

    kfold = KFold(n_splits=num_folds, shuffle=True)
    fold_no = 1

    for train_fold, test_fold in kfold.split(train, label):
        print(f'Training for fold {fold_no} ...')
        
        loss_fn = loss_function

        model = get_model()
        model.compile(
          optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
          loss=loss_fn,
          metrics=['binary_accuracy',
                tf.keras.metrics.AUC(),
                tf.keras.metrics.Precision(),
                tf.keras.metrics.Recall(),
                ]
          )

        history = model.fit(
            train.iloc[train_fold], label.iloc[train_fold],
            validation_data=(train.iloc[test_fold], label.iloc[test_fold]),
            batch_size=BATCH_SIZE,
            epochs=5
        )

        scores = model.evaluate(train.iloc[test_fold], label.iloc[test_fold], verbose=0)
        precision = scores[3]
        recall = scores[4]
        F1_score = 2 * precision * recall / (precision + recall)
        print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')

        if F1_score > best_f1:
            best_f1 = F1_score
            
            if dataset_name == 'Biological Processes':
              tf.keras.models.save_model(
                  model,
                  f'{model_root_path}/best_BP_model',
              )
              print(f'Current best model for Biological Processes has an F1 score of {F1_score}')

            elif dataset_name == 'Molecular Function':
              tf.keras.models.save_model(
                  model,
                  f'{model_root_path}/best_MF_model',
              )
              print(f'Current best model for Molecular Function has an F1 score of {F1_score}')
            else:
              tf.keras.models.save_model(
                  model,
                  f'{model_root_path}/best_CC_model',
              )
              print(f'Current best model for Cellular Component has an F1 score of {F1_score}')

        fold_no += 1

In [23]:
def train_on_datasets(get_model, model_name, loss_function=None):
    for dataset in train_data_dict:
        dataset_name = dataset
        data = train_data_dict[dataset]
        train_model(get_model, dataset_name, model_name, data, 10, BATCH_SIZE, loss_function=loss_function)

In [25]:
train_on_datasets(lambda: ResNet1D(input_shape=(1024, 1), num_classes=1500), 'ResNet1D_BCE', 'binary_crossentropy')

Training for Biological Processes
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.15166213726311673; loss of 0.2813005745410919; binary_accuracy of 97.12539315223694%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_BP_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_BP_model/assets


Current best model for Biological Processes has an F1 score of 0.15166213726311673
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.16929171355113262; loss of 0.2661374807357788; binary_accuracy of 97.10690975189209%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_BP_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_BP_model/assets


Current best model for Biological Processes has an F1 score of 0.16929171355113262
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.18169878864334604; loss of 0.23738867044448853; binary_accuracy of 97.24978804588318%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_BP_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_BP_model/assets


Current best model for Biological Processes has an F1 score of 0.18169878864334604
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.16441074107883544; loss of 0.27645185589790344; binary_accuracy of 97.23501205444336%
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.12221050819809674; loss of 0.2787378132343292; binary_accuracy of 97.2529947757721%
Training for fold 6 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 6: F1 score of 0.17017437235478935; loss of 0.2781556248664856; binary_accuracy of 97.28345274925232%
Training for fold 7 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 7: F1 score of 0.16635547707172976; loss of 0.27647775411605835; binary_accuracy of 97.23699688911438%
Training for fold 8 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 8: F1 score of 0.1251320847866824; loss of 0.287740558385849; b

INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_BP_model/assets


Current best model for Biological Processes has an F1 score of 0.19873373748740614
Training for Cellular Component
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.27994781126059964; loss of 0.23696091771125793; binary_accuracy of 97.76147603988647%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_CC_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_CC_model/assets


Current best model for Cellular Component has an F1 score of 0.27994781126059964
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.28320284040961324; loss of 0.22313320636749268; binary_accuracy of 97.74941802024841%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_CC_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_CC_model/assets


Current best model for Cellular Component has an F1 score of 0.28320284040961324
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.29940668794297587; loss of 0.22050291299819946; binary_accuracy of 97.8320300579071%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_CC_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_CC_model/assets


Current best model for Cellular Component has an F1 score of 0.29940668794297587
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.2934387448580332; loss of 0.2308375984430313; binary_accuracy of 97.7191686630249%
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.2830348337904809; loss of 0.22357343137264252; binary_accuracy of 97.7515161037445%
Training for fold 6 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 6: F1 score of 0.30406585367323213; loss of 0.22286590933799744; binary_accuracy of 97.74405360221863%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_CC_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_CC_model/assets


Current best model for Cellular Component has an F1 score of 0.30406585367323213
Training for fold 7 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 7: F1 score of 0.28519309058087183; loss of 0.25525516271591187; binary_accuracy of 97.69138097763062%
Training for fold 8 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 8: F1 score of 0.2949493696944791; loss of 0.23753124475479126; binary_accuracy of 97.75220155715942%
Training for fold 9 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 9: F1 score of 0.2908419824231169; loss of 0.23550812900066376; binary_accuracy of 97.73579835891724%
Training for fold 10 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 10: F1 score of 0.23667943077727827; loss of 0.260248064994812; binary_accuracy of 97.65558242797852%
Training for Molecular Function
Training for fold 1 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 1: F1 score of 0.201122509770222

INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_MF_model/assets


Current best model for Molecular Function has an F1 score of 0.20112250977022222
Training for fold 2 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 2: F1 score of 0.21115643328708691; loss of 0.5244831442832947; binary_accuracy of 96.22248411178589%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_MF_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_MF_model/assets


Current best model for Molecular Function has an F1 score of 0.21115643328708691
Training for fold 3 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 3: F1 score of 0.23434841765448286; loss of 0.29148027300834656; binary_accuracy of 96.97206616401672%
INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_MF_model/assets


INFO:tensorflow:Assets written to: ./models/ResNet1D_BCE/best_MF_model/assets


Current best model for Molecular Function has an F1 score of 0.23434841765448286
Training for fold 4 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 4: F1 score of 0.1668980568605548; loss of 0.25928983092308044; binary_accuracy of 97.01934456825256%
Training for fold 5 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 5: F1 score of 0.194308300207795; loss of 0.3161976933479309; binary_accuracy of 96.97980880737305%
Training for fold 6 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 6: F1 score of 0.12441579474581542; loss of 0.2854407727718353; binary_accuracy of 97.08724617958069%
Training for fold 7 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 7: F1 score of 0.044643755063070646; loss of 0.27619367837905884; binary_accuracy of 96.92493081092834%
Training for fold 8 ...
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Score for fold 8: F1 score of 0.1923907273581739; loss of 0.2951749563217163; bin