# ResNet-like 1D Neural Network

This note book will be used to test resnet inspired neural network on each of the 3 ontologies.

In [26]:
import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
import yaml

In [27]:
print("TensorFlow v" + tf.__version__)
print("Numpy v" + np.__version__)

TensorFlow v2.15.0
Numpy v1.26.4


In [28]:
config_path = './config.yaml'
with open(config_path, 'r') as file:
    config = yaml.safe_load(file)

In [29]:
USE_75_PERCENT_DATA = config['use_75_percent_datasets']
partial_dataset_prefix = '75percent_' if USE_75_PERCENT_DATA else ''
BP_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_BiologicalProcesses.pkl")
CC_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_CellularComponent.pkl")
MF_train_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_embeddings_MolecularFunction.pkl")
BP_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_BiologicalProcesses.pkl")
CC_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_CellularComponent.pkl")
MF_label_df = pd.read_pickle(f"{config['directories']['preprocessed_data']}/{partial_dataset_prefix}train_labels_MolecularFunction.pkl")

In [30]:
train_data_dict = {
    'Biological Processes': [BP_train_df, BP_label_df],
    'Cellular Component': [CC_train_df, CC_label_df],
    'Molecular Function': [MF_train_df, MF_label_df]
}

In [31]:
num_labels = 1500
num_folds = config['num_folds']

In [32]:
BATCH_SIZE = config['batch_size']

In [33]:
from tensorflow.keras import layers, models, Input

def resnet1d_block(input_tensor, in_channels, out_channels, stride=1, downsample=None):
    x = input_tensor
    identity = x

    # First convolution
    x = layers.Conv1D(out_channels, kernel_size=3, strides=stride, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)

    # Second convolution
    x = layers.Conv1D(out_channels, kernel_size=3, padding='same', use_bias=False)(x)
    x = layers.BatchNormalization()(x)

    # Downsample if necessary
    if downsample is not None:
        identity = downsample(input_tensor)

    # Add identity
    x = layers.add([x, identity])
    x = layers.ReLU()(x)

    return x

def ResNet1D(input_shape=(None, 1), num_classes=1500):
    inputs = Input(shape=input_shape)

    # Initial conv and max-pooling
    x = layers.Conv1D(64, kernel_size=7, strides=2, padding='same', use_bias=False)(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.ReLU()(x)
    x = layers.MaxPool1D(pool_size=3, strides=2, padding='same')(x)

    # Utility function to apply ResNet blocks
    def make_layer(x, in_channels, out_channels, blocks, stride=1):
        downsample = None
        if stride != 1 or in_channels != out_channels:
            downsample = models.Sequential([
                layers.Conv1D(out_channels, kernel_size=1, strides=stride, use_bias=False),
                layers.BatchNormalization(),
            ])

        x = resnet1d_block(x, in_channels, out_channels, stride, downsample)
        for _ in range(1, blocks):
            x = resnet1d_block(x, out_channels, out_channels)
        return x

    # ResNet layers
    x = make_layer(x, 64, 64, 2)
    x = make_layer(x, 64, 128, 2, stride=2)
    x = make_layer(x, 128, 256, 2, stride=2)
    x = make_layer(x, 256, 512, 2, stride=2)

    # Global average pooling and dense layer
    x = layers.GlobalAveragePooling1D()(x)
    outputs = layers.Dense(num_classes)(x)

    # Create model
    model = models.Model(inputs=inputs, outputs=outputs)

    return model


In [34]:
def train_model(get_model, dataset_name, model_name, data, num_folds, BATCH_SIZE, loss_function='binary_crossentropy'):
    train, label = data

    best_f1 = 0
    print('=======================================================================')
    print(f'Training for {dataset_name}')
    
    model_root_path = f'{config["directories"]["models"]}/{model_name}'

    kfold = KFold(n_splits=num_folds, shuffle=True)
    fold_no = 1

    for train_fold, test_fold in kfold.split(train, label):
        print(f'Training for fold {fold_no} ...')
        
        # label_frequencies = label.iloc[train_fold].mean(axis=0)
        # weights = 1 / (label_frequencies + 1e-8)
        # weights = weights / weights.sum() 
        # def weighted_softmax_cross_entropy_with_logits(labels, logits):
        #     raw_loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=logits)
            
        #     weights_reshaped = tf.reshape(weights, [1, -1])
        #     weights_reshaped = tf.cast(weights_reshaped, dtype=tf.float32)
        #     weighted_loss = raw_loss * weights_reshaped
    
        #     return tf.reduce_mean(weighted_loss)
          
        # loss_fn = loss_function if loss_function is not None else 
        loss_fn = loss_function

        model = get_model()
        model.compile(
          optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
          loss=loss_fn,
          metrics=['binary_accuracy',
                tf.keras.metrics.AUC(),
                tf.keras.metrics.Precision(),
                tf.keras.metrics.Recall(),
                ]
          )

        history = model.fit(
            train.iloc[train_fold], label.iloc[train_fold],
            validation_data=(train.iloc[test_fold], label.iloc[test_fold]),
            batch_size=BATCH_SIZE,
            epochs=5
        )

        scores = model.evaluate(train.iloc[test_fold], label.iloc[test_fold], verbose=0)
        precision = scores[3]
        recall = scores[4]
        F1_score = 2 * precision * recall / (precision + recall)
        print(f'Score for fold {fold_no}: F1 score of {F1_score}; {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')

        if F1_score > best_f1:
            best_f1 = F1_score
            
            if dataset_name == 'Biological Processes':
              tf.keras.models.save_model(
                  model,
                  f'{model_root_path}/best_BP_model',
              )
              print(f'Current best model for Biological Processes has an F1 score of {F1_score}')

            elif dataset_name == 'Molecular Function':
              tf.keras.models.save_model(
                  model,
                  f'{model_root_path}/best_MF_model',
              )
              print(f'Current best model for Molecular Function has an F1 score of {F1_score}')
            else:
              tf.keras.models.save_model(
                  model,
                  f'{model_root_path}/best_CC_model',
              )
              print(f'Current best model for Cellular Component has an F1 score of {F1_score}')

        fold_no += 1

In [35]:
def train_on_datasets(get_model, model_name, loss_function=None):
    for dataset in train_data_dict:
        dataset_name = dataset
        data = train_data_dict[dataset]
        train_model(get_model, dataset_name, model_name, data, 10, BATCH_SIZE, loss_function=None)

In [36]:
# train_on_datasets(lambda: ResNet1D(input_shape=(1024, 1), num_classes=1500), 'ResNet1D_weighted_loss')

In [37]:
train_on_datasets(lambda: ResNet1D(input_shape=(1024, 1), num_classes=1500), 'ResNet1D_BCE', 'binary_crossentropy')

Training for Biological Processes
Training for fold 1 ...


2024-02-23 17:54:16.310944: W external/local_tsl/tsl/framework/bfc_allocator.cc:485] Allocator (GPU_0_bfc) ran out of memory trying to allocate 855.53MiB (rounded to 897084160)requested by op _EagerConst
If the cause is memory fragmentation maybe the environment variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will improve the situation. 
Current allocation summary follows.
Current allocation summary follows.
2024-02-23 17:54:16.311032: I external/local_tsl/tsl/framework/bfc_allocator.cc:1039] BFCAllocator dump for GPU_0_bfc
2024-02-23 17:54:16.311058: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (256): 	Total Chunks: 104, Chunks in use: 104. 26.0KiB allocated for chunks. 26.0KiB in use in bin. 15.2KiB client-requested in use in bin.
2024-02-23 17:54:16.311072: I external/local_tsl/tsl/framework/bfc_allocator.cc:1046] Bin (512): 	Total Chunks: 60, Chunks in use: 60. 30.2KiB allocated for chunks. 30.2KiB in use in bin. 30.0KiB client-requested in use in bin.
2024-02-23 17:

InternalError: Failed copying input tensor from /job:localhost/replica:0/task:0/device:CPU:0 to /job:localhost/replica:0/task:0/device:GPU:0 in order to run _EagerConst: Dst tensor is not initialized.