# Neural Network for single lipids

### Imports

In [2]:
# Basics
import os
import random
import numpy as np
import pandas as pd
# Tensorflow
import tensorflow as tf
import tensorflow_addons as tfa
# Keras
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Activation
from keras.initializers import he_normal
from keras.callbacks import EarlyStopping
from keras.regularizers import l2
# Sklearn
from sklearn.model_selection import KFold

2023-12-16 19:08:14.839099: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and introduction of new features.
TFA has entered a minimal maintenance and release mode until a planned end of life in May 2024.
Please modify downstream libraries to take dependencies from other repositories in our TensorFlow community (e.g. Keras, Keras-CV, and Keras-NLP). 

For more information see: https://github.com/tensorflow/addons/issues/2807 

 The versions of TensorFlow you are currently using is 2.10.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and 

### Reproducibility

In [3]:
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

### Data loading

In [4]:
# Dataset path 
dataset_dir = 'data/processed_data'
# Training path
training_input_path = os.path.join(dataset_dir, 'train_features.parquet')
training_output_path = os.path.join(dataset_dir, 'train_targets.parquet')
# Test path
testing_input_path = os.path.join(dataset_dir, 'test_features.parquet')
testing_output_path = os.path.join(dataset_dir, 'test_targets.parquet')

In [5]:
# Load dataset into Pandas dataframes
training_input = pd.read_parquet(training_input_path)
training_output = pd.read_parquet(training_output_path)

testing_input = pd.read_parquet(testing_input_path)
testing_output = pd.read_parquet(testing_output_path)

training_output = training_output * 1000

# Number of output nodes (lipids) for the model
OUTPUT_NODES = training_output.shape[1]
# Number of input nodes (genes) for the model
input_dim = training_input.shape[1]
# Batch size
batch_size = 32
# Remove Nans from lipids
remove_nans_lipids = False

### Definition of the model

In [6]:
def lr_schedule(epoch):
    """
    Set the learning rate considering the epoch's number
    :param epoch: epoch's number
    :return: learning rate
    """
    initial_learning_rate = 0.1  # Set initial learning rate
    decay_factor = 0.9  # Set decay factor
    lr = initial_learning_rate * decay_factor ** epoch  # Compute learnign rate
    return lr

# Create a Sequential model
def build_model(summary=False):
    """
    Build the neural network
    :param summary: if True, print the summary of the model, if False, do not print
    :return: the model
    """
    model = Sequential()

    # Add the input layer with input_dim input nodes
    model.add(Dense(512, input_dim=input_dim, activation='gelu', kernel_initializer=he_normal(seed=seed)))
    model.add(Dense(512, activation='gelu',  kernel_initializer=he_normal(seed=seed)))

    model.add(Dense(256, activation='gelu',  kernel_initializer=he_normal(seed=seed)))
    model.add(Dense(256, activation='gelu',  kernel_initializer=he_normal(seed=seed)))

    model.add(Dense(128, activation='gelu',  kernel_initializer=he_normal(seed=seed)))
    model.add(Dense(128, activation='gelu',  kernel_initializer=he_normal(seed=seed)))

    model.add(Dense(64, activation='gelu',  kernel_initializer=he_normal(seed=seed)))
    model.add(Dense(64, activation='gelu',  kernel_initializer=he_normal(seed=seed)))

    model.add(Dense(32, activation='gelu',  kernel_initializer=he_normal(seed=seed)))
    model.add(Dense(32, activation='gelu',  kernel_initializer=he_normal(seed=seed)))

    model.add(Dense(16, activation='gelu',  kernel_initializer=he_normal(seed=seed)))
    model.add(Dense(16, activation='gelu',  kernel_initializer=he_normal(seed=seed)))

    model.add(Dense(8, activation='gelu',  kernel_initializer=he_normal(seed=seed)))
    model.add(Dense(8, activation='gelu',  kernel_initializer=he_normal(seed=seed)))

    model.add(Dense(4, activation='gelu',  kernel_initializer=he_normal(seed=seed)))
    model.add(Dense(4, activation='gelu',  kernel_initializer=he_normal(seed=seed)))

    # Add the output layer with 1 node (for single regression)
    model.add(Dense(1, activation='relu'))

    if summary:
        # Display the model summary
        model.summary()

    return model

# Print the model summary
build_model(True)

2023-12-16 19:08:25.534231: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 512)               256512    
                                                                 
 dense_1 (Dense)             (None, 512)               262656    
                                                                 
 dense_2 (Dense)             (None, 256)               131328    
                                                                 
 dense_3 (Dense)             (None, 256)               65792     
                                                                 
 dense_4 (Dense)             (None, 128)               32896     
                                                                 
 dense_5 (Dense)             (None, 128)               16512     
                                                                 
 dense_6 (Dense)             (None, 64)                8

<keras.engine.sequential.Sequential at 0x7f944644ec80>

In [7]:
# Use early stopping on the validation
early_stopping = EarlyStopping(monitor='val_loss',  # Metric chose is validation loss (MSE)
                               patience=10,         # Number of epochs with no improvement after which training stops
                               restore_best_weights=True)  # Restores model weights from the epoch with the best value of the monitored metric

In [8]:
def data_generator(X, y, batch_size):
    """
    Data augmentation
    :param X: input DataFrame (genes)
    :param y: output DataFrame (single lipid)
    :param batch_size: size of the mini batch
    :return batch_X: dataset containing random modification on the input DataFrame with size corresponding to batch size
    :return batch_y: dataset containing output with size corresponding to batch size
    """
    # Set noise std and scale factor for random modifications
    noise_std = 0.1
    scale_factor_range = 0.3
    while True:
        indices = np.random.choice(X.shape[0], batch_size, replace=False)
        batch_df = X.iloc[indices]

        # Create a copy of the batch for augmentation
        augmented_batch_df = batch_df.copy()

        # Random scaling
        scale_factor = np.random.uniform(1 - scale_factor_range, 1 + scale_factor_range)
        augmented_batch_df *= scale_factor

        # Add Gaussian noise to all features
        augmented_batch_df += np.random.normal(loc=0, scale=noise_std, size=augmented_batch_df.shape)

        batch_X = augmented_batch_df.values
        batch_y = y.iloc[indices].values

        yield batch_X, batch_y

In [9]:
def remove_nans(training_input, training_output, lipid):
    """
    Remove Nans from the output column, and modify the input DataFrame in accordance
    :param training_input: input DataFrame (genes)
    :param training_output: output DataFrame (lipids)
    :param lipid: number of the lipid to consider
    :return training: input DataFrame (genes) with rows without Nans
    :return output: output DataFrame (genes) with rows without Nans
    """
    value_to_replace = 0.10003404299092956
    # Copy the output DataFrame
    output = training_output.iloc[:, lipid].copy()
    # Replace values with Nans
    output = output.replace(value_to_replace, np.nan)
    # Remove Nans rows from both DataFrames
    non_nan_indices = np.where(~np.isnan(output))
    output = output.iloc[non_nan_indices]
    training = training_input.iloc[non_nan_indices]

    return training, output

### Training

In [None]:
# DataFrame for results
lipid_names = list(map(lambda s: s.strip(), training_output.columns.values))
lipids_metrics_avg = pd.DataFrame(columns=['Loss', 'R2'], index=lipid_names)

# Train a model for each lipid
for j in range(OUTPUT_NODES):
    print('#'*72)
    print(f'Start training for lipid {lipid_names[j]}')

    # Define the K-fold Cross Validator
    num_folds = 5
    k_fold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)

    # Metrics for each fold
    loss_per_fold = np.zeros((num_folds,))
    r2_per_fold = np.zeros((num_folds,))

    # Modify the input and output
    if remove_nans_lipids:
        X, y = remove_nans(training_input, training_output, j)
    else:
        X = training_input
        y = training_output.iloc[:, j]

    # K-fold Cross Validation model evaluation
    fold_no = 1
    models = []
    for input_indices, output_indices in k_fold.split(X, y):
        # Build the model
        model = build_model()

        # Compile the model
        model.compile(optimizer='adam', loss='mean_squared_error',
                      metrics=[metrics.mean_squared_error, metrics.mean_absolute_error, tfa.metrics.RSquare(), metrics.mean_absolute_percentage_error])

        print('------------------------------------------------------------------------')
        print(f'Training for fold {fold_no} ...')

        train_generator = data_generator(X.iloc[input_indices], y.iloc[input_indices], batch_size)
        # Train the model
        history = model.fit(
            train_generator,
            epochs=50,  # Adjust the number of epochs as needed
            steps_per_epoch=len(input_indices) // batch_size,
            validation_data=(X.iloc[output_indices], y.iloc[output_indices]),
            callbacks=[early_stopping]
        ).history

        # Generate generalization metrics
        scores = model.evaluate(X.iloc[output_indices], y.iloc[output_indices], verbose=0)
        print(scores)
        loss_per_fold[fold_no-1] = scores[0]
        r2_per_fold[fold_no-1] = scores[3]

        models.append(model)

        # Increase fold number
        fold_no = fold_no + 1

    # Computing the values
    mean_loss = loss_per_fold.mean()
    mean_r2 = r2_per_fold.mean()
    lipids_metrics_avg.loc[lipid_names[0]] = [mean_loss, mean_r2]
    print('#'*72)
    print(f'Finish training for lipid {lipid_names[0]}')
    print(f'Mean loss: {mean_loss}')
    print(f'Mean r2: {mean_r2}')

# Saving the values
lipids_metrics_avg.to_csv('lipids_metrics_avg_neural_network.csv')

########################################################################
Start training for lipid LPC O- 18:3
------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
[0.03197409585118294, 0.03197409585118294, 0.14039327204227448, 0.2397068738937378, 29.021282196044922]
------------------------------------------------------------------------
Training for fold 2 ...
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50

### Output processing

In [None]:
# Print k best and worst lipids for average between folds of metrics loss and r2
k = 5

def format_names(df: pd.DataFrame, just=15):
    return ', '.join([name.rjust(just) for name in df.index.values])

def format_values(df: pd.DataFrame, col: str, just=15):
    return ', '.join([f'{val:.5e}'.rjust(just) for val in df[col].values])

best_losses = lipids_metrics_avg.nsmallest(k, 'Loss')
worst_losses = lipids_metrics_avg.nlargest(k, 'Loss')
print("Loss:")
print("  Best:")
print(f"  {format_names(best_losses)}")
print(f"  {format_values(best_losses, 'Loss')}")
print("  Worst:")
print(f"  {format_names(worst_losses)}")
print(f"  {format_values(worst_losses, 'Loss')}")

best_r2s = lipids_metrics_avg.nlargest(k, 'R2')
worst_r2s = lipids_metrics_avg.nsmallest(k, 'R2')
print("R2:")
print("  Best:")
print(f"  {format_names(best_r2s)}")
print(f"  {format_values(best_r2s, 'R2')}")
print("  Worst:")
print(f"  {format_names(worst_r2s)}")
print(f"  {format_values(worst_r2s, 'R2')}")