# Neural Network for all lipids

### Import

In [69]:
import os
import random
import numpy as np
import pandas as pd

import tensorflow as tf
import tensorflow_addons as tfa
from keras.callbacks import EarlyStopping
from keras import metrics
from keras.models import Sequential
from keras.layers import Dense, BatchNormalization, Dropout, Activation
from keras.initializers import HeNormal
from keras.regularizers import l2

from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import KFold

### Reproducibility

In [70]:
# Random seed for reproducibility
seed = 42

random.seed(seed)
os.environ['PYTHONHASHSEED'] = str(seed)
np.random.seed(seed)
tf.random.set_seed(seed)
tf.compat.v1.set_random_seed(seed)

### Data loading

In [71]:
# Dataset path 
dataset_dir = 'data/processed_data'
# Training loading
training_input_path = os.path.join(dataset_dir, 'train_features.parquet')
training_output_path = os.path.join(dataset_dir, 'train_targets.parquet')
# Testing loading
testing_input_path = os.path.join(dataset_dir, 'test_features.parquet')
testing_output_path = os.path.join(dataset_dir, 'test_targets.parquet')

In [72]:
# Load dataset into Pandas dataframes
# Load training
training_input = pd.read_parquet(training_input_path)
training_output = pd.read_parquet(training_output_path)

# Load testing
testing_input = pd.read_parquet(testing_input_path)
testing_output = pd.read_parquet(testing_output_path)

# Name of lipids to be removed
lipids_to_drop = ['LPC O-18:3', 'Cer 36:1', 'PE(30:1) ', 'PE 34:2', 'PA(P-38:6)\xa0',
 'PI-Cer(t30:2)\xa0', 'PE 36:3', 'PE O-39:6', 'PA 40:6', 'PA 42:4',
 'PE(O-40:6)\xa0', 'PE(42:6)\xa0', 'PE(40:8)\xa0', 'PGP(34:1) ', 'PE(44:11(OH))\xa0',
 'PS(40:4) ', 'PIP(O-36:5)\xa0', 'PI-Cer(t30:0)\xa0']

training_output = training_output.drop(lipids_to_drop, axis=1)

# Number of output nodes (lipids) for the model
OUTPUT_NODES = training_output.shape[1]
# Number of input nodes (genes) for the model
input_dim = training_input.shape[1]
# Batch size
batch_size = 32

In [73]:
# Create a Sequential model
def build_model(summary=False):
    """
    Build the MLP model
    :param summary: if True, print the summary of the model
    :return: MLP model
    """

    model = Sequential()
    # Add the input layer with 500 nodes
    model.add(Dense(128, activation='sigmoid', input_dim=500))
    model.add(Dense(64, activation='sigmoid'))
    model.add(Dense(32, activation='sigmoid'))
    # Add the output layer with OUTPUT_NODES nodes (for multiple regression)
    model.add(Dense(OUTPUT_NODES, activation='sigmoid'))

    if summary:
        # Display the model summary
        model.summary()

    return model

# Print the model summary
build_model(True)

Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_177 (Dense)           (None, 128)               64128     
                                                                 
 dense_178 (Dense)           (None, 64)                8256      
                                                                 
 dense_179 (Dense)           (None, 32)                2080      
                                                                 
 dense_180 (Dense)           (None, 138)               4554      
                                                                 
Total params: 79,018
Trainable params: 79,018
Non-trainable params: 0
_________________________________________________________________


<keras.engine.sequential.Sequential at 0x7fc9a66195a0>

In [74]:
# Calculate loss for each output node (lipid)
def calculate_losses(lipid_true: pd.DataFrame, lipid_pred: np.ndarray):
    """
    Compute loss (MSE) for each lipid
    :param lipid_true: true values of lipid abundance
    :param lipid_pred: predicted values of lipid abundance
    :return: loss (MSE) for each lipid
    """
    # Initialize numpy array with losses
    losses = np.zeros((OUTPUT_NODES,))
    # Iterate over each lipid
    for i in range(OUTPUT_NODES):
        y_pred = lipid_pred[:, i]
        y_true = lipid_true.iloc[:, i]
        # Compute loss (MSE)
        losses[i] = mean_squared_error(y_true, y_pred)
    return losses

# Calculate loss for each output node (lipid)
def calculate_r2s(lipid_true: pd.DataFrame, lipid_pred: np.ndarray):
    """
    Compute R2 score for each lipid
    :param lipid_true: true values of lipid abundance
    :param lipid_pred: predicted values of lipid abundance
    :return: R2 score for each lipid
    """
    # Initialize numpy array with R2 scores
    r2s = np.zeros((OUTPUT_NODES,))
    # Iterate over each lipid
    for i in range(OUTPUT_NODES):
        y_pred = lipid_pred[:, i]
        y_true = lipid_true.iloc[:, i]
        # Compute R2 score
        r2s[i] = r2_score(y_true, y_pred)
    return r2s

In [75]:
# Use early stopping on the validation
early_stopping = EarlyStopping(monitor='val_loss',  # Metric chose is validation loss (MSE)
                               patience=10,         # Number of epochs with no improvement after which training stops
                               restore_best_weights=True)  # Restores model weights from the epoch with the best value of the monitored metric

In [None]:
lipid_names = list(map(lambda s: s.strip(), training_output.columns.values))

# DataFrame with the results
lipids_metrics_avg = pd.DataFrame(columns=['Loss', 'R2', 'R2_no_scale'], index=lipid_names)

print(f'Start training')

# Define the K-fold Cross Validator
num_folds = 5
k_fold = KFold(n_splits=num_folds, shuffle=True, random_state=seed)

# Metrics for each fold
loss_per_fold = np.zeros((num_folds,len(lipid_names)))
r2_per_fold = np.zeros((num_folds,len(lipid_names)))
r2_per_fold_no_scale = np.zeros((num_folds,len(lipid_names)))

# K-fold Cross Validation model evaluation
fold_no = 1
models = []
split_indices = []
final_losses = []
final_r_score = []

# Compute K-fold Cross Validation
for input_indices, output_indices in k_fold.split(training_input, training_output):
    split_indices.append(output_indices)
    # Build the model
    model = build_model()

    # Fit and transform the data
    scaler = MinMaxScaler()
    scaler.fit(training_output.iloc[input_indices])
    training_output_scale = pd.DataFrame(scaler.transform(training_output.iloc[input_indices]), columns=training_output.columns, index=training_output.iloc[input_indices].index)
    validation_output_scale = pd.DataFrame(scaler.transform(training_output.iloc[output_indices]), columns=training_output.columns, index=training_output.iloc[output_indices].index)

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error',
                  metrics=[metrics.mean_squared_error, metrics.mean_absolute_error, tfa.metrics.RSquare()])

    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    # Train the model
    history = model.fit(
        training_input.iloc[input_indices], training_output_scale,
        epochs=200,  # Adjust the number of epochs as needed
        batch_size = 32,
        validation_data=(training_input.iloc[output_indices], validation_output_scale),
        callbacks=[early_stopping],
        shuffle=True
    ).history

    # Save the results for each fold
    final_losses.append(model.evaluate(training_input.iloc[output_indices], validation_output_scale)[0])
    final_r_score.append(model.evaluate(training_input.iloc[output_indices], validation_output_scale)[3])

    # Generate generalization metrics
    prediction = model.predict(training_input.iloc[output_indices])
    losses = calculate_losses(validation_output_scale, prediction)
    r2s = calculate_r2s(validation_output_scale, prediction)
    loss_per_fold[fold_no-1] = losses
    r2_per_fold[fold_no-1] = r2s
    reversed_data = scaler.inverse_transform(prediction)
    r2_per_fold_no_scale[fold_no-1] = calculate_r2s(training_output.iloc[output_indices], reversed_data)

    # Increase fold number
    fold_no = fold_no + 1

# Compute mean loss and mean R2 score for each lipid
mean_loss = loss_per_fold.mean(axis=0)
mean_r2 = r2_per_fold.mean(axis=0)
mean_r2_no_scale = r2_per_fold_no_scale.mean(axis=0)
# Save the metrics
lipids_metrics_avg['Loss'] = mean_loss
lipids_metrics_avg['R2'] = mean_r2
lipids_metrics_avg['R2_no_scale'] = mean_r2_no_scale
print('#'*72)
print(f'Finish training')

# Compute overall loss and R2 score
mean = np.mean(np.array(final_losses))
std = np.std(np.array(final_losses))
print(f'The final loss is: {mean} + {std}')
mean = np.mean(np.array(final_r_score))
std = np.std(np.array(final_r_score))
print(f'The final R2 is: {mean} + {std}')

# Save the metrics
lipids_metrics_avg.to_csv('results/neural_network/lipids_metrics_avg_neural_network.csv')

Start training


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


------------------------------------------------------------------------
Training for fold 1 ...
Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200

In [None]:
print(np.median(np.array(final_r_score)))

In [None]:
# Print k best and worst lipids for average between folds of metrics loss and r2
k = 5

def format_names(df: pd.DataFrame, just=15):
    return ', '.join([name.rjust(just) for name in df.index.values])

def format_values(df: pd.DataFrame, col: str, just=15):
    return ', '.join([f'{val:.5e}'.rjust(just) for val in df[col].values])

best_losses = lipids_metrics_avg.nsmallest(k, 'Loss')
worst_losses = lipids_metrics_avg.nlargest(k, 'Loss')
print("Loss:")
print("  Best:")
print(f"  {format_names(best_losses)}")
print(f"  {format_values(best_losses, 'Loss')}")
print("  Worst:")
print(f"  {format_names(worst_losses)}")
print(f"  {format_values(worst_losses, 'Loss')}")

best_r2s = lipids_metrics_avg.nlargest(k, 'R2')
worst_r2s = lipids_metrics_avg.nsmallest(k, 'R2')
print("R2:")
print("  Best:")
print(f"  {format_names(best_r2s)}")
print(f"  {format_values(best_r2s, 'R2')}")
print("  Worst:")
print(f"  {format_names(worst_r2s)}")
print(f"  {format_values(worst_r2s, 'R2')}")