# Regression Task with Late Data Integration

In [None]:
# Import pandas and numpy libraries for data analysis
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
# Import the metric to calculate mean squared error
from sklearn.metrics import mean_squared_error
# Import Tensorflow libraries for deep learning
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Dropout, Input, Concatenate
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.constraints import max_norm

In [None]:
# Define the neural network model to use with only one view
def init_model(input_dim, learning_rate, epochs, momentum, neurons, trainable=True):
    input = Input(shape=(input_dim,))
    layer = Dense(neurons, activation='sigmoid',
    kernel_constraint=max_norm(3)) (input)
    layer = Dropout(rate=0.6) (layer)
    layer = Dense(neurons, activation='sigmoid',
    kernel_constraint=max_norm(3)) (layer)
    layer = Dropout(rate=0.6) (layer)
    predictions = Dense(1, activation='linear') (layer)
    model = Model(inputs=input, outputs=predictions)
    rms = SGD(lr=learning_rate, decay=learning_rate / epochs,
    momentum=momentum)
    model.trainable = trainable
    if (trainable) :
        model.compile(loss='mean_squared_error', optimizer=rms, metrics=['mean_squared_error'])
    return model

In [None]:
# Define the neural network to use in a multi=view fashion
def init_multi_model(input_dim,input_dim2, learning_rate, epochs, momentum, neurons, metabolic_fluxes_layer, 
                     gene_expression_layer):
    metabolic_fluxes_input = Input(shape=(input_dim,))
    gene_expression_input = Input(shape=(input_dim2,))
    comb_layer = Concatenate()([metabolic_fluxes_layer(metabolic_fluxes_input), 
                            gene_expression_layer(gene_expression_input)])
    comb_layer = Dense(neurons, activation='sigmoid', kernel_constraint=max_norm(3)) (comb_layer)
    predictions = Dense(1, activation='linear') (comb_layer)
    model = Model(inputs=[metabolic_fluxes_input, gene_expression_input], outputs=predictions)
    rms = SGD(lr=learning_rate, decay=learning_rate / epochs, momentum=momentum)
    model.compile(loss='mean_squared_error', optimizer=rms, metrics=['mean_squared_error'])
    return model

In [None]:
# Specify the proportion of data to be used as the test set
percent_test = 0.3
# Import flux data
metabolic_data = pd.read_csv('fluxes.csv', encoding='utf-8')
# Disregard null fluxes
metabolic_data = metabolic_data.loc[:, (metabolic_data.abs() >= 1e-7).any(axis=0)]
# Import gene expression data
gene_expression_data = pd.read_csv('gene_expression_data.csv', encoding='utf-8')
X = gene_expression_data[gene_expression_data.columns[:-1]]
Y = gene_expression_data[gene_expression_data.columns[-1]]
# Split gene expression data into training and test sets
gene_expression_train, gene_expression_test, Y_train, Y_test = train_test_split(X, Y, test_size=percent_test, shuffle=False)
# Split flux data into training and test sets
metabolic_fluxes_train, metabolic_fluxes_test = train_test_split(X, test_size=percent_test, shuffle=False)

In [None]:
# Perform feature scaling to normalize the training data
stdscaler_f = StandardScaler()
stdscaler_g = StandardScaler()
metabolic_fluxes_train = stdscaler_f.fit_transform(metabolic_fluxes_train)
gene_expression_train = stdscaler_g.fit_transform(gene_expression_train)
# Normalize the test sets with the same parametric values as the training sets
metabolic_fluxes_test = stdscaler_f.transform(metabolic_fluxes_test)
gene_expression_test = stdscaler_g.transform(gene_expression_test)

In [None]:
# Define the number of epochs, batches, learning rate and validation set split size
epochs = 6000
batches = 256
lrate = 0.005
validation = 0.2
lrate2 = 0.05
epochs2 = 500
# Define the stochastic gradient descent algorithm and the early stopping strategy to prevent overfitting
rms = SGD(lr=lrate , decay=lrate / epochs, momentum=0.75)
earlyStopping=EarlyStopping(monitor='val_loss', patience=15000, verbose=0, mode='auto')

In [None]:
# Initialize separate single=view models for the gene expression and flux datasets
model_gene_expression = init_model(gene_expression_train.shape[1], lrate, 3000, 0.75, 1000)
model_metabolic_fluxes = init_model(metabolic_fluxes_train.shape[1], lrate, 3000, 0.75,1000)
# Fit both the learning models on the training data
model_gene_expression.fit(x=gene_expression_train, y=Y_train, epochs=epochs, batch_size=batches, 
                          validation_split=validation, callbacks=[earlyStopping], verbose=0)
model_metabolic_fluxes.fit(x=metabolic_fluxes_train, y=Y_train, epochs=epochs, batch_size=batches, 
                           validation_split=validation, callbacks=[earlyStopping], verbose=0)

In [None]:
# Remove the last layer from each single view model
model_gene_expression.layers.pop()
model_gene_expression.layers.pop()
model_gene_expression.outputs = [model_gene_expression.layers[-1].output]
model_metabolic_fluxes.layers.pop()
model_metabolic_fluxes.layers.pop()
model_metabolic_fluxes.outputs = [model_metabolic_fluxes.layers[-1].output]

In [None]:
# Initialize the multi=modal model
multi_modal_model = init_multi_model(metabolic_fluxes_train.shape[1], gene_expression_train.shape[1], lrate2, epochs2, 
                                     0.75, 15, model_metabolic_fluxes, model_gene_expression)
# Fit the multi=modal model using training samples
multi_modal_model.fit(x=[metabolic_fluxes_train, gene_expression_train], y=Y_train, epochs=epochs2, batch_size=batches, 
                      validation_split=validation, verbose=0)
# Generate predictions for the test set samples
predictions = multi_modal_model.predict([metabolic_fluxes_test, gene_expression_test])
# Print mean squared error
print('MSE: ', mean_squared_error(predictions, Y_test))