# Using pre-defined features & parameters to train a time prediction model
Here we will demonstrate how to train a sub-predictor with features identified using sequential feature selection and hyperparameters determined using 5-fold cross-validation

In [2]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import pickle
import math
import random
import os
from chronogauge_model import model_nn, utils

In [3]:
# Define seed
SEED = 0
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)


## 1. Data loading and pre-processing
First we load the training data

In [4]:
# Load the training samples' expresion matrix
X_train = pd.read_csv('../data/expression_matrices/x_training.csv', index_col=0)

# Load the training samples' targets
Y_train = pd.read_csv('../data/targets/target_training.csv', index_col=0)

The model takes only sine and cosine values of the 24-hour sampling time as an input. This converts the target into sine(time) and -cosine(time)

In [5]:
def cyclic_time(times):
    #this is used to convert the target (time of sampling) in hours to cosine and sine values
    times = times % 24
    t_cos = -np.cos((2 * np.pi * times.astype('float64') / 24)+(np.pi/2))
    t_sin = np.sin((2 * np.pi * times.astype('float64') / 24)+(np.pi/2))
    
    t_circular = np.concatenate((np.asarray(t_cos).reshape(-1, 1), np.asarray(t_sin).reshape(-1, 1)), axis=1)

    return t_circular

In [6]:
# Convert Y_test targets into sine and cosine values for each sample
Y_data = cyclic_time(Y_train.iloc[:,0])

Now we load the test data and make sure it's features correspond with the training and is z-score scaled with reference to the training's scaling factor

In [7]:
# load the test samples' expression matrix
X_test = pd.read_csv('../data/expression_matrices/x_test_rna.csv', index_col=0)

# Load the test sample's target sampling times that were are trying to predict
Y_test = pd.read_csv('../data/targets/target_test_rna.csv', index_col=0)

# Ensure test has same feature space as training
X_test = X_test.loc[X_train.index]

# Standardize the training expression values using z-score scaling (StandardScaler)
scaler = StandardScaler()
X_train = pd.DataFrame(data=scaler.fit_transform(X_train.T).T, index=X_train.index, columns=X_train.columns)


# Fit the test data to scaling factor of the training
X_test = pd.DataFrame(data=scaler.transform(X_test.T).T, index=X_test.index, columns=X_test.columns)


__NOTE__ if using microarray data, the test data should be standardized using a scaling factor that has been fit to a microrray time-course, not an RNA-seq experiment.

## 2. Model training
In this instance, we will use features and hyperparameters that have been pre-determined for model id 0

The model can theoretically be fit to any of the features found in the training expression matrix, but we recommend either using our ensemble of features or the 17 cannonical circadian clock genes, as these have been validated

In [9]:
# load features for model id 0
features = pd.read_csv('../data/model_parameters/gene_features_unadjusted.csv', index_col=0)
features = features.iloc[0].dropna().to_numpy()

# Cannonical clock genes in case users wish to use them instead
# clock_genes = ['AT5G61380', 'AT2G46830', 'AT1G01060', 'AT5G02810', 'AT2G46790', 'AT2G25930', 'AT1G22770', 'AT5G42900',
#       'AT3G46640','AT5G59570', 'AT5G60100', 'AT3G22380', 'AT4G39620', 'AT5G57360', 'AT2G31870',  'AT2G21070',
#       'AT3G20810']

# Select only these features for training and test data
X_train = X_train.loc[features]
X_test = X_test.loc[features]

# load model hyperparameters for model id 0
with open('../data/model_parameters/model_parameters.p', 'rb') as fin:
    hyperparams = pickle.load(fin)
hyperparams = hyperparams[0]

# Extract specific hyperparameters including learning rate and batch size
lr = float(hyperparams['lr'])
batches = int(hyperparams['batches'])
l2 = lr


__NOTE__ during model tuning for each sub-predictor, the learning rate was mistakenly also set for l2 regululrization factor. Although there is no theoretical reason to expect these two hyperparameters to have the same value, this configuration was empirically found to perform optimally based on 5-fold cross-validation. The model's performance under these settings was deemed acceptable and considered optimal for CT prediction.

We call the neural network model from model_nn.py using the aforementioned hyperparameters

In [12]:
model = model_nn.MultiOutputNN(learning_rate=lr,l2_reg=l2, batch_size=batches).nn_model()

The model was optimized using an EarlyStop function to restore the weights of the epoch giving the smallest loss

In [13]:
early_stop = EarlyStopping(patience=25, restore_best_weights=True, monitor='loss', mode='min')

We fit the neural network to the training data. Depending on the feature set and hyperparmeters used, this may take time.

In [14]:
model.fit(X_train.astype('float32').T, Y_data.astype('float32'), epochs=5000, verbose=1, callbacks=[early_stop])


Epoch 1/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 1.7051
Epoch 2/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.6525 
Epoch 3/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.6134 
Epoch 4/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.5800 
Epoch 5/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.5496 
Epoch 6/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.5223 
Epoch 7/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.4948 
Epoch 8/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - loss: 1.4648 
Epoch 9/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.4325 
Epoch 10/5000
[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.3988 

<keras.src.callbacks.history.History at 0x15f3a144bb0>

We can save this specific model so we can use it again. It is a good idea to record the features used, as these will be required when loading and running the model in future.

In [19]:
model.save('

[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step


array([[ 1.6396092e-03,  2.5326002e-02],
       [-3.2094736e-03,  2.5695318e-02],
       [-5.2083092e-04,  1.0002324e-02],
       [-1.0408386e-02,  9.3809757e-03],
       [-1.0003201e-02,  1.0912423e-02],
       [-7.3061259e-03,  2.4408862e-02],
       [-2.9632188e-03,  1.9786457e-02],
       [ 2.7690802e-03,  1.1810710e-02],
       [-6.9589954e-04,  4.4697067e-03],
       [-1.2861886e-02, -1.1332051e-03],
       [-9.0144072e-03,  1.0421447e-02],
       [-8.1061544e-03,  1.3787812e-02],
       [ 1.4288819e-03, -1.4312568e-02],
       [ 1.0324783e-02, -1.3836695e-02],
       [-4.9434444e-03, -1.0504469e-03],
       [ 1.9424232e-03, -1.7619613e-03],
       [-9.2764635e-04,  3.0543166e-03],
       [ 6.9316356e-03, -6.1345496e-04],
       [-2.8327762e-05,  8.2057547e-03],
       [-8.0880076e-03,  5.2644294e-03],
       [-4.2485603e-04,  2.2550423e-03],
       [-9.2012798e-03,  1.1589740e-02],
       [-1.3207435e-02,  1.6234607e-02],
       [-7.7928007e-03,  1.7937686e-02],
       [ 2.39112

## 2. Applying models
Using the gene features correspoding to each model (based on id), we can predict the CT.

Example with model 0:

In [None]:
# extract id 0 model & features from model dictionary
model_0, features_0 = model_dict[0]



# X_test must use only id 0's features
X_test_0 = X_test.loc[features_0]

#predict CT of the test data
results_0 = model_0(X_test_0.T)
results_0[:10]

Example with models 0-9:

In [None]:
results_dict = {}
# Iterate over model ids and predict
for i in range(0, 10):
    i_model, i_features = model_dict[i]
    # Set test features
    i_X_test = X_test.loc[i_features]
    i_results = i_model(i_X_test.T)
    results_dict[i] = i_results
np.asarray(results_dict[9])[:10]

## 3. Processing results of individual sub-predictors
The model generates two outputs - circular values representing the sin(CT) and -cos(CT). Thus, they must be converted back into an hourly CT value using the following atan function (note this is included in utils.py):

In [None]:
def time24(ipreds):
    #returns times as an hourly value within a 24-hour modulus
    preds = []
    for i in range(ipreds.shape[0]):
        preds.append(math.atan2(ipreds[i, 0], ipreds[i, 1]) / math.pi * 12)

    for i in range(len(preds)):
        if preds[i] < 0:
            preds[i] = preds[i] + 24
    return preds

In [None]:
pred24_0 = time24(results_dict[0])
pred24_0[:10]

The errors of individual sub-predictors can be analyzed using the following function considering 24 hours as a modulus:

In [None]:
def errors(pred, true):
    # Ensure 24-hour modulus in the target
    true = true % 24
    #from 24-hour time predictions, get error in minutes
    err = pred - true
    for i in range(0, err.shape[0]):
        if err.iloc[i] > 12:
            err.iloc[i] = err.iloc[i] - 24
        if err.iloc[i] < -12:
            err.iloc[i] = err.iloc[i] + 24
    # return error in minutes
    return err*60


In [None]:
error_0 = errors(pred24_0, Y_test.iloc[:,0])
error_0.head()

However, based on cross-validation results, we found individual sub-predictors are unreliable and it is impossible to tell which sub-predictors will peform accurately or innacurately in unseen test samples.

## 4. Aggretation of results of individual sub-predictors
To overcome this unreliability, we aggregate sub-predictor outputs using a circular mean. Combining predictions as a bagging-like ensemble is essential to ChronoGauge's consistent performance across unseen data.

The following function converts 24-hour CT predictions into sine/cosine values

In [None]:
def cyclic_time(times):
    #this is used to convert the target (time of sampling) in hours to cosine and sine values
    times = times % 24
    t_cos = -np.cos((2 * np.pi * times.astype('float64') / 24)+(np.pi/2))
    t_sin = np.sin((2 * np.pi * times.astype('float64') / 24)+(np.pi/2))
    
    return t_cos, t_sin


The following function uses cyclic_time() to aggregate the results of a dataframe of 24-hour CT predictions

In [None]:
def circular_mean(predictions_24):
    cos_vals = []
    sin_vals = []

    for i in range(0, predictions_24.shape[1]):
        i_cos, i_sin = cyclic_time(predictions_24.iloc[:,i])
        cos_vals.append(i_cos)
        sin_vals.append(i_sin)

    cos_vals = np.mean(cos_vals, axis=0)
    sin_vals = np.mean(sin_vals, axis=0)

    ct_vals = np.concatenate((np.asarray(cos_vals).reshape(-1, 1), np.asarray(sin_vals).reshape(-1, 1)), axis=1)
    ct_24 = time24(ct_vals)
    return ct_24



In [None]:
# Get 24 hour CT predictions for each model
all_preds = []
for i in range(0, 10):
    i_preds = time24(results_dict[i])
    all_preds.append(i_preds)
# Create dataframe for results
all_preds = pd.DataFrame(data=all_preds, columns=X_test.columns).T
all_preds.head()

In [None]:
# Use circular_mean function to obtain an aggregated prediction across the sub-predictors
circ_pred = circular_mean(all_preds)

# Get error metrics for CT preditions
final_results = pd.DataFrame(data=Y_test.iloc[:,0].to_numpy() % 24, index=Y_test.index, columns=['Sampling time (hr)'])
final_results['Predicted CT (hr)'] = circ_pred
final_results['Error (mins)'] = errors(np.asarray(circ_pred), Y_test.iloc[:,0])
final_results['Absolute error (mins)'] = np.absolute(final_results['Error (mins)'])
final_results.head()

Based on cross-validation results, we expect a larger ensemble of sub-predictors will give more accurate results.