# Setup of Noteboook

The follwing code clones the github repository with course files. 
Subsequently it imports all libraries and custom modules needed for this notebook

In [131]:
!git clone https://github.com/DataHow/analytics-course-scripts.git
!pip install --upgrade scipy==1.7.3

fatal: destination path 'analytics-course-scripts' already exists and is not an empty directory.


In [132]:
# import libraries
import numpy as np
import pandas as pd
import scipy
from scipy.stats import qmc
from scipy.integrate import solve_ivp

import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel

import warnings
warnings.filterwarnings('ignore')
np.random.seed(42)
rng = np.random.default_rng(42)

# Hybrid Models

We use a simple model for cell expansion, accounting for the growth of viable cell density (VCD) and the consumption of glucose (Glc), is used to show how hybrid models are trained and can predict new experimental conditions.

Furthermore we define functions specifying our two hybrid models below which will be utilized further in the script. They consist of two functions. First function defines the ODE equations that describe the dynamics and second function solves the equations using specified parameters and initial conditions.

In [133]:
def ode_fcn(t, y, feed):
  # define parameters
    VCD = y[0]
    Glc = y[1]
    # growth rate
    mu = Glc/(5+Glc)
    # mass balances
    dVCD_dt = mu*VCD
    dGlc_dt = -(0.5*mu+0.05*Glc)*VCD+feed
    dy = [dVCD_dt, dGlc_dt]
    return dy

def run_experiment(VCD_0, Glc_0, feed, t_end):
    fun = lambda t, y: ode_fcn(t,y,feed)
    y0 = [VCD_0, Glc_0]
    t_span = np.arange(0, 0.5*round(2*t_end)+0.25, 0.25)
    sol = solve_ivp(fun, [t_span[0], t_span[-1]], y0, method='LSODA', t_eval=t_span, rtol=1e-6, atol=1e-6)
    t = sol.t.tolist()
    y = sol.y.T
    VCD = y[:, 0]
    Glc = y[:, 1]
    return t, VCD, Glc


In [134]:
def ode_fcn_1st_hybrid(t, y, feed, g_mld, k_mld):
    # mass balances
    dVCD_dt = g_mld.predict(y.reshape(-1, 1).T)
    dGlc_dt = -k_mld.predict(y.reshape(-1, 1).T) + feed
    return [dVCD_dt, dGlc_dt]


def run_1st_hybrid(VCD_0, Glc_0, feed, t_end, g_mld, k_mld):
    fun = lambda t, y: ode_fcn_1st_hybrid(t, y, feed, g_mld, k_mld)
    y0 = np.array([VCD_0, Glc_0])
    t_span = np.arange(0, 0.5 * round(2 * t_end) + 0.25, 0.25)
    sol = solve_ivp(fun, [t_span[0], t_span[-1]], y0, method='LSODA', t_eval=t_span, rtol=1e-6, atol=1e-6)
    t = sol.t.tolist()
    y = sol.y.T
    VCD = y[:, 0]
    Glc = y[:, 1]
    return t, VCD, Glc

In [135]:
def ode_fcn_2nd_hybrid(t, y,feed, g_mld,k_mld):
    # mass balances
    dVCD_dt = g_mld.predict(y.reshape(-1, 1).T)*y[0]
    dGlc_dt = -k_mld.predict(y.reshape(-1, 1).T)*y[0]+feed
    return [dVCD_dt, dGlc_dt]


def run_2nd_hybrid(VCD_0,Glc_0,feed,t_end,mu_mld,k_mld):
    fun = lambda t, y: ode_fcn_2nd_hybrid(t, y, feed, g_mld, k_mld)
    y0 = np.array([VCD_0, Glc_0])
    t_span = np.arange(0, 0.5 * round(2 * t_end) + 0.25, 0.25)
    sol = solve_ivp(fun, [t_span[0], t_span[-1]], y0, method='LSODA', t_eval=t_span, rtol=1e-6, atol=1e-6)
    t = sol.t.tolist()
    y = sol.y.T
    VCD = y[:, 0]
    Glc = y[:, 1]
    return t, VCD, Glc


Firstly, we will run the base experiment experiment:

## Run base experiment

Here we create work with the same insilico model as before. The cell process parameters which we are able to design are following:

*   `VCD_0` Amount of VCD at time 0
*   `Glc_0` Amount of Glc at time 0
*   `feed_rate` Amount of continuous feed each day. Feeding starts at day 1.
*   `feed_end` Day when we stop feeding. This defines aslo the length of experiment.

Subsequently we plot the process evolution over time.


In [136]:
# Set CPP's for run generation
VCD_0 = 0.5
Glc_0 = 10
feed_rate = 10
feed_end = 5

In [137]:
# Plot base experiment run
t, VCD, Glc = run_experiment(VCD_0,Glc_0,feed_rate,feed_end)

fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=t,y=VCD, mode='markers+lines',name='VCD'))
fig.add_trace(go.Scatter(x=t,y=Glc, mode='markers+lines',name='Glucose'),secondary_y=True)
fig.update_layout(showlegend=True,title="Cell process dynamics",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

## Create experimental design

Here we create a diverse set of experimental conditions for cell process parameters using Latin Hypercube Sampling and run the experiments.


In [138]:
# Define number of experiments
Nruns_train = 4
# Define initial conditions for process parameters
DoE_train = np.zeros((Nruns_train,3))
sampler = qmc.LatinHypercube(d=3)
DoE_nondim_train = 2*sampler.random(n=Nruns_train)-1
DoE_train[:, 0] = DoE_nondim_train[:,0]*1.5 + 2   # VCD_0
DoE_train[:, 1] = DoE_nondim_train[:,1]*10 + 20   # Glc_0
DoE_train[:, 2] = DoE_nondim_train[:,2]*25 + 30   # feed_rate
# Run experiments
VCD = np.zeros((21, Nruns_train))
Glc = np.zeros((21, Nruns_train))
for nexp in range(Nruns_train):
    t, VCD[:,nexp],Glc[:,nexp] = run_experiment(DoE_train[nexp,0],DoE_train[nexp,1],DoE_train[nexp,2],5)

In [139]:
# Plot experiments
fig = make_subplots(specs=[[{"secondary_y": True}]])
for nexp in range(Nruns_train):
    fig.add_trace(go.Scatter(x=t,y=VCD[:, nexp], mode='markers+lines',name='VCD:'+str(nexp), line=dict(color = px.colors.qualitative.G10[0])))
    fig.add_trace(go.Scatter(x=t,y=Glc[:, nexp], mode='markers+lines',name='Glucose:'+str(nexp),line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True)
fig.update_layout(showlegend=False,title="Cell process dynamics in DOE",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

### Create test set

In similar fashion we create a test set significantly larger number of experiments to properly verify the model's performance on a completely unsees data.

In [140]:
# Define number of experiments
Nruns_test = 100
# Define initial conditions for process parameters
DoE_test = np.zeros((Nruns_test,3))
sampler = qmc.LatinHypercube(d=3, seed=10)
DoE_nondim_test = 2*sampler.random(n=Nruns_test)-1
DoE_test[:, 0] = DoE_nondim_test[:,0]*1.5 + 2   # VCD_0
DoE_test[:, 1] = DoE_nondim_test[:,1]*10 + 20   # Glc_0
DoE_test[:, 2] = DoE_nondim_test[:,2]*25 + 30   # feed_rate
# Run experiments
VCD_test = np.zeros((21, Nruns_test))
Glc_test = np.zeros((21, Nruns_test))
for nexp in range(Nruns_test):
    t, VCD_test[:,nexp], Glc_test[:,nexp] = run_experiment(DoE_test[nexp,0],DoE_test[nexp,1], DoE_test[nexp,2],5)

In [141]:
# Plot experiments
fig = make_subplots(specs=[[{"secondary_y": True}]])
for nexp in range(Nruns_test):
    fig.add_trace(go.Scatter(x=t,y=VCD_test[:, nexp], mode='markers+lines',name='VCD:'+str(nexp), line=dict(color = px.colors.qualitative.G10[0])))
    fig.add_trace(go.Scatter(x=t,y=Glc_test[:, nexp], mode='markers+lines',name='Glucose:'+str(nexp),line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True)
fig.update_layout(showlegend=False,title="Cell process dynamics in DOE",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

## Train Black-box GP
In this section, we are training a "black-box" Gaussian process, which is directly linking the manipulated variables to the final VCD and the final Glc.

The scope of this exercize is to show that the good predictability of the hybrid models that are shown below in the script cannot be ascribed to the fitting capabilities of GPs or any other model with such few experiments.

We create a models to predict final VCD, final Glc, and also Glucose at mid-point.

In [142]:
# Define Kernel
kernel = 1.0 * RBF(length_scale=1e-1, length_scale_bounds=(1e-2, 1e3)) + WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-10, 1e1))
# Define Regression Model
gpr1 = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0, n_restarts_optimizer=3).fit(DoE_train, VCD[20,:]) # Model for final VCD
gpr2 = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0, n_restarts_optimizer=3).fit(DoE_train, Glc[20,:]) # Model for final Glc
gpr3 = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0, n_restarts_optimizer=3).fit(DoE_train, Glc[10,:]) # Model for Glc at t=2.5
# Predict values on test set
VCD_end_pred = gpr1.predict(DoE_test)
Glc_end_pred = gpr2.predict(DoE_test)
Glc_mid_pred = gpr3.predict(DoE_test)


In [143]:
# Plot Observed vs Predicted
fig = px.scatter(x=VCD_test[20,:], y=VCD_end_pred)
fig.update_layout(showlegend=False,title="GP Black-Box model for Final VCD",xaxis_title="VCD Measured",yaxis_title="VCD Predicted")
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

fig = px.scatter(x=Glc_test[20,:], y=Glc_end_pred)
fig.update_layout(showlegend=False,title="GP Black-Box model for Final Glc",xaxis_title="Glc Measured",yaxis_title="Glc Predicted")
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

fig = px.scatter(x=Glc_test[10,:], y=Glc_mid_pred)
fig.update_layout(showlegend=False,title="GP Black-Box model for Glc at t=2.5",xaxis_title="Glc Measured",yaxis_title="Glc Predicted")
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

We can see that the models cannot learn very much, especially for Glucose.

## First Hybrid Model  

Let's suppose the following model (VCD dependent): 

$$
\begin{aligned}
&\frac{\mathrm{dVCD}}{\mathrm{dt}}=g(\mathrm{VCD}, \mathrm{Glc}) \\
&\frac{\mathrm{dGlc}}{\mathrm{dt}}=-k(\mathrm{VCD}, \mathrm{Glc})+\text { feed }
\end{aligned}
$$

Functions needs to learn the combination of the intrinsic process behavior, like GLC specific consumption with the total amount of cells. So for GLC it estimates the **global** glucose consumption

Let's use the inverse method to get the model. First, let's estimate the derivatives.


### Find and model derivatives

In [144]:
# Estimating derivatives (changes/deltas)
g = np.zeros((21, Nruns_train))
k = np.zeros((21, Nruns_train))
delta_t = t[1]-t[0]
g[0,:] = (VCD[1,:]-VCD[0,:])/delta_t
dGlcdt = (Glc[1,:]-Glc[0,:])/delta_t
k[0,:] = -dGlcdt+DoE_train[:,2].T
for nstep in range(1, 20):
    g[nstep,:] = 0.5*(VCD[nstep+1,:]-VCD[nstep-1,:])/delta_t
    dGlcdt = 0.5*(Glc[nstep+1,:]-Glc[nstep-1,:])/delta_t
    k[nstep,:] = -dGlcdt+DoE_train[:,2].T
g[20,:] = (VCD[20,:]-VCD[19,:])/delta_t
dGlcdt = (Glc[20,:]-Glc[19,:])/delta_t
k[20,:] = -dGlcdt+DoE_train[:,2].T

In [145]:
# Visualize the derivatives
print(pd.DataFrame(g))

            0          1          2          3
0    2.644209   2.479082   0.974547   1.070320
1    3.019668   2.837778   1.119367   1.192954
2    3.850077   3.638752   1.443138   1.465933
3    4.864601   4.634091   1.847371   1.800249
4    6.114741   5.881369   2.358003   2.208676
5    7.655540   7.446731   3.005564   2.705848
6    9.545691   9.407891   3.827779   3.308006
7   11.843008  11.854442   4.871709   4.032190
8   14.592709  14.884366   6.196046   4.894524
9   17.802831  18.592720   7.873473   5.906985
10  21.397487  23.044067   9.992787   7.071517
11  25.139234  28.212994  12.660083   8.370056
12  28.539303  33.871067  15.997052   9.748950
13  30.878193  39.423984  20.132076  11.097516
14  31.594198  43.858179  25.175094  12.227756
15  30.938336  46.239851  31.157306  12.881637
16  29.914650  46.732845  37.903677  12.818077
17  29.294327  46.587696  44.818816  12.001176
18  29.138021  46.738964  50.733725  10.736416
19  29.214323  47.222875  54.387284   9.508276
20  29.282157

In [146]:
# Plot of derivatives for VCD
fig=px.scatter(x=VCD.flatten(),y=Glc.flatten(),color=g.flatten(),color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(coloraxis_colorbar=dict(title="g(VCD,Glc)"))
fig.update_layout(showlegend=False,title="Cell Growth Rate, g(VCD,Glc)",xaxis_title="VCD",yaxis_title="Glc")
fig.show()
# Plot of derivatives for Glc
fig=px.scatter(x=VCD.flatten(),y=Glc.flatten(),color=k.flatten(),color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(coloraxis_colorbar=dict(title="k(VCD,Glc)"))
fig.update_layout(showlegend=False,title="Cell Growth Rate, k(VCD,Glc)",xaxis_title="VCD",yaxis_title="Glc")
fig.show()

Let's fit a model to those derivatives:

In [147]:
X = np.stack((VCD.flatten(), Glc.flatten()), axis=1)
g_mld = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0.0, n_restarts_optimizer=3).fit(X, g.flatten())
k_mld = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0.0, n_restarts_optimizer=3).fit(X, k.flatten())

### Prediction on train set

Make prediction on the training set:

In [148]:
VCD_train_pred = np.zeros((21, Nruns_train))
Glc_train_pred = np.zeros((21, Nruns_train))
for nexp in range(Nruns_train):
    t,VCD_train_pred[:,nexp],Glc_train_pred[:,nexp] = run_1st_hybrid(DoE_train[nexp,0], DoE_train[nexp,1], DoE_train[nexp,2], 5, g_mld, k_mld)

Plot the fit of our model on a training experiment (1st hybrid):

In [149]:
# Select experiment on which to compare
exp_no = 1

In [150]:
# Plot base experiment run
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=t,y=VCD_train_pred[:,exp_no], mode='lines',name='VCD Predicted',line=dict(color = px.colors.qualitative.G10[0], dash='dash')))
fig.add_trace(go.Scatter(x=t,y=VCD[:,exp_no], mode='markers+lines',name='VCD Measured',line=dict(color = px.colors.qualitative.G10[0])))
fig.add_trace(go.Scatter(x=t,y=Glc_train_pred[:,exp_no], mode='lines',name='Glucose Predicted',line=dict(color = px.colors.qualitative.G10[1], dash='dash')),secondary_y=True)
fig.add_trace(go.Scatter(x=t,y=Glc[:,exp_no], mode='markers+lines',name='Glucose Measured',line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True)

fig.update_layout(showlegend=True,title="Measured vs Predicted Glc and VCD for run: "+str(exp_no)+" in train set",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

Plot observed vs predicted in the training set for all times (1st hybrid):

In [151]:
# Plot prediction over Train Set
fig = go.Figure()
for nexp in range(Nruns_train):
    fig.add_trace(go.Scatter(x=VCD[:,nexp],y=VCD_train_pred[:,nexp], mode='markers+lines',name='VCD:'+str(nexp)))
fig.update_layout(showlegend=True,title="Measured vs Predicted for VCD of Experiments in train set ",xaxis_title="VCD Measured",yaxis_title="VCD Predicted")
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

fig = go.Figure()
for nexp in range(Nruns_train):
    fig.add_trace(go.Scatter(x=Glc[:,nexp],y=Glc_train_pred[:,nexp], mode='markers+lines',name='Glc:'+str(nexp)))
fig.update_layout(showlegend=True,title="Measured vs Predicted for Glc of Experiments in train set ",xaxis_title="Glc Measured",yaxis_title="Glc Predicted")
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

In [172]:
print("Training Set")
print("\nVCD:")
RMSE_VCD = np.sqrt(np.sum((VCD.flatten()-VCD_train_pred.flatten())**2)/Nruns_train)
print("Absolute RMSE: ",RMSE_VCD)
RMSE_VCD_rel = RMSE_VCD/np.std(VCD.flatten())
print("Relative RMSE: ",RMSE_VCD_rel)
print("\nGlc:")
RMSE_Glc = np.sqrt(np.sum((Glc.flatten()-Glc_train_pred.flatten())**2)/Nruns_train)
print("Absolute RMSE: ",RMSE_Glc)
RMSE_Glc_rel = RMSE_Glc/np.std(Glc.flatten())
print("Relative RMSE: ",RMSE_Glc_rel)

Training Set

VCD:
Absolute RMSE:  3.419257143030393
Relative RMSE:  0.10835446056452364

Glc:
Absolute RMSE:  3.4642190390560885
Relative RMSE:  0.12113171670267595


### Prediction on test set

Make prediction on the test set (1st hybrid):

In [153]:
VCD_test_pred = np.zeros((21,Nruns_test))
Glc_test_pred = np.zeros((21,Nruns_test))
for nexp in range(Nruns_test):
    t, VCD_test_pred[:,nexp], Glc_test_pred[:,nexp] = run_1st_hybrid(DoE_test[nexp,0], DoE_test[nexp,1], DoE_test[nexp,2], 5, g_mld, k_mld)


Plot how the trained model fits on a test experiment (1st hybrid):

In [154]:
# Select experiment on which to compare
exp_no = 1

In [155]:
# Plot base experiment run
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=t,y=VCD_test_pred[:,exp_no], mode='lines',name='VCD Predicted',line=dict(color = px.colors.qualitative.G10[0], dash='dash')))
fig.add_trace(go.Scatter(x=t,y=VCD_test[:,exp_no], mode='markers+lines',name='VCD Measured',line=dict(color = px.colors.qualitative.G10[0])))
fig.add_trace(go.Scatter(x=t,y=Glc_test_pred[:,exp_no], mode='lines',name='Glc Predicted',line=dict(color = px.colors.qualitative.G10[1], dash='dash')),secondary_y=True)
fig.add_trace(go.Scatter(x=t,y=Glc_test[:,exp_no], mode='markers+lines',name='Glc Measured',line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True)

fig.update_layout(showlegend=True,title="Measured vs Predicted Glc and VCD for run: "+str(exp_no)+" in test set",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

Plot Observed vs Predicted in test set for all times (1st hybrid):

In [156]:
# Plot prediction over Train Set
fig = go.Figure()
for nexp in range(Nruns_test):
    fig.add_trace(go.Scatter(x=VCD_test[:,nexp],y=VCD_test_pred[:,nexp], mode='markers+lines',name='VCD:'+str(nexp)))
fig.update_layout(showlegend=True,title="Measured vs Predicted for VCD of Experiments in test set ",xaxis_title="VCD Measured",yaxis_title="VCD Predicted")
#fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

fig = go.Figure()
for nexp in range(Nruns_test):
    fig.add_trace(go.Scatter(x=Glc_test[:,nexp],y=Glc_test_pred[:,nexp], mode='markers+lines',name='Glc:'+str(nexp)))
fig.update_layout(showlegend=True,title="Measured vs Predicted for Glc of Experiments in test set ",xaxis_title="Glc Measured",yaxis_title="Glc Predicted")
fig.show()

In [159]:
print("Test set")
print("\n VCD:")
RMSE_VCD = np.sqrt(np.sum((VCD_test.flatten()-VCD_test_pred.flatten())**2)/Nruns_test)
print("Absolute RMSE: ",RMSE_VCD)
RMSE_VCD_rel = RMSE_VCD/np.std(VCD_test.flatten())
print("Relative RMSE: ",RMSE_VCD_rel)
print("\n Glc:")
RMSE_Glc = np.sqrt(np.sum((Glc_test.flatten()-Glc_test_pred.flatten())**2)/Nruns_test)
print("Absolute RMSE: ",RMSE_Glc)
RMSE_Glc_rel = RMSE_Glc/np.std(Glc_test.flatten())
print("Relative RMSE: ",RMSE_Glc_rel)


 VCD:
Absolute RMSE:  8.468743487410727
Relative RMSE:  0.2698266483306773

 Glc:
Absolute RMSE:  8.178826462170603
Relative RMSE:  0.3485120138735656


We observe a similar situation in Glucose.

## Second Hybrid Model

Let's suppose the following model (VCD independent):

$$
\begin{aligned}
&\frac{\mathrm{dVCD}}{\mathrm{dt}}=\mu(\mathrm{VCD}, \mathrm{Glc}) \cdot \mathrm{VCD} \\
&\frac{\mathrm{dGlc}}{\mathrm{dt}}=-k(\mathrm{VCD}, \mathrm{Glc}) \cdot \mathrm{VCD}+\text { feed }
\end{aligned}
$$

Functions learns intrinsic process behavior of each cell, such as how much GLC it consumes. Then, this function is multiplied by the overall amount of cells in the process.

As before, let's use the inverse method to get the model. First, let's estimate the derivatives.

### Find and model derivatives

In [163]:
# Estimating derivatives (changes/deltas)
g = np.zeros((21,Nruns_train))
k = np.zeros((21,Nruns_train))
delta_t = t[1]-t[0]
dVCDdt = (VCD[1,:]-VCD[0,:])/delta_t
g[0,:] = dVCDdt/VCD[0,:]
dGlcdt = (Glc[1,:]-Glc[0,:])/delta_t
k[0,:] = (-dGlcdt+DoE_train[:,2].T)/VCD[1,:]
for nstep in range(1,20):
    dVCDdt = 0.5*(VCD[nstep+1,:]-VCD[nstep-1,:])/delta_t
    g[nstep,:] = dVCDdt/VCD[nstep,:]
    dGlcdt = 0.5*(Glc[nstep+1,:]-Glc[nstep-1,:])/delta_t
    k[nstep,:] = (-dGlcdt+DoE_train[:,2].T)/VCD[nstep,:]
dVCDdt = (VCD[20,:]-VCD[19,:])/delta_t
g[20,:] = dVCDdt/VCD[20,:]
dGlcdt = (Glc[20,:]-Glc[19,:])/delta_t
k[20,:] = (-dGlcdt+DoE_train[:,2].T)/VCD[20,:]

Let's plot the data:

In [164]:
# Plot of derivatives for VCD
fig=px.scatter(x=VCD.flatten(),y=Glc.flatten(),color=g.flatten(),color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(coloraxis_colorbar=dict(title="u(VCD,Glc)"))
fig.update_layout(showlegend=False,title="Cell Growth Rate, u(VCD,Glc)",xaxis_title="VCD",yaxis_title="Glc")
fig.show()
# Plot of derivatives for Glc
fig=px.scatter(x=VCD.flatten(),y=Glc.flatten(),color=k.flatten(),color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(coloraxis_colorbar=dict(title="k(VCD,Glc)"))
fig.update_layout(showlegend=False,title="Cell Growth Rate, k(VCD,Glc)",xaxis_title="VCD",yaxis_title="Glc")
fig.show()

Let's fit a model:

In [165]:
X = np.stack((VCD.flatten(), Glc.flatten()), axis=1)
g_mld = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0.0, n_restarts_optimizer=3).fit(X, g.flatten())
k_mld = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0.0, n_restarts_optimizer=3).fit(X, k.flatten())

### Prediction on train set

Make prediction on the training set:

In [166]:
VCD_train_pred = np.zeros((21, Nruns_train))
Glc_train_pred = np.zeros((21, Nruns_train))
for nexp in range(Nruns_train):
    t, VCD_train_pred[:,nexp], Glc_train_pred[:,nexp] = run_2nd_hybrid(DoE_train[nexp,0], DoE_train[nexp,1], DoE_train[nexp,2], 5, g_mld, k_mld)

Plot the fit of our model on a training experiment (2nd hybrid):

In [None]:
# Select which experiment to plot
exp_no = 1

In [168]:
# Plot base experiment run
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=t,y=VCD_train_pred[:,exp_no], mode='lines',name='VCD Predicted',line=dict(color = px.colors.qualitative.G10[0], dash='dash')))
fig.add_trace(go.Scatter(x=t,y=VCD[:,exp_no], mode='markers+lines',name='VCD Measured',line=dict(color = px.colors.qualitative.G10[0])))
fig.add_trace(go.Scatter(x=t,y=Glc_train_pred[:,exp_no], mode='lines',name='Glucose Predicted',line=dict(color = px.colors.qualitative.G10[1], dash='dash')),secondary_y=True)
fig.add_trace(go.Scatter(x=t,y=Glc[:,exp_no], mode='markers+lines',name='Glucose Measured',line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True)

fig.update_layout(showlegend=True,title="Measured vs Predicted Glc and VCD for run: "+str(exp_no)+" in train set",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

Plot Observed vs Predicted in training set for all times (2nd hybrid):

In [171]:
# Plot prediction over Train Set
fig = go.Figure()
for nexp in range(Nruns_train):
    fig.add_trace(go.Scatter(x=VCD[:,nexp],y=VCD_train_pred[:,nexp], mode='markers+lines',name='VCD:'+str(nexp)))
fig.update_layout(showlegend=True,title="Measured vs Predicted for VCD of Experiments in train set ",xaxis_title="VCD Measured",yaxis_title="VCD Predicted")
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

fig = go.Figure()
for nexp in range(Nruns_train):
    fig.add_trace(go.Scatter(x=Glc[:,nexp],y=Glc_train_pred[:,nexp], mode='markers+lines',name='Glc:'+str(nexp)))
fig.update_layout(showlegend=True,title="Measured vs Predicted for Glc of Experiments in train set ",xaxis_title="Glc Measured",yaxis_title="Glc Predicted")
fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

In [173]:
print("Training Set")
print("\nVCD:")
RMSE_VCD = np.sqrt(np.sum((VCD.flatten()-VCD_train_pred.flatten())**2)/Nruns_train)
print("Absolute RMSE: ",RMSE_VCD)
RMSE_VCD_rel = RMSE_VCD/np.std(VCD.flatten())
print("Relative RMSE: ",RMSE_VCD_rel)
print("\nGlc:")
RMSE_Glc = np.sqrt(np.sum((Glc.flatten()-Glc_train_pred.flatten())**2)/Nruns_train)
print("Absolute RMSE: ",RMSE_Glc)
RMSE_Glc_rel = RMSE_Glc/np.std(Glc.flatten())
print("Relative RMSE: ",RMSE_Glc_rel)

Training Set

VCD:
Absolute RMSE:  3.419257143030393
Relative RMSE:  0.10835446056452364

Glc:
Absolute RMSE:  3.4642190390560885
Relative RMSE:  0.12113171670267595


### Prediction on test set

Make prediction on the test set (2nd hybrid):

In [174]:
VCD_test_pred = np.zeros((21,Nruns_test))
Glc_test_pred = np.zeros((21,Nruns_test))
for nexp in range(Nruns_test):
    t, VCD_test_pred[:,nexp], Glc_test_pred[:,nexp] = run_2nd_hybrid(DoE_test[nexp,0], DoE_test[nexp,1], DoE_test[nexp,2], 5, g_mld, k_mld)

Plot how the model fits on a test experiment (2nd hybrid):

In [177]:
# Select experiment to plot
exp_no = 1

In [178]:
# Plot base experiment run
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=t,y=VCD_test_pred[:,exp_no], mode='lines',name='VCD Predicted',line=dict(color = px.colors.qualitative.G10[0], dash='dash')))
fig.add_trace(go.Scatter(x=t,y=VCD_test[:,exp_no], mode='markers+lines',name='VCD Measured',line=dict(color = px.colors.qualitative.G10[0])))
fig.add_trace(go.Scatter(x=t,y=Glc_test_pred[:,exp_no], mode='lines',name='Glc Predicted',line=dict(color = px.colors.qualitative.G10[1], dash='dash')),secondary_y=True)
fig.add_trace(go.Scatter(x=t,y=Glc_test[:,exp_no], mode='markers+lines',name='Glc Measured',line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True)

fig.update_layout(showlegend=True,title="Measured vs Predicted Glc and VCD for run: "+str(exp_no)+" in test set",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

Plot Observed vs Predicted in the test set for all times (2nd hybrid):

In [179]:
# Plot prediction over Train Set
fig = go.Figure()
for nexp in range(Nruns_test):
    fig.add_trace(go.Scatter(x=VCD_test[:,nexp],y=VCD_test_pred[:,nexp], mode='markers+lines',name='VCD:'+str(nexp)))
fig.update_layout(showlegend=True,title="Measured vs Predicted for VCD of Experiments in test set ",xaxis_title="VCD Measured",yaxis_title="VCD Predicted")
#fig.update_layout(shapes = [{'type': 'line', 'yref': 'paper', 'xref': 'paper', 'y0': 0, 'y1': 1, 'x0': 0, 'x1': 1, 'layer': 'below'}])
fig.show()

fig = go.Figure()
for nexp in range(Nruns_test):
    fig.add_trace(go.Scatter(x=Glc_test[:,nexp],y=Glc_test_pred[:,nexp], mode='markers+lines',name='Glc:'+str(nexp)))
fig.update_layout(showlegend=True,title="Measured vs Predicted for Glc of Experiments in test set ",xaxis_title="Glc Measured",yaxis_title="Glc Predicted")
fig.show()

In [180]:
print("Test set")
print("\n VCD:")
RMSE_VCD = np.sqrt(np.sum((VCD_test.flatten()-VCD_test_pred.flatten())**2)/Nruns_test)
print("Absolute RMSE: ",RMSE_VCD)
RMSE_VCD_rel = RMSE_VCD/np.std(VCD_test.flatten())
print("Relative RMSE: ",RMSE_VCD_rel)
print("\n Glc:")
RMSE_Glc = np.sqrt(np.sum((Glc_test.flatten()-Glc_test_pred.flatten())**2)/Nruns_test)
print("Absolute RMSE: ",RMSE_Glc)
RMSE_Glc_rel = RMSE_Glc/np.std(Glc_test.flatten())
print("Relative RMSE: ",RMSE_Glc_rel)

Test set

 VCD:
Absolute RMSE:  4.021430326357565
Relative RMSE:  0.12812869678594596

 Glc:
Absolute RMSE:  3.581667702801138
Relative RMSE:  0.15262021145731147


## Discussion: Advantages and disadvantages of hybrid model


Pros:
*   Works well with small dataset
*   Describes the full dynamics
*   Extrapolation possible
*   Incorporate domain knowledge
*   ...


Cons:
*   Prone to failing due data mistakes
*   Units must be consistent
*   Prone to process dynamic misspecification
*   ...
