# Setup of Noteboook

The follwing code clones the github repository with course files. 
Subsequently it imports all libraries and custom modules needed for this notebook

In [53]:
!git clone https://github.com/DataHow/analytics-course-scripts.git

fatal: destination path 'analytics-course-scripts' already exists and is not an empty directory.


In [54]:
# import libraries
import numpy as np
import pandas as pd
import scipy
from scipy.stats import qmc
from scipy.integrate import solve_ivp
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings('ignore')
SEED = np.random.default_rng(42)

# Hybrid Models

We use a simple model for cell expansion, accounting for the growth of viable cell density (VCD) and the consumption of glucose (Glc), is used to show how hybrid models are trained and can predict new experimental conditions.

Furthermore we define functions specifying our two hybrid models below which will be utilized further in the script. They consist of two functions. First function defines the ODE equations that describe the dynamics and second function solves the equations using specified parameters and initial conditions.

In [55]:
def ode_fcn(t, y, feed):
  # define parameters
    VCD = y[0]
    Glc = y[1]
    # growth rate
    mu = Glc/(5+Glc)
    # mass balances
    dVCD_dt = mu*VCD
    dGlc_dt = -(0.5*mu+0.05*Glc)*VCD+feed
    dy = [dVCD_dt, dGlc_dt]
    return dy

def run_experiment(VCD_0, Glc_0, feed, t_end, time_step=0.25):
    fun = lambda t, y: ode_fcn(t,y,feed)
    y0 = [VCD_0, Glc_0]
    t_span = np.arange(0, 0.5*round(2*t_end)+time_step, time_step)
    sol = solve_ivp(fun, [t_span[0], t_span[-1]], y0, method='LSODA', t_eval=t_span, rtol=1e-6, atol=1e-6)
    t = sol.t.tolist()
    y = sol.y.T
    VCD = y[:, 0]
    Glc = y[:, 1]
    return t, VCD, Glc

In [56]:
def ode_fcn_1st_hybrid(t, y, feed, g_mld, k_mld):
    # mass balances
    dVCD_dt = g_mld.predict(y.reshape(-1, 1).T)
    dGlc_dt = -k_mld.predict(y.reshape(-1, 1).T) + feed
    return [dVCD_dt, dGlc_dt]

def run_1st_hybrid(VCD_0, Glc_0, feed, t_end, g_mld, k_mld,time_step =0.25):
    fun = lambda t, y: ode_fcn_1st_hybrid(t, y, feed, g_mld, k_mld)
    y0 = np.array([VCD_0, Glc_0])
    t_span = np.arange(0, 0.5 * round(2 * t_end) + time_step, time_step)
    sol = solve_ivp(fun, [t_span[0], t_span[-1]], y0, method='LSODA', t_eval=t_span, rtol=1e-6, atol=1e-6)
    t = sol.t.tolist()
    y = sol.y.T
    VCD = y[:, 0]
    Glc = y[:, 1]
    return t, VCD, Glc

In [57]:
def ode_fcn_2nd_hybrid(t, y, feed, g_mld,k_mld):
    # mass balances
    dVCD_dt = g_mld.predict(y.reshape(-1, 1).T)*y[0]
    dGlc_dt = -k_mld.predict(y.reshape(-1, 1).T)*y[0]+feed
    return [dVCD_dt, dGlc_dt]


def run_2nd_hybrid(VCD_0,Glc_0,feed,t_end,mu_mld,k_mld,time_step=0.25):
    fun = lambda t, y: ode_fcn_2nd_hybrid(t, y, feed, g_mld, k_mld)
    y0 = np.array([VCD_0, Glc_0])
    t_span = np.arange(0, 0.5 * round(2 * t_end) + time_step, time_step)
    sol = solve_ivp(fun, [t_span[0], t_span[-1]], y0, method='LSODA', t_eval=t_span, rtol=1e-6, atol=1e-6)
    t = sol.t.tolist()
    y = sol.y.T
    VCD = y[:, 0]
    Glc = y[:, 1]
    return t, VCD, Glc


Firstly, we will run the base experiment experiment:

## Run base experiment

Here we create work with the same insilico model as before. The cell process parameters which we are able to design are following:

*   `VCD_0` Amount of VCD at time 0
*   `Glc_0` Amount of Glc at time 0
*   `feed_rate` Amount of continuous feed each day. Feeding starts at day 1.
*   `feed_end` Day when we stop feeding. This defines aslo the length of experiment.

Subsequently we plot the process evolution over time.


In [58]:
""" Set Process Parameters for run generation """
VCD_0 = 0.5
GLC_0 = 10
FEED_RATE = 10
FEED_END = 5
""" Measurement granularity """
TIME_STEP = 0.25

In [59]:
# Plot single experiment run
t, VCD, GLC = run_experiment(VCD_0,GLC_0,FEED_RATE,FEED_END,time_step=TIME_STEP)
fig = make_subplots(specs=[[{"secondary_y": True}]])
fig.add_trace(go.Scatter(x=t,y=VCD, mode='markers+lines',name='VCD'))
fig.add_trace(go.Scatter(x=t,y=GLC, mode='markers+lines',name='Glucose'),secondary_y=True)
fig.update_layout(showlegend=True,title="Cell process dynamics",xaxis_title="Time",yaxis_title="Glc",width=1000)
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

## Create experimental design

Here we create a diverse set of experimental conditions for cell process parameters using Latin Hypercube Sampling and run the experiments.


In [60]:
""" Number of experiments in training """
NUM_RUNS = 4
""" Define initial conditions for process parameters in DOE """
VCD_0_BOUNDS = [1.5,2] 
GLC_0_BOUNDS = [10,20] 
FEED_RATE_BOUNDS = [25,30]

In [61]:
# Run Design of Experiments
doe_train = np.zeros((NUM_RUNS,3))
sampler = qmc.LatinHypercube(d=3,seed=42,centered=True)
doe_train_nondim = 2*sampler.random(n=NUM_RUNS)-1
doe_train[:,0] = doe_train_nondim[:,0]*VCD_0_BOUNDS[0] + VCD_0_BOUNDS[1] 
doe_train[:,1] = doe_train_nondim[:,1]*GLC_0_BOUNDS[0] + GLC_0_BOUNDS[1]
doe_train[:,2] = doe_train_nondim[:,2]*FEED_RATE_BOUNDS[0] + FEED_RATE_BOUNDS[1]
LEN_RUNS = int(np.ceil((FEED_END/TIME_STEP)+1))
# Run experiments
vcd = np.zeros((LEN_RUNS,NUM_RUNS))
glc = np.zeros((LEN_RUNS,NUM_RUNS))
for i in range(NUM_RUNS):
    t, vcd[:,i],glc[:,i] = run_experiment(doe_train[i,0],doe_train[i,1],doe_train[i,2],FEED_END,time_step=TIME_STEP)

In [62]:
# Plot experiments
fig = make_subplots(specs=[[{"secondary_y": True}]])
for i in range(NUM_RUNS):
    fig.add_trace(go.Scatter(x=t,y=vcd[:, i], mode='markers+lines',name='VCD:'+str(i), line=dict(color = px.colors.qualitative.G10[0])))
    fig.add_trace(go.Scatter(x=t,y=glc[:, i], mode='markers+lines',name='Glucose:'+str(i),line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True)
fig.update_layout(showlegend=False,title="Cell process dynamics in DOE",xaxis_title="Time",yaxis_title="Glc",width=1000)
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

### Create test set

In similar fashion we create a test set significantly larger number of experiments to properly verify the model's performance on a completely unsees data.

In [63]:
""" Number of experiments in testing """
NUM_TEST = 100

In [64]:
# Run Design of Experiments
doe_test = np.zeros((NUM_TEST,3))
sampler = qmc.LatinHypercube(d=3, seed=42)
doe_test_nondim = 2*sampler.random(n=NUM_TEST)-1
doe_test[:,0] = doe_test_nondim[:,0]*VCD_0_BOUNDS[0] + VCD_0_BOUNDS[1] 
doe_test[:,1] = doe_test_nondim[:,1]*GLC_0_BOUNDS[0] + GLC_0_BOUNDS[1]
doe_test[:,2] = doe_test_nondim[:,2]*FEED_RATE_BOUNDS[0] + FEED_RATE_BOUNDS[1]
LEN_RUNS = int(np.ceil((FEED_END/TIME_STEP)+1))
# Run experiments
vcd_test = np.zeros((LEN_RUNS,NUM_TEST))
glc_test = np.zeros((LEN_RUNS,NUM_TEST))
for i in range(NUM_TEST):
    t, vcd_test[:,i],glc_test[:,i] = run_experiment(doe_test[i,0],doe_test[i,1],doe_test[i,2],FEED_END,time_step=TIME_STEP)

In [65]:
# Plot experiments
fig = make_subplots(specs=[[{"secondary_y": True}]])
for i in range(NUM_TEST):
    fig.add_trace(go.Scatter(x=t,y=vcd_test[:, i], mode='markers+lines',name='VCD:'+str(i), line=dict(color = px.colors.qualitative.G10[0])))
    fig.add_trace(go.Scatter(x=t,y=glc_test[:, i], mode='markers+lines',name='Glucose:'+str(i),line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True)
fig.update_layout(showlegend=False,title="Cell process dynamics in DOE",xaxis_title="Time",yaxis_title="Glc",width=1000)
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

## Train Black-box GP
In this section, we are training a "black-box" Gaussian process, which is directly linking the manipulated variables to the final VCD and the final Glc.

The scope of this exercize is to show that the good predictability of the hybrid models that are shown below in the script cannot be ascribed to the fitting capabilities of GPs or any other model with such few experiments.

We create a models to predict VCD and GLC at final time step as well as at midpint.

In [66]:
# Define Kernel
kernel = 1.0 * RBF(length_scale=1e-1, length_scale_bounds=(1e-2, 1e2)) + WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-10, 1e1))
# Define Regression Model
gpr_vcd_end = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0, n_restarts_optimizer=3).fit(doe_train, vcd[-1,:]) # Model for final VCD
gpr_vcd_mid = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0, n_restarts_optimizer=3).fit(doe_train, vcd[int(LEN_RUNS/2),:]) # Model for midpoint VCD
gpr_glc_end = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0, n_restarts_optimizer=3).fit(doe_train, glc[-1,:]) # Model for final Glc
gpr_glc_mid = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0, n_restarts_optimizer=3).fit(doe_train, glc[int(LEN_RUNS/2),:]) # Model for midpoint Glc
# Predict values on test set
vcd_end_pred = gpr_vcd_end.predict(doe_test)
vcd_mid_pred = gpr_vcd_mid.predict(doe_test)
glc_end_pred = gpr_glc_end.predict(doe_test)
glc_mid_pred = gpr_glc_mid.predict(doe_test)

In [67]:

# Plot observed vs predicted
fig = make_subplots(rows=2, cols=2, subplot_titles=(
    f"VCD Final - Test Set <br> R^2 = {round(r2_score(vcd_test[-1,:],vcd_end_pred),3)} <br> Rel RMSE = {round(mean_squared_error(vcd_test[-1,:], vcd_end_pred,squared=False) / np.std(np.array(vcd_test[-1,:])),3)}" ,
    f"VCD Midpoint - Test Set <br> R^2 = {round(r2_score(vcd_test[int(LEN_RUNS/2),:],vcd_mid_pred),3)} <br> Rel RMSE = {round(mean_squared_error(vcd_test[int(LEN_RUNS/2),:], vcd_mid_pred,squared=False) / np.std(np.array(vcd_test[int(LEN_RUNS/2),:])),3)}" ,
    f"GLC Final - Test Set <br> R^2 = {round(r2_score(glc_test[-1,:],glc_end_pred),3)} <br> Rel RMSE = {round(mean_squared_error(glc_test[-1,:], glc_end_pred,squared=False) / np.std(np.array(glc_test[-1,:])),3)}" ,
    f"GLC Midpoint - Test Set <br> R^2 = {round(r2_score(glc_test[int(LEN_RUNS/2),:],glc_mid_pred),3)} <br> Rel RMSE = {round(mean_squared_error(glc_test[int(LEN_RUNS/2),:], glc_mid_pred,squared=False) / np.std(np.array(glc_test[int(LEN_RUNS/2),:])),3)}" ))
fig.add_trace(go.Scatter(x=vcd_test[-1,:],y=vcd_end_pred,mode="markers"),row=1,col=1)
fig.add_trace(go.Scatter(x=vcd_test[int(LEN_RUNS/2),:],y=vcd_mid_pred,mode="markers"),row=1,col=2)
fig.add_trace(go.Scatter(x=glc_test[-1,:],y=glc_end_pred,mode="markers"),row=2,col=1)
fig.add_trace(go.Scatter(x=glc_test[int(LEN_RUNS/2),:],y=glc_mid_pred,mode="markers"),row=2,col=2)
fig.add_shape(type="line",x0=min(vcd_end_pred),y0=min(vcd_end_pred),x1=max(vcd_end_pred),y1=max(vcd_end_pred), layer='below', line=dict(dash='dash'),row=1,col=1)
fig.add_shape(type="line",x0=min(vcd_mid_pred),y0=min(vcd_mid_pred),x1=max(vcd_mid_pred),y1=max(vcd_mid_pred), layer='below', line=dict(dash='dash'),row=1,col=2)
fig.add_shape(type="line",x0=min(glc_end_pred),y0=min(glc_end_pred),x1=max(glc_end_pred),y1=max(glc_end_pred), layer='below', line=dict(dash='dash'),row=2,col=1)
fig.add_shape(type="line",x0=min(glc_mid_pred),y0=min(glc_mid_pred),x1=max(glc_mid_pred),y1=max(glc_mid_pred), layer='below', line=dict(dash='dash'),row=2,col=2)
fig.update_layout(title_text = "Observed vs Predicted",showlegend=False, height=1000)
fig.show()

We can see that the models cannot learn very much.


## First Hybrid Model  

Let's suppose the following model (VCD dependent): 

$$
\begin{aligned}
&\frac{\mathrm{dVCD}}{\mathrm{dt}}=g(\mathrm{VCD}, \mathrm{Glc}) \\
&\frac{\mathrm{dGlc}}{\mathrm{dt}}=-k(\mathrm{VCD}, \mathrm{Glc})+\text { feed }
\end{aligned}
$$

Functions needs to learn the combination of the intrinsic process behavior, like GLC specific consumption with the total amount of cells. So for GLC it estimates the **global** glucose consumption

To obtain the full profiles in prediction, we use reconstruct it following way for each timestep:

$$
VCD_{t + \Delta t} = VCD_{t} + \frac{\mathrm{dVCD}}{\mathrm{dt}}\Delta t
$$


Let's use the inverse method to get the model. First, let's estimate the derivatives for each step individual using the approximation of subsequtive steps:
$$
\frac{\mathrm{dVCD}}{\mathrm{dt}} ≈ ΔVCD_t \approx VCD_{t + \Delta t} - VCD_{t}
$$

### Find and model derivatives

In [68]:
# Estimating derivatives (changes/deltas)
g = np.zeros((LEN_RUNS, NUM_RUNS))
k = np.zeros((LEN_RUNS, NUM_RUNS))
# initial derivatives
g[0,:] = (vcd[1,:]-vcd[0,:])/TIME_STEP
dGlcdt = (glc[1,:]-glc[0,:])/TIME_STEP
k[0,:] = -dGlcdt+doe_train[:,2].T
# timestep derivatives
for j in range(1,LEN_RUNS-1):
    g[j,:] = 0.5*(vcd[j+1,:]-vcd[j-1,:])/TIME_STEP
    dGlcdt = 0.5*(glc[j+1,:]-glc[j-1,:])/TIME_STEP
    k[j,:] = -dGlcdt+doe_train[:,2].T
# final derivatives
g[-1,:] = (vcd[-1,:]-vcd[-2,:])/TIME_STEP
dGlcdt = (glc[-1,:]-glc[-2,:])/TIME_STEP
k[-1,:] = -dGlcdt+doe_train[:,2].T

In [69]:
# Visualize the derivatives for VCD
print(pd.DataFrame(g))

            0          1          2          3
0    2.956186   2.241224   1.374803   0.776660
1    3.316390   2.574292   1.598957   0.886273
2    4.120518   3.315900   2.086777   1.131453
3    5.108407   4.235715   2.675568   1.436626
4    6.313407   5.390574   3.407420   1.819395
5    7.768012   6.845003   4.325236   2.300939
6    9.496838   8.675804   5.478961   2.907258
7   11.502852  10.974104   6.928368   3.670371
8   13.741885  13.844875   8.744539   4.629505
9   16.079689  17.401557  11.009782   5.832147
10  18.232604  21.749865  13.814300   7.334637
11  19.731882  26.949017  17.245885   9.201521
12  20.047919  32.929307  21.364911  11.502138
13  19.021592  39.343397  26.150070  14.301213
14  17.250982  45.392459  31.393323  17.637186
15  15.674456  49.920163  36.540034  21.475791
16  14.748294  52.235011  40.605105  25.620432
17  14.350630  52.986955  42.603557  29.569885
18  14.219892  53.450166  42.613748  32.419211
19  14.192154  54.175941  41.946461  33.203419
20  14.191265

In [70]:
# Plot scatterplots for VCD and GLC, color by derivative
fig = make_subplots(rows=2, cols=2, subplot_titles=("VCD over time - VCD growth rate","GLC over time - GLC consumption rate","VCD vs GLC - VCD growth rate","VCD vc GLC - GLC consumption rate" ))
fig.add_trace(go.Scatter(x=np.repeat(t,NUM_RUNS),y=vcd.flatten(),mode='markers',marker=dict(color=g.flatten(),colorscale=px.colors.sequential.Viridis,showscale=True,colorbar=dict(len=1.0, x=0.45)),),row=1,col=1)
fig.add_trace(go.Scatter(x=np.repeat(t,NUM_RUNS),y=glc.flatten(),mode='markers',marker=dict(color=k.flatten(),colorscale=px.colors.sequential.Viridis,showscale=True,colorbar=dict(len=1.0, x=1.0)),),row=1,col=2)
fig.add_trace(go.Scatter(x=vcd.flatten(),y=glc.flatten(),mode='markers',marker=dict(color=g.flatten(),colorscale=px.colors.sequential.Viridis),),row=2,col=1)
fig.add_trace(go.Scatter(x=vcd.flatten(),y=glc.flatten(),mode='markers',marker=dict(color=k.flatten(),colorscale=px.colors.sequential.Viridis),),row=2,col=2)
fig.update_layout(title_text = "State plots of process colored by growth/consumption rates",showlegend=False,height=1000)
fig.update_xaxes(title_text="Time", row=1, col=1), fig.update_yaxes(title_text="VCD", row=1, col=1)
fig.update_xaxes(title_text="Time", row=1, col=2), fig.update_yaxes(title_text="GLC", row=1, col=2)
fig.update_xaxes(title_text="VCD", row=2, col=1), fig.update_yaxes(title_text="GLC", row=2, col=1)
fig.update_xaxes(title_text="VCD", row=2, col=2), fig.update_yaxes(title_text="GLC", row=2, col=2)
fig.show()

Let's fit the same model, but training on the calculated derivatives:

In [71]:
X = np.stack((vcd.flatten(), glc.flatten()), axis=1)
g_mld = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0.0, n_restarts_optimizer=3).fit(X, g.flatten())
k_mld = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0.0, n_restarts_optimizer=3).fit(X, k.flatten())

### Model Evaluation on Train and Test Set

Make prediction on the training set and testing set:


In [72]:
# Predictions in train set
vcd_train_pred = np.zeros((LEN_RUNS, NUM_RUNS))
glc_train_pred = np.zeros((LEN_RUNS, NUM_RUNS))
for i in range(NUM_RUNS):
    t,vcd_train_pred[:,i],glc_train_pred[:,i] = run_1st_hybrid(doe_train[i,0], doe_train[i,1], doe_train[i,2], FEED_END, g_mld, k_mld, time_step=TIME_STEP)
# Predictions in test set
vcd_test_pred = np.zeros((LEN_RUNS,NUM_TEST))
glc_test_pred = np.zeros((LEN_RUNS,NUM_TEST))
for i in range(NUM_TEST):
    t, vcd_test_pred[:,i], glc_test_pred[:,i] = run_1st_hybrid(doe_test[i,0], doe_test[i,1], doe_test[i,2], FEED_END, g_mld, k_mld, time_step=TIME_STEP)

Plot the fit of our model (1st hybrid) over time:

In [73]:
""" Select experiment on which to predict """
PLOT_TRAIN=0
PLOT_TEST=49

In [74]:
# Plot selected experiment run
fig = make_subplots(rows=1,cols=2,specs=[[{"secondary_y": True},{"secondary_y": True}]],subplot_titles=("Train Set prediction","Test Set prediction"))
fig.add_trace(go.Scatter(x=t,y=vcd_train_pred[:,PLOT_TRAIN], mode='lines',name='VCD Predicted',line=dict(color = px.colors.qualitative.G10[0], dash='dash')),row=1,col=1)
fig.add_trace(go.Scatter(x=t,y=vcd[:,PLOT_TRAIN], mode='markers+lines',name='VCD Measured',line=dict(color = px.colors.qualitative.G10[0])),row=1,col=1)
fig.add_trace(go.Scatter(x=t,y=glc_train_pred[:,PLOT_TRAIN], mode='lines',name='Glucose Predicted',line=dict(color = px.colors.qualitative.G10[1], dash='dash')),secondary_y=True,row=1,col=1)
fig.add_trace(go.Scatter(x=t,y=glc[:,PLOT_TRAIN], mode='markers+lines',name='Glucose Measured',line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True,row=1,col=1)
fig.add_trace(go.Scatter(x=t,y=vcd_test_pred[:,PLOT_TEST], mode='lines',name='VCD Predicted',line=dict(color = px.colors.qualitative.G10[0], dash='dash')),row=1,col=2)
fig.add_trace(go.Scatter(x=t,y=vcd_test[:,PLOT_TEST], mode='markers+lines',name='VCD Measured',line=dict(color = px.colors.qualitative.G10[0])),row=1,col=2)
fig.add_trace(go.Scatter(x=t,y=glc_test_pred[:,PLOT_TEST], mode='lines',name='Glucose Predicted',line=dict(color = px.colors.qualitative.G10[1], dash='dash')),secondary_y=True,row=1,col=2)
fig.add_trace(go.Scatter(x=t,y=glc_test[:,PLOT_TEST], mode='markers+lines',name='Glucose Measured',line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True,row=1,col=2)
fig.update_layout(showlegend=True,title="Predicted profile Glc and VCD profile over time for run id: "+str(PLOT_TRAIN)+" in train set (left) & "+str(PLOT_TEST)+" in test set (right)",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

Plot observed vs predicted in train and test set for (1st hybrid):

In [75]:
# Plot observed vs predicted
fig = make_subplots(rows=2, cols=2, subplot_titles=(
    f"VCD - Train Set <br> R^2 = {round(r2_score(vcd,vcd_train_pred),3)} <br> Rel RMSE = {round(mean_squared_error(vcd, vcd_train_pred,squared=False) / np.std(np.array(vcd)),3)}" ,
    f"VCD - Test Set <br> R^2 = {round(r2_score(vcd_test,vcd_test_pred),3)} <br> Rel RMSE = {round(mean_squared_error(vcd_test, vcd_test_pred,squared=False) / np.std(np.array(vcd)),3)}" ,
    f"GLC - Train Set <br> R^2 = {round(r2_score(glc,glc_train_pred),3)} <br> Rel RMSE = {round(mean_squared_error(glc, glc_train_pred,squared=False) / np.std(np.array(glc)),3)}" ,
    f"GLC - Test Set <br> R^2 = {round(r2_score(glc_test,glc_test_pred),3)} <br> Rel RMSE = {round(mean_squared_error(glc_test, glc_test_pred,squared=False) / np.std(np.array(glc)),3)}"))
for i in range(NUM_RUNS):
    fig.add_trace(go.Scatter(x=vcd[:,i],y=vcd_train_pred[:,i],mode="markers",name=f"run id {i}"),row=1,col=1)
    fig.add_trace(go.Scatter(x=glc[:,i],y=glc_train_pred[:,i],mode="markers",name=f"run id {i}"),row=2,col=1)
fig.add_shape(type="line",x0=vcd.min(),y0=vcd.min(),x1=vcd.max(),y1=vcd.max(), layer='above', line=dict(dash='dash'),row=1,col=1)
fig.add_shape(type="line",x0=glc.min(),y0=glc.min(),x1=glc.max(),y1=glc.max(), layer='above', line=dict(dash='dash'),row=2,col=1)
for i in range(NUM_TEST):
    fig.add_trace(go.Scatter(x=vcd_test[:,i],y=vcd_test_pred[:,i],mode="markers",name=f"run id {i}"),row=1,col=2)
    fig.add_trace(go.Scatter(x=glc_test[:,i],y=glc_test_pred[:,i],mode="markers",name=f"run id {i}"),row=2,col=2)
fig.add_shape(type="line",x0=vcd_test.min(),y0=vcd_test.min(),x1=vcd_test.max(),y1=vcd_test.max(), layer='above', line=dict(dash='dash'),row=1,col=2)
fig.add_shape(type="line",x0=glc_test.min(),y0=glc_test.min(),x1=glc_test.max(),y1=glc_test.max(), layer='above', line=dict(dash='dash'),row=2,col=2)
fig.update_layout(title_text = "Observed vs Predicted",showlegend=False, height=1000)
fig.show()

We can see that the model performs generally very good besides few experiments.

## Second Hybrid Model

Let's suppose the following model (VCD independent):

$$
\begin{aligned}
&\frac{\mathrm{dVCD}}{\mathrm{dt}}=\mu(\mathrm{VCD}, \mathrm{Glc}) \cdot \mathrm{VCD} \\
&\frac{\mathrm{dGlc}}{\mathrm{dt}}=-c(\mathrm{VCD}, \mathrm{Glc}) \cdot \mathrm{VCD}+\text { feed }
\end{aligned}
$$

Functions learns intrinsic process behavior of each cell, such as how much GLC it consumes. Then, this function is multiplied by the overall amount of cells in the process.


To obtain the full profiles in prediction, we use reconstruct it following way for each timestep:

$$
VCD_{t + \Delta t} = VCD_{t} + \frac{\mathrm{dVCD}}{\mathrm{dt}}\Delta t
$$


As before, let's use the inverse method to get the model. First, let's estimate the derivatives for each step individual using the approximation of subsequtive steps:
$$
\frac{\mathrm{dVCD}}{\mathrm{dt}} ≈ ΔVCD_t \approx VCD_{t + \Delta t} - VCD_{t}
$$


### Find and model derivatives

In [76]:
# Estimating derivatives (changes/deltas)
g = np.zeros((LEN_RUNS, NUM_RUNS))
k = np.zeros((LEN_RUNS, NUM_RUNS))
# initial derivatives
dVcddt = (vcd[1,:]-vcd[0,:])/TIME_STEP
g[0,:] = dVcddt/vcd[0,:]
dGlcdt = (glc[1,:]-glc[0,:])/TIME_STEP
k[0,:] = (-dGlcdt+doe_train[:,2].T)/vcd[1,:]
# timestep derivatives
for j in range(1,LEN_RUNS-1):
    dVcddt = 0.5*(vcd[j+1,:]-vcd[j-1,:])/TIME_STEP
    g[j,:] = dVcddt/vcd[j,:]
    dGlcdt = 0.5*(glc[j+1,:]-glc[j-1,:])/TIME_STEP
    k[j,:] = (-dGlcdt+doe_train[:,2].T)/vcd[j,:]
# final derivatives
dVcddt = (vcd[-1,:]-vcd[-2,:])/TIME_STEP
g[-1,:] = dVcddt/vcd[-1,:]
dGlcdt = (glc[-1,:]-glc[-2,:])/TIME_STEP
k[-1,:] = (-dGlcdt+doe_train[:,2].T)/vcd[-1,:]

In [77]:
# Visualize the derivatives for VCD
print(pd.DataFrame(g))

           0         1         2         3
0   0.945979  0.943673  0.846032  0.887611
1   0.858269  0.877010  0.812189  0.828939
2   0.861457  0.905453  0.860712  0.858374
3   0.862279  0.922160  0.888277  0.878728
4   0.860442  0.932625  0.905684  0.893415
5   0.855413  0.939146  0.917180  0.904248
6   0.846315  0.942765  0.924738  0.912247
7   0.831766  0.943892  0.929324  0.918002
8   0.809640  0.942498  0.931354  0.921826
9   0.776783  0.938146  0.930859  0.923836
10  0.728935  0.929865  0.927510  0.923970
11  0.661773  0.915891  0.920532  0.921969
12  0.574791  0.893244  0.908492  0.917314
13  0.477442  0.857370  0.888939  0.909090
14  0.388628  0.802887  0.857930  0.895764
15  0.323411  0.727862  0.809952  0.874779
16  0.282390  0.640947  0.740132  0.842017
17  0.256994  0.559511  0.651268  0.791482
18  0.239384  0.494954  0.559502  0.717043
19  0.225450  0.446160  0.483682  0.619816
20  0.213409  0.404234  0.428826  0.533491


Let's plot the data:

In [78]:
# Plot scatterplots for VCD and GLC, color by derivative
fig = make_subplots(rows=2, cols=2, subplot_titles=("VCD over time - VCD spec growth rate","GLC over time - GLC spec consumption rate","VCD vs GLC - VCD spec growth rate","VCD vc GLC - GLC spec consumption rate" ))
fig.add_trace(go.Scatter(x=np.repeat(t,NUM_RUNS),y=vcd.flatten(),mode='markers',marker=dict(color=g.flatten(),colorscale=px.colors.sequential.Viridis,showscale=True,colorbar=dict(len=1.0, x=0.45)),),row=1,col=1)
fig.add_trace(go.Scatter(x=np.repeat(t,NUM_RUNS),y=glc.flatten(),mode='markers',marker=dict(color=k.flatten(),colorscale=px.colors.sequential.Viridis,showscale=True,colorbar=dict(len=1.0, x=1.0)),),row=1,col=2)
fig.add_trace(go.Scatter(x=vcd.flatten(),y=glc.flatten(),mode='markers',marker=dict(color=g.flatten(),colorscale=px.colors.sequential.Viridis),),row=2,col=1)
fig.add_trace(go.Scatter(x=vcd.flatten(),y=glc.flatten(),mode='markers',marker=dict(color=k.flatten(),colorscale=px.colors.sequential.Viridis),),row=2,col=2)
fig.update_layout(title_text = "State plots of process colored by growth/consumption rates",showlegend=False,height=1000)
fig.update_xaxes(title_text="Time", row=1, col=1), fig.update_yaxes(title_text="VCD", row=1, col=1)
fig.update_xaxes(title_text="Time", row=1, col=2), fig.update_yaxes(title_text="GLC", row=1, col=2)
fig.update_xaxes(title_text="VCD", row=2, col=1), fig.update_yaxes(title_text="GLC", row=2, col=1)
fig.update_xaxes(title_text="VCD", row=2, col=2), fig.update_yaxes(title_text="GLC", row=2, col=2)
fig.show()

Let's fit a model:

In [79]:
X = np.stack((vcd.flatten(), glc.flatten()), axis=1)
#kernel2 = 1.0 * RBF(length_scale=1e-1, length_scale_bounds=(1e-3, 1e3)) + WhiteKernel(noise_level=1e-2, noise_level_bounds=(1e-10, 1e1))
g_mld = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0.0, n_restarts_optimizer=3).fit(X, g.flatten())
k_mld = GaussianProcessRegressor(kernel=kernel, random_state=0, alpha=0.0, n_restarts_optimizer=3).fit(X, k.flatten())

### Model Evaluation on Train and Test Set

Make prediction on the training set and testing set:


In [None]:
# Predictions in train set
vcd_train_pred = np.zeros((LEN_RUNS, NUM_RUNS))
glc_train_pred = np.zeros((LEN_RUNS, NUM_RUNS))
for i in range(NUM_RUNS):
    t,vcd_train_pred[:,i],glc_train_pred[:,i] = run_2nd_hybrid(doe_train[i,0], doe_train[i,1], doe_train[i,2], FEED_END, g_mld, k_mld, time_step=TIME_STEP)
# Predictions in test set
vcd_test_pred = np.zeros((LEN_RUNS,NUM_TEST))
glc_test_pred = np.zeros((LEN_RUNS,NUM_TEST))
for i in range(NUM_TEST):
    t, vcd_test_pred[:,i], glc_test_pred[:,i] = run_2nd_hybrid(doe_test[i,0], doe_test[i,1], doe_test[i,2], FEED_END, g_mld, k_mld, time_step=TIME_STEP)

Plot the fit of our model (2nd hybrid) over time:

In [None]:
""" Select experiment on which to predict """
PLOT_TRAIN=0
PLOT_TEST=49

In [None]:
# Plot selected experiment run
fig = make_subplots(rows=1,cols=2,specs=[[{"secondary_y": True},{"secondary_y": True}]],subplot_titles=("Train Set prediction","Test Set prediction"))
fig.add_trace(go.Scatter(x=t,y=vcd_train_pred[:,PLOT_TRAIN], mode='lines',name='VCD Predicted',line=dict(color = px.colors.qualitative.G10[0], dash='dash')),row=1,col=1)
fig.add_trace(go.Scatter(x=t,y=vcd[:,PLOT_TRAIN], mode='markers+lines',name='VCD Measured',line=dict(color = px.colors.qualitative.G10[0])),row=1,col=1)
fig.add_trace(go.Scatter(x=t,y=glc_train_pred[:,PLOT_TRAIN], mode='lines',name='Glucose Predicted',line=dict(color = px.colors.qualitative.G10[1], dash='dash')),secondary_y=True,row=1,col=1)
fig.add_trace(go.Scatter(x=t,y=glc[:,PLOT_TRAIN], mode='markers+lines',name='Glucose Measured',line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True,row=1,col=1)
fig.add_trace(go.Scatter(x=t,y=vcd_test_pred[:,PLOT_TEST], mode='lines',name='VCD Predicted',line=dict(color = px.colors.qualitative.G10[0], dash='dash')),row=1,col=2)
fig.add_trace(go.Scatter(x=t,y=vcd_test[:,PLOT_TEST], mode='markers+lines',name='VCD Measured',line=dict(color = px.colors.qualitative.G10[0])),row=1,col=2)
fig.add_trace(go.Scatter(x=t,y=glc_test_pred[:,PLOT_TEST], mode='lines',name='Glucose Predicted',line=dict(color = px.colors.qualitative.G10[1], dash='dash')),secondary_y=True,row=1,col=2)
fig.add_trace(go.Scatter(x=t,y=glc_test[:,PLOT_TEST], mode='markers+lines',name='Glucose Measured',line=dict(color = px.colors.qualitative.G10[1])),secondary_y=True,row=1,col=2)
fig.update_layout(showlegend=True,title="Predicted profile Glc and VCD profile over time for run id: "+str(PLOT_TRAIN)+" in train set (left) & "+str(PLOT_TEST)+" in test set (right)",xaxis_title="Time",yaxis_title="Glc")
fig.update_yaxes(title_text="VCD", secondary_y=True)
fig.show()

Plot Observed vs Predicted for (2nd hybrid):

In [None]:
# Plot observed vs predicted
fig = make_subplots(rows=2, cols=2, subplot_titles=(
    f"VCD - Train Set <br> R^2 = {round(r2_score(vcd,vcd_train_pred),3)} <br> Rel RMSE = {round(mean_squared_error(vcd, vcd_train_pred,squared=False) / np.std(np.array(vcd)),3)}" ,
    f"VCD - Test Set <br> R^2 = {round(r2_score(vcd_test,vcd_test_pred),3)} <br> Rel RMSE = {round(mean_squared_error(vcd_test, vcd_test_pred,squared=False) / np.std(np.array(vcd)),3)}" ,
    f"GLC - Train Set <br> R^2 = {round(r2_score(glc,glc_train_pred),3)} <br> Rel RMSE = {round(mean_squared_error(glc, glc_train_pred,squared=False) / np.std(np.array(glc)),3)}" ,
    f"GLC - Test Set <br> R^2 = {round(r2_score(glc_test,glc_test_pred),3)} <br> Rel RMSE = {round(mean_squared_error(glc_test, glc_test_pred,squared=False) / np.std(np.array(glc)),3)}"))
for i in range(NUM_RUNS):
    fig.add_trace(go.Scatter(x=vcd[:,i],y=vcd_train_pred[:,i],mode="markers",name=f"run id {i}"),row=1,col=1)
    fig.add_trace(go.Scatter(x=glc[:,i],y=glc_train_pred[:,i],mode="markers",name=f"run id {i}"),row=2,col=1)
fig.add_shape(type="line",x0=vcd.min(),y0=vcd.min(),x1=vcd.max(),y1=vcd.max(), layer='above', line=dict(dash='dash'),row=1,col=1)
fig.add_shape(type="line",x0=glc.min(),y0=glc.min(),x1=glc.max(),y1=glc.max(), layer='above', line=dict(dash='dash'),row=2,col=1)
for i in range(NUM_TEST):
    fig.add_trace(go.Scatter(x=vcd_test[:,i],y=vcd_test_pred[:,i],mode="markers",name=f"run id {i}"),row=1,col=2)
    fig.add_trace(go.Scatter(x=glc_test[:,i],y=glc_test_pred[:,i],mode="markers",name=f"run id {i}"),row=2,col=2)
fig.add_shape(type="line",x0=vcd_test.min(),y0=vcd_test.min(),x1=vcd_test.max(),y1=vcd_test.max(), layer='above', line=dict(dash='dash'),row=1,col=2)
fig.add_shape(type="line",x0=glc_test.min(),y0=glc_test.min(),x1=glc_test.max(),y1=glc_test.max(), layer='above', line=dict(dash='dash'),row=2,col=2)
fig.update_layout(title_text = "Observed vs Predicted",showlegend=False, height=1000)
fig.show()

## Discussion: Advantages and disadvantages of hybrid model


Pros:
*   Works well with small dataset
*   Describes the full dynamics
*   Extrapolation possible
*   Incorporate domain knowledge
*   ...


Cons:
*   Prone to failing due data mistakes
*   Units must be consistent
*   Prone to process dynamic misspecification
*   ...
