diff --git a/.gitignore b/.gitignore
index a960a8b..8524870 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,3 +15,6 @@ docs/src/**/*.html
 docs/src/**/*.ipynb
 docs/src/**/*Manifest.toml
 docs/src_stash/*.ipynb
+docs/src/tutorials/test.*
+docs/src/tutorials/tmp*
+
diff --git a/Project.toml b/Project.toml
index e4a390d..c788bd0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "HybridVariationalInference"
 uuid = "a108c475-a4e2-4021-9a84-cfa7df242f64"
 authors = ["Thomas Wutzler <twutz@bgc-jena.mpg.de> and contributors"]
-version = "1.0.0-DEV"
+version = "0.2"
 
 [deps]
 Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
@@ -10,15 +10,20 @@ ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 Combinatorics = "861a8166-3701-5b0c-9a16-15d98fcdc6aa"
 CommonSolve = "38540f10-b2f7-11e9-35d8-d573e4eb0ff2"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 DistributionFits = "45214091-1ed4-4409-9bcf-fdb48a05e921"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
+LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
 MLUtils = "f1d291b0-491e-4a28-83b9-f70985020b54"
 Missings = "e1d29d7a-bbdc-5cf2-9ac0-f12de2c33e28"
+NaNMath = "77ba4419-2d1f-58cd-9bb1-8ffee604a2e3"
+Optimisers = "3bd65402-5787-11e9-1adc-39752487f4e2"
 Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 StableRNGs = "860ef19b-820b-49d6-a774-d7a799459cd3"
@@ -26,6 +31,7 @@ StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 StatsFuns = "4c63d2b9-4356-54db-8cca-17b64c39e42c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
+Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 
 [weakdeps]
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
@@ -47,17 +53,22 @@ ChainRulesCore = "1.25"
 Combinatorics = "1.0.2"
 CommonSolve = "0.2.4"
 ComponentArrays = "0.15.19"
+DifferentiationInterface = "0.6.54, 0.7"
 DistributionFits = "0.3.9"
 Distributions = "0.25.117"
 FillArrays = "1.13.0"
 Flux = "0.14, 0.15, 0.16"
 Functors = "0.4, 0.5"
 GPUArraysCore = "0.1, 0.2"
+KernelAbstractions = "0.9.34"
 LinearAlgebra = "1.10"
+LogExpFunctions = "0.3.29"
 Lux = "1.4.2"
 MLDataDevices = "1.5, 1.6"
 MLUtils = "0.4.5"
 Missings = "1.2.0"
+NaNMath = "1.1.3"
+Optimisers = "0.4.6"
 Optimization = "3.19.3, 4"
 Random = "1.10.0"
 SimpleChains = "0.4"
@@ -66,6 +77,7 @@ StaticArrays = "1.9.13"
 StatsBase = "0.34.4"
 StatsFuns = "1.3.2"
 Test = "1.10"
+Zygote = "0.7.10"
 julia = "1.10"
 
 [workspace]
diff --git a/dev/doubleMM.jl b/dev/doubleMM.jl
index 021b936..428a7eb 100644
--- a/dev/doubleMM.jl
+++ b/dev/doubleMM.jl
@@ -42,7 +42,7 @@ train_dataloader = MLUtils.DataLoader(
     (xM, xP, y_o, y_unc, 1:n_site);
     batchsize = n_batch, partial = false)
 σ_o = exp.(y_unc[:, 1] / 2)
-# assign the train_loader, otherwise it eatch time creates another version of synthetic data
+# assign the train_loader, otherwise it each time creates another version of synthetic data
 prob0 = HybridProblem(prob0_; train_dataloader)
 #tmp = HVI.get_hybridproblem_ϕunc(prob0; scenario)
 #prob0.covar
@@ -248,7 +248,7 @@ end
     (y2_K1global, θsP2_K1global, θsMs2_K1global) = (y, θsP, θsMs);
 end
 
-() -> begin # otpimize using LUX
+() -> begin # optimize using LUX
     #using Lux
     g_lux = Lux.Chain(
         # dense layer with bias that maps to 8 outputs and applies `tanh` activation
@@ -560,7 +560,8 @@ end
 end
 
 #ζi = first(eachrow(Array(chain)))
-f_allsites = get_hybridproblem_PBmodel(prob0; scenario, use_all_sites = true)
+f = get_hybridproblem_PBmodel(probc; scenario)
+f_allsites = create_nsite_applicator(f, n_site)
 #ζs = mapreduce(ζi -> transposeMs(ζi, intm_PMs_gen, true), hcat, eachrow(Array(chain)));
 ζsP = Array(chain)[:,1:n_θP]'
 ζsMst = reshape(Array(chain)[:,(n_θP+1) : end], n_sample_NUTS, n_site, n_θM)
diff --git a/docs/_quarto.yml b/docs/_quarto.yml
index 4fff7ac..6d3e418 100644
--- a/docs/_quarto.yml
+++ b/docs/_quarto.yml
@@ -3,6 +3,10 @@ project:
   render: 
     - src/tutorials/basic_cpu.qmd
     - src/tutorials/*.qmd
+# julia:
+#     # workaround for quarto not pciking up the correct environment variable
+#     # https://github.com/quarto-dev/quarto-cli/issues/13416#issuecomment-3333700286
+#     env: ["JULIA_DEPOT_PATH=/User/homes/twutz/scratch/twutz/julia_gpu_depots"]
     
 
 
diff --git a/docs/src/explanation/theory_hvi.md b/docs/src/explanation/theory_hvi.md
index 18053a7..60834a0 100644
--- a/docs/src/explanation/theory_hvi.md
+++ b/docs/src/explanation/theory_hvi.md
@@ -28,7 +28,7 @@ In order to learn $\phi_g$, the user needs to provide a batch of $i \in \{1 \ldo
 ## Estimation using the ELBO
 
 In order to find the parameters of the approximation of the posterior, HVI
-minizes the KL divergence between the approximation and the true posterior.
+minimizes the KL divergence between the approximation and the true posterior.
 This is achieve by maximizing the evidence lower bound (ELBO).
 
 $$\mathcal{L}(\phi) = \mathbb{E}_{q(\theta)} \left[\log p(y,\theta) \right] - \mathbb{E}_{q(\theta)} \left[\log q(\theta) \right]$$
@@ -128,7 +128,7 @@ $\phi = (\phi_P, \phi_g, \phi_u)$, comprises
 -  $\phi_P = \mu_{\zeta_P}$: the  means of the distributions of the transformed global   
   parameters, 
 -  $\phi_g$: the parameters of the machine learning model, and 
--  $\phi_u$: paramerization of $\Sigma_\zeta$ that is additional to the means. 
+-  $\phi_u$: parameterization of $\Sigma_\zeta$ that is additional to the means. 
 
 ### Details
 Specifically, $\phi_u= (log\sigma^2_P, log\sigma^2_{M0}, log\sigma^2_{M\eta}, a_P, a_M)$, 
diff --git a/docs/src/tutorials/basic_cpu.md b/docs/src/tutorials/basic_cpu.md
index e1a80c9..c3475ec 100644
--- a/docs/src/tutorials/basic_cpu.md
+++ b/docs/src/tutorials/basic_cpu.md
@@ -27,7 +27,8 @@ The example process based model (PBM) predicts a double-monod constrained rate
 for different substrate concentrations, `S1`, and `S2`.
 
 $$
-y = r_0+ r_1 \frac{S_1}{K_1 + S_1} \frac{S_2}{K_2 + S_2}$$
+y = r_0+ r_1 \frac{S_1}{K_1 + S_1} \frac{S_2}{K_2 + S_2}
+$$
 
 ``` julia
 function f_doubleMM(θc::CA.ComponentVector{ET}, x) where ET
@@ -49,7 +50,7 @@ access the components by its symbolic names in the provided `ComponentArray`.
 HVI requires the evaluation of the likelihood of the predictions.
 It corresponds to the cost of predictions given some observations.
 
-The user specifies a function of the negative log-Likehood
+The user specifies a function of the negative log-Likelihood
 `neg_logden(obs, pred, uncertainty_parameters)`,
 where all of the parameters are arrays with columns for sites.
 
@@ -229,10 +230,10 @@ given a vector of global parameters, and a matrix of site parameters to
 invocation of the process based model (PBM), defined at the beginning.
 
 ``` julia
-f_batch = f_allsites = PBMSiteApplicator(f_doubleMM; θP, θM, θFix, xPvec=xP[:,1])
+f_batch = PBMSiteApplicator(f_doubleMM; θP, θM, θFix, xPvec=xP[:,1])
 
 prob = HybridProblem(θP, θM, g_chain_scaled, ϕg0, 
-    f_batch, f_allsites, priors_dict, py,
+    f_batch, priors_dict, py,
     transM, transP, train_dataloader, n_covar, n_site, n_batch)
 ```
 
@@ -241,6 +242,11 @@ prob = HybridProblem(θP, θM, g_chain_scaled, ϕg0,
 Eventually, having assembled all the moving parts of the HVI, we can perform
 the inversion.
 
+``` julia
+# silence warning of no GPU backend found (because we did not import CUDA here)
+ENV["MLDATADEVICES_SILENCE_WARN_NO_GPU"] = 1
+```
+
 ``` julia
 using OptimizationOptimisers
 import Zygote
@@ -313,8 +319,7 @@ The HVI Problem needs to be updated with this new applicatior.
 
 ``` julia
 f_batch = PBMPopulationApplicator(f_doubleMM_sites, n_batch; θP, θM, θFix, xPvec=xP[:,1])
-f_allsites = PBMPopulationApplicator(f_doubleMM_sites, n_site; θP, θM, θFix, xPvec=xP[:,1])
-probo_sites = HybridProblem(probo; f_batch, f_allsites)
+probo_sites = HybridProblem(probo; f_batch)
 ```
 
 For numerical efficiency, the number of sites within one batch is part of the
@@ -345,8 +350,7 @@ module `Main` to allow for easier reloading with JLD2.
 
 ``` julia
 f_batch = PBMPopulationApplicator(DoubleMM.f_doubleMM_sites, n_batch; θP, θM, θFix, xPvec=xP[:,1])
-f_allsites = PBMPopulationApplicator(DoubleMM.f_doubleMM_sites, n_site; θP, θM, θFix, xPvec=xP[:,1])
-probo2 = HybridProblem(probo; f_batch, f_allsites)
+probo2 = HybridProblem(probo; f_batch)
 ```
 
 ``` julia
diff --git a/docs/src/tutorials/basic_cpu.qmd b/docs/src/tutorials/basic_cpu.qmd
index 37d8449..39cf0a5 100644
--- a/docs/src/tutorials/basic_cpu.qmd
+++ b/docs/src/tutorials/basic_cpu.qmd
@@ -36,7 +36,8 @@ The example process based model (PBM) predicts a double-monod constrained rate
 for different substrate concentrations, `S1`, and `S2`.
 
 $$
-y = r_0+ r_1 \frac{S_1}{K_1 + S_1} \frac{S_2}{K_2 + S_2}$$
+y = r_0+ r_1 \frac{S_1}{K_1 + S_1} \frac{S_2}{K_2 + S_2}
+$$
 
 ```{julia}
 function f_doubleMM(θc::CA.ComponentVector{ET}, x) where ET
@@ -58,7 +59,7 @@ access the components by its symbolic names in the provided `ComponentArray`.
 HVI requires the evaluation of the likelihood of the predictions.
 It corresponds to the cost of predictions given some observations.
 
-The user specifies a function of the negative log-Likehood 
+The user specifies a function of the negative log-Likelihood 
 `neg_logden(obs, pred, uncertainty_parameters)`,
 where all of the parameters are arrays with columns for sites.
 
@@ -239,10 +240,10 @@ given a vector of global parameters, and a matrix of site parameters to
 invocation of the process based model (PBM), defined at the beginning.
 
 ```{julia}
-f_batch = f_allsites = PBMSiteApplicator(f_doubleMM; θP, θM, θFix, xPvec=xP[:,1])
+f_batch = PBMSiteApplicator(f_doubleMM; θP, θM, θFix, xPvec=xP[:,1])
 
 prob = HybridProblem(θP, θM, g_chain_scaled, ϕg0, 
-    f_batch, f_allsites, priors_dict, py,
+    f_batch, priors_dict, py,
     transM, transP, train_dataloader, n_covar, n_site, n_batch)
 ```
 
@@ -265,7 +266,7 @@ y1 = f_batch(CA.getdata(θP), CA.getdata(θMs), CA.getdata(x_batch))[2]
     #using Cthulhu
     #@descend_code_warntype f_batch(CA.getdata(θP), CA.getdata(θMs), CA.getdata(x_batch))
     prob0 = HVI.DoubleMM.DoubleMMCase()
-    f_batch0 = get_hybridproblem_PBmodel(prob0; use_all_sites = false)
+    f_batch0 = get_hybridproblem_PBmodel(prob0)
     y1f = f_batch0(θP, θMs, x_batch)[2]
     y1 .- y1f  # equal
 end
@@ -276,6 +277,11 @@ end
 Eventually, having assembled all the moving parts of the HVI, we can perform 
 the inversion.
 
+```{julia}
+# silence warning of no GPU backend found (because we did not import CUDA here)
+ENV["MLDATADEVICES_SILENCE_WARN_NO_GPU"] = 1
+```
+
 ```{julia}
 using OptimizationOptimisers
 import Zygote
@@ -349,8 +355,7 @@ The HVI Problem needs to be updated with this new applicatior.
 
 ```{julia}
 f_batch = PBMPopulationApplicator(f_doubleMM_sites, n_batch; θP, θM, θFix, xPvec=xP[:,1])
-f_allsites = PBMPopulationApplicator(f_doubleMM_sites, n_site; θP, θM, θFix, xPvec=xP[:,1])
-probo_sites = HybridProblem(probo; f_batch, f_allsites)
+probo_sites = HybridProblem(probo; f_batch)
 ```
 
 For numerical efficiency, the number of sites within one batch is part of the 
@@ -380,8 +385,7 @@ module `Main` to allow for easier reloading with JLD2.
 
 ```{julia}
 f_batch = PBMPopulationApplicator(DoubleMM.f_doubleMM_sites, n_batch; θP, θM, θFix, xPvec=xP[:,1])
-f_allsites = PBMPopulationApplicator(DoubleMM.f_doubleMM_sites, n_site; θP, θM, θFix, xPvec=xP[:,1])
-probo2 = HybridProblem(probo; f_batch, f_allsites)
+probo2 = HybridProblem(probo; f_batch)
 ```
 
 ```{julia}
@@ -397,4 +401,4 @@ end
 #| eval: false
 #| echo: false
 probo = load(fname, "probo"; iotype = IOStream);
-```
\ No newline at end of file
+```
diff --git a/docs/src/tutorials/blocks_corr.md b/docs/src/tutorials/blocks_corr.md
index 54da1fe..27a8629 100644
--- a/docs/src/tutorials/blocks_corr.md
+++ b/docs/src/tutorials/blocks_corr.md
@@ -1,4 +1,4 @@
-# How to model indenpendent parameter-blocks in the posterior
+# How to model independent parameter-blocks in the posterior
 
 
 ``` @meta
diff --git a/docs/src/tutorials/blocks_corr.qmd b/docs/src/tutorials/blocks_corr.qmd
index 380903a..16a0323 100644
--- a/docs/src/tutorials/blocks_corr.qmd
+++ b/docs/src/tutorials/blocks_corr.qmd
@@ -1,5 +1,5 @@
 ---
-title: "How to model indenpendent parameter-blocks in the posterior"
+title: "How to model independent parameter-blocks in the posterior"
 engine: julia
 execute:
   echo: true
diff --git a/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-11-output-1.png b/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-11-output-1.png
index b5f5d20..67efac2 100644
Binary files a/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-11-output-1.png and b/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-11-output-1.png differ
diff --git a/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-12-output-1.png b/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-12-output-1.png
index 80db561..7214bb1 100644
Binary files a/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-12-output-1.png and b/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-12-output-1.png differ
diff --git a/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-13-output-1.png b/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-13-output-1.png
index d1f156b..b8f9977 100644
Binary files a/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-13-output-1.png and b/docs/src/tutorials/blocks_corr_files/figure-commonmark/cell-13-output-1.png differ
diff --git a/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-10-output-1.png b/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-10-output-1.png
index 8ee22fb..d56fea8 100644
Binary files a/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-10-output-1.png and b/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-10-output-1.png differ
diff --git a/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-11-output-1.png b/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-11-output-1.png
index b8d8e64..fd0d188 100644
Binary files a/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-11-output-1.png and b/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-11-output-1.png differ
diff --git a/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-12-output-1.png b/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-12-output-1.png
index 7aa7d4d..606f068 100644
Binary files a/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-12-output-1.png and b/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-12-output-1.png differ
diff --git a/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-9-output-1.png b/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-9-output-1.png
index 1b863b6..6c3e05f 100644
Binary files a/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-9-output-1.png and b/docs/src/tutorials/corr_site_global_files/figure-commonmark/cell-9-output-1.png differ
diff --git a/docs/src/tutorials/inspect_results.md b/docs/src/tutorials/inspect_results.md
index 0c41fe8..7849c63 100644
--- a/docs/src/tutorials/inspect_results.md
+++ b/docs/src/tutorials/inspect_results.md
@@ -39,7 +39,7 @@ using function [`sample_posterior`](@ref).
 using StableRNGs
 rng = StableRNG(112)
 n_sample_pred = 400
-(; θsP, θsMs) = sample_posterior(rng, probo; n_sample_pred)
+(; θsP, θsMs) = sample_posterior(rng, probo; n_sample_pred, is_testmode = true)
 ```
 
 Lets look at the results.
diff --git a/docs/src/tutorials/inspect_results.qmd b/docs/src/tutorials/inspect_results.qmd
index c10bc6c..d22d91e 100644
--- a/docs/src/tutorials/inspect_results.qmd
+++ b/docs/src/tutorials/inspect_results.qmd
@@ -53,9 +53,7 @@ probo = load(fname, "probo");
 _xP_batch = first(probo.train_dataloader)[2]
 f_batch = PBMPopulationApplicator(
   f_doubleMM_sites, probo.n_batch; probo.θP, probo.θM, θFix, xPvec=_xP_batch[:,1])
-f_allsites = PBMPopulationApplicator(
-  f_doubleMM_sites, probo.n_site; probo.θP, probo.θM, θFix, xPvec=_xP_batch[:,1])
-probo = HybridProblem(probo; f_batch, f_allsites)
+probo = HybridProblem(probo; f_batch)
 ```
 ## Sample the posterior
 
@@ -66,7 +64,7 @@ using function [`sample_posterior`](@ref).
 using StableRNGs
 rng = StableRNG(112)
 n_sample_pred = 400
-(; θsP, θsMs) = sample_posterior(rng, probo; n_sample_pred)
+(; θsP, θsMs) = sample_posterior(rng, probo; n_sample_pred, is_testmode = true)
 ```
 
 Lets look at the results.
diff --git a/docs/src/tutorials/intermediate/basic_cpu_results.jld2 b/docs/src/tutorials/intermediate/basic_cpu_results.jld2
index 4046a37..4e7d91f 100644
Binary files a/docs/src/tutorials/intermediate/basic_cpu_results.jld2 and b/docs/src/tutorials/intermediate/basic_cpu_results.jld2 differ
diff --git a/docs/src/tutorials/logden_user.md b/docs/src/tutorials/logden_user.md
index 240f4a1..3ccd55d 100644
--- a/docs/src/tutorials/logden_user.md
+++ b/docs/src/tutorials/logden_user.md
@@ -5,7 +5,7 @@
 CurrentModule = HybridVariationalInference  
 ```
 
-This guide shows how the user can specify a customized log-density function.
+This guide shows how the user can specify a customized log-Likelihood function.
 
 ## Motivation
 
diff --git a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-10-output-1.png b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-10-output-1.png
index dd132d4..dff53ac 100644
Binary files a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-10-output-1.png and b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-10-output-1.png differ
diff --git a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-11-output-1.png b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-11-output-1.png
index 64b3b19..d49bedb 100644
Binary files a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-11-output-1.png and b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-11-output-1.png differ
diff --git a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-12-output-1.png b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-12-output-1.png
index 26aae9f..ee42111 100644
Binary files a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-12-output-1.png and b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-12-output-1.png differ
diff --git a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-8-output-1.png b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-8-output-1.png
index 0ab0e99..38e4dbf 100644
Binary files a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-8-output-1.png and b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-8-output-1.png differ
diff --git a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-9-output-1.png b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-9-output-1.png
index 8142630..d6b0d9e 100644
Binary files a/docs/src/tutorials/logden_user_files/figure-commonmark/cell-9-output-1.png and b/docs/src/tutorials/logden_user_files/figure-commonmark/cell-9-output-1.png differ
diff --git a/docs/src_stash/Manifest.toml b/docs/src_stash/Manifest.toml
index 37f2a35..6b46cfc 100644
--- a/docs/src_stash/Manifest.toml
+++ b/docs/src_stash/Manifest.toml
@@ -2,7 +2,7 @@
 
 julia_version = "1.11.4"
 manifest_format = "2.0"
-project_hash = "39d0d5866236472d6bc1a58c4e663ea8a2a2e057"
+project_hash = "e0bb7b9f384e479f88276cca8506811903b895c6"
 
 [[deps.AliasTables]]
 deps = ["PtrArrays", "Random"]
@@ -39,6 +39,23 @@ git-tree-sha1 = "fde3bf89aead2e723284a8ff9cdf5b551ed700e8"
 uuid = "83423d85-b0ee-5818-9007-b63ccbeb887a"
 version = "1.18.5+0"
 
+[[deps.ChunkCodecCore]]
+git-tree-sha1 = "51f4c10ee01bda57371e977931de39ee0f0cdb3e"
+uuid = "0b6fb165-00bc-4d37-ab8b-79f91016dbe1"
+version = "1.0.0"
+
+[[deps.ChunkCodecLibZlib]]
+deps = ["ChunkCodecCore", "Zlib_jll"]
+git-tree-sha1 = "cee8104904c53d39eb94fd06cbe60cb5acde7177"
+uuid = "4c0bbee4-addc-4d73-81a0-b6caacae83c8"
+version = "1.0.0"
+
+[[deps.ChunkCodecLibZstd]]
+deps = ["ChunkCodecCore", "Zstd_jll"]
+git-tree-sha1 = "34d9873079e4cb3d0c62926a225136824677073f"
+uuid = "55437552-ac27-4d47-9aa3-63184e8fd398"
+version = "1.0.0"
+
 [[deps.CodecZlib]]
 deps = ["TranscodingStreams", "Zlib_jll"]
 git-tree-sha1 = "962834c22b66e32aa10f7611c08c8ca4e20749a9"
@@ -163,6 +180,16 @@ git-tree-sha1 = "3a948313e7a41eb1db7a1e733e6335f17b4ab3c4"
 uuid = "b22a6f82-2f65-5046-a5b2-351ab43fb4e5"
 version = "7.1.1+0"
 
+[[deps.FileIO]]
+deps = ["Pkg", "Requires", "UUIDs"]
+git-tree-sha1 = "b66970a70db13f45b7e57fbda1736e1cf72174ea"
+uuid = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
+version = "1.17.0"
+weakdeps = ["HTTP"]
+
+    [deps.FileIO.extensions]
+    HTTPExt = "HTTP"
+
 [[deps.FileWatching]]
 uuid = "7b1f6079-737a-58dc-b8bc-7a2ca5c1b5ee"
 version = "1.11.0"
@@ -249,6 +276,11 @@ git-tree-sha1 = "f923f9a774fcf3f5cb761bfa43aeadd689714813"
 uuid = "2e76f6c2-a576-52d4-95c1-20adfe4de566"
 version = "8.5.1+0"
 
+[[deps.HashArrayMappedTries]]
+git-tree-sha1 = "2eaa69a7cab70a52b9687c8bf950a5a93ec895ae"
+uuid = "076d061b-32b6-4027-95e0-9a2c6f6d7e74"
+version = "0.2.0"
+
 [[deps.InteractiveUtils]]
 deps = ["Markdown"]
 uuid = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
@@ -259,6 +291,18 @@ git-tree-sha1 = "e2222959fbc6c19554dc15174c81bf7bf3aa691c"
 uuid = "92d709cd-6900-40b7-9082-c6be49f344b6"
 version = "0.2.4"
 
+[[deps.JLD2]]
+deps = ["ChunkCodecLibZlib", "ChunkCodecLibZstd", "FileIO", "MacroTools", "Mmap", "OrderedCollections", "PrecompileTools", "ScopedValues"]
+git-tree-sha1 = "da2e9b4d1abbebdcca0aa68afa0aa272102baad7"
+uuid = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
+version = "0.6.2"
+
+    [deps.JLD2.extensions]
+    UnPackExt = "UnPack"
+
+    [deps.JLD2.weakdeps]
+    UnPack = "3a884ed6-31ef-47d7-9d2a-63182c4928ed"
+
 [[deps.JLFzf]]
 deps = ["REPL", "Random", "fzf_jll"]
 git-tree-sha1 = "82f7acdc599b65e0f8ccd270ffa1467c21cb647b"
@@ -668,6 +712,12 @@ version = "1.3.1"
 uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 version = "0.7.0"
 
+[[deps.ScopedValues]]
+deps = ["HashArrayMappedTries", "Logging"]
+git-tree-sha1 = "c3b2323466378a2ba15bea4b2f73b081e022f473"
+uuid = "7e506255-f358-4e82-b7e4-beb19740aa63"
+version = "1.5.0"
+
 [[deps.Scratch]]
 deps = ["Dates"]
 git-tree-sha1 = "9b81b8393e50b7d4e6d0a9f14e192294d3b7c109"
diff --git a/docs/src_stash/Project.toml b/docs/src_stash/Project.toml
index 43aec5b..f36a540 100644
--- a/docs/src_stash/Project.toml
+++ b/docs/src_stash/Project.toml
@@ -1,2 +1,3 @@
 [deps]
+JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
diff --git a/docs/src_stash/test.md b/docs/src_stash/test.md
new file mode 100644
index 0000000..4501b05
--- /dev/null
+++ b/docs/src_stash/test.md
@@ -0,0 +1,21 @@
+# Basic workflow without GPU
+
+
+``` julia
+ENV["JULIA_DEPOT_PATH"]
+```
+
+    "/User/homes/twutz/scratch/twutz/julia_gpu_depots:"
+
+``` julia
+DEPOT_PATH
+```
+
+    3-element Vector{String}:
+     "/User/homes/twutz/scratch/twutz/julia_gpu_depots"
+     "/Net/Groups/Services/HPC_22/apps/julia/julia-1.11.4/local/share/julia"
+     "/Net/Groups/Services/HPC_22/apps/julia/julia-1.11.4/share/julia"
+
+``` julia
+#println(no_existing_var)
+```
diff --git a/docs/src_stash/test.qmd b/docs/src_stash/test.qmd
new file mode 100644
index 0000000..6ac17e4
--- /dev/null
+++ b/docs/src_stash/test.qmd
@@ -0,0 +1,26 @@
+---
+title: "Basic workflow without GPU"
+engine: julia
+# julia:
+#     env: ["JULIA_DEPOT_PATH=/User/homes/twutz/scratch/twutz/julia_gpu_depots"]
+execute:
+  echo: true
+format:
+  commonmark:
+    variant: -raw_html+tex_math_dollars
+    wrap: preserve    
+# params:
+#   JULIA_DEPOT_PATH: /User/homes/twutz/scratch/twutz/julia_cluster_depots
+---
+```{julia}
+ENV["JULIA_DEPOT_PATH"]
+```
+
+```{julia}
+DEPOT_PATH
+```
+
+```{julia}
+#println(no_existing_var)
+```
+
diff --git a/ext/HybridVariationalInferenceCUDAExt.jl b/ext/HybridVariationalInferenceCUDAExt.jl
index 180944c..eb73007 100644
--- a/ext/HybridVariationalInferenceCUDAExt.jl
+++ b/ext/HybridVariationalInferenceCUDAExt.jl
@@ -12,20 +12,21 @@ function HVI.vec2utri(v::CUDA.CuVector{T}; n=invsumn(length(v)) ) where {T}
     m = CUDA.allowscalar() do
         CUDA.CuArray([j >= i ? (k += 1; v[k]) : z for i in 1:n, j in 1:n])
     end
+    # TODO test if without allowscalar is faster
     # m = CUDA.zeros(T,n,n)
     # @cuda threads = 256 vec2utri_gpu!(m, v) # planned to put v into positions of m
     return (m)
 end
 
-function vec2utri_gpu!(m, v::AbstractVector)
-    index = threadIdx().x    # this example only requires linear indexing, so just use `x`
-    stride = blockDim().x
-    for i in index:stride:length(v)
-        row, col = vec2utri_pos(i)
-        @inbounds m[row, col] = v[i]
-    end
-    return nothing # important
-end
+# function vec2utri_gpu!(m, v::AbstractVector)
+#     index = threadIdx().x    # this example only requires linear indexing, so just use `x`
+#     stride = blockDim().x
+#     for i in index:stride:length(v)
+#         row, col = vec2utri_pos(i)
+#         @inbounds m[row, col] = v[i]
+#     end
+#     return nothing # important
+# end
 
 
 #function vec2uutri(v::GPUArraysCore.AbstractGPUVector{T}; n=invsumn(length(v)) + one(T)) where {T}
@@ -85,6 +86,7 @@ function HVI._create_randn(rng, v::CUDA.CuVector{T,M}, dims...) where {T,M}
     res::CUDA.CuArray{T, length(dims),M}
 end
 
+# TODO ^eplace by KernelAbstractions after type stability is fixed
 function HVI.ones_similar_x(x::CuArray, size_ret = size(x))
     # call CUDA.ones rather than ones for x::CuArray
     ChainRulesCore.@ignore_derivatives CUDA.ones(eltype(x), size_ret)
diff --git a/ext/HybridVariationalInferenceFluxExt.jl b/ext/HybridVariationalInferenceFluxExt.jl
index 1d8f462..47136d8 100644
--- a/ext/HybridVariationalInferenceFluxExt.jl
+++ b/ext/HybridVariationalInferenceFluxExt.jl
@@ -23,7 +23,7 @@ function HVI.construct_ChainsApplicator(rng::AbstractRNG, m::Chain, float_type::
     FluxApplicator(rebuild), ϕ
 end
 
-function HVI.apply_model(app::FluxApplicator, x::T, ϕ) where T
+function HVI.apply_model(app::FluxApplicator, x::T, ϕ; is_testmode=false) where T
     # assume no size informmation in x -> can hint the type of the result
     # to be the same as the type of the input
     m = app.rebuild(ϕ)
@@ -38,7 +38,7 @@ function HVI.construct_partric(app::FluxApplicator{RT}, x, ϕ) where RT
     PartricFluxApplicator{RT, typeof(m), typeof(y)}(app.rebuild)
 end
 
-function HVI.apply_model(app::PartricFluxApplicator{RT, MT, YT}, x, ϕ) where {RT, MT, YT}
+function HVI.apply_model(app::PartricFluxApplicator{RT, MT, YT}, x, ϕ; is_testmode=false) where {RT, MT, YT}
     m = app.rebuild(ϕ)::MT
     res = m(x)::YT
     res
diff --git a/ext/HybridVariationalInferenceLuxExt.jl b/ext/HybridVariationalInferenceLuxExt.jl
index 4c66175..c8995c9 100644
--- a/ext/HybridVariationalInferenceLuxExt.jl
+++ b/ext/HybridVariationalInferenceLuxExt.jl
@@ -5,32 +5,85 @@ using HybridVariationalInference: HybridVariationalInference as HVI
 using ComponentArrays: ComponentArrays as CA
 using Random
 using StatsFuns: logistic
-
+import Functors
+import GPUArraysCore
 
 # AbstractModelApplicator that stores a Lux.StatefulLuxLayer, so that
 # it can be applied with given inputs and parameters
 # The `int_ϕ` ComponentArrayInterpreter, attaches the correct axes to the
 # supplied parameters, that do not need to keep the Axis information
-struct LuxApplicator{MT, IT} <: AbstractModelApplicator 
-    stateful_layer::MT
+struct LuxApplicator{MT1, MT2, IT} <: AbstractModelApplicator 
+    stateful_layer_test::MT1
+    stateful_layer_train::MT2
     int_ϕ::IT
+    is_testmode::Bool
 end
 
+Functors.@functor LuxApplicator ()  # prevent warning for putting stateful_layers to GPU
+
 function HVI.construct_ChainsApplicator(rng::AbstractRNG, m::Chain, float_type=Float32; device = gpu_device()) 
     ps, st = Lux.setup(rng, m)
     ps_ca = float_type.(CA.ComponentArray(ps)) 
     st = st |> device
-    stateful_layer = StatefulLuxLayer{true}(m, nothing, st)
-    #stateful_layer(x_o_gpu[:, 1:n_site_batch], ps_ca)
+    stateful_layer_train = StatefulLuxLayer{true}(m, nothing, st)
+    stateful_layer_test = Lux.testmode(stateful_layer_train)
     int_ϕ = get_concrete(ComponentArrayInterpreter(ps_ca))
-    LuxApplicator(stateful_layer, int_ϕ), ps_ca
+    LuxApplicator(stateful_layer_test, stateful_layer_train, int_ϕ, false), ps_ca
 end
 
-function HVI.apply_model(app::LuxApplicator, x, ϕ) 
-    ϕc = app.int_ϕ(CA.getdata(ϕ))
-    app.stateful_layer(x, ϕc)
+function HVI.apply_model(app::LuxApplicator, x, ϕ; is_testmode=false) 
+    ϕd = CA.getdata(ϕ)
+    if (ϕ isa SubArray) && (ϕ.parent isa GPUArraysCore.AbstractGPUArray)
+        # Lux has problems with SubArrays of GPUArrays, need to convert to plain Array
+        ϕc = app.int_ϕ(ϕd[:])
+    else
+        ϕc = app.int_ϕ(ϕd)
+    end
+    # tmp(x, ϕc)
+    if is_testmode
+        app.stateful_layer_test(x, ϕc)
+    else
+        app.stateful_layer_train(x, ϕc)
+    end
 end
 
+
+
+function HVI.construct_3layer_MLApplicator(
+        rng::AbstractRNG, prob::HVI.AbstractHybridProblem, ::Val{:Lux};
+        scenario::Val{scen},
+        p_dropout = 0.2) where scen
+    (;θM) = get_hybridproblem_par_templates(prob; scenario)
+    n_out = length(θM)
+    n_covar = get_hybridproblem_n_covar(prob; scenario)
+    n_pbm_covars = length(get_hybridproblem_pbmpar_covars(prob; scenario))
+    n_input = n_covar + n_pbm_covars
+    #(; n_covar, n_θM) = get_hybridproblem_sizes(prob; scenario)
+    float_type = get_hybridproblem_float_type(prob; scenario)
+    is_using_dropout = :use_dropout ∈ scen
+    g_chain = if is_using_dropout 
+        Lux.Chain(
+            # dense layer with bias that maps to 8 outputs and applies `tanh` activation
+            Lux.Dense(n_input => n_input * 4, tanh),
+            Lux.Dropout(p_dropout),
+            Lux.Dense(n_input * 4 => n_input * 4, tanh),
+            Lux.Dropout(p_dropout),
+            # dense layer without bias that maps to n outputs and `logistic` activation
+            Lux.Dense(n_input * 4 => n_out, logistic, use_bias = false)
+        )
+    else
+        Lux.Chain(
+            # dense layer with bias that maps to 8 outputs and applies `tanh` activation
+            Lux.Dense(n_input => n_input * 4, tanh),
+            Lux.Dense(n_input * 4 => n_input * 4, tanh),
+            # dense layer without bias that maps to n outputs and `logistic` activation
+            Lux.Dense(n_input * 4 => n_out, logistic, use_bias = false)
+        )
+    end
+    construct_ChainsApplicator(rng, g_chain, float_type)
+end
+
+
 # function HVI.HybridProblem(rng::AbstractRNG, 
 #     θP::CA.ComponentVector, θM::CA.ComponentVector, g_chain::Chain, 
 #     args...; device = gpu_device(), kwargs...)
diff --git a/ext/HybridVariationalInferenceSimpleChainsExt.jl b/ext/HybridVariationalInferenceSimpleChainsExt.jl
index b53262d..a572777 100644
--- a/ext/HybridVariationalInferenceSimpleChainsExt.jl
+++ b/ext/HybridVariationalInferenceSimpleChainsExt.jl
@@ -15,7 +15,7 @@ function HVI.construct_ChainsApplicator(rng::AbstractRNG, m::SimpleChain, FloatT
     SimpleChainsApplicator(m), ϕ
 end
 
-HVI.apply_model(app::SimpleChainsApplicator, x, ϕ) = app.m(x, ϕ)
+HVI.apply_model(app::SimpleChainsApplicator, x, ϕ; is_testmode=false) = app.m(x, ϕ)
 
 function HVI.construct_3layer_MLApplicator(
     rng::AbstractRNG, prob::HVI.AbstractHybridProblem, ::Val{:SimpleChains};
diff --git a/src/AbstractHybridProblem.jl b/src/AbstractHybridProblem.jl
index ad0404a..6c3b499 100644
--- a/src/AbstractHybridProblem.jl
+++ b/src/AbstractHybridProblem.jl
@@ -91,7 +91,7 @@ end
 """
     get_hybridproblem_transforms(::AbstractHybridProblem; scenario)
 
-Return a NamedTupe of
+Return a NamedTuple of
 - `transP`: Bijectors.Transform for the global PBM parameters, θP
 - `transM`: Bijectors.Transform for the single-site PBM parameters, θM
 """
@@ -197,7 +197,7 @@ end
 
 Put relevant parts of the DataLoader to gpu, depending on scenario.
 """
-function gdev_hybridproblem_dataloader(dataloader::MLUtils.DataLoader; gdevs,
+function gdev_hybridproblem_dataloader(dataloader::MLUtils.DataLoader; gdevs = nothing,
     gdev_M = gdevs.gdev_M,
     gdev_P = gdevs.gdev_P,
     # scenario::Val{scen} = Val(()), 
@@ -207,12 +207,26 @@ function gdev_hybridproblem_dataloader(dataloader::MLUtils.DataLoader; gdevs,
     batchsize = dataloader.batchsize,
     partial = dataloader.partial
     ) 
-    xM, xP, y_o, y_unc, i_sites = dataloader.data
+    # xM, xP, y_o, y_unc, i_sites = dataloader.data
+    # xM_dev = gdev_M(xM)
+    # xP_dev, y_o_dev, y_unc_dev = (gdev_P(xP), gdev_P(y_o), gdev_P(y_unc)) 
+    data_dev = gdev_hybridproblem_data(dataloader.data; gdev_M, gdev_P)
+    train_loader_dev = MLUtils.DataLoader(data_dev; batchsize, partial)
+    return(train_loader_dev)
+end
+
+function gdev_hybridproblem_data(data::Tuple; gdevs = nothing,
+    gdev_M = gdevs.gdev_M,
+    gdev_P = gdevs.gdev_P,
+    # scenario::Val{scen} = Val(()), 
+    # gdev = gpu_device(),
+    # gdev_M = :use_gpu ∈ _val_value(scenario) ? gdev : identity,
+    # gdev_P = :f_on_gpu ∈ _val_value(scenario) ? gdev : identity,
+    ) 
+    xM, xP, y_o, y_unc, i_sites = data
     xM_dev = gdev_M(xM)
     xP_dev, y_o_dev, y_unc_dev = (gdev_P(xP), gdev_P(y_o), gdev_P(y_unc)) 
-    train_loader_dev = MLUtils.DataLoader((xM_dev, xP_dev, y_o_dev, y_unc_dev, i_sites);
-        batchsize, partial)
-    return(train_loader_dev)
+    (xM_dev, xP_dev, y_o_dev, y_unc_dev, i_sites)
 end
 
 """
diff --git a/src/ComponentArrayInterpreter.jl b/src/ComponentArrayInterpreter.jl
index 960d5ee..f2c481a 100644
--- a/src/ComponentArrayInterpreter.jl
+++ b/src/ComponentArrayInterpreter.jl
@@ -171,6 +171,7 @@ function ComponentArrayInterpreter(
     ComponentArrayInterpreter(n_dims, CA.getaxes(ca), m_dims)
 end
 
+
 function ComponentArrayInterpreter(
     n_dims::NTuple{N,<:Integer}, axes::NTuple{A,<:CA.AbstractAxis},
     m_dims::NTuple{M,<:Integer}) where {N,A,M}
diff --git a/src/DoubleMM/DoubleMM.jl b/src/DoubleMM/DoubleMM.jl
index 6b39306..95a0757 100644
--- a/src/DoubleMM/DoubleMM.jl
+++ b/src/DoubleMM/DoubleMM.jl
@@ -13,6 +13,8 @@ using Distributions, DistributionFits
 using MLDataDevices
 import GPUArraysCore # used in conditional breakpoints
 import StableRNGs
+import MLUtils
+import ChainRulesCore
 
 export f_doubleMM, f_doubleMM_sites, xP_S1, xP_S2
 include("f_doubleMM.jl")
diff --git a/src/DoubleMM/f_doubleMM.jl b/src/DoubleMM/f_doubleMM.jl
index 7c8928a..22edc82 100644
--- a/src/DoubleMM/f_doubleMM.jl
+++ b/src/DoubleMM/f_doubleMM.jl
@@ -105,17 +105,23 @@ function f_doubleMM_sites(θc::CA.ComponentMatrix, xPc::CA.ComponentMatrix)
     #
     # extract the parameters as vectors that are row-repeated into a matrix
     VT = typeof(CA.getdata(θc)[:,1])   # workaround for non-type-stable Symbol-indexing
-    n_obs = size(S1, 1)
-    rep_fac = HVI.ones_similar_x(xPc, n_obs) # to reshape into matrix, avoiding repeat
+    #n_obs = size(S1, 1)
+    #rep_fac = HVI.ones_similar_x(xPc, n_obs) # to reshape into matrix, avoiding repeat
+    #is_dummy = isnan.(S1) .|| isnan.(S2)
+    is_valid = isfinite.(S1) .&& isfinite.(S2)
     (r0, r1, K1, K2) = map((:r0, :r1, :K1, :K2)) do par
         p1 = CA.getdata(θc[:, par]) ::VT
+        #Main.@infiltrate_main
+        # tmp = Zygote.gradient(p1 -> sum(repeat_rowvector_dummy(p1', is_dummy)), p1)[1]
+        #p1_mat = repeat_rowvector_dummy(p1', is_dummy)
+        p1_mat = is_valid .* p1' # places zeros in dummy positions, prevents gradients there
         #repeat(p1', n_obs)  # matrix: same for each concentration row in S1
         #(rep_fac .* p1')    # move to computation below to save allocation
     end
     #
     # each variable is a matrix (n_obs x n_site)
-    #r0 .+ r1 .* S1 ./ (K1 .+ S1) .* S2 ./ (K2 .+ S2)
-    (rep_fac .* r0') .+ (rep_fac .* r1') .* S1 ./ ((rep_fac .* K1') .+ S1) .* S2 ./ ((rep_fac .* K2') .+ S2)
+    r0 .+ r1 .* S1 ./ (K1 .+ S1) .* S2 ./ (K2 .+ S2)
+    #(rep_fac .* r0') .+ (rep_fac .* r1') .* S1 ./ ((rep_fac .* K1') .+ S1) .* S2 ./ ((rep_fac .* K2') .+ S2)
 end
 
 # function f_doubleMM_sites(θc::CA.ComponentMatrix, xPc::CA.ComponentMatrix)
@@ -221,7 +227,6 @@ end
 
 function HVI.get_hybridproblem_MLapplicator(
         rng::AbstractRNG, prob::HVI.DoubleMM.DoubleMMCase; scenario::Val{scen},
-        use_all_sites = false
 ) where {scen}
     ml_engine = select_ml_engine(; scenario)
     g_nomag, ϕ_g0 = construct_3layer_MLApplicator(rng, prob, ml_engine; scenario)
@@ -232,10 +237,12 @@ function HVI.get_hybridproblem_MLapplicator(
     (; transM) = get_hybridproblem_transforms(prob; scenario)
     lowers, uppers = HVI.get_quantile_transformed(
         priors::AbstractVector{<:Distribution}, transM)
-    n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
-    n_site_batch = use_all_sites ? n_site : n_batch
-    g = NormalScalingModelApplicator(
-        g_nomag, lowers, uppers, eltype(ϕ_g0))
+    #n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
+    g = if (:use_rangescaling ∈ scen)
+        RangeScalingModelApplicator(g_nomag, lowers, uppers, eltype(ϕ_g0))
+    else
+        NormalScalingModelApplicator(g_nomag, lowers, uppers, eltype(ϕ_g0))
+    end
     return g, ϕ_g0
 end
 
@@ -283,7 +290,7 @@ end
 #     cl::CLT
 # end
 
-function HVI.get_hybridproblem_PBmodel(prob::DoubleMMCase; use_all_sites=false,  scenario::Val{scen}) where {scen}
+function HVI.get_hybridproblem_PBmodel(prob::DoubleMMCase; scenario::Val{scen}) where {scen}
     # θall defined in this module above
     # TODO check and test for population or sites, currently return only site specific
     pt = get_hybridproblem_par_templates(prob; scenario)
@@ -294,8 +301,7 @@ function HVI.get_hybridproblem_PBmodel(prob::DoubleMMCase; use_all_sites=false,
         PBMSiteApplicator(f_doubleMM; pt.θP, pt.θM, θFix, xPvec)
     else
         n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
-        n_site_batch = use_all_sites ? n_site : n_batch
-        PBMPopulationApplicator(f_doubleMM_sites, n_site_batch; pt.θP, pt.θM, θFix, xPvec)
+        PBMPopulationApplicator(f_doubleMM_sites, n_batch; pt.θP, pt.θM, θFix, xPvec)
     end
 end
 
@@ -350,11 +356,22 @@ function HVI.get_hybridproblem_n_site_and_batch(prob::DoubleMMCase;
     (n_site, n_batch)
 end
 
-function HVI.get_hybridproblem_train_dataloader(prob::DoubleMMCase; scenario::Val,
+function HVI.get_hybridproblem_train_dataloader(prob::DoubleMMCase; scenario::Val{scen},
         rng::AbstractRNG = StableRNG(111), kwargs...
-)
+) where {scen}
     n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
-    construct_dataloader_from_synthetic(rng, prob; scenario, n_batch, kwargs...)
+    if (:driverNAN ∈ scen)
+        (; xM, xP, y_o, y_unc) = gen_hybridproblem_synthetic(rng, prob; scenario)
+        n_site = size(xM,2)
+        i_sites = 1:n_site
+        # set the last two entries of the S1 drivers and observations of the second site NaN
+        view(@view(xP[:S1,2]), 7:8) .= NaN
+        y_o[7:8,2] .= NaN
+        train_loader = MLUtils.DataLoader((xM, xP, y_o, y_unc, i_sites);
+            batchsize = n_batch, partial = false)
+    else
+        construct_dataloader_from_synthetic(rng, prob; scenario, n_batch, kwargs...)
+    end
 end
 
 function HVI.gen_hybridproblem_synthetic(rng::AbstractRNG, prob::DoubleMMCase;
@@ -371,7 +388,8 @@ function HVI.gen_hybridproblem_synthetic(rng::AbstractRNG, prob::DoubleMMCase;
     int_θMs_sites = ComponentArrayInterpreter(θM, (n_site,))
     # normalize to be distributed around the prescribed true values
     θMs_true = int_θMs_sites(scale_centered_at(θMs_true0, θM, FloatType(0.1)))
-    f = get_hybridproblem_PBmodel(prob; scenario, use_all_sites = true)
+    f_batch = get_hybridproblem_PBmodel(prob; scenario)
+    f = create_nsite_applicator(f_batch, n_site)
     #xP = fill((; S1 = xP_S1, S2 = xP_S2), n_site)
     int_xP_sites = ComponentArrayInterpreter(int_xP1, (n_site,))
     xP = int_xP_sites(vcat(repeat(xP_S1, 1, n_site), repeat(xP_S2, 1, n_site)))
diff --git a/src/HybridProblem.jl b/src/HybridProblem.jl
index c87fdbc..97cc62d 100644
--- a/src/HybridProblem.jl
+++ b/src/HybridProblem.jl
@@ -6,8 +6,7 @@ Fields:
 - `θP::ComponentVector`, `θM::ComponentVector`: parameter templates
 - `g::AbstractModelApplicator`, `ϕg::AbstractVector`: ML model and its parameters 
 - `ϕunc::ComponentVector`: parameters for the Covariance matrix of the approximate posterior
-- `f_batch`: Process-based model predicing for `n_batch` sites
-- `f_allsites`: Process-based model predicing for `n_site` sites
+- `f_batch`: Process-based model predicting for n_batch sites
 - `priors`: AbstractDict: Prior distributions for all PBM parameters on constrained scale
 - `py`: Likelihood function
 - `transM::Stacked`, `transP::Stacked`: bijectors transforming from unconstrained to 
@@ -27,7 +26,6 @@ struct HybridProblem <: AbstractHybridProblem
     θP::CA.ComponentVector
     θM::CA.ComponentVector
     f_batch::Any
-    f_allsites::Any
     g::AbstractModelApplicator
     ϕg::Any # depends on framework
     ϕunc::CA.ComponentVector
@@ -46,7 +44,6 @@ struct HybridProblem <: AbstractHybridProblem
             θP::CA.ComponentVector, θM::CA.ComponentVector,
             g::AbstractModelApplicator, ϕg::AbstractVector,
             f_batch, 
-            f_allsites,
             priors::AbstractDict,
             py,
             transM::Stacked,
@@ -61,7 +58,7 @@ struct HybridProblem <: AbstractHybridProblem
             ϕunc::CA.ComponentVector = init_hybrid_ϕunc(cor_ends, zero(eltype(θM))),
     ) where N
         new(
-            θP, θM, f_batch, f_allsites, g, ϕg, ϕunc, priors, py, transM, transP, cor_ends, 
+            θP, θM, f_batch, g, ϕg, ϕunc, priors, py, transM, transP, cor_ends, 
             train_dataloader, n_covar, n_site, n_batch, pbm_covars)
     end
 end
@@ -86,8 +83,7 @@ function HybridProblem(prob::AbstractHybridProblem; scenario = (),
     θM = get_hybridproblem_par_templates(prob; scenario).θM,
     g = get_hybridproblem_MLapplicator(prob; scenario)[1],
     ϕg = get_hybridproblem_MLapplicator(prob; scenario)[2],
-    f_batch = get_hybridproblem_PBmodel(prob; scenario, use_all_sites = false),
-    f_allsites = get_hybridproblem_PBmodel(prob; scenario, use_all_sites = true),
+    f_batch = get_hybridproblem_PBmodel(prob; scenario),
     priors = get_hybridproblem_priors(prob; scenario),
     py = get_hybridproblem_neg_logden_obs(prob; scenario),
     transP = get_hybridproblem_transforms(prob; scenario).transP,
@@ -100,7 +96,7 @@ function HybridProblem(prob::AbstractHybridProblem; scenario = (),
     pbm_covars = get_hybridproblem_pbmpar_covars(prob; scenario),
     ϕunc = get_hybridproblem_ϕunc(prob; scenario),
     )
-    HybridProblem(θP, θM, g, ϕg, f_batch, f_allsites, priors, py, transM, transP, train_dataloader,
+    HybridProblem(θP, θM, g, ϕg, f_batch, priors, py, transM, transP, train_dataloader,
         n_covar, n_site, n_batch, cor_ends, pbm_covars, ϕunc)
 end
 
@@ -156,8 +152,8 @@ end
 #     (; n_covar=prob.n_covar, n_batch=prob.n_batch, n_θM, n_θP)
 # end
 
-function get_hybridproblem_PBmodel(prob::HybridProblem; scenario = (), use_all_sites=false)
-    use_all_sites ? prob.f_allsites : prob.f_batch
+function get_hybridproblem_PBmodel(prob::HybridProblem; scenario = ())
+    prob.f_batch
 end
 
 function get_hybridproblem_MLapplicator(prob::HybridProblem; scenario = ())
diff --git a/src/HybridSolver.jl b/src/HybridSolver.jl
index a7c9cb0..4410549 100644
--- a/src/HybridSolver.jl
+++ b/src/HybridSolver.jl
@@ -7,11 +7,16 @@ end
 HybridPointSolver(; alg) = HybridPointSolver(alg)
 
 function CommonSolve.solve(prob::AbstractHybridProblem, solver::HybridPointSolver;
-    scenario, rng=Random.default_rng(),
-    gdevs = get_gdev_MP(scenario),
+    scenario=Val(()), rng=Random.default_rng(),
+    gdevs = nothing, # get_gdev_MP(scenario)
     is_inferred::Val{is_infer} = Val(false),
+    ad_backend_loss = AutoZygote(),
+    epochs,
+    is_omitting_NaNbatches = false,
+    is_omit_priors = false,
     kwargs...
 ) where is_infer
+    gdevs = isnothing(gdevs) ? get_gdev_MP(scenario) : gdevs
     par_templates = get_hybridproblem_par_templates(prob; scenario)
     g, ϕg0 = get_hybridproblem_MLapplicator(prob; scenario)
     FT = get_hybridproblem_float_type(prob; scenario)
@@ -20,46 +25,110 @@ function CommonSolve.solve(prob::AbstractHybridProblem, solver::HybridPointSolve
         ϕg=1:length(ϕg0), ϕP=par_templates.θP))
     #ϕ0_cpu = vcat(ϕg0, par_templates.θP .* FT(0.9))  # slightly disturb θP_true
     ϕ0_cpu = vcat(ϕg0, apply_preserve_axes(inverse(transP), par_templates.θP))
+    n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
     train_loader = get_hybridproblem_train_dataloader(prob; scenario)
+    #TODO provide different test data
+    # TODO use 1/10 of the training data
+    # currently HybridProblem returns only applciators of size n_batch and n_site
+    # i_test = rand(1:n_site, Integer(floor(n_site/10)))
+    # test_data = map(train_loader.data) do data_comp
+    #     ndims(data_comp) == 2 ? data_comp[:, i_test] : data_comp[i_test]
+    # end
+    test_data = train_loader.data
     gdev = gdevs.gdev_M
     if gdev isa MLDataDevices.AbstractGPUDevice
         ϕ0_dev = gdev(ϕ0_cpu)
         g_dev = gdev(g)
-        train_loader_dev = gdev_hybridproblem_dataloader(train_loader; scenario, gdev)
+        train_loader_dev = gdev_hybridproblem_dataloader(train_loader; gdevs)
+        test_data_dev = gdev_hybridproblem_data(test_data; gdevs)
     else
         ϕ0_dev = ϕ0_cpu
         g_dev = g
         train_loader_dev = train_loader
+        test_data_dev = test_data
+    end
+    f = get_hybridproblem_PBmodel(prob; scenario)
+    ftest = create_nsite_applicator(f, size(test_data[1],2))
+    if gdevs.gdev_P isa MLDataDevices.AbstractGPUDevice
+        f_dev = gdevs.gdev_P(f) 
+        ftest_dev = gdevs.gdev_P(ftest) 
+    else
+        f_dev = f
+        ftest_dev = ftest
     end
-    f = get_hybridproblem_PBmodel(prob; scenario, use_all_sites=false)
     py = get_hybridproblem_neg_logden_obs(prob; scenario)
     pbm_covars = get_hybridproblem_pbmpar_covars(prob; scenario)
-    n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
+    n_site_test = size(test_data[1],2)
     priors = get_hybridproblem_priors(prob; scenario)
     priorsP = [priors[k] for k in keys(par_templates.θP)]
     priorsM = [priors[k] for k in keys(par_templates.θM)]
     #intP = ComponentArrayInterpreter(par_templates.θP)
-    loss_gf = get_loss_gf(g_dev, transM, transP, f,  py, intϕ;
-        cdev=infer_cdev(gdevs), pbm_covars, n_site_batch=n_batch, priorsP, priorsM,)
+    loss_gf = get_loss_gf(g_dev, transM, transP, f_dev,  py, intϕ;
+        n_site_batch=n_batch, 
+        cdev=infer_cdev(gdevs), pbm_covars, priorsP, priorsM, is_omit_priors,)
+    loss_gf_test = get_loss_gf(g_dev, transM, transP, ftest_dev,  py, intϕ;
+        n_site_batch=n_site_test,
+        cdev=infer_cdev(gdevs), pbm_covars, priorsP, priorsM, is_omit_priors,)
     # call loss function once
     l1 = is_infer ? 
-        Test.@inferred(loss_gf(ϕ0_dev, first(train_loader_dev)...))[1] : 
+        Test.@inferred(loss_gf(ϕ0_dev, first(train_loader_dev)...; is_testmode=true))[1] : 
         # using ShareAdd; @usingany Cthulhu
         # @descend_code_warntype loss_gf(ϕ0_dev, first(train_loader_dev)...)
-        loss_gf(ϕ0_dev, first(train_loader_dev)...)[1]
+        loss_gf(ϕ0_dev, first(train_loader_dev)...; is_testmode=true)[1]
     # and gradient
     # xMg, xP, y_o, y_unc = first(train_loader_dev)
     # gr1 = Zygote.gradient(
     #             p -> loss_gf(p, xMg, xP, y_o, y_unc)[1],
     #             ϕ0_dev)
     # Zygote.gradient(ϕ0_dev -> loss_gf(ϕ0_dev, data1...)[1], ϕ0_dev)
-    optf = Optimization.OptimizationFunction((ϕ, data) -> loss_gf(ϕ, data...)[1],
-        Optimization.AutoZygote())
-    # use CA.getdata(ϕ0_dev), i.e. the plain vector to avoid recompiling for specific CA
-    # loss_gf re-attaches the axes
-    optprob = OptimizationProblem(optf, CA.getdata(ϕ0_dev), train_loader_dev)
-    res = Optimization.solve(optprob, solver.alg; kwargs...)
-    ϕ = intϕ(res.u)
+    if is_omitting_NaNbatches 
+        # implement training loop by hand to skip minibatches with NaN gradients
+        ps = CA.getdata(ϕ0_dev)
+        opt_st_new = Optimisers.setup(solver.alg, ps)
+        n_skips = 0
+        # prepare DI.gradient, need to access and update outside cope data_batch
+        # because cannot redefine fopt_loss_gf
+        data_batch = first(train_loader_dev)
+        is_testmode = false
+        function fopt_loss_gf(ϕ) 
+            #@show first(data_batch[5], 2)
+            loss_gf(ϕ, data_batch...; is_testmode)[1]
+        end
+        ad_prep = DI.prepare_gradient(fopt_loss_gf, ad_backend_loss, zero(ps))
+        grad = similar(ps)
+        stime = time()
+        for epoch in 1:epochs
+            is_testmode = false
+            #i,data_batch = first(enumerate(loader))
+            for (i, data_batch_) in enumerate(train_loader_dev)
+                data_batch = data_batch_  # propagate outside for to scope of fopt_loss_gf
+                DI.gradient!(fopt_loss_gf, grad, ad_prep, ad_backend_loss, ps)    
+                if any(isnan.(grad))
+                    n_skips += 1
+                    #println("Skipped NaN : Batch $i")
+                    print(",$i")
+                else
+                    Optimisers.update!(opt_st_new, ps, grad)
+                end
+            end
+            ttime = time() - stime
+            # compute loss for test data
+            l = loss_gf_test(ps, test_data_dev...; is_testmode = true)
+            println()
+            @show round(ttime, digits=1), epoch, l.nLy, l.neg_log_prior, l.loss_penalty
+            # TODO log 
+        end
+        res = nothing  
+        ϕ = intϕ(ps)
+    else
+        optf = Optimization.OptimizationFunction((ϕ, data) -> loss_gf(ϕ, data...; is_testmode=false)[1],
+            ad_backend_loss)
+        # use CA.getdata(ϕ0_dev), i.e. the plain vector to avoid recompiling for specific CA
+        # loss_gf re-attaches the axes
+        optprob = OptimizationProblem(optf, CA.getdata(ϕ0_dev), train_loader_dev)
+        res = Optimization.solve(optprob, solver.alg; epochs, kwargs...)
+        ϕ = intϕ(res.u)
+    end
     θP = cpu_ca(apply_preserve_axes(transP, cpu_ca(ϕ).ϕP))
     probo = HybridProblem(prob; ϕg=cpu_ca(ϕ).ϕg, θP)
     (; ϕ, resopt=res, probo)
@@ -141,9 +210,9 @@ function CommonSolve.solve(prob::AbstractHybridProblem, solver::HybridPosteriorS
         g_dev = g
         train_loader_dev = train_loader
     end
-    f = get_hybridproblem_PBmodel(prob; scenario, use_all_sites=false)
+    f = get_hybridproblem_PBmodel(prob; scenario)
     if gdevs.gdev_P isa MLDataDevices.AbstractGPUDevice
-        f_dev = fmap(gdevs.gdev_P, f)
+        f_dev = gdevs.gdev_P(f) #fmap(gdevs.gdev_P, f)
     else
         f_dev = f
     end
@@ -164,9 +233,10 @@ function CommonSolve.solve(prob::AbstractHybridProblem, solver::HybridPosteriorS
     # @usingany Cthulhu
     # @descend_code_warntype loss_elbo(ϕ0_dev, rng, first(train_loader_dev)...)
     l0 = is_infer ? 
-        (Test.@inferred loss_elbo(ϕ0_dev, rng, first(train_loader_dev)...)) :
-        loss_elbo(ϕ0_dev, rng, first(train_loader_dev)...)
-    optf = Optimization.OptimizationFunction((ϕ, data) -> first(loss_elbo(ϕ, rng, data...)),
+        (Test.@inferred loss_elbo(ϕ0_dev, rng, first(train_loader_dev)...; is_testmode=true)) :
+        loss_elbo(ϕ0_dev, rng, first(train_loader_dev)...; is_testmode=false)
+    optf = Optimization.OptimizationFunction(
+        (ϕ, data) -> first(loss_elbo(ϕ, rng, data...; is_testmode=false)),
         Optimization.AutoZygote())
     optprob = OptimizationProblem(optf, CA.getdata(ϕ0_dev), train_loader_dev)
     res = Optimization.solve(optprob, solver.alg; kwargs...)
@@ -216,7 +286,7 @@ function get_loss_elbo(g, transP, transMs, f, py;
         trans_mMs=StackedArray(transMs.stacked, n_MC_mean),
         priorsP=priorsP, priorsM=priorsM, floss_penalty=floss_penalty
 
-        function loss_elbo(ϕ, rng, xM, xP, y_o, y_unc, i_sites)
+        function loss_elbo(ϕ, rng, xM, xP, y_o, y_unc, i_sites; is_testmode)
             #ϕc = int_μP_ϕg_unc(ϕ)
             neg_elbo_gtf(
                 rng, ϕ, g, f, py, xM, xP, y_o, y_unc, i_sites;
@@ -224,20 +294,33 @@ function get_loss_elbo(g, transP, transMs, f, py;
                 n_MC, n_MC_cap, n_MC_mean, cor_ends, priors_θP_mean, priors_θMs_mean,
                 cdev, pbm_covar_indices, transP, transMs, trans_mP, trans_mMs,
                 priorsP, priorsM, floss_penalty, #ϕg = ϕc.ϕg, ϕunc = ϕc.unc,
+                is_testmode,
             )
         end
     end
 end
 
+
+function compute_elbo_components(
+    prob::AbstractHybridProblem, solver::HybridPosteriorSolver; 
+    scenario, kwargs...
+    )
+    train_loader = get_hybridproblem_train_dataloader(prob; scenario)
+    data = train_loader.data 
+    compute_elbo_components(
+        prob::AbstractHybridProblem, solver::HybridPosteriorSolver, data; 
+        scenario, kwargs...
+        )
+end
+
 """
 Compute the components of the elbo for given initial conditions of the problems
 for the first batch of the trainloader.
 """
 function compute_elbo_components(
-    prob::AbstractHybridProblem, solver::HybridPosteriorSolver;
+    prob::AbstractHybridProblem, solver::HybridPosteriorSolver, data::Tuple;
     scenario, rng=Random.default_rng(), gdev=gpu_device(),
     θmean_quant=0.0,
-    use_all_sites=false,
     kwargs...)
     n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
     par_templates = get_hybridproblem_par_templates(prob; scenario)
@@ -248,23 +331,27 @@ function compute_elbo_components(
     (; transP, transM) = get_hybridproblem_transforms(prob; scenario)
     (; ϕ, transPMs_batch, interpreters, get_transPMs, get_ca_int_PMs) = init_hybrid_params(
         θP, θM, cor_ends, ϕg0, n_batch; transP, transM, ϕunc0)
-    train_loader = get_hybridproblem_train_dataloader(prob; scenario)
     if gdev isa MLDataDevices.AbstractGPUDevice
         ϕ0_dev = gdev(ϕ)
         g_dev = gdev(g) # zygote fails if  gdev is a CPUDevice, although should be non-op
-        train_loader_dev = gdev_hybridproblem_dataloader(train_loader; scenario, gdev)
+        data_dev = gdev_hybridproblem_data(data; scenario, gdev)
     else
         ϕ0_dev = ϕ
         g_dev = g
-        train_loader_dev = train_loader
+        data_dev = data
     end
-    f = get_hybridproblem_PBmodel(prob; scenario, use_all_sites)
+    (xM, xP, y_o, y_unc, i_sites) = data_dev
+    n_site_pred = size(xP,2)
+    @assert size(xM, 2) == n_site_pred
+    @assert size(y_o, 2) == n_site_pred
+    @assert size(y_unc, 2) == n_site_pred
+    @assert length(i_sites) == n_site_pred
+    f_batch = get_hybridproblem_PBmodel(prob; scenario)
+    f = (n_site_pred == n_batch) ? f : create_nsite_applicator(f_batch, n_site_pred)
     py = get_hybridproblem_neg_logden_obs(prob; scenario)
     priors_θ_mean = construct_priors_θ_mean(
         prob, ϕ0_dev.ϕg, keys(θM), θP, θmean_quant, g_dev, transM;
         scenario, get_ca_int_PMs, gdev, cdev, pbm_covars)
-    # TODO replace train_loader.data by proper function that pulls all the data
-    xM, xP, y_o, y_unc, i_sites = use_all_sites ? train_loader_dev.data : first(train_loader_dev)
     neg_elbo_gtf_components(
         rng, ϕ0_dev, g_dev, transPMs_batch, f, py, xM, xP, y_o, y_unc, i_sites, interpreters;
         solver.n_MC, solver.n_MC_cap, cor_ends, priors_θ_mean)
@@ -275,7 +362,8 @@ In order to let mean of θ stay close to initial point parameter estimates
 construct a prior on mean θ to a Normal around initial prediction.
 """
 function construct_priors_θ_mean(prob, ϕg, keysθM, θP, θmean_quant, g_dev, transM, transP;
-    scenario::Val{scen}, get_ca_int_PMs, gdevs, pbm_covars) where {scen}
+    scenario::Val{scen}, get_ca_int_PMs, gdevs, pbm_covars,
+    ) where {scen}
     iszero(θmean_quant) ? ([],[]) :
     begin
         gdev=gdevs.gdev_M
@@ -294,7 +382,8 @@ function construct_priors_θ_mean(prob, ϕg, keysθM, θP, θmean_quant, g_dev,
         # ζMs = g_dev(xMP_all, CA.getdata(ϕg))'  # transpose to par-last for StackedArray
         # ζMs_cpu = cdev(ζMs)
         # θMs = transMs(ζMs_cpu)
-        θMs = gtrans(g_dev, transMs, xMP_all, CA.getdata(ϕg); cdev=cpu_device())
+        θMs = gtrans(
+            g_dev, transMs, xMP_all, CA.getdata(ϕg); cdev=cpu_device(), is_testmode = true)
         priors_dict = get_hybridproblem_priors(prob; scenario)
         priorsP = [priors_dict[k] for k in keys(θP)]
         priors_θP_mean = map(priorsP, θP) do priorsP, θPi
diff --git a/src/HybridVariationalInference.jl b/src/HybridVariationalInference.jl
index 0292776..37dc8e8 100644
--- a/src/HybridVariationalInference.jl
+++ b/src/HybridVariationalInference.jl
@@ -4,6 +4,7 @@ using ComponentArrays: ComponentArrays as CA
 using Random
 using StatsBase # fit ZScoreTransform
 using StatsFuns # norminvcdf
+using LogExpFunctions # logistic, loglogistic
 using Combinatorics # gen_hybridproblem_synthetic/combinations
 using GPUArraysCore
 using LinearAlgebra
@@ -16,12 +17,18 @@ using MLUtils  # dataloader
 using CommonSolve
 #using OptimizationOptimisers # default alg=Adam(0.02)
 using Optimization
+import Optimisers # for hand-coded optimization loop
 using Distributions, DistributionFits
 using StaticArrays: StaticArrays as SA
 using Functors
 using Test: Test # @inferred
 using Missings
 using FillArrays
+using KernelAbstractions
+import NaNMath # ignore missing observations in logDensity
+using DifferentiationInterface: DifferentiationInterface as DI
+import Zygote
+
 
 export DoubleMM
 
@@ -31,6 +38,7 @@ export extend_stacked_nrow, StackedArray
 #public Exp 
 #julia 1.10 public: https://github.com/JuliaLang/julia/pull/55097
 VERSION >= v"1.11.0-DEV.469" && eval(Meta.parse("public Exp")) 
+VERSION >= v"1.11.0-DEV.469" && eval(Meta.parse("public Logistic")) 
 include("bijectors_utils.jl")
 
 export AbstractComponentArrayInterpreter, ComponentArrayInterpreter,
@@ -42,10 +50,12 @@ include("ComponentArrayInterpreter.jl")
 export AbstractModelApplicator, construct_ChainsApplicator
 export construct_3layer_MLApplicator, select_ml_engine
 export NullModelApplicator, MagnitudeModelApplicator, NormalScalingModelApplicator
+export RangeScalingModelApplicator
 include("ModelApplicator.jl")
 
 export AbstractPBMApplicator, NullPBMApplicator, PBMSiteApplicator, PBMPopulationApplicator
-export DirectPBMApplicator
+export DirectPBMApplicator, PBMPopulationGlobalApplicator
+export create_nsite_applicator
 include("PBMApplicator.jl")
 
 # export AbstractGPUDataHandler, NullGPUDataHandler, get_default_GPUHandler
@@ -64,7 +74,7 @@ export AbstractHybridProblem, get_hybridproblem_MLapplicator, get_hybridproblem_
        get_hybridproblem_pbmpar_covars,
        gen_cov_pred,
        construct_dataloader_from_synthetic,
-       gdev_hybridproblem_dataloader,
+       gdev_hybridproblem_dataloader, gdev_hybridproblem_data,
        setup_PBMpar_interpreter,
        get_gdev_MP
 include("AbstractHybridProblem.jl")
@@ -79,7 +89,7 @@ export HybridProblem
 export get_quantile_transformed
 include("HybridProblem.jl")
 
-export gf, get_loss_gf
+export gf, get_loss_gf, predict_point_hvi
 #export map_f_each_site
 include("gf.jl")
 
@@ -92,6 +102,7 @@ include("util_opt.jl")
 export cpu_ca, apply_preserve_axes
 include("util_ca.jl")
 
+export ones_similar_x
 include("util_gpu.jl")
 
 export neg_logden_indep_normal, entropy_MvNormal
@@ -112,4 +123,7 @@ include("HybridSolver.jl")
 export DoubleMM
 include("DoubleMM/DoubleMM.jl")
 
+export RRuleMonitor
+include("RRuleMonitor.jl")
+
 end
diff --git a/src/ModelApplicator.jl b/src/ModelApplicator.jl
index 4d08fc0..d0077cc 100644
--- a/src/ModelApplicator.jl
+++ b/src/ModelApplicator.jl
@@ -1,5 +1,5 @@
 """
-    AbstractModelApplicator(x, ϕ)
+    AbstractModelApplicator(x, ϕ; is_testmode = false)
 
 Abstraction of applying a machine learning model at covariate matrix, `x`,
 using parameters, `ϕ`. It returns a matrix of predictions with the same
@@ -20,7 +20,7 @@ abstract type AbstractModelApplicator end
 
 function apply_model end
 
-(app::AbstractModelApplicator)(x, ϕ) = apply_model(app, x, ϕ)
+(app::AbstractModelApplicator)(x, ϕ; kwargs...) = apply_model(app, x, ϕ; kwargs...)
 
 
 """
@@ -30,7 +30,7 @@ Model applicator that returns its inputs. Used for testing.
 """
 struct NullModelApplicator <: AbstractModelApplicator end
 
-function apply_model(app::NullModelApplicator, x, ϕ)
+function apply_model(app::NullModelApplicator, x, ϕ; kwargs...)
     return x
 end
 
@@ -102,13 +102,15 @@ struct MagnitudeModelApplicator{M,A} <: AbstractModelApplicator
     multiplier::M
 end
 
-function apply_model(app::MagnitudeModelApplicator, x, ϕ)
+function apply_model(app::MagnitudeModelApplicator, x, ϕ; kwargs...)
     #@show size(x), size(ϕ), app.multiplier
     @assert eltype(app.multiplier) == eltype(ϕ)
-    apply_model(app.app, x, ϕ) .* app.multiplier
+    apply_model(app.app, x, ϕ; kwargs...) .* app.multiplier
 end
 
 
+
+
 """
     NormalScalingModelApplicator(app, μ, σ)
     NormalScalingModelApplicator(app, priors, transM)
@@ -158,14 +160,54 @@ function NormalScalingModelApplicator(
     NormalScalingModelApplicator(app, μ, σ)
 end
 
-function apply_model(app::NormalScalingModelApplicator, x, ϕ)
-    y_perc = apply_model(app.app, x, ϕ)
+function apply_model(app::NormalScalingModelApplicator, x, ϕ; kwargs...)
+    y_perc = apply_model(app.app, x, ϕ; kwargs...)
     # @show typeof(app.μ)
     # @show typeof(ϕ)
     @assert eltype(app.μ) == eltype(ϕ)
-    norminvcdf.(app.μ, app.σ, y_perc) # from StatsFuns
+    ans = norminvcdf.(app.μ, app.σ, y_perc) # from StatsFuns
+    # if !all(isfinite.(ans))
+    #     @info "NormalScalingModelApplicator.apply_model: encountered non-finite results"
+    #     #@show ans, y_perc, app.μ, app.σ
+    #     #@show app.app, x, ϕ
+    #     #error("error to print stacktrace")
+    # end
+    ans
+end
+
+"""
+    RangeScalingModelApplicator(app, y0)
+
+Wrapper around AbstractModelApplicator assumed to predict on (0,1) with 
+a linear mappting to prededfined range.
+"""
+struct RangeScalingModelApplicator{VF,A} <: AbstractModelApplicator
+    offset::VF
+    width::VF
+    app::A
 end
 
+function apply_model(app::RangeScalingModelApplicator, x, ϕ; kwargs...)
+    res0 = apply_model(app.app, x, ϕ; kwargs...)
+    res = res0 .* app.width .+ app.offset
+end
+
+"""
+    RangeScalingModelApplicator(app, lowers, uppers, FT::Type; repeat_inner::Integer = 1) 
+
+Provide the target ragen by vectors `lower` and `upper`. The size of both
+outputs must correspond to the size of the output of `app`.
+"""
+function RangeScalingModelApplicator(
+    app::AbstractModelApplicator, 
+    lowers::VT, uppers::VT,
+    FT::Type) where VT<:AbstractVector
+    width = collect(FT, uppers .- lowers)
+    lowersFT = collect(FT, lowers) # convert eltype
+    RangeScalingModelApplicator(lowersFT, width, app)
+end
+
+
 
 
 
diff --git a/src/PBMApplicator.jl b/src/PBMApplicator.jl
index c354d02..9543aee 100644
--- a/src/PBMApplicator.jl
+++ b/src/PBMApplicator.jl
@@ -28,6 +28,12 @@ function (app::AbstractPBMApplicator)(θP::AbstractArray, θMs::AbstractArray, x
     apply_model(app, θP, θMs, xP)
 end
 
+"""
+create_nsite_applicator(app::AbstractPBMApplicator, n_site) 
+"""
+function create_nsite_applicator end
+
+
 """
     apply_model(app::AbstractPBMApplicator, θsP::AbstractVector, θsMs::AbstractMatrix, xP::AbstractMatrix) 
     apply_model(app::AbstractPBMApplicator, θsP::AbstractMatrix, θsMs::AbstractArray{ET,3}, xP) 
@@ -90,6 +96,8 @@ function apply_model(app::NullPBMApplicator, θP::AbstractVector, θMs::Abstract
     return CA.getdata(θMs)
 end
 
+create_nsite_applicator(app::NullPBMApplicator, n_site) = app
+
 """
     DirectPBMApplicator()
 
@@ -103,6 +111,8 @@ end
 function apply_model(app::DirectPBMApplicator, θP::AbstractVector, θMs::AbstractMatrix, xP)
     return app.f(θP, θMs, xP)
 end
+create_nsite_applicator(app::DirectPBMApplicator, n_site) = app
+
 
 
 
@@ -114,7 +124,7 @@ struct PBMSiteApplicator{F, IT, IXT, VFT} <: AbstractPBMApplicator
 end
 
 """
-    PBMSiteApplicator(fθ, n_batch; θP, θM, θFix, xPvec)
+    PBMSiteApplicator(fθ; θP, θM, θFix, xPvec)
 
 Construct AbstractPBMApplicator from process-based model `fθ` that computes predictions
 for a single site.
@@ -137,19 +147,24 @@ function PBMSiteApplicator(fθ;
     )
     intθ1 = get_concrete(ComponentArrayInterpreter(flatten1(CA.ComponentVector(; θP, θM, θFix))))
     int_xPsite = get_concrete(ComponentArrayInterpreter(xPvec))
-    PBMSiteApplicator(fθ, intθ1, int_xPsite, CA.getdata(θFix))        
+    PBMSiteApplicator(fθ, intθ1, int_xPsite, θFix)        
+end
+
+function create_nsite_applicator(app::PBMSiteApplicator, n_site)
+    PBMSiteApplicator(app.fθ, app.intθ1, app.int_xPsite, app.θFix)        
 end
 
+
 function apply_model(app::PBMSiteApplicator, θP::AbstractVector, θMs::AbstractMatrix, xP) 
     function apply_PBMsite(θM, xP1)
         if (CA.getdata(θP) isa GPUArraysCore.AbstractGPUArray) && 
-            (!(app.θFix isa GPUArraysCore.AbstractGPUArray) || 
+            (!(CA.getdata(app.θFix) isa GPUArraysCore.AbstractGPUArray) || 
              !(CA.getdata(θMs) isa GPUArraysCore.AbstractGPUArray)) 
             error("concatenating GPUarrays with non-gpu arrays θFix or θMs. " *
             "May fmap PBMModelapplicators to gdev, " *
             "or compute PBMmodel on CPU")
         end
-        θ = vcat(CA.getdata(θP), CA.getdata(θM), app.θFix)
+        θ = vcat(CA.getdata(θP), CA.getdata(θM), CA.getdata(app.θFix))
         θc = app.intθ1(θ);  # show errors without ";"
         xPc = app.int_xPsite(xP1);
         ans = CA.getdata(app.fθ(θc, xPc))
@@ -189,10 +204,10 @@ end
 @functor PBMPopulationApplicator (θFixm, rep_fac)
 
 """
-    PBMPopulationApplicator(fθpop, n_batch; θP, θM, θFix, xPvec)
+    PBMPopulationApplicator(fθpop, n_site; θP, θM, θFix, xPvec)
 
 Construct AbstractPBMApplicator from process-based model `fθ` that computes predictions
-across sites for a population of size `n_batch`.
+across sites for a population of size `n_site`.
 The applicator combines enclosed `θFix`, with provided `θMs` and `θP`
 to a `ComponentMatrix` with parameters with one row for each site, that
 can be column-indexed by Symbols.
@@ -203,32 +218,48 @@ can be column-indexed by Symbols.
     - `θc`: parameters: `ComponentMatrix` (n_site x n_par) with each row a parameter vector
     - `xPc`: observations: `ComponentMatrix` (n_obs x n_site) with each column 
     observationsfor one site
-- `n_batch`: number of indiduals, i.e. rows in `θMs`
+- `n_site`: number of indiduals, i.e. rows in `θMs`
 - `θP`: `ComponentVector` template of global process model parameters
 - `θM`: `ComponentVector` template of individual process model parameters
 - `θFix`: `ComponentVector` of actual fixed process model parameters
 - `xPvec`: `ComponentVector` template of model drivers for a single site
 """
-function PBMPopulationApplicator(fθpop, n_batch; 
+function PBMPopulationApplicator(fθpop, n_site; 
     θP::CA.ComponentVector, θM::CA.ComponentVector, θFix::CA.ComponentVector, 
     xPvec::CA.ComponentVector
     )
     intθvec = ComponentArrayInterpreter(flatten1(CA.ComponentVector(; θP, θM, θFix)))
     int_xP_vec = ComponentArrayInterpreter(xPvec)
-    isFix = repeat(axes(θFix, 1)', n_batch)
+    isFix = repeat(axes(θFix, 1)', n_site)
     #
-    intθ = get_concrete(ComponentArrayInterpreter((n_batch,), intθvec))
-    int_xP = get_concrete(ComponentArrayInterpreter(int_xP_vec, (n_batch,)))
-    #isP = repeat(axes(θP, 1)', n_batch)
+    intθ = get_concrete(ComponentArrayInterpreter((n_site,), intθvec))
+    int_xP = get_concrete(ComponentArrayInterpreter(int_xP_vec, (n_site,)))
+    #isP = repeat(axes(θP, 1)', n_site)
     # n_site = size(θMs, 1)
-    rep_fac = ones_similar_x(θP, n_batch) # to reshape into matrix, avoiding repeat
-    θFixm = CA.getdata(θFix[isFix])
+    rep_fac = ones_similar_x(θP, n_site) # to reshape into matrix, avoiding repeat
+    θFixm = CA.ComponentMatrix(θFix[isFix], (CA.FlatAxis(), CA.getaxes(θFix)[1]))
     PBMPopulationApplicator(fθpop, θFixm, rep_fac, intθ, int_xP)        
 end
 
+function create_nsite_applicator(app::PBMPopulationApplicator, n_site) 
+    θFix = app.θFixm[1,:]
+    isFix = repeat(axes(θFix, 1)', n_site)
+    θFixm = if length(θFix) == 0
+        CA.ComponentMatrix(θFix[isFix], (CA.FlatAxis(), CA.FlatAxis()))
+    else
+        CA.ComponentMatrix(θFix[isFix], (CA.FlatAxis(), CA.getaxes(θFix)[1]))
+    end
+    #
+    intθ = get_concrete(ComponentArrayInterpreter((n_site,), (CA.getaxes(app.intθ)[2],),()))
+    int_xP = get_concrete(ComponentArrayInterpreter(
+        (), (CA.getaxes(app.int_xP)[1],), (n_site,)))
+    rep_fac = ones_similar_x(θFix, n_site) # to reshape into matrix, avoiding repeat
+    PBMPopulationApplicator(app.fθpop, θFixm, rep_fac, intθ, int_xP)        
+end
+
 function apply_model(app::PBMPopulationApplicator, θP::AbstractVector, θMs::AbstractMatrix, xP) 
     if (CA.getdata(θP) isa GPUArraysCore.AbstractGPUArray) && 
-        (!(app.θFixm isa GPUArraysCore.AbstractGPUArray) || 
+        (!(CA.getdata(app.θFixm) isa GPUArraysCore.AbstractGPUArray) || 
             !(CA.getdata(θMs) isa GPUArraysCore.AbstractGPUArray)) 
         error("concatenating GPUarrays with non-gpu arrays θFixm or θMs. " *
         "May transfer PBMPopulationApplicator to gdev, " *
@@ -241,7 +272,7 @@ function apply_model(app::PBMPopulationApplicator, θP::AbstractVector, θMs::Ab
     #@benchmark CA.getdata(θP[app.isP])  
     #@benchmark CA.getdata(repeat(θP', size(θMs,1))) 
     #@benchmark rep_fac .* CA.getdata(θP)'  # 
-    local θ = hcat(app.rep_fac .* CA.getdata(θP)', CA.getdata(θMs), app.θFixm) 
+    local θ = hcat(app.rep_fac .* CA.getdata(θP)', CA.getdata(θMs), CA.getdata(app.θFixm)) 
     #local θ = hcat(CA.getdata(θP[app.isP]), CA.getdata(θMs), app.θFixm)
     #local θ = hcat(CA.getdata(repeat(θP', size(θMs,1))), CA.getdata(θMs), app.θFixm)
     local θc = app.intθ(CA.getdata(θ))
@@ -250,3 +281,77 @@ function apply_model(app::PBMPopulationApplicator, θP::AbstractVector, θMs::Ab
     return pred_sites
 end
 
+struct PBMPopulationGlobalApplicator{MFT, IsT, IgT, IXT, F} <: AbstractPBMApplicator 
+    fθpop::F
+    θFix::MFT # may be CuVector rather than Vector
+    intθs::IsT 
+    intθg::IgT 
+    int_xP::IXT
+end
+
+@functor PBMPopulationGlobalApplicator 
+
+
+"""
+    PBMPopulationGlobalApplicator(fθpop, n_site; θP, θM, θFix, xPvec)
+
+Construct AbstractPBMApplicator from process-based model `fθ` that computes predictions
+across sites for a population of size `n_site`.
+The applicator combines enclosed `θFix`, with provided `θMs` and `θP`
+to a `ComponentMatrix` with parameters with one row for each site, that
+can be column-indexed by Symbols.
+
+## Arguments 
+- `fθpop`: process model, process model `f(θsc, θgc, xPc)`, which is not
+  agnostic of the partitioning of parameters into fixed, global, and individual
+  to increase performance
+    - `θc`: parameters: `ComponentMatrix` (n_site x n_par_site) with each row a parameter vector
+    - `θgc`: parameters: `ComponentVector` (n_par_global) 
+    - `xPc`: observations: `ComponentMatrix` (n_obs x n_site) with each column 
+    observationsfor one site
+- `n_site`: number of indiduals, i.e. rows in `θMs`
+- `θP`: `ComponentVector` template of global process model parameters
+- `θM`: `ComponentVector` template of individual process model parameters
+- `θFix`: `ComponentVector` of actual fixed process model parameters
+- `xPvec`: `ComponentVector` template of model drivers for a single site
+"""
+function PBMPopulationGlobalApplicator(fθpop, n_site; 
+    θP::CA.ComponentVector, θM::CA.ComponentVector, θFix::CA.ComponentVector, 
+    xPvec::CA.ComponentVector
+    )
+    #intθvec = ComponentArrayInterpreter(flatten1(CA.ComponentVector(; θP, θM, θFix)))
+    int_xP_vec = ComponentArrayInterpreter(xPvec)
+    intθs = get_concrete(ComponentArrayInterpreter((n_site,), θM))
+    intθg = get_concrete(ComponentArrayInterpreter(vcat(θP, θFix)))
+    int_xP = get_concrete(ComponentArrayInterpreter(int_xP_vec, (n_site,)))
+    PBMPopulationGlobalApplicator(fθpop, θFix, intθs, intθg, int_xP)        
+end
+
+function create_nsite_applicator(app::PBMPopulationGlobalApplicator, n_site) 
+    @info("called PBMPopulationGlobalApplicator.create_nsite_applicator")
+    intθs = get_concrete(ComponentArrayInterpreter((n_site,), (CA.getaxes(app.intθs)[2],),()))
+    int_xP = get_concrete(ComponentArrayInterpreter(
+        (), (CA.getaxes(app.int_xP)[1],), (n_site,)))
+    PBMPopulationGlobalApplicator(app.fθpop, app.θFix, intθs, app.intθg, int_xP)        
+end
+
+
+function apply_model(app::PBMPopulationGlobalApplicator, θP::AbstractVector, θMs::AbstractMatrix, xP) 
+    if (CA.getdata(θP) isa GPUArraysCore.AbstractGPUArray) && 
+        (!(CA.getdata(app.θFix) isa GPUArraysCore.AbstractGPUArray) || 
+            !(CA.getdata(θMs) isa GPUArraysCore.AbstractGPUArray)) 
+        error("concatenating GPUarrays with non-gpu arrays θFixm or θMs. " *
+        "May transfer PBMPopulationGlobalApplicator to gdev, " *
+        "or compute PBM on CPU.")
+    end
+    local θs = CA.getdata(θMs)
+    local θg = vcat(CA.getdata(θP), CA.getdata(app.θFix)) 
+    local θsc = app.intθs(CA.getdata(θs))
+    local θgc = app.intθg(CA.getdata(θg))
+    local xPc = app.int_xP(CA.getdata(xP))
+    local pred_sites = app.fθpop(θsc, θgc, xPc)
+    return pred_sites
+end
+
+
+
diff --git a/src/RRuleMonitor.jl b/src/RRuleMonitor.jl
new file mode 100644
index 0000000..88a73ea
--- /dev/null
+++ b/src/RRuleMonitor.jl
@@ -0,0 +1,118 @@
+"""
+    RRuleMonitor(label, f, [ad_backend::ADTypes.AbstractADType])
+    
+Identity mapping of Callable or function `f` that intervenes the the pullback 
+and raises an error if the supplied cotangent or the jacobian 
+contains non-finitie entries.
+
+Arguments
+- label: id (String, or symbol) used in the error message.
+- `ad_backend`: the AD backend used in `DifferentiationInterface.jacobian`.
+  Defaults to `AutoZygote().`
+"""
+struct RRuleMonitor{F,L,A}  <: AbstractModelApplicator
+    label::L
+    f::F
+    ad_backend::A
+    #TODO think of preparing: https://juliadiff.org/DifferentiationInterface.jl/DifferentiationInterface/stable/tutorials/basic/#Preparing-for-multiple-gradients
+end
+
+RRuleMonitor(label, f) = RRuleMonitor(label, f, DI.AutoZygote())
+#RRuleMonitor(label, f) = RRuleMonitor{typeof(label), typeof(f), DI.AutoZygote}(label, f, DI.AutoZygote())
+
+@functor RRuleMonitor
+
+function (m::RRuleMonitor)(args...; kwargs...)
+    apply_RRuleMonitor(m, args...; kwargs...)
+end
+
+function apply_RRuleMonitor(m::RRuleMonitor, args...; kwargs...)
+    m.f(args...; kwargs...)
+end
+
+# AbstractModelApplicator
+apply_model(m::RRuleMonitor, x, ϕ; kwargs...) = m.f(x, ϕ; kwargs...)
+
+function ChainRulesCore.rrule(::typeof(apply_RRuleMonitor), m::RRuleMonitor, args...; kwargs...)
+    function apply_RRuleMonitor_pullback(Δy)
+        # if m.label == "transP" 
+        #     @show Δy[:]
+        # end
+        if !all(isfinite.(Δy[:]))
+            @info "apply_RRuleMonitor_pullback: encountered non-finite co-gradients Δy " * 
+                "for RRuleMonitor $(m.label)"  
+            #@show Δy[:]
+            #Main.@infiltrate_main
+            error("traceback")
+        end
+        # do not call apply_RRuleMonitor because that results in infinite recursion
+        # for backends other than AutoZygote need to call jacobian for single argument function
+        jacs = if m.ad_backend isa DI.AutoZygote 
+            function ftmp(args_...) 
+                m.f(args_...; kwargs...)
+            end
+            jacs = Zygote.jacobian(ftmp, args...)
+        else
+            Tuple(begin
+                fxi = (x) -> m.f(args[1:(i-1)]..., x, args[i+1:end]...; kwargs...)
+                DI.jacobian(fxi, m.ad_backend, args[i])
+            end for (i,x) in enumerate(args))
+        end
+        for (i,jac) in enumerate(jacs)
+            if !all(isfinite.(jac))
+                #@show jac
+                @info("apply_RRuleMonitor_pullback: encountered non-finite Jacobian " * 
+                    "for argument $(i) in RRuleMonitor $(m.label)")
+                #Main.@infiltrate_main
+                error("traceback")
+            end
+        end
+        projectors = (ProjectTo(arg) for arg in args)
+        # if m.label == "f in gf" && Main.cnt_SteadySOCPools >= 1118
+        #     Main.@infiltrate_main
+        # end
+        #(pr,jac,x) = first(zip(projectors,jacs,args))
+        Δx = (@thunk(pr(reshape(jac' * vec(Δy), size(x)))) for (pr,jac,x) in zip(
+            projectors,jacs,args))
+        # Δx = Tuple(begin
+        #     if size(jac',2) != size(Δy,1) 
+        #         Main.@infiltrate_main
+        #     end
+        #     Δxi = jac' * Δy[:]
+        #     Δxip = pr(reshape(Δxi, size(x)))
+        # end for (pr,jac,x) in zip(projectors,jacs,args))
+        (NoTangent(), NoTangent(), Δx...)
+    end
+    return apply_RRuleMonitor(m, args...; kwargs...), apply_RRuleMonitor_pullback
+end
+
+# # with DI support only functions of one argument
+# function ChainRulesCore.rrule(::typeof(apply_RRuleMonitor), m::RRuleMonitor, x; kwargs...)
+#     function apply_RRuleMonitor_pullback(Δy)
+#         if !all(isfinite.(Δy))
+#             @info "apply_RRuleMonitor_pullback: encountered non-finite co-gradients Δy " * 
+#                 "for RRuleMonitor $(m.label)"  
+#             #@show Δy[:]
+#             #Main.@infiltrate_main
+#             error("traceback")
+#         end
+#         # do not call apply_RRuleMonitor because that results in infinite recursion
+#         ftmp = (x_) -> m.f(x_; kwargs...)
+#         jac = DI.jacobian(ftmp, m.ad_backend, x)
+#         if !all(isfinite.(jac))
+#             #@show jac
+#             @info("apply_RRuleMonitor_pullback: encountered non-finite Jacobian " * 
+#                 "in RRuleMonitor $(m.label)")
+#             #Main.@infiltrate_main
+#             error("traceback")
+#         end
+#         pr = ProjectTo(x) 
+#         if m.label == "f in gf"
+#             #Main.@infiltrate_main
+#         end
+#         Δx = @thunk(pr(reshape(jac' * vec(Δy), size(x)))) 
+#         (NoTangent(), NoTangent(), Δx)
+#     end
+#     return apply_RRuleMonitor(m, args...; kwargs...), apply_RRuleMonitor_pullback
+# end
+
diff --git a/src/bijectors_utils.jl b/src/bijectors_utils.jl
index 9061ebb..20d437b 100644
--- a/src/bijectors_utils.jl
+++ b/src/bijectors_utils.jl
@@ -27,6 +27,35 @@ end
 # end
 Bijectors.is_monotonically_increasing(::Exp) = true
 
+#----------------------- Logistic
+"""
+    Logistic()
+
+A bijector that applies broadcasted exponential function, i.e. `logit.(x)`.
+It is equivalent to `elementwise(exp)` but works better with automatic
+differentiation on GPU.
+"""
+struct Logistic <: Bijector
+end
+
+#Functors.@functor Logistic
+Bijectors.transform(b::Logistic, x) = logistic.(x) # note the broadcast
+Bijectors.transform(ib::Inverse{<:Logistic}, y) = logit.(y)
+
+# `logabsdetjac`
+# https://en.wikipedia.org/wiki/Logistic_function#Derivative
+Bijectors.logabsdetjac(b::Logistic, x) = sum(loglogistic.(x) + log1mlogistic.(x)) 
+
+`with_logabsdet_jacobian`
+function Bijectors.with_logabsdet_jacobian(b::Logistic, x)
+    return transform(b,x), logabsdetjac(b,x)
+end
+# function Bijectors.with_logabsdet_jacobian(ib::Inverse{<:Logistic}, y)
+#     x = transform(ib, y)
+#     return x, -logabsdetjac(inverse(ib), x)
+# end
+Bijectors.is_monotonically_increasing(::Logistic) = true
+
 
 """
     StackedArray(stacked, nrow) 
diff --git a/src/cholesky.jl b/src/cholesky.jl
index 24fd444..7784ad2 100644
--- a/src/cholesky.jl
+++ b/src/cholesky.jl
@@ -147,7 +147,7 @@ This can be used to fit parameters that yield an upper Cholesky-Factor
 of a Covariance matrix.
 
 It uses the upper triangular matrix rather than the lower because it
-involes a sum across columns, whereas the alternative of a lower triangular
+involves a sum across columns, whereas the alternative of a lower triangular
 uses sum across rows. 
 Sum across columns is often faster, because entries of columns are contiguous.
 """
diff --git a/src/elbo.jl b/src/elbo.jl
index cc26bec..9f6a461 100644
--- a/src/elbo.jl
+++ b/src/elbo.jl
@@ -51,10 +51,11 @@ function neg_elbo_gtf_components(rng, ϕ::AbstractVector{FT}, g, f, py,
     trans_mMs =StackedArray(transMs.stacked, n_MC),
     priorsP, priorsM,
     floss_penalty = zero_penalty_loss,
+    is_testmode,
 ) where {FT}
     n_MCr = isempty(priors_θP_mean) ? n_MC : max(n_MC, n_MC_mean)
     ζsP, ζsMs, σ = generate_ζ(rng, g, ϕ, xM; n_MC=n_MCr, cor_ends, pbm_covar_indices,
-        int_unc, int_μP_ϕg_unc)
+        int_unc, int_μP_ϕg_unc, is_testmode)
     ζsP_cpu = cdev(ζsP) # fetch to CPU, because for <1000 sites (n_batch) this is faster
     ζsMs_cpu = cdev(ζsMs) # fetch to CPU, because for <1000 sites (n_batch) this is faster
     #
@@ -194,22 +195,21 @@ function zero_penalty_loss(
 end
 
 
-
 """
-    predict_hvi([rng], predict_hvi(rng, prob::AbstractHybridProblem)
+    predict_hvi([rng], prob::AbstractHybridProblem)
 
 Prediction function for hybrid variational inference parameter model. 
 
 ## Arguments
-- The problem for which to predict
-- xM: covariates for the machine-learning model (ML): Matrix (n_θM x n_site_pred).
-- xP: model drivers for process based model (PBM): Matrix with (n_site_pred) rows.
-  If provided a ComponentArray with a Tuple-Axis in rows, the PBM model can
-  access parts of it, e.g. `xP[:S1,...]`.
+- `prob`: The problem for which to predict
 
 ## Keyword arguments
 - `scenario`
 - `n_sample_pred`
+- `xM`: covariates for the machine-learning model (ML): Matrix (n_θM x n_site_pred).
+  Possibility to override the default from `get_hybridproblem_train_dataloader`.
+- `xP`: model drivers for process based model (PBM): Matrix with (n_site_pred) rows.
+  Possibility to override the default from `get_hybridproblem_train_dataloader`.
 
 Returns an NamedTuple `(; y, θsP, θsMs, entropy_ζ)` with entries
 - `y`: Array `(n_obs, n_site, n_sample_pred)` of model predictions.
@@ -222,20 +222,27 @@ Returns an NamedTuple `(; y, θsP, θsMs, entropy_ζ)` with entries
 """
 function predict_hvi(rng, prob::AbstractHybridProblem; scenario=Val(()), 
     gdevs = get_gdev_MP(scenario), 
+    xM = nothing, xP = nothing,
+    is_testmode = true,
     kwargs...
     )
-    dl = get_hybridproblem_train_dataloader(prob; scenario)
-    dl_dev = gdev_hybridproblem_dataloader(dl; gdevs)
-    xM, xP = dl_dev.data[1:2]
-    (; θsP, θsMs, entropy_ζ) = sample_posterior(rng, prob, xM; scenario, gdevs, kwargs...)
+    if isnothing(xM) || isnothing(xP)
+        dl = get_hybridproblem_train_dataloader(prob; scenario)
+        dl_dev = gdev_hybridproblem_dataloader(dl; gdevs)
+        xM_dl, xP_dl = dl_dev.data[1:2]
+        xM = isnothing(xM) ? xM_dl : xM
+        xP = isnothing(xP) ? xP_dl : xP
+    end
+    (; θsP, θsMs, entropy_ζ) = sample_posterior(
+        rng, prob, xM; scenario, gdevs, is_testmode, kwargs...)
     #
     n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
-    n_site_pred = size(θsMs,1)
-    is_predict_batch = (n_site_pred == n_batch)
+    n_site_pred = size(θsMs,1) # determined by size(xM)
     @assert size(xP, 2) == n_site_pred
-    f = get_hybridproblem_PBmodel(prob; scenario, use_all_sites=!is_predict_batch)
+    f_batch = get_hybridproblem_PBmodel(prob; scenario)
+    f = n_site_pred == n_batch ? f_batch : create_nsite_applicator(f_batch, n_site_pred)
     if gdevs.gdev_P isa MLDataDevices.AbstractGPUDevice
-        f_dev = fmap(gdevs.gdev_P, f)
+        f_dev = gdevs.gdev_P(f)#fmap(gdevs.gdev_P, f)
     else
         f_dev = f
     end
@@ -323,11 +330,11 @@ function sample_posterior(rng, g, ϕ::AbstractVector, xM::AbstractMatrix;
     cor_ends,
     pbm_covar_indices,
     is_inferred::Val{is_infer} = Val(false),
-    kwargs...
+    is_testmode,
 ) where is_infer
     ζsP_gpu, ζsMs_gpu, σ = generate_ζ(rng, g, CA.getdata(ϕ), CA.getdata(xM);
         int_μP_ϕg_unc, int_unc,
-        n_MC=n_sample_pred, cor_ends, pbm_covar_indices)
+        n_MC=n_sample_pred, cor_ends, pbm_covar_indices, is_testmode)
     ζsP = cdev(ζsP_gpu)
     ζsMs = cdev(ζsMs_gpu)
     logdetΣ = 2 * sum(log.(σ))
@@ -354,7 +361,9 @@ each MC sample and then transforming each parameter on block across sites.
 function generate_ζ(rng, g, ϕ::AbstractVector{FT}, xM::MT;
     int_μP_ϕg_unc::AbstractComponentArrayInterpreter,
     int_unc::AbstractComponentArrayInterpreter,
-    n_MC=3, cor_ends, pbm_covar_indices) where {FT,MT}
+    n_MC=3, cor_ends, pbm_covar_indices,
+    is_testmode,
+    ) where {FT,MT}
     # see documentation of neg_elbo_gtf
     ϕc = int_μP_ϕg_unc(CA.getdata(ϕ))
     μ_ζP = CA.getdata(ϕc.μP)
@@ -364,7 +373,7 @@ function generate_ζ(rng, g, ϕ::AbstractVector{FT}, xM::MT;
     xMP0 = _append_each_covars(xM, CA.getdata(μ_ζP), pbm_covar_indices)
     #Main.@infiltrate_main
 
-    μ_ζMs0 = g(xMP0, ϕg)
+    μ_ζMs0 = g(xMP0, ϕg; is_testmode)
     ζP_resids, ζMs_parfirst_resids, σ = sample_ζresid_norm(rng, μ_ζP, μ_ζMs0, ϕc.unc; n_MC, cor_ends, int_unc)
     if pbm_covar_indices isa SA.SVector{0}
         # do not need to predict again but just add the residuals to μ_ζP and μ_ζMs
@@ -561,10 +570,11 @@ function _create_blockdiag(
     tmp = vcat(Uσ)
 end
 
+# TODO replace by KA.rand when it becomes available, see ones_similar
+# https://github.com/JuliaGPU/KernelAbstractions.jl/issues/488
 function _create_randn(rng, ::AbstractVector{T}, dims...) where {T}
     randn(rng, T, dims...)
 end
-
 #moved to HybridVariationalInferenceCUDAExt
 #function _create_randn(rng, ::CUDA.CuVector{T}, dims...) where {T}
 
diff --git a/src/gf.jl b/src/gf.jl
index 30b5a76..5578bef 100644
--- a/src/gf.jl
+++ b/src/gf.jl
@@ -49,28 +49,77 @@
 # #map_f_each_site(f_double, θMs_true, stack(Iterators.repeated(CA.getdata(θP_true), size(θMs_true,2))))
 
 """
-composition f ∘ transM ∘ g: mechanistic model after machine learning parameter prediction
+    predict_point_hvi([rng], prob::AbstractHybridProblem)
+
+Prediction function for hybrid variational inference parameter model that omits
+the sampling step but returns the prediction at the mean in unconstrained space.
+
+## Arguments
+- `prob`: The problem for which to predict
+
+## Keyword arguments
+- `scenario`
+- `gdevs`
+- `xM`: covariates for the machine-learning model (ML): Matrix (n_θM x n_site_pred).
+  Possibility to override the default from `get_hybridproblem_train_dataloader`.
+- `xP`: model drivers for process based model (PBM): Matrix with (n_site_pred) rows.
+  Possibility to override the default from `get_hybridproblem_train_dataloader`.
+
+Returns an NamedTuple `(; y, θMs, θP)` with entries
+- `y`: Matrix `(n_obs, n_site)` of model predictions.
+- `θP`: ComponentVector of PBM model parameters
+  that are kept constant across sites.
+- `θMs`: ComponentMatrix `(n_site, n_θM)` of PBM model parameters
+  that vary by site.
 """
-function gf(prob::AbstractHybridProblem; scenario = Val(()), kwargs...)
-    train_loader = get_hybridproblem_train_dataloader(prob; scenario)
-    train_loader_dev = gdev_hybridproblem_dataloader(train_loader; scenario)
-    xM, xP = train_loader_dev.data[1:2]
-    gf(prob, xM, xP; scenario, kwargs...)
+function predict_point_hvi(rng, prob::AbstractHybridProblem; scenario=Val(()), 
+    gdevs = get_gdev_MP(scenario), 
+    xM = nothing, xP = nothing,
+    is_testmode = true,
+    kwargs...
+    )
+    if isnothing(xM) || isnothing(xP)
+        dl = get_hybridproblem_train_dataloader(prob; scenario)
+        dl_dev = gdev_hybridproblem_dataloader(dl; gdevs)
+        xM_dl, xP_dl = dl_dev.data[1:2]
+        xM = isnothing(xM) ? xM_dl : xM
+        xP = isnothing(xP) ? xP_dl : xP
+    end
+    y_pred, θMs, θP = gf(prob, xM, xP; scenario, gdevs, is_testmode, kwargs...)    
+    θPc = ComponentArrayInterpreter(prob.θP)(θP)
+    θMsc = ComponentArrayInterpreter((size(θMs,1),), prob.θM)(θMs)
+    (;y_pred, θMs=θMsc, θP=θPc)
 end
+
+
+
+# function gf(prob::AbstractHybridProblem; scenario = Val(()), kwargs...)
+#     train_loader = get_hybridproblem_train_dataloader(prob; scenario)
+#     train_loader_dev = gdev_hybridproblem_dataloader(train_loader; scenario)
+#     xM, xP = train_loader_dev.data[1:2]
+#     gf(prob, xM, xP; scenario, kwargs...)
+# end
+"""
+composition f ∘ transM ∘ g: mechanistic model after machine learning parameter prediction
+"""
 function gf(prob::AbstractHybridProblem, xM::AbstractMatrix, xP::AbstractMatrix; 
     scenario = Val(()), 
-    gdev = :use_gpu ∈ _val_value(scenario) ? gpu_device() : identity, 
-    cdev = gdev isa MLDataDevices.AbstractGPUDevice ? cpu_device() : identity,
+    gdevs = nothing, #get_gdev_MP(scenario), 
     is_inferred::Val{is_infer} = Val(false),
     kwargs...
 ) where is_infer
+    gdevs = isnothing(gdevs) ? get_gdev_MP(scenario) : gdevs
     g, ϕg = get_hybridproblem_MLapplicator(prob; scenario)
     n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
-    is_predict_batch = (n_batch == size(xP,2))
-    n_site_pred = is_predict_batch ? n_batch : n_site
-    @assert size(xP, 2) == n_site_pred
+    n_site_pred = size(xP,2)
     @assert size(xM, 2) == n_site_pred
-    f = get_hybridproblem_PBmodel(prob; scenario, use_all_sites = !is_predict_batch)
+    f_batch = get_hybridproblem_PBmodel(prob; scenario)
+    f = (n_site_pred == n_batch) ? f : create_nsite_applicator(f_batch, n_site_pred)
+    if gdevs.gdev_P isa MLDataDevices.AbstractGPUDevice
+        f_dev = gdevs.gdev_P(f) #fmap(gdevs.gdev_P, f)
+    else
+        f_dev = f
+    end
     (; θP, θM) = get_hybridproblem_par_templates(prob; scenario)
     (; transP, transM) = get_hybridproblem_transforms(prob; scenario)
     transMs = StackedArray(transM, n_site_pred)
@@ -78,14 +127,15 @@ function gf(prob::AbstractHybridProblem, xM::AbstractMatrix, xP::AbstractMatrix;
     pbm_covars = get_hybridproblem_pbmpar_covars(prob; scenario)
     pbm_covar_indices = CA.getdata(intP(1:length(intP))[pbm_covars])
     ζP = inverse(transP)(θP)
-    g_dev, ϕg_dev, ζP_dev =  (gdev(g), gdev(ϕg), gdev(CA.getdata(ζP))) 
+    gdev, cdev = gdevs.gdev_M, infer_cdev(gdevs)
+    g_dev, ϕg_dev, xM_dev, ζP_dev =  gdev(g), gdev(ϕg), gdev(CA.getdata(xM)), gdev(CA.getdata(ζP))
     # most of the properties of prob are not type-inferred
     # hence result is not type-inferred, but may test at this context
     res = is_infer ? 
         Test.@inferred( gf(
-            g_dev, transMs, transP, f, xM, xP, ϕg_dev, ζP_dev, pbm_covar_indices; 
+            g_dev, transMs, transP, f_dev, xM_dev, xP, ϕg_dev, ζP_dev, pbm_covar_indices; 
             cdev, kwargs...)) :
-        gf(g_dev, transMs, transP, f, xM, xP, ϕg_dev, ζP_dev, pbm_covar_indices; 
+        gf(g_dev, transMs, transP, f_dev, xM_dev, xP, ϕg_dev, ζP_dev, pbm_covar_indices; 
             cdev, kwargs...)
 end
 
@@ -100,7 +150,7 @@ end
 
 function gf(g::AbstractModelApplicator, transMs, transP, f, xM, xP, ϕg, ζP, 
     pbm_covar_indices::AbstractVector{<:Integer}; 
-    cdev)
+    cdev, is_testmode)
     # @show first(xM,5)
     # @show first(ϕg,5)
 
@@ -110,10 +160,16 @@ function gf(g::AbstractModelApplicator, transMs, transP, f, xM, xP, ϕg, ζP,
     # end
     #xMP = _append_PBM_covars(xM, intP(ζP), pbm_covars) 
     xMP = _append_each_covars(xM, CA.getdata(ζP), pbm_covar_indices)
-    θMs = gtrans(g, transMs, xMP, ϕg; cdev)
+    θMs = gtrans(g, transMs, xMP, ϕg; cdev, is_testmode)
+    # transPM = RRuleMonitor("transP", ζP -> transP(ζP))
+    # θP = transPM(CA.getdata(ζP))
     θP = transP(CA.getdata(ζP))
     θP_cpu = cdev(θP) 
     y_pred = f(θP_cpu, θMs, xP)
+    # fM = RRuleMonitor("f in gf", (θP_cpu) -> f(θP_cpu, θMs, xP), DI.AutoForwardDiff())
+    # y_pred = fM(θP_cpu) 
+    # fM = RRuleMonitor("f in gf", (θP_cpu, θMs) -> f(θP_cpu, θMs, xP))
+    # y_pred = fM(θP_cpu, θMs) # very slow large JvP with θMs
     return y_pred, θMs, θP_cpu
 end
 
@@ -121,17 +177,23 @@ end
 composition transM ∘ g: transformation after machine learning parameter prediction
 Provide a `transMs = StackedArray(transM, n_batch)`
 """
-function gtrans(g, transMs, xMP, ϕg; cdev)
+function gtrans(g, transMs, xMP, ϕg; cdev, is_testmode)
     # TODO remove after removing gf
     # predict the log of the parameters
-    ζMst = g(xMP, ϕg)
+    ζMst = g(xMP, ϕg; is_testmode)
     ζMs = ζMst' 
     ζMs_cpu = cdev(ζMs)
     θMs = transMs(ζMs_cpu)
+    if !all(isfinite.(θMs))
+        @info "gtrans: encountered non-finite parameters"
+        #@show θMs, ζMs_cpu, transMs
+        #@show xMP, ϕg, is_testmode
+        #TODO save xMP, ϕg, is_testmode using JLD2
+    end
+    θMs
     #θMs = reduce(hcat, map(transM, eachcol(ζMs_cpu))) # transform each row
 end
 
-
 """
 Create a loss function for given
 - g(x, ϕ): machine learning model 
@@ -168,18 +230,25 @@ function get_loss_gf(g, transM, transP, f, py,
     cdev=cpu_device(),
     pbm_covars, n_site_batch, 
     priorsP, priorsM, floss_penalty = zero_penalty_loss,
+    is_omit_priors = false,
     kwargs...)
 
     let g = g, transM = transM, transP = transP, f = f, 
         intϕ = get_concrete(intϕ),
         transMs = StackedArray(transM, n_site_batch),
+        is_omit_priors = is_omit_priors,
         cdev = cdev,
         pbm_covar_indices = CA.getdata(intP(1:length(intP))[pbm_covars]),
-        priorsP = priorsP, priorsM = priorsM, floss_penalty = floss_penalty
+        priorsP = priorsP, priorsM = priorsM, floss_penalty = floss_penalty,
+        cpu_dev = cpu_device() # real cpu, different form infer_cdev(gdevs) that maybe idenetity
         #, intP = get_concrete(intP)
         #inv_transP = inverse(transP), kwargs = kwargs
-        function loss_gf(ϕ, xM, xP, y_o, y_unc, i_sites)
+        function loss_gf(ϕ, xM, xP, y_o, y_unc, i_sites; is_testmode)
             ϕc = intϕ(ϕ)
+            # GPUArraysCore.allowscalar(() -> if !all(isfinite.(ϕ))
+            #     @show ϕc.ϕP
+            #     error("invokded loss function loss_gf with non-finite parameters")
+            # end)
             # μ_ζP = ϕc.ϕP
             # xMP = _append_each_covars(xM, CA.getdata(μ_ζP), pbm_covar_indices)
             # ϕ_M = g(xMP, CA.getdata(ϕc.ϕg))
@@ -187,9 +256,14 @@ function get_loss_gf(g, transM, transP, f, py,
             # ζP_cpu = cdev(CA.getdata(μ_ζP))
             # ζMs_cpu = cdev(CA.getdata(μ_ζMs))
             # y_pred, _, _ = apply_f_trans(ζP_cpu, ζMs_cpu, f, xP; transM, transP)
+            if !all(isfinite.(ϕ)) 
+                @info "loss_gf: encountered non-finite ϕ"
+                @show ϕc.ϕP
+                #Main.@infiltrate_main
+            end
             y_pred, θMs_pred, θP_pred = gf(
                 g, transMs, transP, f, xM, xP, CA.getdata(ϕc.ϕg), CA.getdata(ϕc.ϕP), 
-                pbm_covar_indices; cdev, kwargs...)
+                pbm_covar_indices; cdev, is_testmode, kwargs...)
             #σ = exp.(y_unc ./ 2)
             #nLy = sum(abs2, (y_pred .- y_o) ./ σ) 
             nLy = py( y_pred, y_o, y_unc)
@@ -198,10 +272,27 @@ function get_loss_gf(g, transM, transP, f, py,
             logpdf_tv = (prior, θ::AbstractVector) -> begin
                 map(Base.Fix1(logpdf, prior), θ)::Vector{eltype(θP_pred)}
             end
-            neg_log_prior = -sum(logpdf_t.(priorsP, θP_pred)) - sum(map(
-                (priorMi, θMi) -> sum(logpdf_tv(priorMi, θMi)), priorsM, eachcol(θMs_pred))) 
+            #Main.@infiltrate_main
+            #Maybe: move priors to GPU, for now need to move θ to cpu
+            # currently does not work on gpu, moving to dpu has problems with gradient
+            # θP_pred_cpu = cpu_dev(CA.getdata(θP_pred))
+            # θMs_pred_cpu = cpu_dev(CA.getdata(θMs_pred))
+            θP_pred_cpu = CA.getdata(θP_pred)
+            θMs_pred_cpu = CA.getdata(θMs_pred)
+            # TODO account for prior cost on global parameters after debug
+            neg_log_prior = is_omit_priors ? zero(nLy) :
+                -sum(logpdf_t.(priorsP, θP_pred_cpu)) +
+                -sum(map((priorMi, θMi) -> sum(
+                    logpdf_tv(priorMi, θMi)), priorsM, eachcol(θMs_pred_cpu))) 
+            #neg_log_prior = min(sqrt(floatmax(neg_log_prior0)), neg_log_prior0)                
+            if !isfinite(neg_log_prior)
+                @info "loss_gf: encountered non-finite prior density"
+                @show θP_pred, θMs_pred, ϕc.ϕP
+                error("debug get_loss_gf")
+            end
             ϕunc = eltype(θP_pred)[]  # no uncertainty parameters optimized
             loss_penalty = floss_penalty(y_pred, θMs_pred, θP_pred, ϕc.ϕg, ϕunc)
+            #@show nLy, neg_log_prior, loss_penalty
             nLjoint_pen = nLy + neg_log_prior + loss_penalty
             return (;nLjoint_pen, y_pred, θMs_pred, θP_pred, nLy, neg_log_prior, loss_penalty)
         end
diff --git a/src/logden_normal.jl b/src/logden_normal.jl
index d3cd814..262adea 100644
--- a/src/logden_normal.jl
+++ b/src/logden_normal.jl
@@ -24,11 +24,34 @@ function neg_logden_indep_normal(obs::AbstractArray, μ::AbstractArray, logσ2::
     # optimize argument logσ2 rather than σs for performance
     #nlogL = sum(σfac .* (1/2) .* logσ2 .+ (1/2) .* exp.(.- logσ2) .* abs2.(obs .- μ))
     # specifying logσ2 instead of σ is not transforming a random variable -> no Jacobian
-    obs_data = CA.getdata(obs)
-    μ_data = CA.getdata(μ)
-    nlogL = sum(σfac .* logσ2 .+ abs2.(obs_data .- μ_data) .* exp.(.-logσ2)) / convert(eltype(μ),2)
+    #
+    # obs_data = CA.getdata(obs)
+    # μ_data = CA.getdata(μ)
+    # nlogL = NaNMath.sum(  # observations might by NaN for missing
+    #     σfac .* logσ2 .+ abs2.(obs_data .- μ_data) .* exp.(.-logσ2)) / convert(eltype(μ),2)
+    # return (nlogL)
+    #
+    i_finobs = .! isnan.(obs)
+    obs_data = CA.getdata(obs)[i_finobs]
+    μ_data = CA.getdata(μ)[i_finobs]
+    logσ2_fin = logσ2[i_finobs]
+    nlogL = sum(  # observations might by NaN for missing
+        σfac .* logσ2_fin .+ abs2.(obs_data .- μ_data) .* exp.(.-logσ2_fin)) / convert(eltype(μ),2)
     return (nlogL)
 end
+
+function neg_logden_indep_normal(obs::AbstractGPUArray, μ::AbstractGPUArray, logσ2::AbstractGPUArray{ET}; 
+    σfac=one(ET)) where ET
+    #cannot use NaNMath.sum on gpu, allocate vectors  of non-NAN
+    i_finobs = .! isnan.(obs)
+    obs_data = CA.getdata(obs)[i_finobs]
+    μ_data = CA.getdata(μ)[i_finobs]
+    logσ2_fin = logσ2[i_finobs]
+    nlogL = sum(  # observations might by NaN for missing
+        σfac .* logσ2_fin .+ abs2.(obs_data .- μ_data) .* exp.(.-logσ2_fin)) / convert(eltype(μ),2)
+    return (nlogL)
+end
+
 # function neg_logden_indep_normal(obss::AbstractMatrix, preds::AbstractMatrix, logσ2::AbstractVector; kwargs...)
 #     nlogLs = map(eachcol(obss), eachcol(preds)) do obs, μ
 #         neg_logden_indep_normal(obs, μ, logσ2; kwargs...)
diff --git a/src/util_ca.jl b/src/util_ca.jl
index 224ce87..c378582 100644
--- a/src/util_ca.jl
+++ b/src/util_ca.jl
@@ -31,7 +31,7 @@ component information that might be present in the dimensions.
 function compose_axes(axtuples::NamedTuple)
     ls = map(axtuple -> Val(prod(axis_length.(axtuple))), axtuples)
     # to work on types, need to construct value types of intervals
-    intervals = _construct_invervals(;lengths=ls)
+    intervals = _construct_intervals(;lengths=ls)
     named_intervals = (;zip(keys(axtuples),intervals)...)
     axc = map(named_intervals, axtuples) do interval, axtuple
         ax = length(axtuple) == 1 ? axtuple[1] : CA.ShapedAxis(axis_length.(axtuple))
@@ -40,7 +40,7 @@ function compose_axes(axtuples::NamedTuple)
     CA.Axis(; axc...)
 end
 
-function _construct_invervals(;lengths) 
+function _construct_intervals(;lengths) 
     reduce((ranges,length) -> _add_interval(;ranges, length), 
         Iterators.tail(lengths), init=(Val(1:_val_value(first(lengths))),))    
 end
diff --git a/src/util_gpu.jl b/src/util_gpu.jl
index 7253cbd..d4f2cba 100644
--- a/src/util_gpu.jl
+++ b/src/util_gpu.jl
@@ -12,11 +12,77 @@ function ones_similar_x(x::AbstractArray, size_ret = size(x))
     Ones{eltype(x)}(size_ret)
 end
 
+# TODO replace CUDA-method in extension after type stabilie is fixed
+# https://github.com/JuliaGPU/KernelAbstractions.jl/issues/634
+# function ones_similar_x(x::GPUArraysCore.AbstractGPUArray, s = size(x)) 
+#     backend = get_backend(x)
+#     ans = KernelAbstractions.ones(backend, eltype(x), s)
+#     #ans = ChainRulesCore.@ignore_derivatives KernelAbstractions.ones(backend, eltype(x), s)
+#     # https://juliagpu.github.io/KernelAbstractions.jl/stable/quickstart/#Synchronization
+#     # ChainRulesCore.@ignore_derivatives synchronize(backend)  
+#     ans
+# end
+
 # handle containers and transformations of Arrays
 ones_similar_x(x::CA.ComponentArray, s = size(x)) = ones_similar_x(CA.getdata(x), s)
 ones_similar_x(x::LinearAlgebra.Adjoint, s = size(x)) = ones_similar_x(parent(x), s)
 ones_similar_x(x::SubArray, s = size(x)) = ones_similar_x(parent(x), s)
 
+function repeat_rowvector_dummy(x::AbstractMatrix{T}, is_dummy::Union{BitVector,AbstractVector{Bool}}; 
+    ones_vec = ones_similar_x(x, length(is_dummy)),
+    ) where T
+    ones_vec .* x .+ (is_dummy .* convert(T,NaN))
+    #ones_vec .* x .+ (is_dummy)
+end
+
+function ChainRulesCore.rrule(::typeof(repeat_rowvector_dummy), x, is_dummy::Union{BitVector,AbstractVector{Bool}}; kwargs...)
+    function repeat_rowvector_dummy_pullback(Δy)
+        # only sum the partials across non-dummy rows for each column
+        Δx = sum(Δy[.! is_dummy,:]; dims=1)
+        (NoTangent(), Δx, NoTangent())
+    end
+    return repeat_rowvector_dummy(x, is_dummy; kwargs...), repeat_rowvector_dummy_pullback
+end
+
+function repeat_rowvector_dummy(x::AbstractMatrix{T}, is_dummy::Union{BitMatrix,AbstractMatrix{Bool}}; 
+    ) where T
+    x .+ (is_dummy .* convert(T,NaN))
+end
+
+function ChainRulesCore.rrule(::typeof(repeat_rowvector_dummy), 
+    x, is_dummy::Union{BitMatrix,AbstractMatrix{Bool}}; kwargs...)
+    pullback = if !any(is_dummy)
+        # avoid mapping rows if there is no dummy
+        function repeat_rowvector_dummy_pullback_emptybitmatrix(Δy)
+            Δx = sum(Δy; dims=1)
+            #Main.@infiltrate_main
+            (NoTangent(), Δx, ZeroTangent())            
+        end
+    else
+        function repeat_rowvector_dummy_pullback_bitmatrix(Δy)
+            #@info "called rrule for repeat_rowvector_dummy with is_dummy matrix"
+            # only sum the partials across non-dummy rows for each column
+            #Δx = similar(x)  # errors in copyto! need the same as xtvec
+            Δxt = similar(x')
+            # TODO think of avoiding allocation of temporary vector
+            # using generator or StaticArray results in scalar indexing
+            xtvec = map(eachcol(Δy[:,:]), eachcol(is_dummy)) do Δyi, is_dummi_i
+                sum(Δyi[.! is_dummi_i])
+            end 
+            # gen = (sum(Δyi[.! is_dummi_i]) for (Δyi, is_dummi_i) in zip(eachcol(Δy[:,:]), eachcol(is_dummy)))
+            #xtvec = @SArray[sum(Δyi[.! is_dummi_i]) for (Δyi, is_dummi_i) in zip(eachcol(Δy[:,:]), eachcol(is_dummy))]
+            # if !all(isfinite.(xtvec))
+            #     @info "repeat_rowvector_dummy_pullback_bitmatrix: encountered non-finite gradients"
+            # end
+            copyto!(Δxt, xtvec) 
+            (NoTangent(), Δxt', ZeroTangent())
+        end
+    end
+    return repeat_rowvector_dummy(x, is_dummy; kwargs...), pullback
+end
+
+
+
 
 
 
diff --git a/test/Project.toml b/test/Project.toml
index ab63301..0832140 100644
--- a/test/Project.toml
+++ b/test/Project.toml
@@ -4,12 +4,15 @@ Bijectors = "76274a88-744f-5084-9051-94815aaf08c4"
 BlockDiagonals = "0a1fb500-61f7-11e9-3c65-f5ef3456f9f0"
 CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+DifferentiationInterface = "a0c0ee7d-e4b9-4e03-894e-1c5f64a51d63"
 DistributionFits = "45214091-1ed4-4409-9bcf-fdb48a05e921"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
 FillArrays = "1a297f60-69ca-5386-bcde-b61e274b549b"
 Flux = "587475ba-b771-5e3f-ad9e-33799f191a9c"
+ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210"
 Functors = "d9f16b24-f501-4c13-a1f2-28368ffc5196"
 GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
+KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Lux = "b2108857-7c20-44ae-9111-449ecde12c47"
 MLDataDevices = "7e8f7934-dd98-4c1a-8fe8-92b47a384d40"
diff --git a/test/runtests.jl b/test/runtests.jl
index 87bbd4b..99c5bbb 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -3,6 +3,8 @@ const GROUP = get(ENV, "GROUP", "All") # defined in in CI.yml
 
 @time begin
     if GROUP == "All" || GROUP == "Basic"
+        #@safetestset "test" include("test/test_RRuleMonitor.jl")
+        @time @safetestset "test_RRuleMonitor" include("test_RRuleMonitor.jl")
         #@safetestset "test" include("test/test_bijectors_utils.jl")
         @time @safetestset "test_bijectors_utils" include("test_bijectors_utils.jl")
         #@safetestset "test" include("test/test_util.jl")
@@ -15,6 +17,8 @@ const GROUP = get(ENV, "GROUP", "All") # defined in in CI.yml
         @time @safetestset "test_ComponentArrayInterpreter" include("test_ComponentArrayInterpreter.jl")
         #@safetestset "test" include("test/test_hybridprobleminterpreters.jl")
         @time @safetestset "test_hybridprobleminterpreters" include("test_hybridprobleminterpreters.jl")
+        #@safetestset "test" include("test/test_PBMApplicator.jl")
+        @time @safetestset "test_PBMApplicator" include("test_PBMApplicator.jl")
         #@safetestset "test" include("test/test_ModelApplicator.jl")
         @time @safetestset "test_ModelApplicator" include("test_ModelApplicator.jl")
         #@safetestset "test" include("test/test_gencovar.jl")
@@ -40,6 +44,8 @@ const GROUP = get(ENV, "GROUP", "All") # defined in in CI.yml
         # tests that need fixing (but should not be commented) at the end:
         #@safetestset "test" include("test/test_HybridProblem.jl")
         @time @safetestset "test_HybridProblem" include("test_HybridProblem.jl")
+        #@safetestset "test" include("test/test_missingdriver.jl")
+        @time @safetestset "test_missingdriver" include("test_missingdriver.jl")
     end
 end
 
diff --git a/test/test_ComponentArrayInterpreter.jl b/test/test_ComponentArrayInterpreter.jl
index bd9a765..f6dabf9 100644
--- a/test/test_ComponentArrayInterpreter.jl
+++ b/test/test_ComponentArrayInterpreter.jl
@@ -12,7 +12,7 @@ using Suppressor
 gdev = Suppressor.@suppress gpu_device() # not loaded CUDA
 cdev = cpu_device()
 
-@testset "construct StaticComponentArrayInterepreter" begin
+@testset "construct StaticComponentArrayInterpreter" begin
     intv = @inferred CP.StaticComponentArrayInterpreter(CA.ComponentVector(a=1:3, b=reshape(4:9,3,2)))
     ints = @inferred CP.StaticComponentArrayInterpreter((;a=Val(3), b = Val((3,2))))
     # @descend_code_warntype CP.StaticComponentArrayInterpreter((;a=Val(3), b = Val((3,2))))
diff --git a/test/test_HybridProblem.jl b/test/test_HybridProblem.jl
index c8b9c2b..c7bb04e 100644
--- a/test/test_HybridProblem.jl
+++ b/test/test_HybridProblem.jl
@@ -82,11 +82,11 @@ function construct_problem(; scenario::Val{scen}) where scen
     #g_chain_scaled = app
     #ϕunc0 = init_hybrid_ϕunc(cor_ends, zero(FT)) 
     pbm_covars = (:covarK2 ∈ scen) ? (:K2,) : ()
-    f_batch = f_sites = PBMSiteApplicator(
+    f_batch = PBMSiteApplicator(
         f_doubleMM; θP, θM, θFix=CA.ComponentVector{FT}(), 
         xPvec=xP[:,1])
     HybridProblem(θP, θM, g_chain_scaled, ϕg0, 
-        f_batch, f_sites, priors_dict, py,
+        f_batch, priors_dict, py,
         transM, transP, train_dataloader, n_covar, n_site, n_batch, 
         cor_ends, pbm_covars,
         #ϕunc0, 
@@ -140,7 +140,7 @@ test_without_flux = (scenario) -> begin
         n_site, n_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
         train_loader = get_hybridproblem_train_dataloader(prob; scenario)
         (xM, xP, y_o, y_unc, i_sites) = first(train_loader)
-        f = get_hybridproblem_PBmodel(prob; scenario, use_all_sites = false)
+        f = get_hybridproblem_PBmodel(prob; scenario)
         py = get_hybridproblem_neg_logden_obs(prob; scenario)
         par_templates = get_hybridproblem_par_templates(prob; scenario)
         #f(par_templates.θP, hcat(par_templates.θM, par_templates.θM), xP[1:2])
@@ -156,19 +156,21 @@ test_without_flux = (scenario) -> begin
 
         # Pass the site-data for the batches as separate vectors wrapped in a tuple
         loss_gf = get_loss_gf(g, transM, transP, f, py, intϕ; 
-            pbm_covars, n_site_batch = n_batch, priorsP, priorsM)
+            pbm_covars, n_site_batch = n_batch, priorsP, priorsM,
+            )
         (_xM, _xP, _y_o, _y_unc, _i_sites) = first(train_loader)
-        l1 = loss_gf(p0, _xM, _xP, _y_o, _y_unc, _i_sites)
+        #l1 = loss_gf(p0, _xM, _xP, _y_o, _y_unc, _i_sites; is_testmode = false)
 
         l1 = @inferred (
             # @descend_code_warntype (
-            loss_gf(p0, _xM, _xP, _y_o, _y_unc, _i_sites))
+            loss_gf(p0, _xM, _xP, _y_o, _y_unc, _i_sites; is_testmode = true))
         tld = first(train_loader)
-        gr = Zygote.gradient(p -> loss_gf(p, tld...)[1], CA.getdata(p0))
+        gr = Zygote.gradient(p -> loss_gf(p, tld...; is_testmode = false)[1], CA.getdata(p0))
         @test gr[1] isa Vector
 
         () -> begin
-            optf = Optimization.OptimizationFunction((ϕ, data) -> loss_gf(ϕ, data...)[1],
+            optf = Optimization.OptimizationFunction(
+                (ϕ, data) -> loss_gf(ϕ, data...; is_testmode = false)[1],
                 Optimization.AutoZygote())
             optprob = OptimizationProblem(optf, p0, train_loader)
 
@@ -198,7 +200,7 @@ gdev = gpu_device()
 test_with_flux = (scenario) -> begin
     prob = probc = construct_problem(;scenario);
 
-    @testset "HybridPointSolver $(last(CP._val_value(scenario)))" begin
+    @testset "HybridPointSolver + predict_point_hvi $(last(CP._val_value(scenario)))" begin
         rng = StableRNG(111)
         solver = HybridPointSolver(; alg=Adam(0.02))
         (; ϕ, resopt, probo) = solve(prob, solver; scenario, rng,
@@ -218,12 +220,15 @@ test_with_flux = (scenario) -> begin
         end)()
         @test θPo.r0 < 1.5 * θP.r0
         @test ϕ.ϕP.K2 < 1.5 * log(θP.K2)
+        (;y_pred, θMs, θP) = predict_point_hvi(rng, probo; scenario)
+        _,_,y_obs,_ = get_hybridproblem_train_dataloader(prob; scenario).data
+        @test size(y_pred) == size(y_obs)
     end;
 
-    @testset "HybridPosteriorSolver  $(last(CP._val_value(scenario)))" begin
+    @testset "HybridPosteriorSolver + predict_hvi $(last(CP._val_value(scenario)))" begin
         rng = StableRNG(111)
         solver = HybridPosteriorSolver(; alg=Adam(0.02), n_MC=3)
-        (; ϕ, θP, resopt) = solve(prob, solver; scenario, rng,
+        (; probo, ϕ, θP, resopt) = solve(prob, solver; scenario, rng,
             #callback = callback_loss(100), maxiters = 1200,
             #maxiters = 20 # too small so that it yields error
             #maxiters=37, # still complains "need to specify maxiters or epochs"
@@ -237,6 +242,14 @@ test_with_flux = (scenario) -> begin
         @test exp(ϕ.μP.K2) == θP.K2 < 1.5 * θP.K2
         θP
         prob.θP
+        n_sample_pred = 12
+        (; y, θsP, θsMs, entropy_ζ) = predict_hvi(rng, probo; scenario, n_sample_pred);
+        _,_,y_obs,_ = get_hybridproblem_train_dataloader(prob; scenario).data
+        @test size(y) == (size(y_obs)..., n_sample_pred)
+        yc = cdev(y)
+        _ = map(eachslice(yc; dims = 3)) do ycs
+            @test all(isfinite.(ycs[isfinite.(y_obs)]))    
+        end
     end;
 end # test_with flux
 
@@ -321,8 +334,8 @@ test_with_flux_gpu = (scenario) -> begin
             # moved to solve and predict_hvi
             # probg = HybridProblem(
             #     probg, 
-            #     f_batch = fmap(gdev, probg.f_batch), 
-            #     f_allsites = fmap(gdev, probg.f_allsites))
+            #     f_batch = gdev(probg.f_batch), 
+            #     f_allsites = gdev(probg.f_allsites))
             #prob = CP.update(probg, transM = identity, transP = identity);
             solver = HybridPosteriorSolver(; alg=Adam(0.02), n_MC=3)
             n_site, n_batch = get_hybridproblem_n_site_and_batch(probg; scenario = scenf)
diff --git a/test/test_ModelApplicator.jl b/test/test_ModelApplicator.jl
index 3b80d62..70521e6 100644
--- a/test/test_ModelApplicator.jl
+++ b/test/test_ModelApplicator.jl
@@ -1,10 +1,14 @@
 using Test
 using HybridVariationalInference
+using HybridVariationalInference: HybridVariationalInference as HVI
 using ComponentArrays: ComponentArrays as CA
 using StatsFuns
 using Distributions
 using MLDataDevices, CUDA, cuDNN, GPUArraysCore
 
+gdev = gpu_device()
+cdev = cpu_device()
+
 @testset "NullModelApplicator" begin
     g = NullModelApplicator()
     c1 = CA.ComponentVector(a = (a1 = 1, a2 = 2:3), b = 3:4)
@@ -31,7 +35,6 @@ end;
     p = normcdf.(μ, σ, y)
     #hcat(r, p)
     @test p ≈ r
-    gdev = gpu_device()
     #cdev = cpu_device()
     if gdev isa MLDataDevices.AbstractGPUDevice 
         g_gpu = g |> gdev
@@ -41,3 +44,25 @@ end;
         @test y isa GPUArraysCore.AbstractGPUArray
     end
 end;
+
+@testset "RangeScalingModelApplicator" begin
+    app = NullModelApplicator()
+    r = logistic.(randn(Float32, 5)) # 0..1
+    lowers = collect(exp.(1.0:5.0)) # different magnitudes
+    uppers = lowers .* 2
+    g = RangeScalingModelApplicator(app, lowers, uppers, eltype(r))
+    y = g(r, [])
+    width = uppers .- lowers
+    @test y ≈(r .* width .+ lowers)
+    @test eltype(y) == eltype(r)
+    #cdev = cpu_device()
+    if gdev isa MLDataDevices.AbstractGPUDevice 
+        g_gpu = g |> gdev
+        @test g_gpu.offset isa GPUArraysCore.AbstractGPUArray
+        @test g_gpu.width isa GPUArraysCore.AbstractGPUArray
+        r_gpu = r |> gdev
+        y_dev = g_gpu(r_gpu, [])
+        @test y_dev isa GPUArraysCore.AbstractGPUArray
+        @test cdev(y_dev) ≈ y
+    end
+end;
diff --git a/test/test_PBMApplicator.jl b/test/test_PBMApplicator.jl
new file mode 100644
index 0000000..5df1c71
--- /dev/null
+++ b/test/test_PBMApplicator.jl
@@ -0,0 +1,95 @@
+using Test
+using HybridVariationalInference
+using ComponentArrays: ComponentArrays as CA
+
+import Zygote
+
+using MLDataDevices, CUDA, cuDNN, GPUArraysCore
+
+f_pop = function(θsc, xPc)
+    local n_obs = size(xPc, 1)
+    is_valid = isfinite.(CA.getdata(xPc))
+    a1 = is_valid .* CA.getdata(θsc[:,:a1])' 
+    a2 = is_valid .* CA.getdata(θsc[:,:a2])' 
+    # b in θP has been expanded in PopulationApplicator
+    b = is_valid .* CA.getdata(θsc[:,:b])' 
+    y = a1 .+  log.(a2) .* abs2.(cos.(b .- 0.2)) .* xPc.^2
+end
+
+() -> begin
+    include("test/test_scratch.jl")
+end
+
+@testset "PBMPopulationApplicator" begin
+    n_obs = 3
+    n_site = 5
+    xPvec = CA.ComponentVector(s1 = 1.0:n_obs)
+    xPc = xPvec .* ones(n_site)' .+ abs2.(randn(n_obs, n_site) .* 0.1)
+    θP = CA.ComponentVector(b=3.0)
+    θM = CA.ComponentVector(a1=2.0,a2=1.0)
+    θFix  = CA.ComponentVector(c=1.5)
+    #
+    θMs = (ones(n_site) .* θM') .+ abs2.(randn(n_site, length(θM)) .* 0.1)
+    θs = hcat(ones(n_site) .* θP', θMs)
+    y_obs = f_pop(θs, xPc)
+    g = PBMPopulationApplicator(f_pop, n_site; θP, θM, θFix, xPvec)
+    ret = g(θP, θMs, xPc)
+    @test ret ≈ y_obs
+
+    gr = Zygote.gradient((θP, θMs) -> sum(g(θP, θMs, xPc)), CA.getdata(θP), CA.getdata(θMs))
+
+    xPc_NaN = copy(xPc); xPc_NaN[2:3,1] .= NaN
+    ret2 = g(θP, θMs, xPc_NaN)
+    gr = Zygote.gradient((θP, θMs) -> sum(g(θP, θMs, xPc_NaN)), CA.getdata(θP), CA.getdata(θMs))
+    @test all(isfinite.(gr[1])) 
+
+    n_site_m = 4
+    gm = create_nsite_applicator(g, n_site_m)
+    retm = gm(θP, θMs[1:n_site_m,:], xPc[:,1:n_site_m])
+    @test retm ≈ y_obs[:,1:n_site_m]
+end;
+
+f_global = function(θsc, θgc, xPc)
+    local n_obs = size(xPc, 1)
+    #is_dummy = isnan.(CA.getdata(xPc))
+    is_valid = isfinite.(CA.getdata(xPc))
+    # rep_fac = ones_similar_x(θsc, n_obs)
+    # a1 = rep_fac .* CA.getdata(θsc[:,:a1])'
+    # a2 = rep_fac .* CA.getdata(θsc[:,:a2])'
+    #a1 = repeat_rowvector_dummy(CA.getdata(θsc[:,:a1])', is_dummy)
+    #a2 = repeat_rowvector_dummy(CA.getdata(θsc[:,:a2])', is_dummy)
+    a1 = is_valid .* CA.getdata(θsc[:,:a1])' 
+    a2 = is_valid .* CA.getdata(θsc[:,:a2])' 
+    b = CA.getdata(θgc.b) .* (is_valid)
+    y = a1 .+  log.(a2) .* abs2.(cos.(b .- 0.2)) .* xPc.^2
+end
+
+@testset "PBMPopulationGlobalApplicator" begin
+    n_obs = 3
+    n_site = 5
+    xPvec = CA.ComponentVector(s1 = 1.0:n_obs)
+    xPc = xPvec .* ones(n_site)' .+ abs2.(randn(n_obs, n_site) .* 0.1)
+    θP = CA.ComponentVector(b=3.0)
+    θM = CA.ComponentVector(a1=2.0,a2=1.0)
+    θFix  = CA.ComponentVector(c=1.5)
+    #
+    θMs = (ones(n_site) .* θM') .+ abs2.(randn(n_site, length(θM)) .* 0.1)
+    y_obs = f_global(θMs, θP, xPc)
+    g = PBMPopulationGlobalApplicator(f_global, n_site; θP, θM, θFix, xPvec)
+    ret = g(θP, θMs, xPc)
+    @test ret ≈ y_obs
+
+    gr = Zygote.gradient((θP, θMs) -> sum(g(θP, θMs, xPc)), CA.getdata(θP), CA.getdata(θMs))
+
+    xPc_NaN = copy(xPc); xPc_NaN[2:3,1] .= NaN
+    ret2 = g(θP, θMs, xPc_NaN)
+    gr = Zygote.gradient((θP, θMs) -> sum(g(θP, θMs, xPc_NaN)), CA.getdata(θP), CA.getdata(θMs))
+    @test all(isfinite.(gr[1])) # \thetaP
+    @test all(isfinite.(gr[2])) # solves finite gradient for a2 for first site
+
+    n_site_m = 4
+    gm = create_nsite_applicator(g, n_site_m)
+    retm = gm(θP, θMs[1:n_site_m,:], xPc[:,1:n_site_m])
+    @test retm ≈ y_obs[:,1:n_site_m]
+end;
+
diff --git a/test/test_RRuleMonitor.jl b/test/test_RRuleMonitor.jl
new file mode 100644
index 0000000..8825c8c
--- /dev/null
+++ b/test/test_RRuleMonitor.jl
@@ -0,0 +1,66 @@
+using Test
+using HybridVariationalInference
+using HybridVariationalInference: HybridVariationalInference as HVI
+using ComponentArrays: ComponentArrays as CA
+# using MLDataDevices
+# import CUDA, cuDNN
+using DifferentiationInterface: DifferentiationInterface as DI
+import Zygote
+import ForwardDiff
+
+
+function ftest(x)
+    3 .* abs2.(x)
+end
+
+function ftest2(x1,x2)
+    x1 .* abs2.(x2)
+end
+
+
+@testset "RRuleMonitor one argument" begin 
+    x = collect(1.0:3.0)
+    y = ftest(x)
+    m = RRuleMonitor("ftest", ftest)
+    y2 = m(x)
+    @test y2 == y
+    gr = Zygote.gradient(x -> sum(ftest(x)), x)[1]
+    gr2 = Zygote.gradient(x -> sum(m(x)), x)[1]
+    @test gr2 == gr
+end
+
+@testset "RRuleMonitor two argument" begin 
+    x1 = collect(3.1:0.1:3.3)
+    x2 = collect(1.0:3.0)
+    y = ftest2(x1, x2)
+    m = RRuleMonitor("ftest2", ftest2)
+    y2 = m(x1, x2)
+    @test y2 == y
+    gr = Zygote.gradient((x1,x2) -> sum(ftest2(x1,x2)), x1, x2)
+    gr2 = Zygote.gradient((x1,x2) -> sum(m(x1,x2)), x1, x2)
+    @test gr2 == gr
+    md = RRuleMonitor("ftest2_Forward", ftest2, DI.AutoForwardDiff())
+    gr3 = Zygote.gradient((x1,x2) -> sum(md(x1,x2)), x1, x2)
+    @test all(gr3 .≈ gr)
+end
+
+function ftestsqrt(x)
+    is_pos = x .>= zero(x)
+    sqrt.(is_pos .* x) .+ .!is_pos .* convert(eltype(x), NaN)
+end
+@testset "RRuleMonitor non-finite" begin 
+    x = collect(2.0:-1.0:-1.0)
+    y = ftestsqrt(x)
+    m = RRuleMonitor("ftestsqrt", ftestsqrt)
+    y2 = m(x)
+    @test isequal(y2,y)
+    gr = Zygote.gradient(x -> sum(ftestsqrt(x)), x)[1]
+    @test_throws ErrorException Zygote.gradient(x -> sum(m(x)), x)
+    #@test isequal(gr2,gr)
+    #
+    md = RRuleMonitor("ftestsqrt", ftestsqrt, DI.AutoForwardDiff())
+    @test_throws ErrorException Zygote.gradient(x -> sum(md(x)), x)
+end
+
+
+
diff --git a/test/test_bijectors_utils.jl b/test/test_bijectors_utils.jl
index 01a62f7..bc5c8de 100644
--- a/test/test_bijectors_utils.jl
+++ b/test/test_bijectors_utils.jl
@@ -1,6 +1,7 @@
 using Test
 using HybridVariationalInference
 using HybridVariationalInference: HybridVariationalInference as CP
+using StatsFuns
 
 using Bijectors
 
@@ -51,6 +52,21 @@ end;
     @test dys == dy
 end;
 
+@testset "Logistic" begin
+    c3 = HybridVariationalInference.Logistic()
+    c3s = Stacked((c3,c3), (1:3,4:4))
+    y1 = @inferred c3(x)
+    y2 = @inferred c3s(x)
+    @test all(inverse(c3)(y2) .≈ x)
+    @test all(inverse(c3s)(y2) .≈ x)
+    # test logabsdetjac
+    gr = Zygote.gradient(x -> sum(logistic.(x)), x)[1]
+    logjac = Bijectors.logabsdetjac(c3s, x) 
+    @test logjac ≈ sum(log.(gr))
+    y2b, logjac2= Bijectors.with_logabsdet_jacobian(c3s, x)
+    @test y2b == y2
+    @test logjac2== logjac
+end;
 
 if gdev isa MLDataDevices.AbstractGPUDevice
     xd = gdev(x)
diff --git a/test/test_doubleMM.jl b/test/test_doubleMM.jl
index b4d6936..60a36e1 100644
--- a/test/test_doubleMM.jl
+++ b/test/test_doubleMM.jl
@@ -91,7 +91,10 @@ end
         # yg = CP.DoubleMM.f_doubleMM(θg, xPMg, intθ);
         θvecg = gdev(θvec); # errors without ";"
         xPMg = CP.apply_preserve_axes(gdev, xPM); 
+        yg = fy(θvecg, xPMg)
         yg = @inferred fy(θvecg, xPMg);
+        #@usingany Cthulhu
+        #@descend_code_warntype fy(θvecg, xPMg)
         @test cdev(yg) == y_exp
         ygradg = Zygote.gradient(θv -> sum(fy(θv, xPMg)), θvecg)[1];
         @test ygradg isa CA.ComponentArray
@@ -128,10 +131,10 @@ end
         # θg = gdev(θ)
         # xPMg = gdev(xPM)
         # yg = CP.DoubleMM.f_doubleMM(θg, xPMg, intθ);
-        θvecg = gdev(θvec)
-        xPMg = gdev(xPM)
-        y_og = gdev(y_o)
-        y_uncg = gdev(y_unc)
+        θvecg = gdev(θvec);
+        xPMg = gdev(xPM);
+        y_og = gdev(y_o);
+        y_uncg = gdev(y_unc);
         costg = fcost(θvecg, xPMg, y_og, y_uncg)
         @test costg ≈ cost
         ygradg = Zygote.gradient(θv -> fcost(θv, xPMg, y_og, y_uncg), θvecg)[1]; # errors without ";"
@@ -197,8 +200,8 @@ end
     g, ϕg0 = get_hybridproblem_MLapplicator(prob; scenario)
     (; transP, transM) = get_hybridproblem_transforms(prob; scenario)
     n_site, n_site_batch = get_hybridproblem_n_site_and_batch(prob; scenario)
-    f = get_hybridproblem_PBmodel(prob; scenario, use_all_sites = false)
-    f2 = get_hybridproblem_PBmodel(prob; scenario, use_all_sites = true)
+    f = get_hybridproblem_PBmodel(prob; scenario)
+    f2 = create_nsite_applicator(f, n_site)
     py = get_hybridproblem_neg_logden_obs(prob; scenario)
     priors = get_hybridproblem_priors(prob; scenario)
     priorsP = [priors[k] for k in keys(par_templates.θP)]
@@ -223,14 +226,14 @@ end
         pbm_covars, n_site_batch = n_batch, priorsP, priorsM)
     loss_gf_site = get_loss_gf(g, transM, transP, f2, py, intϕ;
         pbm_covars, n_site_batch = n_site, priorsP, priorsM)
-    nLjoint = @inferred first(loss_gf(p0, first(train_loader)...))
+    nLjoint = @inferred first(loss_gf(p0, first(train_loader)...; is_testmode=true))
     (xM_batch, xP_batch, y_o_batch, y_unc_batch, i_sites_batch) = first(train_loader)
     # @usingany Cthulhu
     # @descend_code_warntype loss_gf(p0, xM_batch, xP_batch, y_o_batch, y_unc_batch, i_sites_batch)
     Zygote.gradient(
         p0 -> first(loss_gf(
-            p0, xM_batch, xP_batch, y_o_batch, y_unc_batch, i_sites_batch)), CA.getdata(p0))
-    optf = Optimization.OptimizationFunction((ϕ, data) -> first(loss_gf(ϕ, data...)),
+            p0, xM_batch, xP_batch, y_o_batch, y_unc_batch, i_sites_batch; is_testmode=false)), CA.getdata(p0))
+    optf = Optimization.OptimizationFunction((ϕ, data) -> first(loss_gf(ϕ, data...; is_testmode=false)),
         Optimization.AutoZygote())
     optprob = OptimizationProblem(optf, CA.getdata(p0), train_loader)
 
@@ -239,7 +242,7 @@ end
         optprob, Adam(0.02), maxiters = 2000)
 
     (;nLjoint_pen, y_pred, θMs_pred, θP_pred, nLy, neg_log_prior, loss_penalty) = loss_gf_site(
-        res.u, train_loader.data...)
+        res.u, train_loader.data...; is_testmode=true)
     #(nLjoint,  y_pred, θMs_pred, θP, nLy, neg_log_prior, loss_penalty) = loss_gf(p0, xM, xP, y_o, y_unc);
     θMs_pred = CA.ComponentArray(θMs_pred, CA.getaxes(θMs_true'))
     #TODO @test isapprox(par_templates.θP, intϕ(res.u).ϕP, rtol = 0.15)
diff --git a/test/test_elbo.jl b/test/test_elbo.jl
index 022d99a..14bd99d 100644
--- a/test/test_elbo.jl
+++ b/test/test_elbo.jl
@@ -54,8 +54,8 @@ test_scenario = (scenario) -> begin
     # prediction by g(ϕg, XM) does not correspond to θMs_true, randomly initialized
     # only the magnitude is there because of NormalScaling and prior
     g, ϕg0 = get_hybridproblem_MLapplicator(probc; scenario)
-    f = get_hybridproblem_PBmodel(probc; scenario, use_all_sites=false)
-    f_pred = get_hybridproblem_PBmodel(probc; scenario, use_all_sites=true)
+    f = get_hybridproblem_PBmodel(probc; scenario)
+    f_pred = create_nsite_applicator(f, n_site)
 
     n_θM, n_θP = values(map(length, par_templates))
 
@@ -99,7 +99,7 @@ test_scenario = (scenario) -> begin
         CP.generate_ζ(
         rng, g, ϕ_ini, xM[:, 1:n_batch];
         n_MC, cor_ends, pbm_covar_indices,
-        int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc)
+        int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc, is_testmode = false)
     )
 
     @testset "generate_ζ $(last(CP._val_value(scenario)))" begin
@@ -116,7 +116,8 @@ test_scenario = (scenario) -> begin
                 _ζsP, _ζsMs, _σ = CP.generate_ζ(
                     rng, g, ϕ, xM[:, 1:n_batch];
                     n_MC=8, cor_ends, pbm_covar_indices,
-                    int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc)
+                    int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc,
+                     is_testmode = true)
                 sum(_ζsP) + sum(_ζsMs) + sum(_σ)
             end, CA.getdata(ϕ_ini))
         @test gr[1] isa Vector
@@ -156,7 +157,8 @@ test_scenario = (scenario) -> begin
                     CP.generate_ζ(
                     rng, g, _ϕ, xM_batch;
                     n_MC = n_predict, cor_ends, pbm_covar_indices,
-                    int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc)
+                    int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc,
+                    is_testmode = true)
                 )
             ζMs_g = g(xM_batch, probc.ϕg)' # have been generated with no scaling
             function test_distζ(_ζsP, _ζsMs, ϕunc_true, ζMs_g)
@@ -241,7 +243,8 @@ test_scenario = (scenario) -> begin
                 CP.generate_ζ(
                 rng, g_gpu, ϕ, xMg_batch;
                 n_MC, cor_ends, pbm_covar_indices,
-                int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc))
+                int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc,
+                is_testmode = true))
             @test ζsP_d isa Union{GPUArraysCore.AbstractGPUMatrix,
                 LinearAlgebra.Adjoint{FT,<:GPUArraysCore.AbstractGPUMatrix}}
             @test ζsMs_d isa Union{GPUArraysCore.AbstractGPUArray,
@@ -254,7 +257,8 @@ test_scenario = (scenario) -> begin
                     _ζsP, _ζsMs, _σ = CP.generate_ζ(
                         rng, g_gpu, ϕ, xMg_batch;
                         n_MC, cor_ends, pbm_covar_indices,
-                        int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc)
+                        int_unc=interpreters.unc, int_μP_ϕg_unc=interpreters.μP_ϕg_unc,
+                        is_testmode = false)
                     sum(_ζsP) + sum(_ζsMs) + sum(_σ)
                 end, CA.getdata(ϕ))
             @test gr[1] isa GPUArraysCore.AbstractGPUVector
@@ -338,14 +342,16 @@ test_scenario = (scenario) -> begin
             neg_elbo_gtf(rng, ϕ_ini, g, f, py,
             xM[:, i_sites], xP[:, i_sites], y_o[:, i_sites], y_unc[:, i_sites], i_sites;
             int_unc, int_μP_ϕg_unc,
-            cor_ends, pbm_covar_indices, transP, transMs, priorsP, priorsM,)
+            cor_ends, pbm_covar_indices, transP, transMs, priorsP, priorsM,
+            is_testmode = true)
         )
         @test cost isa Float64
         gr = Zygote.gradient(
             ϕ -> neg_elbo_gtf(rng, ϕ, g, f, py,
                 xM[:, i_sites], xP[:, i_sites], y_o[:, i_sites], y_unc[:, i_sites], i_sites;
                 int_unc, int_μP_ϕg_unc,
-                cor_ends, pbm_covar_indices, transP, transMs, priorsP, priorsM,),
+                cor_ends, pbm_covar_indices, transP, transMs, priorsP, priorsM,
+                is_testmode = false),
             CA.getdata(ϕ_ini))
         @test gr[1] isa Vector
     end
@@ -363,6 +369,7 @@ test_scenario = (scenario) -> begin
                 xMg_batch, xP_batch, y_o[:, i_sites], y_unc[:, i_sites], i_sites;
                 int_unc, int_μP_ϕg_unc,
                 n_MC=3, cor_ends, pbm_covar_indices, transP, transMs, priorsP, priorsM,
+                is_testmode = true,
                 )
             )
             @test cost isa Float64
@@ -371,6 +378,7 @@ test_scenario = (scenario) -> begin
                     xMg_batch, xP_batch, y_o[:, i_sites], y_unc[:, i_sites], i_sites;
                     int_unc, int_μP_ϕg_unc,
                     n_MC=3, cor_ends, pbm_covar_indices, transP, transMs, priorsP, priorsM,
+                    is_testmode = false,
                     ),
                 ϕ)
             @test gr[1] isa GPUArraysCore.AbstractGPUVector
@@ -391,7 +399,9 @@ test_scenario = (scenario) -> begin
                 int_μP_ϕg_unc, int_unc,
                 transP, transM,
                 cdev = identity,
-                n_sample_pred, cor_ends, pbm_covar_indices)
+                n_sample_pred, cor_ends, pbm_covar_indices,
+                is_testmode = true,
+                )
             )
         @test θsP isa AbstractMatrix
         @test θsMs isa AbstractArray{T,3} where {T}
@@ -417,7 +427,8 @@ test_scenario = (scenario) -> begin
                     transP, transM,
                     #cdev = cpu_device(),
                     cdev = identity, # do not transfer to CPU
-                    n_sample_pred, cor_ends, pbm_covar_indices)
+                    n_sample_pred, cor_ends, pbm_covar_indices,
+                    is_testmode = true)
                 )
             # this variant without the problem, does not attach axes
             @test θsP isa AbstractMatrix
@@ -426,7 +437,7 @@ test_scenario = (scenario) -> begin
             @test all(int_mP(θsP)[:r0, :] .> 0)
             #
             xP_dev = ggdev(xP);
-            f_pred_dev = fmap(ggdev, f_pred)
+            f_pred_dev = ggdev(f_pred) #fmap(ggdev, f_pred)
             y = @inferred f_pred_dev(θsP, θsMs, xP_dev)
             #@benchmark f_pred_dev(θsP, θsMs, xP_dev)
             @test y isa GPUArraysCore.AbstractGPUArray
diff --git a/test/test_missingdriver.jl b/test/test_missingdriver.jl
new file mode 100644
index 0000000..d24c871
--- /dev/null
+++ b/test/test_missingdriver.jl
@@ -0,0 +1,138 @@
+using Test
+using HybridVariationalInference
+using HybridVariationalInference: HybridVariationalInference as CP
+using SimpleChains
+using StableRNGs
+
+import Zygote
+
+import CUDA, cuDNN
+using MLDataDevices
+
+gdev = gpu_device()
+cdev = cpu_device()
+
+using OptimizationOptimisers
+using Lux  # in order to load extension
+
+
+@testset "repeat_rowvector_dummy" begin
+    x = collect(1.0:5.0) 
+    is_dummy = [false, false, true, true]
+    xm = CP.repeat_rowvector_dummy(x', is_dummy)
+    @test xm[1,:] == xm[2,:] == x
+    @test all(isnan.(xm[3:4,:]))
+    #
+    gr = Zygote.gradient(x -> sum(CP.repeat_rowvector_dummy(x', is_dummy)), x)[1]
+    @test gr == fill(2.0, size(x))
+    gr = Zygote.gradient(x -> sum(CP.repeat_rowvector_dummy(x', is_dummy).^2), x)[1]
+    @test gr == 2 .*2 .*x
+    gr = Zygote.gradient(x -> sum(CP.repeat_rowvector_dummy(x', is_dummy) .* 2.0), x)[1]
+    @test gr == fill(2*2.0,5)
+    gr = Zygote.gradient(x -> sum(abs2, CP.repeat_rowvector_dummy(x', is_dummy)), x)[1]
+    @test gr == 2 .*2 .*x
+    gr = Zygote.gradient(x -> sum(sin, CP.repeat_rowvector_dummy(x', is_dummy)), x)[1]
+    @test gr == 2 .* cos.(x)
+    gr = (Zygote.gradient(x) do x 
+        y = CP.repeat_rowvector_dummy(x', is_dummy) 
+        z = sum(abs2, y[1,:]) + 3*sum(y[2,:])
+    end)[1]
+    @test gr == 2 .*x .+ 3
+    #
+    gdev = gpu_device()
+    cdev = cpu_device()
+    x_dev = gdev(x)
+    is_dummy_dev = gdev(is_dummy)
+    #is_dummy_dev = x_dev .>= 4.0
+    xm_dev = CP.repeat_rowvector_dummy(x_dev', is_dummy_dev)
+    gr = Zygote.gradient(x -> sum(sin, CP.repeat_rowvector_dummy(x', is_dummy_dev)), x_dev)[1];
+    @test cdev(gr) ≈ 2 .* cos.(x)
+end
+
+@testset "repeat_rowvector_dummymatrix" begin
+    x = collect(1.0:5.0) 
+    is_dummy = fill(false, (4, length(x)))
+    is_dummy[3:4, 2] .= true
+    xm = CP.repeat_rowvector_dummy(x', is_dummy)
+    @test xm[:,[1,3,4,5]] == xm[:,[1,3,4,5]] == repeat(x[[1,3,4,5]]', 4)
+    @test xm[1:2,2] ==  fill(x[2], 2)
+    @test all(isnan.(xm[3:4,2]))
+    #
+    #tmp = Zygote.gradient(is_dummy -> sum(repeat_rowvector_dummy(x', is_dummy)), is_dummy) 
+    gr = Zygote.gradient(x -> sum(CP.repeat_rowvector_dummy(x', is_dummy)), x)[1]
+    @test gr == [4,2,4,4,4.0]
+    gr = Zygote.gradient(x -> sum(CP.repeat_rowvector_dummy(x', is_dummy).^2), x)[1]
+    @test gr == [4,2,4,4,4.0] .* 2.0 .*x
+    gr = Zygote.gradient(x -> sum(sin, CP.repeat_rowvector_dummy(x', is_dummy)), x)[1]
+    @test gr == [4,2,4,4,4.0] .* cos.(x)
+    gr = (Zygote.gradient(x) do x 
+        y = CP.repeat_rowvector_dummy(x', is_dummy) 
+        z = sum(abs2, y[1,:]) + 3*sum(y[2:end,:])
+    end)[1]
+    @test gr == 2 .* x .+ [3,1,3,3,3.0] .* 3
+    #
+    x_dev = gdev(x)
+    is_dummy_dev = gdev(is_dummy)
+    #is_dummy_dev = x_dev .>= 4.0
+    xm_dev = CP.repeat_rowvector_dummy(x_dev', is_dummy_dev)
+    #tmp = Zygote.gradient(is_dummy -> sum(repeat_rowvector_dummy(x', is_dummy)), is_dummy) 
+    gr = Zygote.gradient(x -> sum(sin, CP.repeat_rowvector_dummy(x', is_dummy_dev)), x_dev)[1];
+    @test cdev(gr) ≈ [4,2,4,4,4.0] .* cos.(x)
+end
+
+
+function test_driverNaN(scenario::Val{scen})  where scen
+    prob = HybridProblem(DoubleMM.DoubleMMCase(); scenario);
+    if (:use_rangescaling ∈ scen)
+        @test prob.g isa RangeScalingModelApplicator
+    end
+    solver_point = HybridPointSolver(; alg=Adam(0.02))
+    rng = StableRNG(111)
+    (;ϕ, resopt, probo) = solve(prob, solver_point; rng,
+        #callback = callback_loss(100), # output during fitting
+        #callback = callback_loss(10), # output during fitting
+        epochs = 2,
+        is_omit_priors = (:f_on_gpu ∈ scen), # prior computation does not work on gpu
+        scenario,
+    );
+    @test all(isfinite.(ϕ))
+    (;y_pred, θMs, θP) = predict_point_hvi(rng, probo; scenario);
+    _,_,y_obs,_ = get_hybridproblem_train_dataloader(prob; scenario).data
+    @test size(y_pred) == size(y_obs)
+    y_predc = cdev(y_pred)
+    @test all(isfinite.(y_predc[isfinite.(y_obs)]))    
+    #
+    # takes long, only activate on suspicious
+    () -> begin
+        solver = HybridPosteriorSolver(; alg=Adam(0.02), n_MC=3)
+        (; probo, interpreters) = solve(prob, solver; rng,
+            callback = callback_loss(10), # output during fitting
+            epochs = 2,
+            scenario,
+        );    
+        @test all(isfinite.(probo.θP))
+        n_sample_pred = 12
+        (; y, θsP, θsMs, entropy_ζ) = predict_hvi(rng, probo; scenario, n_sample_pred);
+        @test size(y) == (size(y_pred)..., n_sample_pred)
+        yc = cdev(y)
+        _ = map(eachslice(yc; dims = 3)) do ycs
+            @test all(isfinite.(ycs[isfinite.(y_obs)]))    
+        end
+    end
+end
+
+@testset "HybridPointSolver driverNaN cpu" begin
+    scenario = Val((:driverNAN,))
+    test_driverNaN(scenario)
+end
+
+# @testset "HybridPointSolver driverNaN ML on gpu" begin
+#     scenario = Val((:driverNAN, :use_Lux, :use_gpu))
+#     test_driverNaN(scenario)
+# end
+
+@testset "HybridPointSolver driverNaN also PBM on gpu" begin
+    scenario = Val((:driverNAN, :use_rangescaling, :use_Lux, :use_dropout, :use_gpu, :f_on_gpu))
+    test_driverNaN(scenario)
+end
+
diff --git a/test/test_util_ca.jl b/test/test_util_ca.jl
index 1370c9c..a4871ad 100644
--- a/test/test_util_ca.jl
+++ b/test/test_util_ca.jl
@@ -6,7 +6,7 @@ using ComponentArrays: ComponentArrays as CA
 @testset "compose_axes" begin
     @test (@inferred CP._add_interval(;ranges=(Val(1:3),), length = Val(2))) == (Val(1:3), Val(4:5))
     ls = Val.((3,1,2))
-    @test (@inferred CP._construct_invervals(;lengths=ls)) == Val.((1:3, 4:4, 5:6))
+    @test (@inferred CP._construct_intervals(;lengths=ls)) == Val.((1:3, 4:4, 5:6))
     v1 = CA.ComponentVector(A=1:3)
     v2 = CA.ComponentVector(B=1:2)
     v3 = CA.ComponentVector(P=(x=1, y=2), Ms=zeros(3,2))
diff --git a/test/test_util_gpu.jl b/test/test_util_gpu.jl
index 04a2b8e..71ebec8 100644
--- a/test/test_util_gpu.jl
+++ b/test/test_util_gpu.jl
@@ -1,25 +1,25 @@
 using Test
 using HybridVariationalInference: HybridVariationalInference as HVI
-using ComponentArrays
+using ComponentArrays: ComponentArrays as CA
 using MLDataDevices
 import CUDA, cuDNN
 using FillArrays
 
-@testset "ones_similar_x" begin
+@testset "ones_similar_x" begin 
     A = rand(Float64, 3, 4); 
-    @test HVI.ones_similar_x(A, 3) isa FillArrays.AbstractFill #Vector
-    @test HVI.ones_similar_x(A, size(A,1)) isa FillArrays.AbstractFill #Vector#Vector
+    @test @inferred HVI.ones_similar_x(A, 3) isa FillArrays.AbstractFill #Vector
+    @test @inferred HVI.ones_similar_x(A, size(A,1)) isa FillArrays.AbstractFill #Vector#Vector
 end
 
 gdev = gpu_device()
 if gdev isa MLDataDevices.CUDADevice
     @testset "ones_similar_x" begin
         B = CUDA.rand(Float32, 5, 2);    # GPU matrix
-        @test HVI.ones_similar_x(B, size(B,1)) isa CUDA.CuArray
-        @test HVI.ones_similar_x(ComponentVector(b=B), size(B,1)) isa CUDA.CuArray
-        @test HVI.ones_similar_x(B', size(B,1)) isa CUDA.CuArray
-        @test HVI.ones_similar_x(@view(B[:,2]), size(B,1)) isa CUDA.CuArray
-        @test HVI.ones_similar_x(ComponentVector(b=B)[:,1], size(B,1)) isa CUDA.CuArray
+        @test @inferred HVI.ones_similar_x(B, size(B,1)) isa CUDA.CuArray
+        @test @inferred HVI.ones_similar_x(CA.ComponentVector(b=B), size(B,1)) isa CUDA.CuArray
+        @test @inferred HVI.ones_similar_x(B', size(B,1)) isa CUDA.CuArray
+        @test @inferred HVI.ones_similar_x(@view(B[:,2]), size(B,1)) isa CUDA.CuArray
+        @test @inferred HVI.ones_similar_x(CA.ComponentVector(b=B)[:,1], size(B,1)) isa CUDA.CuArray
     end
 end