diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 4b01c51a7..91670535e 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -49,6 +49,7 @@ spack_setup:
 .gpu_node:
   variables:
     bb5_constraint: volta
+    bb5_ntasks: 16
 .test_neuron:
   extends: [.ctest]
   stage: test_neuron
diff --git a/coreneuron/apps/main1.cpp b/coreneuron/apps/main1.cpp
index 410fe5c62..7411dbfa2 100644
--- a/coreneuron/apps/main1.cpp
+++ b/coreneuron/apps/main1.cpp
@@ -470,6 +470,7 @@ static void* load_dynamic_mpi(const std::string& libname) {
 extern "C" void mk_mech_init(int argc, char** argv) {
     // reset all parameters to their default values
     corenrn_param.reset();
+
     // read command line parameters and parameter config files
     corenrn_param.parse(argc, argv);
 
diff --git a/coreneuron/io/core2nrn_data_return.cpp b/coreneuron/io/core2nrn_data_return.cpp
index 2d046a866..6a12c197f 100644
--- a/coreneuron/io/core2nrn_data_return.cpp
+++ b/coreneuron/io/core2nrn_data_return.cpp
@@ -170,6 +170,14 @@ static void core2nrn_corepointer(int tid, NrnThreadMembList* tml) {
  */
 static void core2nrn_tqueue(NrnThread&);
 
+/** @brief Callback to clear NEURON thread queues.
+    In particular need to initialize bin queues to the current time before
+    transferring events.
+ */
+extern "C" {
+void (*core2nrn_clear_queues_)(double t);
+}
+
 /** @brief All activated WATCH statements need activation on NEURON side.
  */
 // vector in unpermuted Memb_list index order of vector of
@@ -200,6 +208,9 @@ void core2nrn_data_return() {
     if (!nrn2core_type_return_) {
         return;
     }
+
+    (*core2nrn_clear_queues_)(nrn_threads[0]._t);  // all threads at same time
+
     for (int tid = 0; tid < nrn_nthread; ++tid) {
         size_t n = 0;
         double* data = nullptr;
diff --git a/coreneuron/io/nrn2core_data_init.cpp b/coreneuron/io/nrn2core_data_init.cpp
index e79ed824d..6838c668d 100644
--- a/coreneuron/io/nrn2core_data_init.cpp
+++ b/coreneuron/io/nrn2core_data_init.cpp
@@ -51,6 +51,8 @@ void direct_mode_initialize() {
     dt2thread(-1.);
     nrn_thread_table_check();
 
+    clear_event_queue();
+
     // Reproduce present NEURON WATCH activation
     // Start from nothing active.
     watch_activate_clear();
diff --git a/coreneuron/io/nrn_setup.cpp b/coreneuron/io/nrn_setup.cpp
index ccd1e7b8c..6bec06fde 100644
--- a/coreneuron/io/nrn_setup.cpp
+++ b/coreneuron/io/nrn_setup.cpp
@@ -180,7 +180,7 @@ void nrn_read_filesdat(int& ngrp, int*& grp, const char* filesdat) {
     FILE* fp = fopen(filesdat, "r");
 
     if (!fp) {
-        nrn_fatal_error("No input file with nrnthreads, exiting...");
+        nrn_fatal_error("No input file ( %s ) with nrnthreads, exiting...", filesdat);
     }
 
     char version[256];
@@ -710,6 +710,9 @@ void nrn_cleanup_ion_map() {
 
 void nrn_cleanup() {
     clear_event_queue();  // delete left-over TQItem
+    for (auto psi: gid2in) {
+        delete psi.second;
+    }
     gid2in.clear();
     gid2out.clear();
 
diff --git a/coreneuron/mpi/lib/mpispike.cpp b/coreneuron/mpi/lib/mpispike.cpp
index 87d073d7e..bbe81ac6c 100644
--- a/coreneuron/mpi/lib/mpispike.cpp
+++ b/coreneuron/mpi/lib/mpispike.cpp
@@ -24,7 +24,7 @@ extern MPI_Comm nrnmpi_comm;
 
 static int np;
 static int* displs{nullptr};
-static int* byteovfl; /* for the compressed transfer method */
+static int* byteovfl{nullptr}; /* for the compressed transfer method */
 static MPI_Datatype spike_type;
 
 static void* emalloc(size_t size) {
@@ -175,7 +175,7 @@ The allgather sends the first part of the buf and the allgatherv buffer
 sends any overflow.
 */
 int nrnmpi_spike_exchange_compressed_impl(int localgid_size,
-                                          unsigned char* spfixin_ovfl,
+                                          unsigned char*& spfixin_ovfl,
                                           int send_nspike,
                                           int* nin,
                                           int ovfl_capacity,
@@ -187,9 +187,10 @@ int nrnmpi_spike_exchange_compressed_impl(int localgid_size,
         np = nrnmpi_numprocs_;
         displs = (int*) emalloc(np * sizeof(int));
         displs[0] = 0;
+    }
+    if (!byteovfl) {
         byteovfl = (int*) emalloc(np * sizeof(int));
     }
-
     MPI_Allgather(
         spikeout_fixed, ag_send_size, MPI_BYTE, spikein_fixed, ag_send_size, MPI_BYTE, nrnmpi_comm);
     int novfl = 0;
diff --git a/coreneuron/mpi/lib/nrnmpi.cpp b/coreneuron/mpi/lib/nrnmpi.cpp
index 070ce05fd..bc84a969e 100644
--- a/coreneuron/mpi/lib/nrnmpi.cpp
+++ b/coreneuron/mpi/lib/nrnmpi.cpp
@@ -35,6 +35,12 @@ static void nrn_fatal_error(const char* msg) {
 }
 
 nrnmpi_init_ret_t nrnmpi_init_impl(int* pargc, char*** pargv, bool is_quiet) {
+    // Execute at most once per launch. Avoid memory leak.
+    static bool executed = false;
+    if (executed) {
+        return {nrnmpi_numprocs_, nrnmpi_myid_};
+    }
+
     nrnmpi_under_nrncontrol_ = true;
 
     if (!nrnmpi_initialized_impl()) {
@@ -62,6 +68,7 @@ nrnmpi_init_ret_t nrnmpi_init_impl(int* pargc, char*** pargv, bool is_quiet) {
 #endif
     }
 
+    executed = true;
     return {nrnmpi_numprocs_, nrnmpi_myid_};
 }
 
diff --git a/coreneuron/mpi/nrnmpidec.h b/coreneuron/mpi/nrnmpidec.h
index f5ac5bf60..a9f12c8f6 100644
--- a/coreneuron/mpi/nrnmpidec.h
+++ b/coreneuron/mpi/nrnmpidec.h
@@ -37,7 +37,7 @@ extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_write_file_impl)> nrnmp
 extern "C" int nrnmpi_spike_exchange_impl(int* nin, NRNMPI_Spike* spikeout, int icapacity, NRNMPI_Spike** spikein, int& ovfl, int nout, NRNMPI_Spikebuf* spbufout, NRNMPI_Spikebuf* spbufin);
 extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_impl)>
     nrnmpi_spike_exchange;
-extern "C" int nrnmpi_spike_exchange_compressed_impl(int, unsigned char*, int, int*, int, unsigned char*, int, unsigned char*, int& ovfl);
+extern "C" int nrnmpi_spike_exchange_compressed_impl(int, unsigned char*&, int, int*, int, unsigned char*, int, unsigned char*, int& ovfl);
 extern mpi_function<cnrn_make_integral_constant_t(nrnmpi_spike_exchange_compressed_impl)>
     nrnmpi_spike_exchange_compressed;
 extern "C" int nrnmpi_int_allmax_impl(int i);
diff --git a/coreneuron/network/multisend_setup.cpp b/coreneuron/network/multisend_setup.cpp
index a11edb54e..72453b7a1 100644
--- a/coreneuron/network/multisend_setup.cpp
+++ b/coreneuron/network/multisend_setup.cpp
@@ -224,7 +224,7 @@ void TarList::alloc() {
 
 // for two phase
 
-static nrnran123_State* ranstate;
+static nrnran123_State* ranstate{nullptr};
 
 static void random_init(int i) {
     if (!ranstate) {
@@ -236,6 +236,14 @@ static unsigned int get_random() {
     return nrnran123_ipick(ranstate);
 }
 
+// Avoid warnings if the global index is changed on subsequent psolve.
+static void random_delete() {
+    if (ranstate) {
+        nrnran123_deletestream(ranstate);
+        ranstate = nullptr;
+    }
+}
+
 static int iran(int i1, int i2) {
     // discrete uniform random integer from i2 to i2 inclusive. Must
     // work if i1 == i2
@@ -575,6 +583,7 @@ static std::vector<int> setup_target_lists(bool use_phase2) {
                 phase2organize(tl);
             }
         }
+        random_delete();
     }
 
     // For clarity, use the all2allv_int style of information flow
diff --git a/coreneuron/network/netcvode.cpp b/coreneuron/network/netcvode.cpp
index 4fb1d165f..cecd1f30a 100644
--- a/coreneuron/network/netcvode.cpp
+++ b/coreneuron/network/netcvode.cpp
@@ -135,6 +135,10 @@ void NetCvodeThreadData::interthread_send(double td, DiscreteEvent* db, NrnThrea
     inter_thread_events_.emplace_back(InterThreadEvent{db, td});
 }
 
+void interthread_enqueue(NrnThread* nt) {
+    net_cvode_instance->p[nt->id].enqueue(net_cvode_instance, nt);
+}
+
 void NetCvodeThreadData::enqueue(NetCvode* nc, NrnThread* nt) {
     std::lock_guard<OMP_Mutex> lock(mut);
     for (const auto& ite: inter_thread_events_) {
@@ -229,14 +233,14 @@ void NetCvode::clear_events() {
         d.unreffed_event_cnt_ = 0;
         d.inter_thread_events_.clear();
         d.tqe_->nshift_ = -1;
-        d.tqe_->shift_bin(nrn_threads->_t);
+        d.tqe_->shift_bin(nrn_threads->_t - 0.5 * nrn_threads->_dt);
     }
 }
 
 void NetCvode::init_events() {
     for (int i = 0; i < nrn_nthread; ++i) {
         p[i].tqe_->nshift_ = -1;
-        p[i].tqe_->shift_bin(nrn_threads->_t);
+        p[i].tqe_->shift_bin(nrn_threads->_t - 0.5 * nrn_threads->_dt);
     }
 
     for (int tid = 0; tid < nrn_nthread; ++tid) {  // can be done in parallel
diff --git a/coreneuron/network/netcvode.hpp b/coreneuron/network/netcvode.hpp
index b5694b10f..6e1da66e5 100644
--- a/coreneuron/network/netcvode.hpp
+++ b/coreneuron/network/netcvode.hpp
@@ -37,6 +37,7 @@ class DiscreteEvent;
 class NetCvode;
 
 extern NetCvode* net_cvode_instance;
+extern void interthread_enqueue(NrnThread*);
 
 struct InterThreadEvent {
     DiscreteEvent* de_;
diff --git a/coreneuron/network/netpar.cpp b/coreneuron/network/netpar.cpp
index cbb2b547c..036e5baab 100644
--- a/coreneuron/network/netpar.cpp
+++ b/coreneuron/network/netpar.cpp
@@ -286,7 +286,10 @@ void nrn_spike_exchange_init() {
             t_exchange_ = t;
             dt1_ = rev_dt;
             usable_mindelay_ = floor(mindelay_ * dt1_ + 1e-9) * dt;
-            assert(usable_mindelay_ >= dt && (usable_mindelay_ * dt1_) < 255);
+            if (usable_mindelay_ * dt1_ >= 255.) {
+                usable_mindelay_ = 255. / dt1_;
+            }
+            assert(usable_mindelay_ >= dt && (usable_mindelay_ * dt1_) <= 255.);
         } else {
 #if nrn_spikebuf_size > 0
             if (spbufout) {
@@ -366,6 +369,7 @@ void nrn_spike_exchange(NrnThread* nt) {
             ps->send(spikein[i].spiketime, net_cvode_instance, nt);
         }
     }
+    nrn_multithread_job(interthread_enqueue);
     wt1_ = nrn_wtime() - wt;
 }
 
@@ -482,6 +486,12 @@ void nrn_spike_exchange_compressed(NrnThread* nt) {
             }
         }
     }
+    // In case of multiple threads some above ps->send events put
+    // NetCon events into interthread buffers. Some of those may
+    // need to be delivered early enough that the interthread buffers
+    // need transfer to the thread event queues before the next dqueue_bin
+    // while loop in deliver_net_events. So enqueue now...
+    nrn_multithread_job(interthread_enqueue);
     t_exchange_ = nrn_threads->_t;
     wt1_ = nrn_wtime() - wt;
 }
@@ -606,6 +616,7 @@ void BBS_netpar_solve(double tstop) {
         }
 
         nrn_timeout(timeout_);
+        nrn_multithread_job(interthread_enqueue);
         ncs2nrn_integrate(tstop * (1. + 1e-11));
         nrn_spike_exchange(nrn_threads);
         nrn_timeout(0);
diff --git a/external/nmodl b/external/nmodl
index 46f8baf2b..3e960d7d9 160000
--- a/external/nmodl
+++ b/external/nmodl
@@ -1 +1 @@
-Subproject commit 46f8baf2bbeaa0d21559d6306ec37b94c601f1ee
+Subproject commit 3e960d7d9e6db1e4f74a1c7fb6b773a6a3cd593c