From 2fdb5f05855c6d9d223c909b0418453b1384ed7f Mon Sep 17 00:00:00 2001 From: Weijia Song Date: Sun, 6 Jun 2021 16:47:24 -0400 Subject: [PATCH] Another critical bugfix for API change from libfabric 1.7.o to libfabric 1.12.1 In libfabric v1.12.1, verbs provider's fi_cq_open() API does not pick a valid size if given size is zero. In such a case, fi_msg() will always return -FI_EAGAIN, causing an infinite loop in RDMC initialization. The TCP provider is not affected. Instead of let fi_cq_open() to pick one for us, we set it to a fix number 2097152. --- src/rdmc/lf_helper.cpp | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/rdmc/lf_helper.cpp b/src/rdmc/lf_helper.cpp index de0b3363..b37b6502 100644 --- a/src/rdmc/lf_helper.cpp +++ b/src/rdmc/lf_helper.cpp @@ -665,6 +665,11 @@ bool lf_initialize(const std::map>& ip fail_if_nonzero_retry_on_eagain( "fi_domain() failed", CRASH_ON_FAILURE, fi_domain, g_ctxt.fabric, g_ctxt.fi, &(g_ctxt.domain), nullptr); + /** + * libfabric 1.12 does not pick an adequate default value for completion queue size. + * We simply set it to a large enough one. + */ + g_ctxt.cq_attr.size = 2097152; fail_if_nonzero_retry_on_eagain( "failed to initialize tx completion queue", CRASH_ON_FAILURE, fi_cq_open, g_ctxt.domain, &(g_ctxt.cq_attr), &(g_ctxt.cq), nullptr);