diff --git a/src/common/config_opts.h b/src/common/config_opts.h index 41cc525958a7f..b0efc7586a944 100644 --- a/src/common/config_opts.h +++ b/src/common/config_opts.h @@ -223,6 +223,9 @@ OPTION(ms_async_rdma_send_buffers, OPT_U32, 10240) OPTION(ms_async_rdma_receive_buffers, OPT_U32, 10240) OPTION(ms_async_rdma_port_num, OPT_U32, 1) OPTION(ms_async_rdma_polling_us, OPT_U32, 1000) +OPTION(ms_async_rdma_local_gid, OPT_STR, "") // GID format: "fe80:0000:0000:0000:7efe:90ff:fe72:6efe", no zero folding +OPTION(ms_async_rdma_roce_ver, OPT_INT, 2) // 2=RoCEv2, 1=RoCEv1.5, 0=RoCEv1 +OPTION(ms_async_rdma_sl, OPT_INT, 3) // in RoCE, this means PCP OPTION(ms_dpdk_port_id, OPT_INT, 0) OPTION(ms_dpdk_coremask, OPT_STR, "1") diff --git a/src/msg/async/rdma/Infiniband.cc b/src/msg/async/rdma/Infiniband.cc index 591afeb080cad..2913aa1ecc4f7 100644 --- a/src/msg/async/rdma/Infiniband.cc +++ b/src/msg/async/rdma/Infiniband.cc @@ -46,6 +46,59 @@ Device::Device(CephContext *cct, ibv_device* d): device(d), device_attr(new ibv_ } } +Port::Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn), port_attr(new ibv_port_attr) { + union ibv_gid cgid; + struct ibv_exp_gid_attr gid_attr; + + int r = ibv_query_port(ctxt, port_num, port_attr); + if (r == -1) { + lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + + lid = port_attr->lid; + + // search for requested GID in GIDs table + ldout(cct, 1) << __func__ << " looking for local GID " << (cct->_conf->ms_async_rdma_local_gid) + << " of type " << (cct->_conf->ms_async_rdma_roce_ver) << dendl; + sscanf(cct->_conf->ms_async_rdma_local_gid.c_str(), + "%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx" + ":%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx:%02hhx%02hhx", + &cgid.raw[ 0], &cgid.raw[ 1], + &cgid.raw[ 2], &cgid.raw[ 3], + &cgid.raw[ 4], &cgid.raw[ 5], + &cgid.raw[ 6], &cgid.raw[ 7], + &cgid.raw[ 8], &cgid.raw[ 9], + &cgid.raw[10], &cgid.raw[11], + &cgid.raw[12], &cgid.raw[13], + &cgid.raw[14], &cgid.raw[15]); + + gid_attr.comp_mask = IBV_EXP_QUERY_GID_ATTR_TYPE; + + for (gid_idx = 0; gid_idx < port_attr->gid_tbl_len; gid_idx++) { + r = ibv_query_gid(ctxt, port_num, gid_idx, &gid); + if (r) { + lderr(cct) << __func__ << " query gid of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + r = ibv_exp_query_gid_attr(ctxt, port_num, gid_idx, &gid_attr); + if (r) { + lderr(cct) << __func__ << " query gid attributes of port " << port_num << " index " << gid_idx << " failed " << cpp_strerror(errno) << dendl; + ceph_abort(); + } + if ( (gid_attr.type == cct->_conf->ms_async_rdma_roce_ver) && + (memcmp(&gid, &cgid, 16) == 0) ) { + ldout(cct, 1) << __func__ << " found at index " << gid_idx << dendl; + break; + } + } + + if (gid_idx == port_attr->gid_tbl_len) { + lderr(cct) << __func__ << " Requested local GID was not found in GID table" << dendl; + ceph_abort(); + } + } + void Device::binding_port(CephContext *cct, uint8_t port_num) { port_cnt = device_attr->phys_port_cnt; ports = new Port*[port_cnt]; diff --git a/src/msg/async/rdma/Infiniband.h b/src/msg/async/rdma/Infiniband.h index 932af8ec6ed9c..b92f1b19d6501 100644 --- a/src/msg/async/rdma/Infiniband.h +++ b/src/msg/async/rdma/Infiniband.h @@ -49,30 +49,17 @@ class Port { struct ibv_context* ctxt; uint8_t port_num; struct ibv_port_attr* port_attr; - int gid_tbl_len; uint16_t lid; + int gid_idx; union ibv_gid gid; public: - explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn): ctxt(ictxt), port_num(ipn), port_attr(new ibv_port_attr) { - int r = ibv_query_port(ctxt, port_num, port_attr); - if (r == -1) { - lderr(cct) << __func__ << " query port failed " << cpp_strerror(errno) << dendl; - ceph_abort(); - } - - lid = port_attr->lid; - r = ibv_query_gid(ctxt, port_num, 0, &gid); - if (r) { - lderr(cct) << __func__ << " query gid failed " << cpp_strerror(errno) << dendl; - ceph_abort(); - } - } - + explicit Port(CephContext *cct, struct ibv_context* ictxt, uint8_t ipn); uint16_t get_lid() { return lid; } ibv_gid get_gid() { return gid; } uint8_t get_port_num() { return port_num; } ibv_port_attr* get_port_attr() { return port_attr; } + int get_gid_idx() { return gid_idx; } }; @@ -92,6 +79,7 @@ class Device { const char* get_name() { return name;} uint16_t get_lid() { return active_port->get_lid(); } ibv_gid get_gid() { return active_port->get_gid(); } + int get_gid_idx() { return active_port->get_gid_idx(); } void binding_port(CephContext *c, uint8_t port_num); struct ibv_context *ctxt; ibv_device_attr *device_attr; diff --git a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc index b31c0b17bc14e..b214147f63696 100644 --- a/src/msg/async/rdma/RDMAConnectedSocketImpl.cc +++ b/src/msg/async/rdma/RDMAConnectedSocketImpl.cc @@ -37,13 +37,16 @@ int RDMAConnectedSocketImpl::activate() qpa.ah_attr.is_global = 1; qpa.ah_attr.grh.hop_limit = 6; qpa.ah_attr.grh.dgid = peer_msg.gid; - qpa.ah_attr.grh.sgid_index = 0; + + qpa.ah_attr.grh.sgid_index = infiniband->get_device()->get_gid_idx(); qpa.ah_attr.dlid = peer_msg.lid; - qpa.ah_attr.sl = 0; + qpa.ah_attr.sl = cct->_conf->ms_async_rdma_sl; qpa.ah_attr.src_path_bits = 0; qpa.ah_attr.port_num = (uint8_t)(infiniband->get_ib_physical_port()); + ldout(cct, 20) << __func__ << " Choosing gid_index " << (int)qpa.ah_attr.grh.sgid_index << ", sl " << (int)qpa.ah_attr.sl << dendl; + r = ibv_modify_qp(qp->get_qp(), &qpa, IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU |