Skip to content

Commit

Permalink
Fix: controller: Replace node state atomically at DC join ack step
Browse files Browse the repository at this point in the history
Currently, do_dc_join_ack() deletes part of the joining node's resource
history from the CIB and then updates the remaining portion. If we
query the CIB or we're interrupted in some way between the delete and
the update, then we get a bad view of the state, where pieces are
missing.

Here, we use a CIB transaction to ensure that the update happens
atomically as a replacement.

Of note: anything passed as user_data to a CIB client callback must be
freeable by the function passed to cib_client_register_callback_full()
as the free_func argument. That argument is of type
void (*free_func)(void *).

free_xml() can't be passed as the free_func() argument to a CIB client
callback. The behavior is undefined by the C standard, and GCC throws an
error. (void *) and (xmlNode *) are not compatible.
- https://stackoverflow.com/a/559671/7660197

Casting might be safe in practice though:
- https://stackoverflow.com/a/14044244/7660197

Closes T186
Closes CLBZ#5306
Closes RHBZ#2000595

Signed-off-by: Reid Wahl <nrwahl@protonmail.com>
  • Loading branch information
nrwahl2 committed Jun 15, 2023
1 parent 6a074c4 commit 5e3b3d1
Showing 1 changed file with 162 additions and 37 deletions.
199 changes: 162 additions & 37 deletions daemons/controld/controld_join_dc.c
Original file line number Diff line number Diff line change
Expand Up @@ -673,23 +673,164 @@ finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, voi
}
}

/*!
* \internal
* \brief Free XML in a CIB client callback
*
* \param[in,out] xml XML to free
*/
static void
join_update_complete_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
xml_callback_free_fn(void *xml)
{
fsa_data_t *msg_data = NULL;
free_xml((xmlNode *) xml);
}

if (rc == pcmk_ok) {
crm_debug("join-%d node history update (via CIB call %d) complete",
current_join_id, call_id);
check_join_state(controld_globals.fsa_state, __func__);
static void
join_node_state_commit_callback(xmlNode *msg, int call_id, int rc,
xmlNode *output, void *user_data)
{
controld_forget_cib_replace_call(call_id);

if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL; // for register_fsa_error() macro

crm_crit("join-%d node history update (via CIB call %d) failed: %s",
current_join_id, call_id, pcmk_strerror(rc));
crm_log_xml_debug(msg, "failed");
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
}

crm_debug("join-%d node history update (via CIB call %d) complete",
current_join_id, call_id);
check_join_state(controld_globals.fsa_state, __func__);
}

static void
join_node_state_update_callback(xmlNode *msg, int call_id, int rc,
xmlNode *output, void *user_data)
{
char *join_from = NULL;
cib_t *cib = controld_globals.cib_conn;

if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL; // for register_fsa_error() macro

crm_crit("join-%d node history update (via CIB call %d) failed: %s",
current_join_id, call_id, pcmk_strerror(rc));
crm_log_xml_debug(msg, "failed");
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
return;
}

crm_debug("join-%d node history update extend-delete (via CIB call %d) "
"succeeded",
current_join_id, call_id);

// Make a copy so that it can be freed when the callback runs
pcmk__str_update(&join_from, (const char *) user_data);

rc = cib->cmds->end_transaction(cib, true, cib_scope_local);
fsa_register_cib_callback(rc, join_from, join_node_state_commit_callback);
if (rc >= 0) {
// Expect a CIB replacement
controld_record_cib_replace_call(rc);
}
}

static void
join_node_state_delete_callback(xmlNode *msg, int call_id, int rc,
xmlNode *output, void *user_data)
{
xmlNode *join_msg = user_data;
char *join_from = NULL;
const int call_options = cib_scope_local|cib_can_create|cib_transaction;

if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL; // for register_fsa_error() macro

crm_crit("join-%d node history update (via CIB call %d) failed: %s",
current_join_id, call_id, pcmk_strerror(rc));
crm_log_xml_debug(msg, "failed");
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
return;
}

crm_debug("join-%d node history update extend-delete (via CIB call %d) "
"succeeded",
current_join_id, call_id);

// Make a copy so that it can be freed when the callback runs
join_from = crm_element_value_copy(join_msg, F_CRM_HOST_FROM);

if (pcmk__str_eq(join_from, controld_globals.our_nodename,
pcmk__str_casei)) {

xmlNode *state = controld_query_executor_state();

if (state != NULL) {
crm_debug("Updating local node history for join-%d from query "
"result",
current_join_id);
controld_update_cib(XML_CIB_TAG_STATUS, state, call_options,
join_node_state_update_callback, join_from);
free_xml(state);

} else {
crm_warn("Updating local node history from join-%d confirmation "
"because query failed",
current_join_id);
controld_update_cib(XML_CIB_TAG_STATUS, join_msg, call_options,
join_node_state_update_callback, join_from);
}

} else {
crm_err("join-%d node history update (via CIB call %d) failed: %s "
"(next transition may determine resource status incorrectly)",
current_join_id, call_id, pcmk_strerror(rc));
crm_debug("Updating node history for %s from join-%d confirmation",
join_from, current_join_id);
controld_update_cib(XML_CIB_TAG_STATUS, join_msg, call_options,
join_node_state_update_callback, join_from);
}
}

static void
join_node_state_init_callback(xmlNode *msg, int call_id, int rc,
xmlNode *output, void *user_data)
{
xmlNode *join_msg = user_data;
const char *join_from = crm_element_value(join_msg, F_CRM_HOST_FROM);
enum controld_section_e section = controld_section_lrm;
char *xpath = NULL;
cib_t *cib = controld_globals.cib_conn;

if (rc != pcmk_ok) {
fsa_data_t *msg_data = NULL; // for register_fsa_error() macro

crm_crit("join-%d node history update (via CIB call %d) failed: %s",
current_join_id, call_id, pcmk_strerror(rc));
crm_log_xml_debug(msg, "failed");
register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
return;
}

crm_debug("join-%d node history update init (via CIB call %d) succeeded ",
current_join_id, call_id);

if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
section = controld_section_lrm_unlocked;
}
controld_node_state_deletion_strings(join_from, section, &xpath, NULL);

rc = cib->cmds->remove(cib, xpath, NULL,
cib_scope_local
|cib_xpath
|cib_multiple
|cib_transaction);

cib->cmds->register_callback_full(cib, rc, cib_op_timeout(), FALSE,
copy_xml(join_msg),
"join_node_state_delete_callback",
join_node_state_delete_callback,
xml_callback_free_fn);
free(xpath);
}

/* A_DC_JOIN_PROCESS_ACK */
Expand All @@ -701,13 +842,14 @@ do_dc_join_ack(long long action,
{
int join_id = -1;
ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);
enum controld_section_e section = controld_section_lrm;
const int cib_opts = cib_scope_local|cib_can_create;

const char *op = crm_element_value(join_ack->msg, F_CRM_TASK);
const char *join_from = crm_element_value(join_ack->msg, F_CRM_HOST_FROM);
crm_node_t *peer = NULL;

cib_t *cib = controld_globals.cib_conn;
int rc = 0;

// Sanity checks
if (join_from == NULL) {
crm_warn("Ignoring message received without node identification");
Expand Down Expand Up @@ -751,33 +893,16 @@ do_dc_join_ack(long long action,

/* Update CIB with node's current executor state. A new transition will be
* triggered later, when the CIB notifies us of the change.
*
* A chain of callbacks extends and ultimately commits the transaction.
*/
if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
section = controld_section_lrm_unlocked;
}
controld_delete_node_state(join_from, section, cib_scope_local);
if (pcmk__str_eq(join_from, controld_globals.our_nodename,
pcmk__str_casei)) {
xmlNode *now_dc_lrmd_state = controld_query_executor_state();

if (now_dc_lrmd_state != NULL) {
crm_debug("Updating local node history for join-%d "
"from query result", join_id);
controld_update_cib(XML_CIB_TAG_STATUS, now_dc_lrmd_state, cib_opts,
join_update_complete_callback, NULL);
free_xml(now_dc_lrmd_state);
} else {
crm_warn("Updating local node history from join-%d confirmation "
"because query failed", join_id);
controld_update_cib(XML_CIB_TAG_STATUS, join_ack->xml, cib_opts,
join_update_complete_callback, NULL);
}
} else {
crm_debug("Updating node history for %s from join-%d confirmation",
join_from, join_id);
controld_update_cib(XML_CIB_TAG_STATUS, join_ack->xml, cib_opts,
join_update_complete_callback, NULL);
}
rc = cib->cmds->init_transaction(cib, cib_scope_local);

cib->cmds->register_callback_full(cib, rc, cib_op_timeout(), FALSE,
copy_xml(join_ack->msg),
"join_node_state_init_callback",
join_node_state_init_callback,
xml_callback_free_fn);
}

void
Expand Down

0 comments on commit 5e3b3d1

Please sign in to comment.