Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
Fix: pacemaker-remote: pacemaker_remoted shutdown while unmanaged
Since introduction of the graceful shutdown of pacemaker_remoted
the shutdown is hanging if the remote-resource is unmanaged.
This happens as pacemaker_remoted is waiting for all resources
running on the remote-node to be shut down and pacemaker
on the other hand doesn't touch resources on a remote-node
when the remote-resource is unmanaged.

Fixes rhbz#1388102
  • Loading branch information
wenningerk committed Mar 20, 2017
1 parent 43967f0 commit 0113ff6
Show file tree
Hide file tree
Showing 19 changed files with 329 additions and 36 deletions.
2 changes: 2 additions & 0 deletions crmd/crmd_lrm.h
Expand Up @@ -162,5 +162,7 @@ int remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *acti
void remote_ra_cleanup(lrm_state_t * lrm_state);
void remote_ra_fail(const char *node_name);
void remote_ra_process_pseudo(xmlNode *xml);
gboolean remote_ra_is_in_maintenance(lrm_state_t * lrm_state);
void remote_ra_process_maintenance_nodes(xmlNode *xml);

gboolean process_lrm_event(lrm_state_t * lrm_state, lrmd_event_data_t * op, struct recurring_op_s *pending);
20 changes: 14 additions & 6 deletions crmd/lrm_state.c
Expand Up @@ -572,14 +572,21 @@ remote_proxy_cb(lrmd_t *lrmd, void *userdata, xmlNode *msg)
crm_notice("%s requested shutdown of its remote connection",
lrm_state->node_name);

now_s = crm_itoa(now);
update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE);
free(now_s);
if (!remote_ra_is_in_maintenance(lrm_state)) {
now_s = crm_itoa(now);
update_attrd(lrm_state->node_name, XML_CIB_ATTR_SHUTDOWN, now_s, NULL, TRUE);
free(now_s);

remote_proxy_ack_shutdown(lrmd);
remote_proxy_ack_shutdown(lrmd);

crm_warn("Reconnection attempts to %s may result in failures that must be cleared",
lrm_state->node_name);
crm_warn("Reconnection attempts to %s may result in failures that must be cleared",
lrm_state->node_name);
} else {
remote_proxy_nack_shutdown(lrmd);

crm_notice("Remote resource for %s is not managed so no ordered shutdown happening",
lrm_state->node_name);
}
return;

} else if (safe_str_eq(op, LRMD_IPC_OP_NEW)) {
Expand Down Expand Up @@ -852,3 +859,4 @@ lrm_state_unregister_rsc(lrm_state_t * lrm_state,

return ((lrmd_t *) lrm_state->conn)->cmds->unregister_rsc(lrm_state->conn, rsc_id, options);
}

5 changes: 5 additions & 0 deletions crmd/messages.c
Expand Up @@ -872,6 +872,11 @@ handle_request(xmlNode * stored_msg, enum crmd_fsa_cause cause)
reap_crm_member(id, name);
}

} else if (strcmp(op, CRM_OP_MAINTENANCE_NODES) == 0) {
xmlNode *xml = get_message_xml(stored_msg, F_CRM_DATA);

remote_ra_process_maintenance_nodes(xml);

} else {
crm_err("Unexpected request (%s) sent to %s", op, AM_I_DC ? "the DC" : "non-DC node");
crm_log_xml_err(stored_msg, "Unexpected");
Expand Down
124 changes: 118 additions & 6 deletions crmd/remote_lrmd_ra.c
Expand Up @@ -80,6 +80,10 @@ typedef struct remote_ra_data_s {
enum remote_migration_status migrate_status;

gboolean active;
gboolean is_maintenance; /* kind of complex to determine from crmd-context
* so we have it signalled back with the
* transition from pengine
*/
} remote_ra_data_t;

static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
Expand Down Expand Up @@ -485,6 +489,28 @@ monitor_timeout_cb(gpointer data)
return FALSE;
}

static void
synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
{
lrmd_event_data_t op = { 0, };

if (lrm_state == NULL) {
/* if lrm_state not given assume local */
lrm_state = lrm_state_find(fsa_our_uname);
}
CRM_ASSERT(lrm_state != NULL);

op.type = lrmd_event_exec_complete;
op.rsc_id = rsc_id;
op.op_type = op_type;
op.rc = PCMK_OCF_OK;
op.op_status = PCMK_LRM_OP_DONE;
op.t_run = time(NULL);
op.t_rcchange = op.t_run;
op.call_id = generate_callid();
process_lrm_event(lrm_state, &op, NULL);
}

void
remote_lrm_op_callback(lrmd_event_data_t * op)
{
Expand Down Expand Up @@ -536,9 +562,18 @@ remote_lrm_op_callback(lrmd_event_data_t * op)
(ra_data->cur_cmd == NULL) &&
(ra_data->active == TRUE)) {

crm_err("Unexpected disconnect on remote-node %s", lrm_state->node_name);
ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
if (!remote_ra_is_in_maintenance(lrm_state)) {
crm_err("Unexpected disconnect on remote-node %s", lrm_state->node_name);
ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
} else {
crm_notice("Disconnect on unmanaged remote-node %s", lrm_state->node_name);
/* Do roughly what a 'stop' on the remote-resource would do */
handle_remote_ra_stop(lrm_state, NULL);
remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
/* now fake the reply of a successful 'stop' */
synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
}
return;
}

Expand Down Expand Up @@ -651,8 +686,6 @@ handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)

ra_data->active = FALSE;
lrm_state_disconnect(lrm_state);
cmd->rc = PCMK_OCF_OK;
cmd->op_status = PCMK_LRM_OP_DONE;

if (ra_data->cmds) {
g_list_free_full(ra_data->cmds, free_cmd);
Expand All @@ -664,7 +697,12 @@ handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
ra_data->recurring_cmds = NULL;
ra_data->cur_cmd = NULL;

report_remote_ra_result(cmd);
if (cmd) {
cmd->rc = PCMK_OCF_OK;
cmd->op_status = PCMK_LRM_OP_DONE;

report_remote_ra_result(cmd);
}
}

static int
Expand Down Expand Up @@ -1140,3 +1178,77 @@ remote_ra_process_pseudo(xmlNode *xml)
}
freeXpathObject(search);
}

static void
remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
{
remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
xmlNode *update, *state;
int call_opt, call_id = 0;
crm_node_t *node;

call_opt = crmd_cib_smart_opt();
node = crm_remote_peer_get(lrm_state->node_name);
CRM_CHECK(node != NULL, return);
update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
state = create_node_state_update(node, node_update_none, update,
__FUNCTION__);
crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
if (call_id < 0) {
crm_perror(LOG_WARNING, "%s CIB node state update failed", lrm_state->node_name);
} else {
/* TODO: still not 100% sure that async update will succeed ... */
ra_data->is_maintenance = maintenance;
}
free_xml(update);
}

#define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
"[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
XML_GRAPH_TAG_MAINTENANCE

/*!
* \internal
* \brief Check a pseudo-action holding updates for maintenance state
*
* \param[in] xml XML of pseudo-action to check
*/

void
remote_ra_process_maintenance_nodes(xmlNode *xml)
{
xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);

if (numXpathResults(search) == 1) {
xmlNode *node;
int cnt = 0, cnt_remote = 0;

for (node =
first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
node; node = __xml_next(node)) {
lrm_state_t *lrm_state = lrm_state_find(ID(node));

cnt++;
if (lrm_state && lrm_state->remote_ra_data &&
((remote_ra_data_t *) lrm_state->remote_ra_data)->active) {
cnt_remote++;
remote_ra_maintenance(lrm_state,
crm_atoi(crm_element_value(node,
XML_NODE_IS_MAINTENANCE), "0"));

}
}
crm_trace("Action holds %d nodes (%d remotes found) "
"adjusting maintenance-mode", cnt, cnt_remote);
}
freeXpathObject(search);
}

gboolean
remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
{
remote_ra_data_t *ra_data = lrm_state->remote_ra_data;

return ra_data->is_maintenance;
}
28 changes: 26 additions & 2 deletions crmd/te_actions.c
Expand Up @@ -53,8 +53,32 @@ te_start_action_timer(crm_graph_t * graph, crm_action_t * action)
static gboolean
te_pseudo_action(crm_graph_t * graph, crm_action_t * pseudo)
{
/* Check action for Pacemaker Remote node side effects */
remote_ra_process_pseudo(pseudo->xml);
const char *task = crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK);

/* send to peers as well? */
if (safe_str_eq(task, CRM_OP_MAINTENANCE_NODES)) {
GHashTableIter iter;
crm_node_t *node = NULL;

g_hash_table_iter_init(&iter, crm_peer_cache);
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
xmlNode *cmd = NULL;

if (safe_str_eq(fsa_our_uname, node->uname)) {
continue;
}

cmd = create_request(task, pseudo->xml, node->uname,
CRM_SYSTEM_CRMD, CRM_SYSTEM_TENGINE, NULL);
send_cluster_message(node, crm_msg_crmd, cmd, FALSE);
free_xml(cmd);
}

remote_ra_process_maintenance_nodes(pseudo->xml);
} else {
/* Check action for Pacemaker Remote node side effects */
remote_ra_process_pseudo(pseudo->xml);
}

crm_debug("Pseudo-action %d (%s) fired and confirmed", pseudo->id,
crm_element_value(pseudo->xml, XML_LRM_ATTR_TASK_KEY));
Expand Down
1 change: 1 addition & 0 deletions include/crm/crm.h
Expand Up @@ -126,6 +126,7 @@ extern char *crm_system_name;
# define CRM_OP_RELAXED_SET "one-or-more"
# define CRM_OP_RELAXED_CLONE "clone-one-or-more"
# define CRM_OP_RM_NODE_CACHE "rm_node_cache"
# define CRM_OP_MAINTENANCE_NODES "maintenance_nodes"

/* @COMPAT: These symbols are deprecated and not used by Pacemaker,
* but they are kept for public API backward compatibility.
Expand Down
1 change: 1 addition & 0 deletions include/crm/lrmd.h
Expand Up @@ -99,6 +99,7 @@ typedef struct lrmd_key_value_s {
#define LRMD_IPC_OP_RESPONSE "response"
#define LRMD_IPC_OP_SHUTDOWN_REQ "shutdown_req"
#define LRMD_IPC_OP_SHUTDOWN_ACK "shutdown_ack"
#define LRMD_IPC_OP_SHUTDOWN_NACK "shutdown_nack"

#define F_LRMD_IPC_OP "lrmd_ipc_op"
#define F_LRMD_IPC_IPC_SERVER "lrmd_ipc_server"
Expand Down
2 changes: 2 additions & 0 deletions include/crm/msg_xml.h
Expand Up @@ -255,6 +255,7 @@
# define XML_NODE_IS_PEER "crmd"
# define XML_NODE_IS_REMOTE "remote_node"
# define XML_NODE_IS_FENCED "node_fenced"
# define XML_NODE_IS_MAINTENANCE "node_in_maintenance"

# define XML_CIB_ATTR_SHUTDOWN "shutdown"
# define XML_CIB_ATTR_STONITH "stonith"
Expand Down Expand Up @@ -297,6 +298,7 @@
# define XML_GRAPH_TAG_PSEUDO_EVENT "pseudo_event"
# define XML_GRAPH_TAG_CRM_EVENT "crm_event"
# define XML_GRAPH_TAG_DOWNED "downed"
# define XML_GRAPH_TAG_MAINTENANCE "maintenance"

# define XML_TAG_RULE "rule"
# define XML_RULE_ATTR_SCORE "score"
Expand Down
1 change: 1 addition & 0 deletions include/crm/pengine/status.h
Expand Up @@ -160,6 +160,7 @@ struct node_shared_s {
gboolean rsc_discovery_enabled;
gboolean remote_requires_reset;
gboolean remote_was_fenced;
gboolean remote_maintenance; /* what the remote-rsc is thinking */
};

struct node_s {
Expand Down
1 change: 1 addition & 0 deletions include/crm_internal.h
Expand Up @@ -371,6 +371,7 @@ typedef struct remote_proxy_s {
} remote_proxy_t;
void remote_proxy_notify_destroy(lrmd_t *lrmd, const char *session_id);
void remote_proxy_ack_shutdown(lrmd_t *lrmd);
void remote_proxy_nack_shutdown(lrmd_t *lrmd);
void remote_proxy_relay_event(lrmd_t *lrmd, const char *session_id, xmlNode *msg);
void remote_proxy_relay_response(lrmd_t *lrmd, const char *session_id, xmlNode *msg, int msg_id);
void remote_proxy_end_session(const char *session);
Expand Down
15 changes: 15 additions & 0 deletions lib/lrmd/proxy_common.c
Expand Up @@ -59,6 +59,21 @@ remote_proxy_ack_shutdown(lrmd_t *lrmd)
free_xml(msg);
}

/*!
* \brief We're not gonna shutdown as response to
* a remote proxy shutdown request.
*
* \param[in] lrmd Connection to proxy
*/
void
remote_proxy_nack_shutdown(lrmd_t *lrmd)
{
xmlNode *msg = create_xml_node(NULL, T_LRMD_IPC_PROXY);
crm_xml_add(msg, F_LRMD_IPC_OP, LRMD_IPC_OP_SHUTDOWN_NACK);
lrmd_internal_proxy_send(lrmd, msg);
free_xml(msg);
}

void
remote_proxy_relay_event(lrmd_t *lrmd, const char *session_id, xmlNode *msg)
{
Expand Down

0 comments on commit 0113ff6

Please sign in to comment.