Skip to content

Commit

Permalink
Merge pull request #1231 from kgaillot/fail
Browse files Browse the repository at this point in the history
Fix a couple of bugs in fail-count clearing
  • Loading branch information
kgaillot committed Feb 24, 2017
2 parents 199b428 + fc14818 commit f61d0b2
Show file tree
Hide file tree
Showing 25 changed files with 141 additions and 111 deletions.
18 changes: 18 additions & 0 deletions crmd/attrd.c
Original file line number Diff line number Diff line change
Expand Up @@ -112,3 +112,21 @@ update_attrd_remote_node_removed(const char *host, const char *user_name)
crm_trace("Asking attrd to purge Pacemaker Remote node %s", host);
update_attrd_helper(host, NULL, NULL, user_name, TRUE, 'C');
}

void
update_attrd_clear_failures(const char *host, const char *rsc,
gboolean is_remote_node)
{
char *attr;

crm_info("Asking attrd to clear failure of %s on %s node %s",
rsc, (is_remote_node? "Pacemaker Remote" : "cluster"), host);

attr = crm_failcount_name(rsc);
update_attrd(host, attr, NULL, NULL, is_remote_node);
free(attr);

attr = crm_lastfailure_name(rsc);
update_attrd(host, attr, NULL, NULL, is_remote_node);
free(attr);
}
2 changes: 2 additions & 0 deletions crmd/crmd_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,8 @@ void erase_status_tag(const char *uname, const char *tag, int options);
void init_transient_attrs(const char *uname, const char *start_state, int options);
void update_attrd(const char *host, const char *name, const char *value, const char *user_name, gboolean is_remote_node);
void update_attrd_remote_node_removed(const char *host, const char *user_name);
void update_attrd_clear_failures(const char *host, const char *rsc,
gboolean is_remote_node);

int crmd_join_phase_count(enum crm_join_phase phase);
void crmd_join_phase_log(int level);
Expand Down
37 changes: 15 additions & 22 deletions crmd/messages.c
Original file line number Diff line number Diff line change
Expand Up @@ -623,34 +623,27 @@ handle_failcount_op(xmlNode * stored_msg)
const char *rsc = NULL;
const char *uname = NULL;
gboolean is_remote_node = FALSE;
xmlNode *xml_rsc = get_xpath_object("//" XML_CIB_TAG_RESOURCE, stored_msg, LOG_ERR);
xmlNode *xml_op = get_message_xml(stored_msg, F_CRM_DATA);

if (xml_rsc) {
rsc = ID(xml_rsc);
}
if (xml_op) {
xmlNode *xml_rsc = first_named_child(xml_op, XML_CIB_TAG_RESOURCE);

uname = crm_element_value(stored_msg, XML_LRM_ATTR_TARGET);
if (crm_element_value(stored_msg, XML_LRM_ATTR_ROUTER_NODE)) {
is_remote_node = TRUE;
if (xml_rsc) {
rsc = ID(xml_rsc);
}
}

if (rsc) {
char *attr = NULL;

crm_info("Removing failcount for %s", rsc);

attr = crm_failcount_name(rsc);
update_attrd(uname, attr, NULL, NULL, is_remote_node);
free(attr);

attr = crm_lastfailure_name(rsc);
update_attrd(uname, attr, NULL, NULL, is_remote_node);
free(attr);

lrm_clear_last_failure(rsc, uname);
} else {
if (rsc == NULL) {
crm_log_xml_warn(stored_msg, "invalid failcount op");
return I_NULL;
}

uname = crm_element_value(xml_op, XML_LRM_ATTR_TARGET);
if (crm_element_value(xml_op, XML_LRM_ATTR_ROUTER_NODE)) {
is_remote_node = TRUE;
}
update_attrd_clear_failures(uname, rsc, is_remote_node);
lrm_clear_last_failure(rsc, uname);

return I_NULL;
}
Expand Down
2 changes: 1 addition & 1 deletion cts/CTStests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1824,7 +1824,7 @@ def __call__(self, node):
def errorstoignore(self):
'''Return list of errors which should be ignored'''
return [
r"resources were active at shutdown",
r"resource( was|s were) active at shutdown",
]

def is_applicable(self):
Expand Down
2 changes: 1 addition & 1 deletion cts/patterns.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,7 +255,7 @@ def __init__(self, name):
self.components["common-ignore"] = [
"Pending action:",
"error: crm_log_message_adv:",
"resources were active at shutdown",
r"resource( was|s were) active at shutdown",
"pending LRM operations at shutdown",
"Lost connection to the CIB service",
"Connection to the CIB terminated...",
Expand Down
35 changes: 19 additions & 16 deletions include/crm/crm.h
Original file line number Diff line number Diff line change
Expand Up @@ -96,53 +96,56 @@ extern char *crm_system_name;

/* Valid operations */
# define CRM_OP_NOOP "noop"

# define CRM_OP_JOIN_ANNOUNCE "join_announce"
# define CRM_OP_JOIN_OFFER "join_offer"
# define CRM_OP_JOIN_REQUEST "join_request"
# define CRM_OP_JOIN_ACKNAK "join_ack_nack"
# define CRM_OP_JOIN_CONFIRM "join_confirm"

# define CRM_OP_DIE "die_no_respawn"
# define CRM_OP_RETRIVE_CIB "retrieve_cib"
# define CRM_OP_PING "ping"
# define CRM_OP_THROTTLE "throttle"
# define CRM_OP_VOTE "vote"
# define CRM_OP_NOVOTE "no-vote"
# define CRM_OP_HELLO "hello"
# define CRM_OP_HBEAT "dc_beat"
# define CRM_OP_PECALC "pe_calc"
# define CRM_OP_ABORT "abort"
# define CRM_OP_QUIT "quit"
# define CRM_OP_LOCAL_SHUTDOWN "start_shutdown"
# define CRM_OP_SHUTDOWN_REQ "req_shutdown"
# define CRM_OP_SHUTDOWN "do_shutdown"
# define CRM_OP_FENCE "stonith"
# define CRM_OP_EVENTCC "event_cc"
# define CRM_OP_TEABORT "te_abort"
# define CRM_OP_TEABORTED "te_abort_confirmed" /* we asked */
# define CRM_OP_TE_HALT "te_halt"
# define CRM_OP_TECOMPLETE "te_complete"
# define CRM_OP_TETIMEOUT "te_timeout"
# define CRM_OP_TRANSITION "transition"
# define CRM_OP_REGISTER "register"
# define CRM_OP_IPC_FWD "ipc_fwd"
# define CRM_OP_DEBUG_UP "debug_inc"
# define CRM_OP_DEBUG_DOWN "debug_dec"
# define CRM_OP_INVOKE_LRM "lrm_invoke"
# define CRM_OP_LRM_REFRESH "lrm_refresh" /* Deprecated */
# define CRM_OP_LRM_QUERY "lrm_query"
# define CRM_OP_LRM_DELETE "lrm_delete"
# define CRM_OP_LRM_FAIL "lrm_fail"
# define CRM_OP_PROBED "probe_complete"
# define CRM_OP_NODES_PROBED "probe_nodes_complete"
# define CRM_OP_REPROBE "probe_again"
# define CRM_OP_CLEAR_FAILCOUNT "clear_failcount"
# define CRM_OP_REMOTE_STATE "remote_state"
# define CRM_OP_RELAXED_SET "one-or-more"
# define CRM_OP_RELAXED_CLONE "clone-one-or-more"
# define CRM_OP_RM_NODE_CACHE "rm_node_cache"

/* @COMPAT: These symbols are deprecated and not used by Pacemaker,
* but they are kept for public API backward compatibility.
*/
# define CRM_OP_DIE "die_no_respawn"
# define CRM_OP_RETRIVE_CIB "retrieve_cib"
# define CRM_OP_HBEAT "dc_beat"
# define CRM_OP_ABORT "abort"
# define CRM_OP_EVENTCC "event_cc"
# define CRM_OP_TEABORT "te_abort"
# define CRM_OP_TEABORTED "te_abort_confirmed"
# define CRM_OP_TE_HALT "te_halt"
# define CRM_OP_TECOMPLETE "te_complete"
# define CRM_OP_TETIMEOUT "te_timeout"
# define CRM_OP_TRANSITION "transition"
# define CRM_OP_DEBUG_UP "debug_inc" /* unused since 1.1.8 */
# define CRM_OP_DEBUG_DOWN "debug_dec" /* unused since 1.1.8 */
# define CRM_OP_NODES_PROBED "probe_nodes_complete"

/* Possible cluster membership states */
# define CRMD_JOINSTATE_DOWN "down"
# define CRMD_JOINSTATE_PENDING "pending"
# define CRMD_JOINSTATE_MEMBER "member"
Expand Down
5 changes: 4 additions & 1 deletion include/crm/transition.h
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,9 @@ typedef struct crm_action_s {

} crm_action_t;

/* @COMPAT: This enum has deprecated. It has apparently never been used in a
* Pacemaker release, but it is kept for API backward compatibility.
*/
enum timer_reason {
timeout_action,
timeout_action_warn,
Expand All @@ -71,7 +74,7 @@ enum timer_reason {
struct te_timer_s {
int source_id;
int timeout;
enum timer_reason reason;
enum timer_reason reason; /* @COMPAT: unused, API backward compatibility */
crm_action_t *action;
};

Expand Down
35 changes: 21 additions & 14 deletions lib/common/remote.c
Original file line number Diff line number Diff line change
Expand Up @@ -601,13 +601,16 @@ crm_remote_recv_once(crm_remote_t * remote)

/*!
* \internal
* \brief Read data off the socket until at least one full message is present or timeout occures.
* \retval TRUE message read
* \retval FALSE full message not read
* \brief Read message(s) from a remote connection
*
* \param[in] remote Remote connection to read
* \param[in] total_timeout Fail if message not read in this time (ms)
* \param[out] disconnected Will be set to 1 if disconnect detected
*
* \return TRUE if at least one full message read, FALSE otherwise
*/

gboolean
crm_remote_recv(crm_remote_t * remote, int total_timeout /*ms */ , int *disconnected)
crm_remote_recv(crm_remote_t *remote, int total_timeout, int *disconnected)
{
int rc;
time_t start = time(NULL);
Expand All @@ -623,28 +626,32 @@ crm_remote_recv(crm_remote_t * remote, int total_timeout /*ms */ , int *disconne
remaining_timeout = total_timeout;
while ((remaining_timeout > 0) && !(*disconnected)) {

/* read some more off the tls buffer if we still have time left. */
crm_trace("waiting to receive remote msg, starting timeout %d, remaining_timeout %d",
total_timeout, remaining_timeout);
crm_trace("Waiting for remote data (%d of %d ms timeout remaining)",
remaining_timeout, total_timeout);
rc = crm_remote_ready(remote, remaining_timeout);

if (rc == 0) {
crm_err("poll timed out (%d ms) while waiting to receive msg", remaining_timeout);
crm_err("Timed out (%d ms) while waiting for remote data",
remaining_timeout);
return FALSE;

} else if(rc < 0) {
crm_debug("could not poll: %s (%d)", pcmk_strerror(rc), rc);
} else if (rc < 0) {
crm_debug("Wait for remote data aborted, will try again: %s "
CRM_XS " rc=%d", pcmk_strerror(rc), rc);

} else {
rc = crm_remote_recv_once(remote);
if(rc > 0) {
if (rc > 0) {
return TRUE;
} else if (rc == -EAGAIN) {
crm_trace("Still waiting for remote data");
} else if (rc < 0) {
crm_debug("recv() failed: %s (%d)", pcmk_strerror(rc), rc);
crm_debug("Could not receive remote data: %s " CRM_XS " rc=%d",
pcmk_strerror(rc), rc);
}
}

if(rc == -ENOTCONN) {
if (rc == -ENOTCONN) {
*disconnected = 1;
return FALSE;
}
Expand Down
23 changes: 11 additions & 12 deletions lib/pengine/unpack.c
Original file line number Diff line number Diff line change
Expand Up @@ -2851,11 +2851,11 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
{
bool expired = FALSE;
time_t last_failure = 0;
int clear_failcount = 0;
int interval = 0;
int failure_timeout = rsc->failure_timeout;
const char *key = get_op_key(xml_op);
const char *task = crm_element_value(xml_op, XML_LRM_ATTR_TASK);
const char *clear_reason = NULL;

/* clearing recurring monitor operation failures automatically
* needs to be carefully considered */
Expand Down Expand Up @@ -2903,15 +2903,14 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
int fc = get_failcount_full(node, rsc, &last_failure, FALSE, xml_op, data_set);
if(fc) {
if (get_failcount_full(node, rsc, &last_failure, TRUE, xml_op, data_set) == 0) {
clear_failcount = 1;
crm_notice("Clearing expired failcount for %s on %s", rsc->id, node->details->uname);
clear_reason = "it expired";

} else {
expired = FALSE;
}
} else if (rsc->remote_reconnect_interval && strstr(ID(xml_op), "last_failure")) {
/* always clear last failure when reconnect interval is set */
clear_failcount = 1;
clear_reason = "reconnect interval is set";
}
}

Expand All @@ -2926,19 +2925,19 @@ static bool check_operation_expiry(resource_t *rsc, node_t *node, int rc, xmlNod
crm_trace("rsc op %s/%s on node %s does not have a op digest to compare against", rsc->id,
key, node->details->id);
} else if (digest_data->rc != RSC_DIGEST_MATCH) {
clear_failcount = 1;
crm_info
("Clearing failcount for %s on %s, %s failed and now resource parameters have changed.",
task, rsc->id, node->details->uname);
clear_reason = "resource parameters have changed";
}
}

if (clear_failcount) {
action_t *clear_op = NULL;
if (clear_reason != NULL) {
char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
node, FALSE, TRUE, data_set);

clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'),
CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);

crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
rsc->id, node->details->uname, clear_reason, clear_op->uuid);
}

crm_element_value_int(xml_op, XML_LRM_ATTR_INTERVAL, &interval);
Expand Down
21 changes: 13 additions & 8 deletions pengine/allocate.c
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,10 @@ check_actions_for(xmlNode * rsc_entry, resource_t * rsc, node_t * node, pe_worki
action_clear =
custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
set_bit(action_clear->flags, pe_action_runnable);

crm_notice("Clearing failure of %s on %s "
"because action definition changed " CRM_XS " %s",
rsc->id, node->details->uname, action_clear->uuid);
}
}

Expand Down Expand Up @@ -596,7 +600,7 @@ static gboolean
failcount_clear_action_exists(node_t * node, resource_t * rsc)
{
gboolean rc = FALSE;
char *key = crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_');
char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
GListPtr list = find_actions_exact(rsc->actions, key, node);

if (list) {
Expand Down Expand Up @@ -1195,15 +1199,16 @@ cleanup_orphans(resource_t * rsc, pe_working_set_t * data_set)
node_t *node = (node_t *) gIter->data;

if (node->details->online && get_failcount(node, rsc, NULL, data_set)) {
action_t *clear_op = NULL;

clear_op = custom_action(rsc, crm_concat(rsc->id, CRM_OP_CLEAR_FAILCOUNT, '_'),
CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE, data_set);
char *key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
action_t *clear_op = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT,
node, FALSE, TRUE, data_set);

add_hash_param(clear_op->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
pe_rsc_info(rsc, "Clearing failcount (%d) for orphaned resource %s on %s (%s)",
get_failcount(node, rsc, NULL, data_set), rsc->id, node->details->uname,
clear_op->uuid);

pe_rsc_info(rsc,
"Clearing failure of %s on %s because it is orphaned "
CRM_XS " %s",
rsc->id, node->details->uname, clear_op->uuid);

custom_action_order(rsc, NULL, clear_op,
rsc, generate_op_key(rsc->id, RSC_STOP, 0), NULL,
Expand Down
2 changes: 1 addition & 1 deletion pengine/test10/bug-5025-4.dot
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
digraph "g" {
"remote-node_clear_failcount 18builder" [ style=bold color="green" fontcolor="black"]
"remote-node_clear_failcount_0 18builder" [ style=bold color="green" fontcolor="black"]
"remote-node_delete_0 18builder" -> "remote-node_start_0 18builder" [ style = bold]
"remote-node_delete_0 18builder" [ style=bold color="green" fontcolor="black"]
"remote-node_monitor_30000 18builder" [ style=bold color="green" fontcolor="black"]
Expand Down
2 changes: 1 addition & 1 deletion pengine/test10/bug-5025-4.exp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
</synapse>
<synapse id="3">
<action_set>
<crm_event id="1" operation="clear_failcount" operation_key="remote-node_clear_failcount" on_node="18builder" on_node_uuid="4">
<crm_event id="1" operation="clear_failcount" operation_key="remote-node_clear_failcount_0" on_node="18builder" on_node_uuid="4">
<primitive id="remote-node" class="ocf" provider="pacemaker" type="Dummy"/>
<attributes CRM_meta_on_node="18builder" CRM_meta_on_node_uuid="4" CRM_meta_op_no_wait="true" CRM_meta_timeout="20000" port="1984" server="18node1"/>
</crm_event>
Expand Down
2 changes: 1 addition & 1 deletion pengine/test10/bug-lf-2106.dot
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
digraph "g" {
"all_stopped" [ style=bold color="green" fontcolor="orange" ]
"pingd:0_clear_failcount cl-virt-1" [ style=bold color="green" fontcolor="black" ]
"pingd:0_clear_failcount_0 cl-virt-1" [ style=bold color="green" fontcolor="black"]
"pingd:0_clear_failcount_0 cl-virt-2" [ style=bold color="green" fontcolor="black"]
"pingd:0_monitor_30000 cl-virt-1" [ style=bold color="green" fontcolor="black" ]
"pingd:0_monitor_30000 cl-virt-2" [ style=bold color="green" fontcolor="black"]
Expand Down
4 changes: 2 additions & 2 deletions pengine/test10/bug-lf-2106.exp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@
</synapse>
<synapse id="3">
<action_set>
<crm_event id="2" operation="clear_failcount" operation_key="pingd:0_clear_failcount" on_node="cl-virt-1" on_node_uuid="cl-virt-1">
<crm_event id="2" operation="clear_failcount" operation_key="pingd:0_clear_failcount_0" on_node="cl-virt-1" on_node_uuid="cl-virt-1">
<primitive id="pingd" long-id="pingd:0" class="ocf" provider="pacemaker" type="pingd"/>
<attributes CRM_meta_clone="0" CRM_meta_clone_max="2" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_notify="false" CRM_meta_on_node="cl-virt-1" CRM_meta_on_node_uuid="cl-virt-1" CRM_meta_op_no_wait="true" CRM_meta_timeout="20000" dampen="10" host_list="10.2.50.103 10.2.50.11 10.2.50.40 10.2.50.8" interval="5"/>
</crm_event>
Expand Down Expand Up @@ -94,7 +94,7 @@
</synapse>
<synapse id="7">
<action_set>
<crm_event id="10" operation="clear_failcount" operation_key="pingd:0_clear_failcount_0" internal_operation_key="pingd:1_clear_failcount" on_node="cl-virt-2" on_node_uuid="cl-virt-2">
<crm_event id="10" operation="clear_failcount" operation_key="pingd:0_clear_failcount_0" internal_operation_key="pingd:1_clear_failcount_0" on_node="cl-virt-2" on_node_uuid="cl-virt-2">
<primitive id="pingd" long-id="pingd:0" class="ocf" provider="pacemaker" type="pingd"/>
<attributes CRM_meta_clone="1" CRM_meta_clone_max="2" CRM_meta_clone_node_max="1" CRM_meta_globally_unique="false" CRM_meta_notify="false" CRM_meta_on_node="cl-virt-2" CRM_meta_on_node_uuid="cl-virt-2" CRM_meta_op_no_wait="true" CRM_meta_timeout="20000" dampen="10" host_list="10.2.50.103 10.2.50.11 10.2.50.40 10.2.50.8" interval="5"/>
</crm_event>
Expand Down

0 comments on commit f61d0b2

Please sign in to comment.