Skip to content

Commit

Permalink
Merge pull request #1243 from kgaillot/fail
Browse files Browse the repository at this point in the history
Track resource failures per operation
  • Loading branch information
kgaillot committed Mar 17, 2017
2 parents 00ef0c1 + 4f7420e commit bde602c
Show file tree
Hide file tree
Showing 30 changed files with 1,052 additions and 474 deletions.
10 changes: 8 additions & 2 deletions attrd/attrd_common.c
Expand Up @@ -248,13 +248,16 @@ attrd_expand_value(const char *value, const char *old_value)
*
* \param[out] regex Where to store created regular expression
* \param[in] rsc Name of resource to clear (or NULL for all)
* \param[in] op Operation to clear if rsc is specified (or NULL for all)
* \param[in] interval Interval of operation to clear if op is specified
*
* \return pcmk_ok on success, -EINVAL if arguments are invalid
*
* \note The caller is responsible for freeing the result with regfree().
*/
int
attrd_failure_regex(regex_t *regex, const char *rsc)
attrd_failure_regex(regex_t *regex, const char *rsc, const char *op,
int interval)
{
char *pattern = NULL;
int rc;
Expand All @@ -263,8 +266,11 @@ attrd_failure_regex(regex_t *regex, const char *rsc)

if (rsc == NULL) {
pattern = strdup(ATTRD_RE_CLEAR_ALL);
} else {
} else if (op == NULL) {
pattern = crm_strdup_printf(ATTRD_RE_CLEAR_ONE, rsc);
} else {
pattern = crm_strdup_printf(ATTRD_RE_CLEAR_OP,
rsc, op, interval);
}

/* Compile pattern into regular expression */
Expand Down
21 changes: 17 additions & 4 deletions attrd/attrd_common.h
Expand Up @@ -27,10 +27,23 @@ int attrd_expand_value(const char *value, const char *old_value);
#define ATTRD_RE_CLEAR_ALL \
"^(" CRM_FAIL_COUNT_PREFIX "|" CRM_LAST_FAILURE_PREFIX ")-"

/* regular expression to clear failure of one resource */
/* format takes resource name */
#define ATTRD_RE_CLEAR_ONE ATTRD_RE_CLEAR_ALL "%s$"
/* regular expression to clear failure of all operations for one resource
* (format takes resource name)
*
* @COMPAT attributes set < 1.1.17:
* also match older attributes that do not have the operation part
*/
#define ATTRD_RE_CLEAR_ONE ATTRD_RE_CLEAR_ALL "%s(#.+_[0-9]+)?$"

/* regular expression to clear failure of one operation for one resource
* (format takes resource name, operation name, and interval)
*
* @COMPAT attributes set < 1.1.17:
* also match older attributes that do not have the operation part
*/
#define ATTRD_RE_CLEAR_OP ATTRD_RE_CLEAR_ALL "%s(#%s_%d)?$"

int attrd_failure_regex(regex_t *regex, const char *rsc);
int attrd_failure_regex(regex_t *regex, const char *rsc, const char *op,
int interval);

#endif /* PCMK_ATTRD_COMMON__H */
40 changes: 32 additions & 8 deletions attrd/commands.c
Expand Up @@ -332,24 +332,39 @@ attrd_client_clear_failure(xmlNode *xml)
}
#endif

const char *rsc = crm_element_value(xml, F_ATTRD_ATTRIBUTE);
const char *rsc = crm_element_value(xml, F_ATTRD_RESOURCE);
const char *op = crm_element_value(xml, F_ATTRD_OPERATION);
const char *interval_s = crm_element_value(xml, F_ATTRD_INTERVAL);

/* Map this to an update that uses a regular expression */
/* Map this to an update */
crm_xml_add(xml, F_ATTRD_TASK, ATTRD_OP_UPDATE);

/* Add expression matching one or all resources as appropriate */
/* Add regular expression matching desired attributes */

if (rsc) {
char *pattern = crm_strdup_printf(ATTRD_RE_CLEAR_ONE, rsc);
char *pattern;

if (op == NULL) {
pattern = crm_strdup_printf(ATTRD_RE_CLEAR_ONE, rsc);

} else {
int interval = crm_get_interval(interval_s);

pattern = crm_strdup_printf(ATTRD_RE_CLEAR_OP,
rsc, op, interval);
}

crm_xml_add(xml, F_ATTRD_REGEX, pattern);
crm_xml_replace(xml, F_ATTRD_ATTRIBUTE, NULL);
free(pattern);

} else {
crm_xml_add(xml, F_ATTRD_REGEX, ATTRD_RE_CLEAR_ALL);
}

/* Delete the value */
/* Make sure attribute and value are not set, so we delete via regex */
if (crm_element_value(xml, F_ATTRD_ATTRIBUTE)) {
crm_xml_replace(xml, F_ATTRD_ATTRIBUTE, NULL);
}
if (crm_element_value(xml, F_ATTRD_VALUE)) {
crm_xml_replace(xml, F_ATTRD_VALUE, NULL);
}
Expand Down Expand Up @@ -504,20 +519,28 @@ attrd_client_query(crm_client_t *client, uint32_t id, uint32_t flags, xmlNode *q
static void
attrd_peer_clear_failure(crm_node_t *peer, xmlNode *xml)
{
const char *rsc = crm_element_value(xml, F_ATTRD_ATTRIBUTE);
const char *rsc = crm_element_value(xml, F_ATTRD_RESOURCE);
const char *host = crm_element_value(xml, F_ATTRD_HOST);
const char *op = crm_element_value(xml, F_ATTRD_OPERATION);
const char *interval_s = crm_element_value(xml, F_ATTRD_INTERVAL);
int interval = crm_get_interval(interval_s);
char *attr = NULL;
GHashTableIter iter;
regex_t regex;

if (attrd_failure_regex(&regex, rsc) != pcmk_ok) {
if (attrd_failure_regex(&regex, rsc, op, interval) != pcmk_ok) {
crm_info("Ignoring invalid request to clear failures for %s",
(rsc? rsc : "all resources"));
return;
}

crm_xml_add(xml, F_ATTRD_TASK, ATTRD_OP_UPDATE);

/* Make sure value is not set, so we delete */
if (crm_element_value(xml, F_ATTRD_VALUE)) {
crm_xml_replace(xml, F_ATTRD_VALUE, NULL);
}

g_hash_table_iter_init(&iter, attributes);
while (g_hash_table_iter_next(&iter, (gpointer *) &attr, NULL)) {
if (regexec(&regex, attr, 0, NULL, 0) == 0) {
Expand Down Expand Up @@ -1030,6 +1053,7 @@ build_update_element(xmlNode *parent, attribute_t *a, const char *nodeid, const
for (lpc = 0; uuid[lpc] != 0; lpc++) {
switch (uuid[lpc]) {
case ':':
case '#':
uuid[lpc] = '.';
}
}
Expand Down
96 changes: 78 additions & 18 deletions attrd/legacy.c
Expand Up @@ -237,19 +237,27 @@ find_hash_entry(xmlNode * msg)
static void
local_clear_failure(xmlNode *xml)
{
const char *rsc = crm_element_value(xml, F_ATTRD_ATTRIBUTE);
const char *rsc = crm_element_value(xml, F_ATTRD_RESOURCE);
const char *what = rsc? rsc : "all resources";
const char *op = crm_element_value(xml, F_ATTRD_OPERATION);
const char *interval_s = crm_element_value(xml, F_ATTRD_INTERVAL);
int interval = crm_get_interval(interval_s);
regex_t regex;
GHashTableIter iter;
attr_hash_entry_t *hash_entry = NULL;

if (attrd_failure_regex(&regex, rsc) != pcmk_ok) {
if (attrd_failure_regex(&regex, rsc, op, interval) != pcmk_ok) {
crm_info("Ignoring invalid request to clear %s",
(rsc? rsc : "all resources"));
return;
}
crm_debug("Clearing %s locally", what);

/* Make sure value is not set, so we delete */
if (crm_element_value(xml, F_ATTRD_VALUE)) {
crm_xml_replace(xml, F_ATTRD_VALUE, NULL);
}

g_hash_table_iter_init(&iter, attr_hash);
while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &hash_entry)) {
if (regexec(&regex, hash_entry->id, 0, NULL, 0) == 0) {
Expand Down Expand Up @@ -282,15 +290,40 @@ remote_clear_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
"/" XML_CIB_TAG_STATE "[@" XML_NODE_IS_REMOTE "='true']" x \
"/" XML_TAG_TRANSIENT_NODEATTRS "/" XML_TAG_ATTR_SETS "/" XML_CIB_TAG_NVPAIR

/* xpath component to match an attribute name exactly */
#define XPATH_NAME_IS(x) "@" XML_NVPAIR_ATTR_NAME "='" x "'"

/* xpath component to match an attribute name by prefix */
#define XPATH_NAME_START(x) "starts-with(@" XML_NVPAIR_ATTR_NAME ", '" x "')"

/* xpath ending to clear all resources */
#define XPATH_CLEAR_ALL \
"[starts-with(@" XML_NVPAIR_ATTR_NAME ", '" CRM_FAIL_COUNT_PREFIX "-') " \
"or starts-with(@" XML_NVPAIR_ATTR_NAME ", '" CRM_LAST_FAILURE_PREFIX "-')]"
"[" XPATH_NAME_START(CRM_FAIL_COUNT_PREFIX "-") \
" or " XPATH_NAME_START(CRM_LAST_FAILURE_PREFIX "-") "]"

/* xpath ending to clear one resource (format takes resource name x 2) */
/* xpath ending to clear all operations for one resource
* (format takes resource name x 4)
*
* @COMPAT attributes set < 1.1.17:
* also match older attributes that do not have the operation part
*/
#define XPATH_CLEAR_ONE \
"[@" XML_NVPAIR_ATTR_NAME "='" CRM_FAIL_COUNT_PREFIX "-%s' " \
"or @" XML_NVPAIR_ATTR_NAME "='" CRM_LAST_FAILURE_PREFIX "-%s']"
"[" XPATH_NAME_IS(CRM_FAIL_COUNT_PREFIX "-%s") \
" or " XPATH_NAME_IS(CRM_LAST_FAILURE_PREFIX "-%s") \
" or " XPATH_NAME_START(CRM_FAIL_COUNT_PREFIX "-%s#") \
" or " XPATH_NAME_START(CRM_LAST_FAILURE_PREFIX "-%s#") "]"

/* xpath ending to clear one operation for one resource
* (format takes resource name x 2, resource name + operation + interval x 2)
*
* @COMPAT attributes set < 1.1.17:
* also match older attributes that do not have the operation part
*/
#define XPATH_CLEAR_OP \
"[" XPATH_NAME_IS(CRM_FAIL_COUNT_PREFIX "-%s") \
" or " XPATH_NAME_IS(CRM_LAST_FAILURE_PREFIX "-%s") \
" or " XPATH_NAME_IS(CRM_FAIL_COUNT_PREFIX "-%s#%s_%d") \
" or " XPATH_NAME_IS(CRM_LAST_FAILURE_PREFIX "-%s#%s_%d") "]"

/*!
* \internal
Expand All @@ -301,8 +334,9 @@ remote_clear_callback(xmlNode *msg, int call_id, int rc, xmlNode *output,
static void
remote_clear_failure(xmlNode *xml)
{
const char *rsc = crm_element_value(xml, F_ATTRD_ATTRIBUTE);
const char *rsc = crm_element_value(xml, F_ATTRD_RESOURCE);
const char *host = crm_element_value(xml, F_ATTRD_HOST);
const char *op = crm_element_value(xml, F_ATTRD_OPERATION);
int rc = pcmk_ok;
char *xpath;

Expand All @@ -313,18 +347,44 @@ remote_clear_failure(xmlNode *xml)
return;
}

if ((rsc == NULL) && (host == NULL)) {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR("") XPATH_CLEAR_ALL);
/* Build an xpath to clear appropriate attributes */

if (rsc == NULL) {
/* No resource specified, clear all resources */

if (host == NULL) {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR("") XPATH_CLEAR_ALL);
} else {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR(XPATH_ID) XPATH_CLEAR_ALL,
host);
}

} else if (op == NULL) {
/* Resource but no operation specified, clear all operations */

if (host == NULL) {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR("") XPATH_CLEAR_ONE,
rsc, rsc, rsc, rsc);
} else {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR(XPATH_ID) XPATH_CLEAR_ONE,
host, rsc, rsc, rsc, rsc);
}

} else if (rsc == NULL) {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR(XPATH_ID) XPATH_CLEAR_ALL,
host);
} else if (host == NULL) {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR("") XPATH_CLEAR_ONE,
rsc, rsc);
} else {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR(XPATH_ID) XPATH_CLEAR_ONE,
host, rsc, rsc);
/* Resource and operation specified */

const char *interval_s = crm_element_value(xml, F_ATTRD_INTERVAL);
int interval = crm_get_interval(interval_s);

if (host == NULL) {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR("") XPATH_CLEAR_OP,
rsc, rsc, rsc, op, interval,
rsc, op, interval);
} else {
xpath = crm_strdup_printf(XPATH_REMOTE_ATTR(XPATH_ID) XPATH_CLEAR_OP,
host, rsc, rsc, rsc, op, interval,
rsc, op, interval);
}
}

crm_trace("Clearing attributes matching %s", xpath);
Expand Down
32 changes: 21 additions & 11 deletions crmd/attrd.c
Expand Up @@ -53,8 +53,8 @@ log_attrd_error(const char *host, const char *name, const char *value,

static void
update_attrd_helper(const char *host, const char *name, const char *value,
const char *user_name, gboolean is_remote_node,
char command)
const char *interval, const char *user_name,
gboolean is_remote_node, char command)
{
int rc;
int max = 5;
Expand All @@ -78,9 +78,16 @@ update_attrd_helper(const char *host, const char *name, const char *value,
}
}

rc = attrd_update_delegate(attrd_ipc, command, host, name, value,
XML_CIB_TAG_STATUS, NULL, NULL, user_name,
attrd_opts);
if (command) {
rc = attrd_update_delegate(attrd_ipc, command, host, name, value,
XML_CIB_TAG_STATUS, NULL, NULL,
user_name, attrd_opts);
} else {
/* (ab)using name/value as resource/operation */
rc = attrd_clear_delegate(attrd_ipc, host, name, value, interval,
user_name, attrd_opts);
}

if (rc == pcmk_ok) {
break;

Expand All @@ -103,21 +110,24 @@ void
update_attrd(const char *host, const char *name, const char *value,
const char *user_name, gboolean is_remote_node)
{
update_attrd_helper(host, name, value, user_name, is_remote_node, 'U');
update_attrd_helper(host, name, value, NULL, user_name, is_remote_node,
'U');
}

void
update_attrd_remote_node_removed(const char *host, const char *user_name)
{
crm_trace("Asking attrd to purge Pacemaker Remote node %s", host);
update_attrd_helper(host, NULL, NULL, user_name, TRUE, 'C');
update_attrd_helper(host, NULL, NULL, NULL, user_name, TRUE, 'C');
}

void
update_attrd_clear_failures(const char *host, const char *rsc,
gboolean is_remote_node)
update_attrd_clear_failures(const char *host, const char *rsc, const char *op,
const char *interval, gboolean is_remote_node)
{
crm_info("Asking attrd to clear failure of %s on %s node %s",
crm_info("Asking attrd to clear failure of %s %s for %s on %s node %s",
(op? op : "all operations"),
(interval? interval : "at all intervals"),
rsc, (is_remote_node? "Pacemaker Remote" : "cluster"), host);
update_attrd_helper(host, rsc, NULL, NULL, is_remote_node, 'c');
update_attrd_helper(host, rsc, op, interval, NULL, is_remote_node, 0);
}
3 changes: 2 additions & 1 deletion crmd/crmd_lrm.h
Expand Up @@ -19,7 +19,8 @@
#include <crmd_messages.h>

extern gboolean verify_stopped(enum crmd_fsa_state cur_state, int log_level);
extern void lrm_clear_last_failure(const char *rsc_id, const char *node_name);
extern void lrm_clear_last_failure(const char *rsc_id, const char *node_name,
const char *operation, int interval);
void lrm_op_callback(lrmd_event_data_t * op);

typedef struct resource_history_s {
Expand Down
1 change: 1 addition & 0 deletions crmd/crmd_utils.h
Expand Up @@ -94,6 +94,7 @@ void init_transient_attrs(const char *uname, const char *start_state, int option
void update_attrd(const char *host, const char *name, const char *value, const char *user_name, gboolean is_remote_node);
void update_attrd_remote_node_removed(const char *host, const char *user_name);
void update_attrd_clear_failures(const char *host, const char *rsc,
const char *op, const char *interval,
gboolean is_remote_node);

int crmd_join_phase_count(enum crm_join_phase phase);
Expand Down

0 comments on commit bde602c

Please sign in to comment.