Skip to content

Commit

Permalink
Clean up state machine.
Browse files Browse the repository at this point in the history
  This error happens when "max_request_time" is set VERY low.
i.e. lower than "response_window".  (12s versus 30s).

  The current logic for enforcing the various timers is pretty bad.  There
is one timer per request, and it bounces around between the different
requirements.  At the time it was written, it seemed simpler than trying
to manage 3-4 simultaneous timers per request.

  When the request is proxied, the timer being applied is for
"response_window".  BUT by the time that expires, the "max_request_time"
has expired.  The code *does* notice that it has expired.  BUT it doesn't
notice that there's no child thread processing the request.  So it waits
for the child thread to exit... forever.

  At some point, a timer overflows, and it dies.

  There are a few changes to make:

1) check for "no child" in this situation, and clean up the request rather
  than waiting forever.

2) cap the timer to 5 minutes (this can still happen, for example, when a
   bad DB locks a thread for hours at a time).

3) don't overflow when adding timer values.
  • Loading branch information
alandekok committed Nov 29, 2009
1 parent 7074e78 commit 139c45b
Showing 1 changed file with 29 additions and 35 deletions.
64 changes: 29 additions & 35 deletions src/main/event.c
Expand Up @@ -139,8 +139,8 @@ static void tv_add(struct timeval *tv, int usec_delay)
tv->tv_usec += usec_delay;

if (tv->tv_usec > USEC) {
tv->tv_usec -= USEC;
tv->tv_sec++;
tv->tv_sec += tv->tv_usec / USEC;
tv->tv_usec %= USEC;
}
}

Expand Down Expand Up @@ -485,10 +485,19 @@ static void wait_for_child_to_die(void *ctx)

if ((request->child_state == REQUEST_QUEUED) ||
(request->child_state == REQUEST_RUNNING)) {
request->delay += (request->delay >> 1);
tv_add(&request->when, request->delay);

RDEBUG2("Child is still stuck for request %d", request->number);
/*
* Cap delay at five minutes.
*/
if (request->delay < (USEC * 60 * 5)) {
request->delay += (request->delay >> 1);
radlog(L_INFO, "WARNING: Child is hung for request %d.",
request->number);
} else {
RDEBUG2("Child is still stuck for request %d",
request->number);
}
tv_add(&request->when, request->delay);

INSERT_EVENT(wait_for_child_to_die, request);
return;
Expand Down Expand Up @@ -1086,9 +1095,7 @@ static void no_response_to_proxied_request(void *ctx)

post_proxy_fail_handler(request);
} else {
/*
* Ensure that there is a callback for the request.
*/
rad_assert(request->ev == NULL);
request->child_state = REQUEST_RUNNING;
wait_a_bit(request);
}
Expand Down Expand Up @@ -1224,49 +1231,36 @@ static void wait_a_bit(void *ctx)
break;
}

stop_processing:
#if defined(HAVE_PTHREAD_H) || defined(WITH_PROXY)
/*
* A child thread MAY still be running on the
* request. Ask the thread to stop working on
* the request.
*/
if (have_children) {
/* FIXME: kill unresponsive children? */

/*
* Print this error message ONLY if
* there's a child currently processing
* the request. As we don't have thread
* locks here, there may be race
* conditions on this check. But it's
* just an error message, so that's OK.
*/
if (!pthread_equal(request->child_pid, NO_SUCH_CHILD_PID)) {
radlog(L_ERR, "WARNING: Unresponsive child for request %d, in module %s component %s",
request->number,
request->module ? request->module : "<server core>",
request->component ? request->component : "<server core>");
}

stop_processing:
if (have_children &&
(pthread_equal(request->child_pid, NO_SUCH_CHILD_PID) == 0)) {
request->master_state = REQUEST_STOP_PROCESSING;

radlog(L_ERR, "WARNING: Unresponsive child for request %d, in module %s component %s",
request->number,
request->module ? request->module : "<server core>",
request->component ? request->component : "<server core>");

request->delay = USEC / 4;
tv_add(&request->when, request->delay);
callback = wait_for_child_to_die;
break;
}
#else /* no child threads */
stop_processing:
#endif

/*
* Else there are no child threads. We probably
* should have just marked the request as 'done'
* elsewhere, like in the post-proxy-fail
* handler. But doing that would involve
* checking for max_request_time in multiple
* places, so this may be simplest.
* Else no child thread is processing the
* request. We probably should have just marked
* the request as 'done' elsewhere, like in the
* post-proxy-fail handler. But doing that would
* involve checking for max_request_time in
* multiple places, so this may be simplest.
*/
request->child_state = REQUEST_DONE;
/* FALL-THROUGH */
Expand Down

0 comments on commit 139c45b

Please sign in to comment.