Skip to content

Commit

Permalink
backport: dird: statistic thread crash fixed
Browse files Browse the repository at this point in the history
 - stop statistics thread before reload config and restart afterwards
 - added debug message when old resources table is destroyed within callback
 - cleanup variable names and removed obvious comments

 Fixes #695: director crashes some time after a reload if Collect Statistic is enabled
  • Loading branch information
franku authored and pstorz committed May 9, 2018
1 parent 245425c commit b97f07a
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 41 deletions.
60 changes: 28 additions & 32 deletions src/dird/dird.c
Expand Up @@ -124,12 +124,15 @@ static void reload_job_end_cb(JCR *jcr, void *ctx)

foreach_alist_index(i, table, reload_table) {
if (table == (resource_table_reference *)ctx) {
if (--table->job_count <= 0) {
Dmsg0(100, "Last reference to old configuration, removing saved configuration\n");
free_saved_resources(table);
reload_table->remove(i);
free(table);
break;
if (table->job_count) {
table->job_count--;
if (table->job_count == 0) {
Dmsg0(100, "Last reference to old configuration, removing saved configuration\n");
free_saved_resources(table);
reload_table->remove(i);
free(table);
break;
}
}
}
}
Expand Down Expand Up @@ -455,14 +458,14 @@ static
#endif
void terminate_dird(int sig)
{
static bool already_here = false;
static bool is_reloading = false;

if (already_here) { /* avoid recursive temination problems */
if (is_reloading) { /* avoid recursive temination problems */
bmicrosleep(2, 0); /* yield */
exit(1);
}

already_here = true;
is_reloading = true;
debug_level = 0; /* turn off debug */

destroy_configure_usage_string();
Expand Down Expand Up @@ -510,19 +513,19 @@ void terminate_dird(int sig)
extern "C"
void sighandler_reload_config(int sig, siginfo_t *siginfo, void *ptr)
{
static bool already_here = false;
static bool is_reloading = false;

if (already_here) {
if (is_reloading) {
/*
* Note: don't use Jmsg here, as it could produce a race condition
* on multiple parallel reloads
*/
Qmsg(NULL, M_ERROR, 0, _("Already reloading. Request ignored.\n"));
return;
}
already_here = true;
is_reloading = true;
do_reload_config();
already_here = false;
is_reloading = false;
}
#endif

Expand Down Expand Up @@ -569,34 +572,30 @@ static bool init_sighandler_sighup()
*/
bool do_reload_config()
{
static bool already_here = false;
static bool is_reloading = false;
bool ok = false;
bool reloaded = false;
JCR *jcr;
int njobs = 0; /* Number of running jobs */
int num_running_jobs = 0; /* Number of running jobs */
resource_table_reference prev_config;

if (already_here) {
if (is_reloading) {
/*
* Note: don't use Jmsg here, as it could produce a race condition
* on multiple parallel reloads
*/
Qmsg(NULL, M_ERROR, 0, _("Already reloading. Request ignored.\n"));
return false;
}
already_here = true;
is_reloading = true;

stop_statistics_thread();

lock_jobs();
LockRes();

/*
* Flush the sql connection pools.
*/
db_sql_pool_flush();

/*
* Save the previous config so we can restore it.
*/
prev_config.res_table = my_config->save_resources();
prev_config.job_count = 0;

Expand All @@ -617,17 +616,13 @@ bool do_reload_config()
*/
failed_config.res_table = my_config->save_resources();

/*
* Now restore old resource values,
*/
num = my_config->m_r_last - my_config->m_r_first + 1;
for (int i = 0; i < num; i++) {
// restore original config
my_config->m_res_head[i] = prev_config.res_table[i];
}

/*
* Reset director resource to old config as check_resources() changed it
*/
// me changed above in check_resources()
me = (DIRRES *)GetNextRes(R_DIRECTOR, NULL);

/*
Expand All @@ -647,7 +642,7 @@ bool do_reload_config()
}
new_table->job_count++;
register_job_end_callback(jcr, reload_job_end_cb, (void *)new_table);
njobs++;
num_running_jobs++;
}
}
endeach_jcr(jcr);
Expand All @@ -659,7 +654,7 @@ bool do_reload_config()
set_working_directory(me->working_directory);
Dmsg0(10, "Director's configuration file reread.\n");

if (njobs > 0) {
if (num_running_jobs > 0) {
/*
* See if we already initialized the alist.
*/
Expand All @@ -677,12 +672,13 @@ bool do_reload_config()
*/
free_saved_resources(&prev_config);
}
start_statistics_thread();
}

bail_out:
UnlockRes();
unlock_jobs();
already_here = false;
is_reloading = false;
return reloaded;
}

Expand Down
22 changes: 13 additions & 9 deletions src/dird/stats.c
Expand Up @@ -125,12 +125,14 @@ static inline void wait_for_next_run()
* Entry point for a separate statistics thread.
*/
extern "C"
void *statistics_thread_runner(void *arg)
void *statistics_thread(void *arg)
{
JCR *jcr;
utime_t now;
POOL_MEM current_store(PM_NAME);

Dmsg0(200, "Starting statistics thread\n");

memset(&cached_device, 0, sizeof(struct cached_device));
pm_strcpy(current_store, "");

Expand Down Expand Up @@ -166,21 +168,21 @@ void *statistics_thread_runner(void *arg)
while (!quit) {
now = (utime_t)time(NULL);

Dmsg1(200, "statistics_thread_runner: Doing work at %ld\n", now);
Dmsg1(200, "statistics_thread: Doing work at %ld\n", now);

/*
* Do nothing if no job is running currently.
*/
if (job_count() == 0) {
if (!need_flush) {
Dmsg0(200, "statistics_thread_runner: do nothing as no jobs are running\n");
Dmsg0(200, "statistics_thread: do nothing as no jobs are running\n");
wait_for_next_run();
continue;
} else {
/*
* Flush any pending statistics data one more time and then sleep until new jobs start running.
*/
Dmsg0(200, "statistics_thread_runner: flushing pending statistics\n");
Dmsg0(200, "statistics_thread: flushing pending statistics\n");
need_flush = false;
}
} else {
Expand Down Expand Up @@ -325,16 +327,16 @@ void *statistics_thread_runner(void *arg)
jcr->store_bsock->close();
delete jcr->store_bsock;
jcr->store_bsock = NULL;
}
} // while 1

wait_for_next_run();
}

db_sql_close_pooled_connection(jcr, jcr->db);
} // while(!quit)

bail_out:
free_jcr(jcr);

Dmsg0(200, "Finished statistics thread\n");

return NULL;
}

Expand All @@ -346,7 +348,9 @@ int start_statistics_thread(void)
return 0;
}

if ((status = pthread_create(&statistics_tid, NULL, statistics_thread_runner, NULL)) != 0) {
quit = false;

if ((status = pthread_create(&statistics_tid, NULL, statistics_thread, NULL)) != 0) {
return status;
}

Expand Down

0 comments on commit b97f07a

Please sign in to comment.