Skip to content

Commit

Permalink
bareos-storage-droplet: improve error handling
Browse files Browse the repository at this point in the history
Previos version did not always return an error, if data could not be written.
Especially the load_chunk ignored EIO errors, properly because of a typo.

As the droplet_device in iothread mode relies on asynchronious write-backs,
the new device method flush() has been introduced.
If a droplet_device is configured to use iothreads and unlimited retries,
this will do busy waiting until all data is written to the droplet backend.
In case of a connection problems to the droplet_device, this will be forever.

Note that a bconsole "status storage=..." command will inform about "Pending IO flush requests".

Fixes #892: bareos-storage-droplet: if configured with unreachable S3 system, backup will terminate with OK
  • Loading branch information
joergsteffens committed May 3, 2018
1 parent 11d9845 commit 5b6f33e
Show file tree
Hide file tree
Showing 8 changed files with 246 additions and 18 deletions.
20 changes: 19 additions & 1 deletion src/stored/acquire.c
Expand Up @@ -3,7 +3,7 @@
Copyright (C) 2002-2013 Free Software Foundation Europe e.V.
Copyright (C) 2011-2012 Planets Communications B.V.
Copyright (C) 2013-2013 Bareos GmbH & Co. KG
Copyright (C) 2013-2018 Bareos GmbH & Co. KG
This program is Free Software; you can redistribute it and/or
modify it under the terms of version three of the GNU Affero General Public
Expand Down Expand Up @@ -496,6 +496,24 @@ bool release_device(DCR *dcr)
now = (utime_t)time(NULL);
update_job_statistics(jcr, now);

/*
* Some devices do cache write operations (e.g. droplet_device).
* Therefore flushing the cache is required to determine
* if a job have been written successfully.
* As a flush operation can take quite a long time,
* this must be done before acquiring locks.
* A previous implementation did the flush inside dev->close(),
* which resulted in various locking problems.
*/
if (!job_canceled(jcr)) {
Jmsg(jcr, M_INFO, 0, "Flushing device %s.\n", dev->print_name());
if (!dev->flush(dcr)) {
Jmsg(jcr, M_FATAL, 0, "Failed to flush device %s.\n", dev->print_name());
} else {
Jmsg(jcr, M_INFO, 0, "Device %s flushed.\n", dev->print_name());
}
}

dev->Lock();
if (!dev->is_blocked()) {
block_device(dev, BST_RELEASING);
Expand Down
10 changes: 5 additions & 5 deletions src/stored/backends/Makefile.in
Expand Up @@ -80,27 +80,27 @@ STORED_RESTYPES = autochanger device director ndmp messages storage
$(NO_ECHO)$(LIBTOOL_COMPILE) $(CXX) $(DEFS) $(DEBUG) -c $(WCFLAGS) $(CPPFLAGS) $(INCLUDES) $(DINCLUDE) $(CXXFLAGS) $<
if [ -d "$(@:.lo=.d)" ]; then $(MKDIR) $(CONF_EXTRA_DIR); $(CP) -r $(@:.lo=.d)/. $(CONF_EXTRA_DIR)/.; fi

$(CHEPHFS_LOBJS):
$(CHEPHFS_LOBJS): $(CHEPHFS_SRCS)
@echo "Compiling $(@:.lo=.c)"
$(NO_ECHO)$(LIBTOOL_COMPILE) $(CXX) $(DEFS) $(DEBUG) -c $(WCFLAGS) $(CPPFLAGS) $(INCLUDES) $(CEPHFS_INC) $(DINCLUDE) $(CXXFLAGS) $(@:.lo=.c)
if [ -d "$(@:.lo=.d)" ]; then $(MKDIR) $(CONF_EXTRA_DIR); $(CP) -r $(@:.lo=.d)/. $(CONF_EXTRA_DIR)/.; fi

$(DROPLET_LOBJS):
$(DROPLET_LOBJS): $(DROPLET_SRCS)
@echo "Compiling $(@:.lo=.c)"
$(NO_ECHO)$(LIBTOOL_COMPILE) $(CXX) $(DEFS) $(DEBUG) -c $(WCFLAGS) $(CPPFLAGS) $(INCLUDES) $(DROPLET_INC) $(DINCLUDE) $(CXXFLAGS) $(@:.lo=.c)
if [ -d "$(@:.lo=.d)" ]; then $(MKDIR) $(CONF_EXTRA_DIR); $(CP) -r $(@:.lo=.d)/. $(CONF_EXTRA_DIR)/.; fi

$(ELASTO_LOBJS):
$(ELASTO_LOBJS): $(ELASTO_SRCS)
@echo "Compiling $(@:.lo=.c)"
$(NO_ECHO)$(LIBTOOL_COMPILE) $(CXX) $(DEFS) $(DEBUG) -c $(WCFLAGS) $(CPPFLAGS) $(INCLUDES) $(ELASTO_INC) $(DINCLUDE) $(CXXFLAGS) $(@:.lo=.c)
if [ -d "$(@:.lo=.d)" ]; then $(MKDIR) $(CONF_EXTRA_DIR); $(CP) -r $(@:.lo=.d)/. $(CONF_EXTRA_DIR)/.; fi

$(GFAPI_LOBJS):
$(GFAPI_LOBJS): $(GFAPI_SRCS)
@echo "Compiling $(@:.lo=.c)"
$(NO_ECHO)$(LIBTOOL_COMPILE) $(CXX) $(DEFS) $(DEBUG) -c $(WCFLAGS) $(CPPFLAGS) $(INCLUDES) $(GLUSTER_INC) $(DINCLUDE) $(CXXFLAGS) $(@:.lo=.c)
if [ -d "$(@:.lo=.d)" ]; then $(MKDIR) $(CONF_EXTRA_DIR); $(CP) -r $(@:.lo=.d)/. $(CONF_EXTRA_DIR)/.; fi

$(RADOS_LOBJS):
$(RADOS_LOBJS): $(RADOS_SRCS)
@echo "Compiling $(@:.lo=.c)"
$(NO_ECHO)$(LIBTOOL_COMPILE) $(CXX) $(DEFS) $(DEBUG) -c $(WCFLAGS) $(CPPFLAGS) $(INCLUDES) $(RADOS_INC) $(DINCLUDE) $(CXXFLAGS) $(@:.lo=.c)
if [ -d "$(@:.lo=.d)" ]; then $(MKDIR) $(CONF_EXTRA_DIR); $(CP) -r $(@:.lo=.d)/. $(CONF_EXTRA_DIR)/.; fi
Expand Down
116 changes: 108 additions & 8 deletions src/stored/backends/chunked_device.c
Expand Up @@ -2,6 +2,7 @@
BAREOS® - Backup Archiving REcovery Open Sourced
Copyright (C) 2015-2017 Planets Communications B.V.
Copyright (C) 2017-2018 Bareos GmbH & Co. KG
This program is Free Software; you can redistribute it and/or
modify it under the terms of version three of the GNU Affero General Public
Expand Down Expand Up @@ -71,7 +72,7 @@ static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
*
* flush_remote_chunk() - Flush a chunk to the remote backing store.
* read_remote_chunk() - Read a chunk from the remote backing store.
* chunked_remote_volume_size - Return the current size of a volume.
* chunked_remote_volume_size() - Return the current size of a volume.
* truncate_remote_chunked_volume() - Truncate a chunked volume on the
* remote backing store.
*/
Expand Down Expand Up @@ -657,16 +658,26 @@ bool chunked_device::read_chunk()

/*
* Setup a chunked volume for reading or writing.
* return:
* -1: failure
* 0: success
*/
int chunked_device::setup_chunk(const char *pathname, int flags, int mode)
{
int retval = -1;
/*
* If device is (re)opened and we are put into readonly mode because
* of problems flushing chunks to the backing store we return EROFS
* to the upper layers.
*/
if ((flags & O_RDWR) && m_readonly) {
dev_errno = EROFS;
dev_errno = EROFS; /**< Read-only file system */
return -1;
}

if (!check_remote()) {
Dmsg0(100, "setup_chunk failed, as remote device is not available\n");
dev_errno = EIO; /**< I/O error */
return -1;
}

Expand Down Expand Up @@ -698,7 +709,6 @@ int chunked_device::setup_chunk(const char *pathname, int flags, int mode)
m_current_chunk->writing = true;
}

m_current_chunk->opened = true;
m_current_chunk->chunk_setup = false;

/*
Expand Down Expand Up @@ -727,7 +737,23 @@ int chunked_device::setup_chunk(const char *pathname, int flags, int mode)

m_current_volname = bstrdup(getVolCatName());

return 0;
/*
* in principle it is not required to load_chunk(),
* but we need a secure way to determine,
* if the chunk already exists.
*/
if (load_chunk()) {
m_current_chunk->opened = true;
retval = 0;
} else if (flags & O_CREAT) {
/* create a chunk */
if (flush_chunk(false /* release */, false /* move_to_next_chunk */)) {
m_current_chunk->opened = true;
retval = 0;
}
}

return retval;
}

/*
Expand Down Expand Up @@ -1013,6 +1039,7 @@ int chunked_device::close_chunk()
m_current_chunk->buflen = 0;
m_current_chunk->start_offset = -1;
m_current_chunk->end_offset = -1;

} else {
errno = EBADF;
}
Expand Down Expand Up @@ -1151,6 +1178,62 @@ ssize_t chunked_device::chunked_volume_size()
return chunked_remote_volume_size();
}


bool chunked_device::is_written()
{
/*
* See if we are using io-threads or not and the ordered circbuf is created.
* We try to make sure that nothing of the volume being requested is still inflight as then
* the chunked_remote_volume_size() method will fail to determine the size of the data as
* its not fully stored on the backing store yet.
*/

/*
* Make sure there is also nothing inflight to the backing store anymore.
*/
if (nr_inflight_chunks() > 0) {
Dmsg0(100, "is_written = false, as there are inflight chunks\n");
return false;
}

if (m_io_threads > 0 && m_cb) {

if (!m_cb->empty()) {

chunk_io_request *request;

/*
* Peek on the ordered circular queue if there are any pending IO-requests
* for this volume. If there are use that as the indication of the size of
* the volume and don't contact the remote storage as there is still data
* inflight and as such we need to look at the last chunk that is still not
* uploaded of the volume.
*/
request = (chunk_io_request *)m_cb->peek(PEEK_FIRST, m_current_volname, compare_volume_name);
if (request) {
free(request);
Dmsg0(100, "is_written = false, as there are queued write requests\n");
return false;
}
}
}

return true;
}


/*
* Busy waits until write buffer is empty.
*/
bool chunked_device::wait_until_chunks_written()
{
while (!is_written()) {
bmicrosleep(DEFAULT_RECHECK_INTERVAL_WRITE_BUFFER, 0);
}
return true;
}


static int clone_io_request(void *item1, void *item2)
{
chunk_io_request *src = (chunk_io_request *)item1;
Expand Down Expand Up @@ -1271,6 +1354,7 @@ bool chunked_device::load_chunk()
if (m_current_chunk->writing) {
m_current_chunk->end_offset = start_offset + (m_current_chunk->chunk_size - 1);
}
return false;
break;
default:
return false;
Expand All @@ -1290,7 +1374,7 @@ static int list_io_request(void *request, void *data)
bsdDevStatTrig *dst = (bsdDevStatTrig *)data;
POOL_MEM status(PM_MESSAGE);

status.bsprintf(" /%s/%04d - %ld\n", io_request->volname, io_request->chunk, io_request->wbuflen);
status.bsprintf(" /%s/%04d - %ld (try=%d)\n", io_request->volname, io_request->chunk, io_request->wbuflen, io_request->tries);
dst->status_length = pm_strcat(dst->status, status.c_str());

return 0;
Expand All @@ -1304,20 +1388,36 @@ bool chunked_device::device_status(bsdDevStatTrig *dst)
/*
* See if we are using io-threads or not and the ordered circbuf is created and not empty.
*/
bool pending = false;
POOL_MEM inflights(PM_MESSAGE);

dst->status_length = 0;
if (check_remote()) {
dst->status_length = pm_strcpy(dst->status, _("Backend connection is working.\n"));
} else {
dst->status_length = pm_strcpy(dst->status, _("Backend connection is not working.\n"));
}
if (m_io_threads > 0 && m_cb) {
if (nr_inflight_chunks() > 0) {
pending = true;
inflights.bsprintf("Inflight chunks: %d\n", nr_inflight_chunks());
dst->status_length = pm_strcat(dst->status, inflights.c_str());
}
if (!m_cb->empty()) {
dst->status_length = pm_strcpy(dst->status, _("Pending IO flush requests:\n"));
pending = true;
dst->status_length = pm_strcat(dst->status, _("Pending IO flush requests:\n"));

/*
* Peek on the ordered circular queue and list all pending requests.
*/
m_cb->peek(PEEK_LIST, dst, list_io_request);
} else {
dst->status_length = pm_strcpy(dst->status, _("No Pending IO flush requests\n"));
}
}

if (!pending) {
dst->status_length += pm_strcat(dst->status, _("No Pending IO flush requests.\n"));
}

return (dst->status_length > 0);
}

Expand Down
11 changes: 11 additions & 0 deletions src/stored/backends/chunked_device.h
Expand Up @@ -2,6 +2,7 @@
BAREOS® - Backup Archiving REcovery Open Sourced
Copyright (C) 2015-2017 Planets Communications B.V.
Copyright (C) 2018-2018 Bareos GmbH & Co. KG
This program is Free Software; you can redistribute it and/or
modify it under the terms of version three of the GNU Affero General Public
Expand Down Expand Up @@ -32,6 +33,12 @@
*/
#define DEFAULT_RECHECK_INTERVAL 300

/*
* Recheck interval when waiting that buffer gets written
* (write buffer is empty).
*/
#define DEFAULT_RECHECK_INTERVAL_WRITE_BUFFER 10

/*
* Chunk the volume into chunks of this size.
* This is the lower limit used the exact chunksize is
Expand Down Expand Up @@ -112,6 +119,7 @@ class chunked_device: public DEVICE {
bool enqueue_chunk(chunk_io_request *request);
bool flush_chunk(bool release_chunk, bool move_to_next_chunk);
bool read_chunk();
bool is_written();

protected:
/*
Expand All @@ -138,10 +146,13 @@ class chunked_device: public DEVICE {
bool truncate_chunked_volume(DCR *dcr);
ssize_t chunked_volume_size();
bool load_chunk();
bool wait_until_chunks_written();

/*
* Methods implemented by inheriting class.
*/
virtual bool check_remote() = 0;
virtual bool remote_chunked_volume_exists() = 0;
virtual bool flush_remote_chunk(chunk_io_request *request) = 0;
virtual bool read_remote_chunk(chunk_io_request *request) = 0;
virtual ssize_t chunked_remote_volume_size() = 0;
Expand Down

0 comments on commit 5b6f33e

Please sign in to comment.