Skip to content

Commit

Permalink
long flashcopy operation may block volume service
Browse files Browse the repository at this point in the history
Storwize family uses flashcopy for snapshot or volume clone. The
volume delete has to wait until flashcopy finishes or errors out.
The _delete_vdisk() will poll volume FlashCopy status in a loop.
This may block volume serivce heartheat since it is in the same
. The solution is to use openstack FixedIntervalLoopingCall
to run the FlashCopy status poll in a timer thread.

The cinder volume mananger will resume delete operation for those
volumes that are in the deleting state during volume service startup.
Since Storwize volume delete may wait for a long time, this can cause
volume service to have long delay before it becomes available.
A greenpool is used to offload those volume delete operations.

Change-Id: Ie01a441a327e1e318fa8da0040ae130731b7a686
Closes-Bug: #1203152
  • Loading branch information
Alan Jiang authored and avishay-traeger committed Oct 9, 2013
1 parent b00e307 commit 7aa4f65
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 48 deletions.
5 changes: 4 additions & 1 deletion cinder/tests/test_storwize_svc.py
Expand Up @@ -39,6 +39,7 @@
from cinder.volume.drivers import storwize_svc
from cinder.volume import volume_types

from eventlet import greenthread

LOG = logging.getLogger(__name__)

Expand Down Expand Up @@ -1165,7 +1166,7 @@ def _cmd_lsfcmap(self, **kwargs):
'no'])

for d in to_delete:
del self._fcmappings_list[k]
del self._fcmappings_list[d]

return self._print_info_cmd(rows=rows, **kwargs)

Expand Down Expand Up @@ -1485,6 +1486,8 @@ def setUp(self):
self.driver.do_setup(None)
self.driver.check_for_setup_error()
self.stubs.Set(storwize_svc.time, 'sleep', lambda s: None)
self.stubs.Set(greenthread, 'sleep', lambda *x, **y: None)
self.stubs.Set(storwize_svc, 'CHECK_FCMAPPING_INTERVAL', 0)

def _set_flag(self, flag, value):
group = self.driver.configuration.config_group
Expand Down
111 changes: 66 additions & 45 deletions cinder/volume/drivers/storwize_svc.py
Expand Up @@ -51,6 +51,7 @@
from cinder import exception
from cinder.openstack.common import excutils
from cinder.openstack.common import log as logging
from cinder.openstack.common import loopingcall
from cinder.openstack.common import processutils
from cinder.openstack.common import strutils
from cinder import utils
Expand Down Expand Up @@ -112,6 +113,8 @@
CONF = cfg.CONF
CONF.register_opts(storwize_svc_opts)

CHECK_FCMAPPING_INTERVAL = 300


class StorwizeSVCDriver(san.SanDriver):
"""IBM Storwize V7000 and SVC iSCSI/FC volume driver.
Expand Down Expand Up @@ -1243,55 +1246,73 @@ def _is_vdisk_defined(self, vdisk_name):
return True

def _ensure_vdisk_no_fc_mappings(self, name, allow_snaps=True):
# Ensure vdisk has no FlashCopy mappings
"""Ensure vdisk has no flashcopy mappings."""
timer = loopingcall.FixedIntervalLoopingCall(
self._check_vdisk_fc_mappings, name, allow_snaps)
# Create a timer greenthread. The default volume service heart
# beat is every 10 seconds. The flashcopy usually takes hours
# before it finishes. Don't set the sleep interval shorter
# than the heartbeat. Otherwise volume service heartbeat
# will not be serviced.
LOG.debug(_('Calling _ensure_vdisk_no_fc_mappings: vdisk %s')
% name)
ret = timer.start(interval=CHECK_FCMAPPING_INTERVAL).wait()
timer.stop()
return ret

def _check_vdisk_fc_mappings(self, name, allow_snaps=True):
"""FlashCopy mapping check helper."""

LOG.debug(_('Loopcall: _check_vdisk_fc_mappings(), vdisk %s') % name)
mapping_ids = self._get_vdisk_fc_mappings(name)
while len(mapping_ids):
wait_for_copy = False
for map_id in mapping_ids:
attrs = self._get_flashcopy_mapping_attributes(map_id)
if not attrs:
continue
source = attrs['source_vdisk_name']
target = attrs['target_vdisk_name']
copy_rate = attrs['copy_rate']
status = attrs['status']

if copy_rate == '0':
# Case #2: A vdisk that has snapshots
if source == name:
if not allow_snaps:
return False
ssh_cmd = ['svctask', 'chfcmap', '-copyrate', '50',
'-autodelete', 'on', map_id]
out, err = self._run_ssh(ssh_cmd)
wait_for_copy = True
# Case #3: A snapshot
else:
msg = (_('Vdisk %(name)s not involved in '
'mapping %(src)s -> %(tgt)s') %
{'name': name, 'src': source, 'tgt': target})
self._driver_assert(target == name, msg)
if status in ['copying', 'prepared']:
self._run_ssh(['svctask', 'stopfcmap', map_id])
elif status in ['stopping', 'preparing']:
wait_for_copy = True
else:
self._run_ssh(['svctask', 'rmfcmap', '-force',
map_id])
# Case 4: Copy in progress - wait and will autodelete
wait_for_copy = False
for map_id in mapping_ids:
attrs = self._get_flashcopy_mapping_attributes(map_id)
if not attrs:
continue
source = attrs['source_vdisk_name']
target = attrs['target_vdisk_name']
copy_rate = attrs['copy_rate']
status = attrs['status']

if copy_rate == '0':
# Case #2: A vdisk that has snapshots. Return
# False if snapshot is not allowed.
if source == name:
if not allow_snaps:
raise loopingcall.LoopingCallDone(retvalue=False)
ssh_cmd = ['svctask', 'chfcmap', '-copyrate', '50',
'-autodelete', 'on', map_id]
out, err = self._run_ssh(ssh_cmd)
wait_for_copy = True
# Case #3: A snapshot
else:
if status == 'prepared':
msg = (_('Vdisk %(name)s not involved in '
'mapping %(src)s -> %(tgt)s') %
{'name': name, 'src': source, 'tgt': target})
self._driver_assert(target == name, msg)
if status in ['copying', 'prepared']:
self._run_ssh(['svctask', 'stopfcmap', map_id])
self._run_ssh(['svctask', 'rmfcmap', '-force', map_id])
elif status == 'idle_or_copied':
# Prepare failed
self._run_ssh(['svctask', 'rmfcmap', '-force', map_id])
else:
# Need to wait for the fcmap to change to
# stopped state before remove fcmap
wait_for_copy = True
elif status in ['stopping', 'preparing']:
wait_for_copy = True
if wait_for_copy:
time.sleep(5)
mapping_ids = self._get_vdisk_fc_mappings(name)
return True
else:
self._run_ssh(['svctask', 'rmfcmap', '-force',
map_id])
# Case 4: Copy in progress - wait and will autodelete
else:
if status == 'prepared':
self._run_ssh(['svctask', 'stopfcmap', map_id])
self._run_ssh(['svctask', 'rmfcmap', '-force', map_id])
elif status == 'idle_or_copied':
# Prepare failed
self._run_ssh(['svctask', 'rmfcmap', '-force', map_id])
else:
wait_for_copy = True
if not wait_for_copy or not len(mapping_ids):
raise loopingcall.LoopingCallDone(retvalue=True)

def _delete_vdisk(self, name, force):
"""Deletes existing vdisks.
Expand Down
21 changes: 20 additions & 1 deletion cinder/volume/manager.py
Expand Up @@ -64,6 +64,8 @@

from cinder.taskflow import states

from eventlet.greenpool import GreenPool

LOG = logging.getLogger(__name__)

QUOTAS = quota.QUOTAS
Expand All @@ -76,6 +78,10 @@
default=300,
help='Timeout for creating the volume to migrate to '
'when performing volume migration (seconds)'),
cfg.BoolOpt('volume_service_inithost_offload',
default=False,
help='Offload pending volume delete during '
'volume service startup'),
]

CONF = cfg.CONF
Expand Down Expand Up @@ -144,6 +150,8 @@ def __init__(self, volume_driver=None, service_name=None,
*args, **kwargs)
self.configuration = Configuration(volume_manager_opts,
config_group=service_name)
self._tp = GreenPool()

if not volume_driver:
# Get from configuration, which will get the default
# if its not using the multi backend
Expand All @@ -165,6 +173,9 @@ def __init__(self, volume_driver=None, service_name=None,
configuration=self.configuration,
db=self.db)

def _add_to_threadpool(self, func, *args, **kwargs):
self._tp.spawn_n(func, *args, **kwargs)

def init_host(self):
"""Do any initialization that needs to be run if this is a
standalone service.
Expand Down Expand Up @@ -208,7 +219,15 @@ def init_host(self):
for volume in volumes:
if volume['status'] == 'deleting':
LOG.info(_('Resuming delete on volume: %s') % volume['id'])
self.delete_volume(ctxt, volume['id'])
if CONF.volume_service_inithost_offload:
# Offload all the pending volume delete operations to the
# threadpool to prevent the main volume service thread
# from being blocked.
self._add_to_threadpool(self.delete_volume(ctxt,
volume['id']))
else:
# By default, delete volumes sequentially
self.delete_volume(ctxt, volume['id'])

# collect and publish service capabilities
self.publish_service_capabilities(ctxt)
Expand Down
6 changes: 5 additions & 1 deletion etc/cinder/cinder.conf.sample
Expand Up @@ -1760,6 +1760,10 @@
# performing volume migration (seconds) (integer value)
#migration_create_volume_timeout_secs=300

# Offload pending volume delete during volume service startup
# (boolean value)
#volume_service_inithost_offload=false


#
# Options defined in cinder.volume.utils
Expand All @@ -1770,4 +1774,4 @@
#volume_dd_blocksize=1M


# Total option count: 381
# Total option count: 382

0 comments on commit 7aa4f65

Please sign in to comment.