Skip to content

Commit

Permalink
Merge pull request #2228 from DataDog/yann/wmi-fix-timeout
Browse files Browse the repository at this point in the history
[system][wmi_check] handle `TimeoutException`
  • Loading branch information
yannmh committed Feb 12, 2016
2 parents 2578005 + 6573280 commit 6286faa
Show file tree
Hide file tree
Showing 6 changed files with 127 additions and 35 deletions.
28 changes: 23 additions & 5 deletions checks.d/wmi_check.py
Expand Up @@ -4,6 +4,7 @@
# project
from checks import AgentCheck
from checks.libs.wmi.sampler import WMISampler
from utils.timeout import TimeoutException

WMIMetric = namedtuple('WMIMetric', ['name', 'value', 'tags'])

Expand Down Expand Up @@ -37,6 +38,8 @@ class WMICheck(AgentCheck):
"""
def __init__(self, name, init_config, agentConfig, instances):
AgentCheck.__init__(self, name, init_config, agentConfig, instances)

# Cache
self.wmi_samplers = {}
self.wmi_props = {}

Expand Down Expand Up @@ -69,13 +72,25 @@ def check(self, instance):
wmi_class, properties,
filters=filters,
host=host, namespace=namespace,
username=username, password=password
username=username, password=password,
)

# Sample, extract & submit metrics
wmi_sampler.sample()
metrics = self._extract_metrics(wmi_sampler, tag_by, tag_queries, constant_tags)
self._submit_metrics(metrics, metric_name_and_type_by_property)
metrics = []
try:
wmi_sampler.sample()
metrics = self._extract_metrics(wmi_sampler, tag_by, tag_queries, constant_tags)
except TimeoutException:
self.log.warning(
u"WMI query timed out."
u" class={wmi_class} - properties={wmi_properties} -"
u" filters={filters} - tag_queries={tag_queries}".format(
wmi_class=wmi_class, wmi_properties=properties,
filters=filters, tag_queries=tag_queries
)
)
else:
self._submit_metrics(metrics, metric_name_and_type_by_property)

def _format_tag_query(self, sampler, wmi_obj, tag_query):
"""
Expand Down Expand Up @@ -177,7 +192,10 @@ def _extract_metrics(self, wmi_sampler, tag_by, tag_queries, constant_tags):
"""
Extract and tag metrics from the WMISampler.
Raise when multiple WMIObject were returned by the sampler with no `tag_by` specified.
Raise
* `MissingTagBy` when multiple WMIObject were returned by the sampler
with no `tag_by` specified.
* `TimeoutException` on WMI query timeouts.
Returns: List of WMIMetric
```
Expand Down
15 changes: 10 additions & 5 deletions checks/libs/wmi/sampler.py
Expand Up @@ -53,7 +53,7 @@ class WMISampler(object):
"""
# Shared resources
_wmi_locators = {}
_wmi_connections = defaultdict(set)
_wmi_connections = defaultdict(list)

def __init__(self, logger, class_name, property_names, filters="", host="localhost",
namespace="root\\cimv2", username="", password="", timeout_duration=10):
Expand Down Expand Up @@ -146,11 +146,12 @@ def sample(self):
self.previous_sample = self.current_sample
self.current_sample = self._query()
except TimeoutException:
self.logger.warning(
self.logger.debug(
u"Query timeout after {timeout}s".format(
timeout=self._timeout_duration
)
)
raise
else:
self._sampling = False
self.logger.debug(u"Sample: {0}".format(self.current_sample))
Expand All @@ -161,7 +162,9 @@ def __len__(self):
"""
# No data is returned while sampling
if self._sampling:
return 0
raise TypeError(
u"Sampling `WMISampler` object has no len()"
)

return len(self.current_sample)

Expand All @@ -171,7 +174,9 @@ def __iter__(self):
"""
# No data is returned while sampling
if self._sampling:
return
raise TypeError(
u"Sampling `WMISampler` object is not iterable"
)

if self.is_raw_perf_class:
# Format required
Expand Down Expand Up @@ -293,7 +298,7 @@ def get_connection(self):
yield connection

# Release it
self._wmi_connections[self.connection_key].add(connection)
self._wmi_connections[self.connection_key].append(connection)

@staticmethod
def _format_filter(filters):
Expand Down
58 changes: 51 additions & 7 deletions checks/system/win32.py
Expand Up @@ -16,6 +16,9 @@ def WMISampler(*args, **kwargs):
"""
return

# datadog
from utils.timeout import TimeoutException


# Device WMI drive types
class DriveType(object):
Expand Down Expand Up @@ -44,7 +47,14 @@ def __init__(self, logger):
self.gauge('system.proc.count')

def check(self, agentConfig):
self.wmi_sampler.sample()
try:
self.wmi_sampler.sample()
except TimeoutException:
self.logger.warning(
u"Timeout while querying Win32_PerfRawData_PerfOS_System WMI class."
u" Processes metrics will be returned at next iteration."
)
return

if not (len(self.wmi_sampler)):
self.logger.info('Missing Win32_PerfRawData_PerfOS_System WMI class.'
Expand Down Expand Up @@ -100,7 +110,14 @@ def __init__(self, logger):
self.gauge('system.mem.pct_usable')

def check(self, agentConfig):
self.os_wmi_sampler.sample()
try:
self.os_wmi_sampler.sample()
except TimeoutException:
self.logger.warning(
u"Timeout while querying Win32_OperatingSystem WMI class."
u" Memory metrics will be returned at next iteration."
)
return

if not (len(self.os_wmi_sampler)):
self.logger.info('Missing Win32_OperatingSystem WMI class.'
Expand All @@ -123,7 +140,14 @@ def check(self, agentConfig):
self.save_sample('system.mem.free', free)
self.save_sample('system.mem.used', total - free)

self.mem_wmi_sampler.sample()
try:
self.mem_wmi_sampler.sample()
except TimeoutException:
self.logger.warning(
u"Timeout while querying Win32_PerfRawData_PerfOS_Memory WMI class."
u" Memory metrics will be returned at next iteration."
)
return

if not (len(self.mem_wmi_sampler)):
self.logger.info('Missing Win32_PerfRawData_PerfOS_Memory WMI class.'
Expand Down Expand Up @@ -173,8 +197,14 @@ def __init__(self, logger):
self.counter('system.cpu.system')

def check(self, agentConfig):

self.wmi_sampler.sample()
try:
self.wmi_sampler.sample()
except TimeoutException:
self.logger.warning(
u"Timeout while querying Win32_PerfRawData_PerfOS_Processor WMI class."
u" CPU metrics will be returned at next iteration."
)
return

if not (len(self.wmi_sampler)):
self.logger.info('Missing Win32_PerfRawData_PerfOS_Processor WMI class.'
Expand Down Expand Up @@ -230,7 +260,14 @@ def __init__(self, logger):
self.gauge('system.net.bytes_sent')

def check(self, agentConfig):
self.wmi_sampler.sample()
try:
self.wmi_sampler.sample()
except TimeoutException:
self.logger.warning(
u"Timeout while querying Win32_PerfRawData_Tcpip_NetworkInterface WMI class."
u" Network metrics will be returned at next iteration."
)
return

if not (len(self.wmi_sampler)):
self.logger.info('Missing Win32_PerfRawData_Tcpip_NetworkInterface WMI class.'
Expand Down Expand Up @@ -271,7 +308,14 @@ def __init__(self, logger):
self.gauge('system.io.avg_q_sz')

def check(self, agentConfig):
self.wmi_sampler.sample()
try:
self.wmi_sampler.sample()
except TimeoutException:
self.logger.warning(
u"Timeout while querying Win32_PerfRawData_PerfDisk_LogicalDiskUnable WMI class."
u" I/O metrics will be returned at next iteration."
)
return

if not (len(self.wmi_sampler)):
self.logger.info('Missing Win32_PerfRawData_PerfDisk_LogicalDiskUnable WMI class.'
Expand Down
31 changes: 30 additions & 1 deletion tests/checks/mock/test_wmi_check.py
Expand Up @@ -3,7 +3,7 @@

# project
from tests.checks.common import AgentCheckTest
from tests.core.test_wmi import TestCommonWMI
from tests.core.test_wmi import SWbemServices, TestCommonWMI


class WMITestCase(AgentCheckTest, TestCommonWMI):
Expand Down Expand Up @@ -188,6 +188,35 @@ def test_missing_property(self):
self.run_check(config, mocks={'log': logger})
self.assertTrue(logger.warning.called)

def test_query_timeouts(self):
"""
Gracefully handle WMI query timeouts.
"""
def __patched_init__(*args, **kwargs):
"""
Force `timeout_duration` value.
"""
kwargs['timeout_duration'] = 0.5
return wmi_constructor(*args, **kwargs)

# Increase WMI queries' runtime
SWbemServices._exec_query_run_time = 0.5

# Patch WMISampler to decrease timeout tolerancy
WMISampler = self.load_class("WMISampler")
wmi_constructor = WMISampler.__init__
WMISampler.__init__ = __patched_init__

# Set up the check
config = {
'instances': [self.WMI_CONFIG]
}
logger = Mock()

# No exception is raised but a WARNING is logged
self.run_check(config, mocks={'log': logger})
self.assertTrue(logger.warning.called)

def test_mandatory_tag_by(self):
"""
Exception is raised when the result returned by the WMI query contains multiple rows
Expand Down
28 changes: 12 additions & 16 deletions tests/core/test_wmi.py
Expand Up @@ -10,20 +10,14 @@

# project
from tests.checks.common import Fixtures
from utils.timeout import TimeoutException


log = logging.getLogger(__name__)

WMISampler = None


# Thoughts
# Log WMI activity
# Mechanism to timeout
# Check when pywintypes.com_error are raised
# Check the role of the flags


def load_fixture(f, args=None):
"""
Build a WMI query result from a file and given parameters.
Expand Down Expand Up @@ -100,9 +94,11 @@ def __init__(self, wmi_conn_args):
@classmethod
def reset(cls):
"""
FIXME - Dirty patch to reset `SWbemServices.ExecQuery` to 0.
Dirty patch to reset `SWbemServices.ExecQuery.call_count` and
`SWbemServices._exec_query_run_time` to 0.
"""
cls._exec_query_call_count.reset()
cls._exec_query_run_time = 0

def get_conn_args(self):
"""
Expand Down Expand Up @@ -224,7 +220,7 @@ def tearDown(self):
# Flush cache
from checks.libs.wmi.sampler import WMISampler
WMISampler._wmi_locators = {}
WMISampler._wmi_connections = defaultdict(set)
WMISampler._wmi_connections = defaultdict(list)

def assertWMIConn(self, wmi_sampler, param=None, count=None):
"""
Expand Down Expand Up @@ -451,7 +447,7 @@ def test_wmi_sampler_iterator_getter(self):

def test_wmi_sampler_timeout(self):
"""
Gracefully handle WMI queries' timeouts.
Gracefully handle WMI query timeouts.
"""
from checks.libs.wmi.sampler import WMISampler
logger = Mock()
Expand All @@ -462,14 +458,14 @@ def test_wmi_sampler_timeout(self):
timeout_duration=0.5)
SWbemServices._exec_query_run_time = 0.5

# Gracefully timeout with a warning message but no exception
wmi_sampler.sample()
# `TimeoutException` exception is raised, DEBUG message logged
self.assertRaises(TimeoutException, wmi_sampler.sample)
self.assertTrue(wmi_sampler._sampling)
self.assertTrue(logger.warning.called)
self.assertTrue(logger.debug.called)

# Show no data
self.assertEquals(len(wmi_sampler), 0)
self.assertEquals(sum(1 for _ in wmi_sampler), 0)
# Cannot iterate on data
self.assertRaises(TypeError, lambda: len(wmi_sampler))
self.assertRaises(TypeError, lambda: sum(1 for _ in wmi_sampler))

# Recover from timeout at next iteration
wmi_sampler.sample()
Expand Down
2 changes: 1 addition & 1 deletion tests/core/test_wmi_calculator.py
Expand Up @@ -36,7 +36,7 @@ def test_calculator_decorator(self):
Asssign a calculator to a counter_type. Raise when the calculator is missing.
"""
@calculator(123456)
def do_something():
def do_something(*args, **kwargs):
"""A function that does something."""
pass

Expand Down

0 comments on commit 6286faa

Please sign in to comment.