Skip to content
Permalink
Browse files

Add conntrack metrics (#3624)

* add conntrack metrics

* refactor conntrack metrics

* address reviews

* fix argument in readme
  • Loading branch information...
coignetp committed Apr 23, 2019
1 parent d154792 commit 0497a14bbd1c0601f230c3e9f4a81aa2eed5ec7b
@@ -31,6 +31,12 @@ The network check is included in the [Datadog Agent][2] package, so you don't ne

2. [Restart the Agent][5] to effect any configuration changes.

Some conntrack metrics require running conntrack with privileged access to be retrieved.
Note: the appropriate sudoers rules have to be configured for this to work
```
dd-agent ALL=NOPASSWD: /usr/sbin/conntrack -S
```

### Validation

[Run the Agent's `status` subcommand][6] and look for `network` under the Checks section.
@@ -49,6 +49,38 @@ instances:
#
# collect_count_metrics: false

## @param conntrack_path - string - optional
## Linux only.
## The location of the conntrack executable in order to get the stats from conntrack -S.
## It will be run with sudo, so an entry needs to be added to the sudoers file.
## By default, these metrics will not be sent.
#
# conntrack_path: /usr/sbin/conntrack

## @param whitelist_conntrack_metrics - list of string - optional - default: ["max", "count"]
## Linux only.
## Names of the conntrack metrics to whitelist for monitoring. The metric value is in the file
## /${proc}/sys/net/netfilter/nf_conntrack_${metric_name}.
## By default the agent collects only max and count.
## Regex expressions for the project names are supported.
## Blacklist takes precedence over whitelist in case of overlap.
#
# whitelist_conntrack_metrics:
# - <METRIC_NAME>
# - <METRIC_PREFIX>*

## @param blacklist_conntrack_metrics - list of strings - optional - default: []
## Linux only.
## Names of the conntrack metrics to blacklist for monitoring. The metric value is in the file
## /${proc}/sys/net/netfilter/nf_conntrack_${metric_name}.
## If set, whitelist default value is reset to []
## Regex expressions for the project names are supported.
## Blacklist takes precedence over whitelist in case of overlap.
#
# blacklist_conntrack_metrics:
# - <METRIC_NAME>
# - <METRIC_PREFIX>*

## @param tags - list of key:value elements - optional
## List of tags to attach to every metric, event and service check emitted by this integration.
##
@@ -6,6 +6,7 @@
Collects network metrics.
"""

import os
import re
import socket
from collections import defaultdict
@@ -14,6 +15,7 @@
from six import PY3, iteritems

from datadog_checks.checks import AgentCheck
from datadog_checks.utils.common import pattern_filter
from datadog_checks.utils.platform import Platform
from datadog_checks.utils.subprocess_output import SubprocessOutputEmptyError, get_subprocess_output

@@ -413,29 +415,76 @@ def _check_linux(self, instance):
nstat_metrics_names[k][met], self._parse_value(netstat_data[k][met]), tags=custom_tags
)

proc_conntrack_path = "{}/net/nf_conntrack".format(proc_location)
# Get the conntrack -S information
conntrack_path = instance.get('conntrack_path')
if conntrack_path is not None:
self._add_conntrack_stats_metrics(conntrack_path, custom_tags)

# Get the rest of the metric by reading the files. Metrics available since kernel 3.6
conntrack_files_location = os.path.join(proc_location, 'sys', 'net', 'netfilter')
# By default, only max and count are reported. However if the blacklist is set,
# the whitelist is loosing its default value
blacklisted_files = instance.get('blacklist_conntrack_metrics')
whitelisted_files = instance.get('whitelist_conntrack_metrics')
if blacklisted_files is None and whitelisted_files is None:
whitelisted_files = ['max', 'count']

available_files = []

# Get the metrics to read
try:
with open(proc_conntrack_path, 'r') as conntrack_file:
# Starting at 0 as the last line has a line return
conntrack_count = 0
while 1:
# Reading the file by chucks (64k being a randomly chosen buffer size)
conntrack_buffer = conntrack_file.read(65536)
if not conntrack_buffer:
break
conntrack_count += conntrack_buffer.count('\n')
self.gauge('system.net.conntrack.count', conntrack_count, tags=custom_tags)
except IOError:
self.log.debug("Unable to read %s. Skipping conntrack metrics pull.", proc_conntrack_path)

proc_conntrack_max_path = "{}/sys/net/nf_conntrack_max".format(proc_location)
for metric_file in os.listdir(conntrack_files_location):
if (
os.path.isfile(os.path.join(conntrack_files_location, metric_file))
and 'nf_conntrack_' in metric_file
):
available_files.append(metric_file[len('nf_conntrack_') :])
except Exception as e:
self.log.debug("Unable to list the files in {}. {}".format(conntrack_files_location, e))

filtered_available_files = pattern_filter(
available_files, whitelist=whitelisted_files, blacklist=blacklisted_files
)

for metric_name in filtered_available_files:
metric_file_location = os.path.join(conntrack_files_location, 'nf_conntrack_{}'.format(metric_name))
try:
with open(metric_file_location, 'r') as conntrack_file:
# Checking it's an integer
try:
value = int(conntrack_file.read().rstrip())
self.gauge('system.net.conntrack.{}'.format(metric_name), value, tags=custom_tags)
except ValueError:
self.log.debug("{} is not an integer".format(metric_name))
except IOError as e:
self.log.debug("Unable to read {}, skipping {}.".format(metric_file_location, e))

def _add_conntrack_stats_metrics(self, conntrack_path, tags):
"""
Parse the output of conntrack -S
Add the parsed metrics
"""
try:
with open(proc_conntrack_max_path, 'r') as conntrack_max_file:
# Starting at 0 as the last line has a line return
conntrack_max = conntrack_max_file.read().rstrip()
self.gauge('system.net.conntrack.max', conntrack_max, tags=custom_tags)
except IOError:
self.log.debug("Unable to read %s. Skipping nf_conntrack_max metrics pull.", proc_conntrack_max_path)
output, _, _ = get_subprocess_output(["sudo", conntrack_path, "-S"], self.log)
# conntrack -S sample:
# cpu=0 found=27644 invalid=19060 ignore=485633411 insert=0 insert_failed=1 \
# drop=1 early_drop=0 error=0 search_restart=39936711
# cpu=1 found=21960 invalid=17288 ignore=475938848 insert=0 insert_failed=1 \
# drop=1 early_drop=0 error=0 search_restart=36983181

lines = output.splitlines()

for line in lines:
cols = line.split()
cpu_num = cols[0].split('=')[-1]
cpu_tag = ['cpu:{}'.format(cpu_num)]
cols = cols[1:]

for cell in cols:
metric, value = cell.split('=')
self.monotonic_count('system.net.conntrack.{}'.format(metric), int(value), tags=tags + cpu_tag)
except SubprocessOutputEmptyError:
self.log.debug("Couldn't use {} to get conntrack stats".format(conntrack_path))

def _parse_linux_cx_state(self, lines, tcp_states, state_col, protocol=None, ip_version=None):
"""
@@ -1,8 +1,43 @@
metric_name,metric_type,interval,unit_name,per_unit_name,description,orientation,integration,short_name
system.net.bytes_rcvd,gauge,,byte,second,The number of bytes received on a device per second.,0,system,bytes rcvd
system.net.bytes_sent,gauge,,byte,second,The number of bytes sent from a device per second.,0,system,bytes sent
system.net.conntrack.acct,gauge,,unit,,Boolean to enable connection tracking flow accounting. 64-bit byte and packet counters per flow are added.,0,system,acct
system.net.conntrack.buckets,gauge,,unit,,Size of the hash table.,0,system,buckets
system.net.conntrack.checksum,gauge,,unit,,Boolean to verify checksum of incoming packets.,0,system,checksum
system.net.conntrack.count,gauge,,connection,connection,The number of connections present in the conntrack table.,0,system,connections
system.net.conntrack.drop,count,,unit,,The number of drop in the conntrack table.,0,system,drop
system.net.conntrack.early_drop,count,,unit,,The number of early drop in the conntrack table.,0,system,early drop
system.net.conntrack.error,count,,unit,,The number of error in the conntrack table.,0,system,err
system.net.conntrack.events,count,,unit,,Boolean to enable the connection tracking code will provide userspace with connection tracking events via ctnetlink.,0,system,evts
system.net.conntrack.events_retry_timeout,gauge,,unit,,,0,system,evts retry tiemout
system.net.conntrack.expect_max,gauge,,unit,,Maximum size of expectation table.,0,system,expect max
system.net.conntrack.found,count,,unit,,The number of currently allocated flow entries.,0,system,connt found
system.net.conntrack.generic_timeout,gauge,,unit,,Default for generic timeout. This refers to layer 4 unknown/unsupported protocols.,0,system,generic timeout
system.net.conntrack.helper,gauge,,unit,,Boolean to enable automatic conntrack helper assignment.,0,system,helper
system.net.conntrack.icmp_timeout,gauge,,second,,Default for ICMP timeout.,0,system,tcmp timeout
system.net.conntrack.ignore,count,,unit,,The number of ignored in the conntrack table.,0,system,ignore
system.net.conntrack.invalid,count,,unit,,The number of invalid in the conntrack table.,0,system,invalid
system.net.conntrack.insert,count,,unit,,The number of insertion in the conntrack table.,0,system,insert
system.net.conntrack.insert_failed,count,,unit,,The number of failed insertion in the conntrack table.,0,system,insert failed
system.net.conntrack.log_invalid,gauge,,unit,,Log invalid packets of a type specified by value.,0,system,log invalid
system.net.conntrack.max,gauge,,connection,entry,Conntrack table max capacity.,0,system,table entries
system.net.conntrack.search_restart,count,,unit,,,0,system,search re
system.net.conntrack.tcp_be_liberal,gauge,,unit,,Boolean to mark only out of window RST segments as INVALID.,0,system,liberal
system.net.conntrack.tcp_loose,gauge,,unit,,Boolean to enable picking up already established connections.,0,system,tcp loose
system.net.conntrack.tcp_max_retrans,gauge,,packet,,Maximum number of packets that can be retransmitted without received an (acceptable) ACK from the destination.,0,system,max retrans
system.net.conntrack.tcp_timeout_close,gauge,,second,,,0,system,timeout close
system.net.conntrack.tcp_timeout_close_wait,gauge,,second,,,0,system,timeout close wait
system.net.conntrack.tcp_timeout_established,gauge,,second,,,0,system,timeout estab
system.net.conntrack.tcp_timeout_fin_wait,gauge,,second,,,0,system,timeout fin
system.net.conntrack.tcp_timeout_last_ack,gauge,,second,,,0,system,timeout last ack
system.net.conntrack.tcp_timeout_max_retrans,gauge,,second,,,0,system,timeout max retrans
system.net.conntrack.tcp_timeout_syn_recv,gauge,,second,,,0,system,timeout syn recv
system.net.conntrack.tcp_timeout_syn_sent,gauge,,second,,,0,system,timeout syn sent
system.net.conntrack.tcp_timeout_time_wait,gauge,,second,,,0,system,timeout time wait
system.net.conntrack.tcp_timeout_unacknowledged,gauge,,second,,,0,system,timeout unack
system.net.conntrack.tcp_timeout,gauge,,second,,,0,system,timeout
system.net.conntrack.tcp_timeout_stream,gauge,,second,,,0,system,timeout stream
system.net.conntrack.timestamp,gauge,,unit,,Boolean to enable connection tracking flow timestamping.,0,system,timestamp
system.net.packets_in.count,gauge,,packet,second,The number of packets of data received by the interface.,0,system,packets in
system.net.packets_in.error,gauge,,error,second,The number of packet receive errors detected by the device driver.,-1,system,pkts in err
system.net.packets_out.count,gauge,,packet,second,The number of packets of data transmitted by the interface.,0,system,packets out
@@ -8,6 +8,8 @@

INSTANCE = {"collect_connection_state": True}

INSTANCE_BLACKLIST = {"collect_connection_state": True, "blacklist_conntrack_metrics": ["count"]}

EXPECTED_METRICS = [
'system.net.bytes_rcvd',
'system.net.bytes_sent',
@@ -16,3 +18,28 @@
'system.net.packets_out.count',
'system.net.packets_out.error',
]

CONNTRACK_METRICS = [
'system.net.conntrack.acct',
'system.net.conntrack.buckets',
'system.net.conntrack.checksum',
'system.net.conntrack.events',
'system.net.conntrack.expect_max',
'system.net.conntrack.generic_timeout',
'system.net.conntrack.helper',
'system.net.conntrack.log_invalid',
'system.net.conntrack.max',
'system.net.conntrack.tcp_loose',
'system.net.conntrack.tcp_max_retrans',
'system.net.conntrack.tcp_timeout_close',
'system.net.conntrack.tcp_timeout_close_wait',
'system.net.conntrack.tcp_timeout_established',
'system.net.conntrack.tcp_timeout_fin_wait',
'system.net.conntrack.tcp_timeout_last_ack',
'system.net.conntrack.tcp_timeout_max_retrans',
'system.net.conntrack.tcp_timeout_syn_recv',
'system.net.conntrack.tcp_timeout_syn_sent',
'system.net.conntrack.tcp_timeout_time_wait',
'system.net.conntrack.tcp_timeout_unacknowledged',
'system.net.conntrack.timestamp',
]
@@ -1,6 +1,8 @@
# (C) Datadog, Inc. 2019
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)
from copy import deepcopy

import pytest

from datadog_checks.network import Network
@@ -16,3 +18,13 @@ def dd_environment():
@pytest.fixture
def check():
return Network(common.SERVICE_CHECK_NAME, {}, {})


@pytest.fixture
def instance():
return deepcopy(common.INSTANCE)


@pytest.fixture
def instance_blacklist():
return deepcopy(common.INSTANCE_BLACKLIST)
@@ -1,7 +1,7 @@
# (C) Datadog, Inc. 2019
# All rights reserved
# Licensed under Simplified BSD License (see LICENSE)
from copy import deepcopy
import platform

import pytest

@@ -11,8 +11,17 @@


@pytest.mark.usefixtures("dd_environment")
def test_check(aggregator, check):
check.check(deepcopy(common.INSTANCE))
def test_check(aggregator, check, instance):
check.check(instance)

for metric in common.EXPECTED_METRICS:
aggregator.assert_metric(metric)


@pytest.mark.skipif(platform.system() != 'Linux', reason="Only runs on Unix systems")
@pytest.mark.usefixtures("dd_environment")
def test_check_linux(aggregator, check, instance_blacklist):
check.check(instance_blacklist)

for metric in common.CONNTRACK_METRICS:
aggregator.assert_metric(metric)
@@ -35,6 +35,18 @@
'system.net.tcp6.time_wait': 1,
}

CONNTRACK_STATS = {
'system.net.conntrack.found': (27644, 21960),
'system.net.conntrack.invalid': (19060, 17288),
'system.net.conntrack.ignore': (485633411, 475938848),
'system.net.conntrack.insert': (0, 0),
'system.net.conntrack.insert_failed': (1, 1),
'system.net.conntrack.drop': (1, 1),
'system.net.conntrack.early_drop': (0, 0),
'system.net.conntrack.error': (0, 0),
'system.net.conntrack.search_restart': (39936711, 36983181),
}

if PY3:
ESCAPE_ENCODING = 'unicode-escape'

@@ -91,6 +103,22 @@ def test_cx_state(aggregator, check):
aggregator.assert_metric(metric, value=value)


def test_add_conntrack_stats_metrics(aggregator, check):
mocked_conntrack_stats = (
"cpu=0 found=27644 invalid=19060 ignore=485633411 insert=0 insert_failed=1 "
"drop=1 early_drop=0 error=0 search_restart=39936711\n"
"cpu=1 found=21960 invalid=17288 ignore=475938848 insert=0 insert_failed=1 "
"drop=1 early_drop=0 error=0 search_restart=36983181"
)
with mock.patch('datadog_checks.network.network.get_subprocess_output') as subprocess:
subprocess.return_value = mocked_conntrack_stats, None, None
check._add_conntrack_stats_metrics(None, ['foo:bar'])

for metric, value in iteritems(CONNTRACK_STATS):
aggregator.assert_metric(metric, value=value[0], tags=['foo:bar', 'cpu:0'])
aggregator.assert_metric(metric, value=value[1], tags=['foo:bar', 'cpu:1'])


@mock.patch('datadog_checks.network.network.Platform.is_linux', return_value=False)
@mock.patch('datadog_checks.network.network.Platform.is_bsd', return_value=False)
@mock.patch('datadog_checks.network.network.Platform.is_solaris', return_value=False)
@@ -2,7 +2,7 @@
minversion = 2.0
basepython = py37
envlist =
py{27,37}-unit
py{27,37}-{integration,unit}

[testenv]
dd_check_style = true
@@ -11,6 +11,10 @@ platform = linux|darwin|win32
deps =
-e../datadog_checks_base[deps]
-rrequirements-dev.txt
passenv =
DOCKER*
COMPOSE*
commands =
pip install -r requirements.in
pytest -v
integration: pytest -m"integration" -v
unit: pytest -m"not integration" -v

0 comments on commit 0497a14

Please sign in to comment.
You can’t perform that action at this time.