Skip to content

Commit

Permalink
[Fixes #4437] [Monitoring] collect_metrics often timeouts
Browse files Browse the repository at this point in the history
  • Loading branch information
afabiani committed May 28, 2019
1 parent 70a80a4 commit 7513820
Show file tree
Hide file tree
Showing 8 changed files with 81 additions and 65 deletions.
104 changes: 55 additions & 49 deletions geonode/monitoring/collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,20 +105,26 @@ def get_network_rate(row, value, metric_defaults,
metric_name, valid_to):
iface_label = get_iface_name(row)
if not iface_label:
print('no label', metric_name, row.get('description'))
try:
log.debug('no label', metric_name, row.get('description'))
except BaseException:
pass
return
rate = self._calculate_rate(
metric_name, iface_label, value, valid_to)
if rate is None:
print('no rate for', metric_name)
try:
log.debug('no rate for', metric_name)
except BaseException:
pass
return
mdata = {'value': rate,
'value_raw': rate,
'value_num': rate,
'label': iface_label,
'metric': '{}.rate'.format(metric_name)}
mdata.update(metric_defaults)
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))

def get_mem_label(*args):
return 'B'
Expand Down Expand Up @@ -179,7 +185,7 @@ def get_mem_label(*args):
'label': label_function(metric_data) if callable(label_function) else None,
'metric': metric_name}
mdata.update(mdefaults)
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))

if callable(processing_function):
processing_function(
Expand Down Expand Up @@ -222,13 +228,13 @@ def process_host_geonode(self, service, data, valid_from, valid_to):
mdata.update(mdefaults)
rate = self._calculate_rate(
mdata['metric'], ifname, tx_value, valid_to)
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))
if rate:
mdata['metric'] = '{}.rate'.format(mdata['metric'])
mdata['value'] = rate
mdata['value_num'] = rate
mdata['value_raw'] = rate
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))

ldata = data['data']['load']
llabel = ['1', '5', '15']
Expand All @@ -253,7 +259,7 @@ def process_host_geonode(self, service, data, valid_from, valid_to):
label__name='MB',
service=service)\
.delete()
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))

MetricValue.objects.filter(service_metric__metric__name__in=('storage.total', 'storage.used', 'storage.free',),
valid_from=valid_from,
Expand All @@ -279,7 +285,7 @@ def process_host_geonode(self, service, data, valid_from, valid_to):
'label': mount,
}
mdata.update(mdefaults)
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))

if ldata:
for lidx, l in enumerate(ldata):
Expand All @@ -297,7 +303,7 @@ def process_host_geonode(self, service, data, valid_from, valid_to):
label__name='Value',
service=service)\
.delete()
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))

uptime = data['data'].get('uptime')
if uptime is not None:
Expand All @@ -313,7 +319,7 @@ def process_host_geonode(self, service, data, valid_from, valid_to):
label__name=mdata['label'],
service=service)\
.delete()
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))

if data['data'].get('cpu'):
_l = data['data']['cpu']['usage']
Expand All @@ -332,7 +338,7 @@ def process_host_geonode(self, service, data, valid_from, valid_to):
label__name=mdata['label'],
service=service)\
.delete()
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))
rate = self._calculate_rate(
mdata['metric'],
mdata['label'],
Expand All @@ -344,7 +350,7 @@ def process_host_geonode(self, service, data, valid_from, valid_to):
rate_data['value'] = rate
rate_data['value_num'] = rate
rate_data['value_raw'] = rate
print MetricValue.add(**rate_data)
log.debug(MetricValue.add(**rate_data))

percent = self._calculate_percent(
mdata['metric'],
Expand All @@ -358,10 +364,10 @@ def process_host_geonode(self, service, data, valid_from, valid_to):
percent_data['value_num'] = percent
percent_data['value_raw'] = percent
percent_data['label'] = 'Value'
print MetricValue.add(**percent_data)
log.debug(MetricValue.add(**percent_data))

mdata.update(mdefaults)
print MetricValue.add(**mdata)
log.debug(MetricValue.add(**mdata))

def get_labels_for_metric(self, metric_name, resource=None):
mt = ServiceTypeMetric.objects.filter(metric__name=metric_name)
Expand Down Expand Up @@ -492,7 +498,7 @@ def _key(v):
'samples_count': samples,
'value_raw': value or 0,
'value_num': value if isinstance(value, (int, float, long, Decimal,)) else None})
print MetricValue.add(**metric_values)
log.debug(MetricValue.add(**metric_values))

def process(self, service, data, valid_from, valid_to, *args, **kwargs):
if service.is_hostgeonode:
Expand Down Expand Up @@ -535,7 +541,7 @@ def set_error_values(self, requests, valid_from, valid_to,
'label': 'count',
'service': service}
cnt = with_errors.count()
print MetricValue.add(value=cnt, value_num=cnt, value_raw=cnt, **defaults)
log.debug(MetricValue.add(value=cnt, value_num=cnt, value_raw=cnt, **defaults))

defaults['metric'] = 'response.error.types'
for label in labels:
Expand All @@ -544,16 +550,16 @@ def set_error_values(self, requests, valid_from, valid_to,
defaults['label'] = label

defaults['samples_count'] = cnt
print MetricValue.add(value=cnt, value_num=cnt, value_raw=cnt, **defaults)
log.debug(MetricValue.add(value=cnt, value_num=cnt, value_raw=cnt, **defaults))

def process_requests_batch(self, service, requests, valid_from, valid_to):
"""
Processes requests information into metric values
"""
if not requests.count():
return
log.info("Processing batch of %s requests from %s to %s",
requests.count(), valid_from, valid_to)
if not requests.count():
return
metric_defaults = {'valid_from': valid_from,
'valid_to': valid_to,
'requests': requests,
Expand All @@ -567,20 +573,20 @@ def process_requests_batch(self, service, requests, valid_from, valid_to):
count = requests.count()
paths = requests.distinct('request_path').values_list(
'request_path', flat=True)
print MetricValue.add('request.count', valid_from, valid_to, service, 'Count',
value=count,
value_num=count,
value_raw=count,
samples_count=count,
resource=None)
for path in paths:
count = requests.filter(request_path=path).count()
print MetricValue.add('request.path', valid_from, valid_to, service, path,
log.debug(MetricValue.add('request.count', valid_from, valid_to, service, 'Count',
value=count,
value_num=count,
value_raw=count,
samples_count=count,
resource=None)
resource=None))
for path in paths:
count = requests.filter(request_path=path).count()
log.debug(MetricValue.add('request.path', valid_from, valid_to, service, path,
value=count,
value_num=count,
value_raw=count,
samples_count=count,
resource=None))

# calculate overall stats
self.set_metric_values('request.ip', 'client_ip', **metric_defaults)
Expand Down Expand Up @@ -686,13 +692,13 @@ def process_requests_batch(self, service, requests, valid_from, valid_to):
metric_defaults['requests'] = ows_requests
metric_defaults['ows_service'] = ows_all

print(MetricValue.add('request.count', valid_from,
valid_to, service, 'Count',
value=count, value_num=count,
samples_count=count,
value_raw=count,
resource=resource,
ows_service=ows_all))
log.debug(MetricValue.add('request.count', valid_from,
valid_to, service, 'Count',
value=count, value_num=count,
samples_count=count,
value_raw=count,
resource=resource,
ows_service=ows_all))
self.set_metric_values(
'request.ip', 'client_ip', **metric_defaults)
self.set_metric_values(
Expand Down Expand Up @@ -730,23 +736,23 @@ def process_requests_batch(self, service, requests, valid_from, valid_to):
'request_path').values_list('request_path', flat=True)
for path in paths:
count = ows_requests.filter(request_path=path).count()
print MetricValue.add('request.path', valid_from, valid_to, service, path,
value=count,
value_num=count,
value_raw=count,
samples_count=count,
resource=resource)
log.debug(MetricValue.add('request.path', valid_from, valid_to, service, path,
value=count,
value_num=count,
value_raw=count,
samples_count=count,
resource=resource))

count = ows_requests.count()
metric_defaults['ows_service'] = ows_service
metric_defaults['requests'] = ows_requests
print(MetricValue.add('request.count', valid_from, valid_to, service, 'Count',
value=count,
value_num=count,
samples_count=count,
value_raw=count,
resource=resource,
ows_service=ows_service))
log.debug(MetricValue.add('request.count', valid_from, valid_to, service, 'Count',
value=count,
value_num=count,
samples_count=count,
value_raw=count,
resource=resource,
ows_service=ows_service))
self.set_metric_values(
'request.ip', 'client_ip', **metric_defaults)
self.set_metric_values(
Expand Down
13 changes: 7 additions & 6 deletions geonode/monitoring/management/commands/collect_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import logging
import timeout_decorator

from datetime import datetime
from datetime import datetime, timedelta
from dateutil.tz import tzlocal

from django.conf import settings
Expand All @@ -36,7 +36,7 @@
from geonode.monitoring.collector import CollectorAPI
from geonode.monitoring.utils import TypeChecks

LOCAL_TIMEOUT = 300
LOCAL_TIMEOUT = 8600

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -125,16 +125,17 @@ def run_check(self, service, collector, since=None, until=None, force_check=None
service.last_check = service.last_check.astimezone(utc)
except:
service.last_check = service.last_check.replace(tzinfo=utc) if service.last_check else now
last_check = local_tz.localize(since).astimezone(utc).replace(tzinfo=utc) if since else service.last_check
if not last_check or last_check > now:
last_check = (now - service.check_interval)
service.last_check = last_check

if not until:
until = now
else:
until = local_tz.localize(until).astimezone(utc).replace(tzinfo=utc)

last_check = local_tz.localize(since).astimezone(utc).replace(tzinfo=utc) if since else service.last_check
if not last_check or last_check > until or (until - last_check) > settings.MONITORING_DATA_TTL:
last_check = (until - settings.MONITORING_DATA_TTL)
service.last_check = last_check

print('[',now ,'] checking', service.name, 'since', last_check, 'until', until)
data_in = None
h = Handler(service, force_check=force_check)
Expand Down
2 changes: 1 addition & 1 deletion geonode/monitoring/management/commands/render_metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
from geonode.monitoring.collector import CollectorAPI
from geonode.monitoring.utils import TypeChecks

LOCAL_TIMEOUT = 300
LOCAL_TIMEOUT = 8600

TIMESTAMP_OUTPUT = '%Y-%m-%d %H:%M:%S'

Expand Down
4 changes: 2 additions & 2 deletions geonode/monitoring/middleware.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,12 +74,12 @@ def add_resource(request, resource_type, name):

def register_request(self, request, response):
if self.service:
self.log.info('request', extra={'request': request, 'response': response})
self.log.debug('request', extra={'request': request, 'response': response})

def register_exception(self, request, exception):
if self.service:
response = HttpResponse('')
self.log.info('request', exc_info=exception, extra={'request': request, 'response': response})
self.log.debug('request', exc_info=exception, extra={'request': request, 'response': response})

def process_view(self, request, view_func, view_args, view_kwargs):
m = request.resolver_match
Expand Down
5 changes: 4 additions & 1 deletion geonode/monitoring/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,10 @@ def get_geoip():
# otherwise, some cli commands may fail (like updating geouip)
global GEOIP_DB
if GEOIP_DB is None:
GEOIP_DB = GeoIP()
try:
GEOIP_DB = GeoIP()
except BaseException as e:
log.exception(e)
return GEOIP_DB


Expand Down
8 changes: 5 additions & 3 deletions geonode/monitoring/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,7 @@ def get_href(self, link, format=None):
base_url = urlsplit(self.base_url)
if href and href.netloc != base_url.netloc:
href = href._replace(netloc=base_url.netloc)
href = href._replace(scheme=base_url.scheme)
if format is None:
return href.geturl()
if format in self.REPORT_FORMATS:
Expand All @@ -143,26 +144,27 @@ def get_requests(self, format=None, since=None, until=None):
if qargs:
rest_url = '{}?{}'.format(rest_url, urlencode(qargs))

print('checking', rest_url)
log.debug('checking', rest_url)
username = settings.OGC_SERVER['default']['USER']
password = settings.OGC_SERVER['default']['PASSWORD']
resp = requests.get(
rest_url,
auth=HTTPBasicAuth(username, password),
timeout=30)
doc = bs(resp.content)
doc = bs(resp.content, features="lxml")
links = doc.find_all('a')
for l in links:
href = self.get_href(l, format)
data = self.get_request(href, format=format)
if data:
yield data
else:
print("Skipping payload for {}".format(href))
log.href("Skipping payload for {}".format(href))

def get_request(self, href, format=format):
username = settings.OGC_SERVER['default']['USER']
password = settings.OGC_SERVER['default']['PASSWORD']
log.debug(" href: %s " % href)
r = requests.get(
href,
auth=HTTPBasicAuth(username, password),
Expand Down
2 changes: 1 addition & 1 deletion geonode/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -1243,7 +1243,7 @@
MONITORING_SERVICE_NAME = os.getenv("MONITORING_SERVICE_NAME", 'local-geonode')

# how long monitoring data should be stored
MONITORING_DATA_TTL = timedelta(days=7)
MONITORING_DATA_TTL = timedelta(days=int(os.getenv("MONITORING_DATA_TTL", 7)))

# this will disable csrf check for notification config views,
# use with caution - for dev purpose only
Expand Down

0 comments on commit 7513820

Please sign in to comment.