Skip to content

Commit

Permalink
Merge pull request #30 from melor/statsd_exceptions
Browse files Browse the repository at this point in the history
statsd: unexpected exception stats

#30
  • Loading branch information
saaros committed May 27, 2016
2 parents 345ced6 + 2315941 commit bfa976d
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 18 deletions.
1 change: 0 additions & 1 deletion .pylintrc
@@ -1,6 +1,5 @@
[MESSAGES CONTROL]
disable=
bare-except,
duplicate-code,
invalid-name,
line-too-long,
Expand Down
10 changes: 7 additions & 3 deletions pglookout/cluster_monitor.py
Expand Up @@ -54,9 +54,11 @@ def wait_select(conn, timeout=5.0):


class ClusterMonitor(Thread):
def __init__(self, config, cluster_state, observer_state, create_alert_file, trigger_check_queue):
def __init__(self, config, cluster_state, observer_state, create_alert_file, trigger_check_queue,
stats):
Thread.__init__(self)
self.log = logging.getLogger("ClusterMonitor")
self.stats = stats
self.running = True
self.cluster_state = cluster_state
self.observer_state = observer_state
Expand Down Expand Up @@ -91,9 +93,10 @@ def _connect_to_db(self, instance, dsn):
if "password authentication" in getattr(ex, "message", ""):
self.create_alert_file("authentication_error")
conn = None # make sure we don't try to use the connection if we timed out
except:
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to connect to %s (%s)",
instance, inst_info_str)
self.stats.unexpected_exception(ex, where="_connect_to_db")
conn = None
self.db_conns[instance] = conn
return conn
Expand All @@ -117,8 +120,9 @@ def _fetch_observer_state(self, instance, uri):
self.log.warning("%s (%s) fetching state from observer: %r, %r",
ex.__class__.__name__, ex, instance, fetch_uri)
result['connection'] = False
except:
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Problem in fetching state from observer: %r, %r", instance, fetch_uri)
self.stats.unexpected_exception(ex, where="_fetch_observer_state")
result['connection'] = False
return result

Expand Down
2 changes: 1 addition & 1 deletion pglookout/current_master.py
Expand Up @@ -44,7 +44,7 @@ def main(args=None):
state_dict = json.load(fp)
current_master = state_dict['current_master']
print(current_master)
except:
except: # pylint: disable=bare-except
return -1


Expand Down
33 changes: 22 additions & 11 deletions pglookout/pglookout.py
Expand Up @@ -37,14 +37,14 @@

try:
from systemd import daemon # pylint: disable=import-error
except:
except ImportError:
daemon = None


class PgLookout(object):
def __init__(self, config_path):
self.log = logging.getLogger("pglookout")
self.statsd = None
self.stats = None
self.running = True
self.replication_lag_over_warning_limit = False

Expand Down Expand Up @@ -81,9 +81,13 @@ def __init__(self, config_path):
"current_master": self.current_master,
"replication_lag_over_warning": self.replication_lag_over_warning_limit}

self.cluster_monitor = ClusterMonitor(self.config, self.cluster_state,
self.observer_state, self.create_alert_file,
trigger_check_queue=self.trigger_check_queue)
self.cluster_monitor = ClusterMonitor(
config=self.config,
cluster_state=self.cluster_state,
observer_state=self.observer_state,
create_alert_file=self.create_alert_file,
trigger_check_queue=self.trigger_check_queue,
stats=self.stats)
# cluster_monitor doesn't exist at the time of reading the config initially
self.cluster_monitor.log.setLevel(self.log_level)
self.webserver = WebServer(self.config, self.cluster_state)
Expand All @@ -108,8 +112,9 @@ def load_config(self, _signal=None, _frame=None):
try:
with open(self.config_path) as fp:
self.config = json.load(fp)
except:
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Invalid JSON config, exiting")
self.stats.unexpected_exception(ex, where="load_config")
sys.exit(1)

# statsd settings may have changed
Expand Down Expand Up @@ -169,9 +174,10 @@ def write_cluster_state_to_json_file(self):
fp.write(json_to_dump)
os.rename(state_file_path + ".tmp", state_file_path)
self.log.debug("Wrote JSON state file to disk, took %.4fs", time.time() - start_time)
except:
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Problem in writing JSON: %r file to disk, took %.4fs",
self.overall_state, time.time() - start_time)
self.stats.unexpected_exception(ex, where="write_cluster_state_to_json_file")

def create_node_map(self, cluster_state, observer_state):
standby_nodes, master_node, master_instance = {}, None, None
Expand Down Expand Up @@ -546,6 +552,7 @@ def execute_external_command(self, command):
except subprocess.CalledProcessError as err:
self.log.exception("Problem with executing: %r, return_code: %r, output: %r",
command, err.returncode, err.output)
self.stats.unexpected_exception(err, where="execute_external_command")
return_code = err.returncode # pylint: disable=no-member
self.log.warning("Executed external command: %r, output: %r", return_code, output)
return return_code
Expand All @@ -559,17 +566,19 @@ def create_alert_file(self, filename):
self.log.debug("Creating alert file: %r", filepath)
with open(filepath, "w") as fp:
fp.write("alert")
except:
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Problem writing alert file: %r", filepath)
self.stats.unexpected_exception(ex, where="create_alert_file")

def delete_alert_file(self, filename):
try:
filepath = os.path.join(self.config.get("alert_file_dir", os.getcwd()), filename)
if os.path.exists(filepath):
self.log.debug("Deleting alert file: %r", filepath)
os.unlink(filepath)
except:
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Problem unlinking: %r", filepath)
self.stats.unexpected_exception(ex, where="delete_alert_file")

def main_loop(self):
while self.running:
Expand All @@ -578,12 +587,14 @@ def main_loop(self):
try:
sleep_time = float(self.config.get("replication_state_check_interval", 5.0))
self.check_cluster_state()
except:
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to check cluster state")
self.stats.unexpected_exception(ex, where="main_loop1")
try:
self.write_cluster_state_to_json_file()
except:
except Exception as ex: # pylint: disable=broad-except
self.log.exception("Failed to write cluster state")
self.stats.unexpected_exception(ex, where="main_loop2")
time.sleep(sleep_time)

def run(self):
Expand Down
4 changes: 3 additions & 1 deletion test/test_cluster_monitor.py
Expand Up @@ -5,6 +5,7 @@
See LICENSE for details
"""

from pglookout import statsd
from pglookout.common import LOG_FORMAT
from pglookout.cluster_monitor import ClusterMonitor
from psycopg2.extensions import POLL_OK
Expand Down Expand Up @@ -79,7 +80,8 @@ def poll(self): # pylint: disable=no-self-use
cluster_state=cluster_state,
observer_state=observer_state,
create_alert_file=create_alert_file,
trigger_check_queue=trigger_check_queue)
trigger_check_queue=trigger_check_queue,
stats=statsd.StatsClient(host=None))
cm.main_monitoring_loop()

assert len(cm.cluster_state) == 2
Expand Down
2 changes: 1 addition & 1 deletion test/test_lookout.py
Expand Up @@ -14,7 +14,7 @@
from pglookout.pglookout import PgLookout
try:
from mock import Mock # pylint: disable=import-error
except: # py3k import location
except ImportError: # py3k import location
from unittest.mock import Mock # pylint: disable=import-error,no-name-in-module
from unittest import TestCase
import datetime
Expand Down

0 comments on commit bfa976d

Please sign in to comment.