diff --git a/.travis.yml b/.travis.yml index 0b6e21cc1..7caf6ce10 100644 --- a/.travis.yml +++ b/.travis.yml @@ -8,6 +8,7 @@ script: - make test - make reltest - make systests + - make coverage - rebar3 as test coveralls send - make dialyzer - make lint diff --git a/Makefile b/Makefile index a1cc41d5c..17aa84b3d 100644 --- a/Makefile +++ b/Makefile @@ -1,5 +1,6 @@ REBAR = $(shell pwd)/rebar3 -.PHONY: rel test relgentlerain +COVERPATH = $(shell pwd)/_build/test/cover +.PHONY: rel test relgentlerain docker-build docker-run all: compile @@ -18,15 +19,11 @@ cleantests: rm -f test/multidc/*.beam rm -rf logs/ -shell: - $(REBAR) shell --name='antidote@127.0.0.1' --setcookie antidote --config config/sys-debug.config - -# same as shell, but automatically reloads code when changed -# to install add `{plugins, [rebar3_auto]}.` to ~/.config/rebar3/rebar.config -# the tool requires inotifywait (sudo apt install inotify-tools) -# see https://github.com/vans163/rebar3_auto or http://blog.erlware.org/rebar3-auto-comile-and-load-plugin/ -auto: - $(REBAR) auto --name='antidote@127.0.0.1' --setcookie antidote --config config/sys-debug.config +shell: rel + export NODE_NAME=antidote@127.0.0.1 ; \ + export COOKIE=antidote ; \ + export ROOT_DIR_PREFIX=$$NODE_NAME/ ; \ + _build/default/rel/antidote/bin/antidote console ${ARGS} rel: $(REBAR) release @@ -58,10 +55,13 @@ compile-utils: compile done test: - mkdir -p eunit_logs ${REBAR} eunit skip_deps=true coverage: + # copy the coverdata files with a wildcard filter + # won't work if there are multiple folders (multiple systests) + cp logs/*/*singledc*/../all.coverdata ${COVERPATH}/singledc.coverdata ; \ + cp logs/*/*multidc*/../all.coverdata ${COVERPATH}/multidc.coverdata ; \ ${REBAR} cover --verbose singledc: compile-utils rel @@ -94,12 +94,9 @@ dialyzer: ${REBAR} dialyzer docker-build: - DOCKERTMPDIR="$(shell mktemp -d ./docker-tmpdir.XXXXXXXX)" ; \ - wget "https://raw.githubusercontent.com/AntidoteDB/docker-antidote/master/local-build/Dockerfile" -O "$$DOCKERTMPDIR/Dockerfile" ; \ - wget "https://raw.githubusercontent.com/AntidoteDB/docker-antidote/master/local-build/entrypoint.sh" -O "$$DOCKERTMPDIR/entrypoint.sh" ; \ - wget "https://raw.githubusercontent.com/AntidoteDB/docker-antidote/master/local-build/start_and_attach.sh" -O "$$DOCKERTMPDIR/start_and_attach.sh" ; \ - docker build -f $$DOCKERTMPDIR/Dockerfile --build-arg DOCKERFILES=$$DOCKERTMPDIR -t antidotedb:local-build . ; \ - [ ! -d $$DOCKERTMPDIR ] || rm -r $$DOCKERTMPDIR + tmpdir=`mktemp -d` ; \ + wget "https://raw.githubusercontent.com/AntidoteDB/docker-antidote/v0.2.1/local-build/Dockerfile" -O "$$tmpdir/Dockerfile" ; \ + docker build -f $$tmpdir/Dockerfile -t antidotedb:local-build . docker-run: docker-build docker run -d --name antidote -p "8087:8087" antidotedb:local-build @@ -108,4 +105,3 @@ docker-clean: ifneq ($(docker images -q antidotedb:local-build 2> /dev/null), "") docker image rm -f antidotedb:local-build endif - [ ! -d docker-tmpdir* ] || rm -r docker-tmpdir* diff --git a/README.md b/README.md index 44341f6ba..12b0dffa9 100644 --- a/README.md +++ b/README.md @@ -1,57 +1,75 @@ -Antidote +AntidoteDB ============ + [![Build Status](https://travis-ci.org/AntidoteDB/antidote.svg?branch=master)](https://travis-ci.org/AntidoteDB/antidote) [![Coverage Status](https://coveralls.io/repos/github/AntidoteDB/antidote/badge.svg?branch=master)](https://coveralls.io/github/AntidoteDB/antidote?branch=master) -Welcome to the Antidote repository, the reference platform of the [SyncFree European Project](https://syncfree.lip6.fr/) and the [LightKone European Project](https://www.lightkone.eu/) - -You will find all information in the documentation at [http://antidotedb.eu](http://antidotedb.eu). +Welcome to the Antidote repository, the reference platform of the [SyncFree European Project](https://syncfree.lip6.fr/) and the [LightKone European Project](https://www.lightkone.eu/). -For benchmarking Antidote deployments, checkout the [Antidote Benchmarks](https://github.com/AntidoteDB/Benchmarks) repository. +Description +=========== +AntidoteDB is a highly available geo-replicated key-value database. +AntidoteDB provides features that help programmers to write correct applications while having the same performance and horizontal scalability as AP/NoSQL databases. +Furthermore, AntidoteDB operations are based on the principle of synchronization-free execution by using Conflict-free replicated datatypes (*CRDTs*). -Development ------------ +Features +========= -Antidote requires Erlang 21 or greater. +**CRDTs** -Use the following `Makefile` targets to build and test antidote: +High-level replicated data types that are designed to work correctly in the presence of concurrent updates and partial failures. - # compile the project: - make compile +**Highly Available Transactions** - # run the unit tests: - make test +Traditional ACID transactions were built for single-machine deployments. +On the one hand, it is expensive to implement ACID transactions in distributed deployments. +On the other hand, highly-available transactions (HAT) provide strong consistency within a data center, +but still perform well in geo-replicated deployments. - # run the system tests: - make systests +**Geo-replication** - # Run dialyzer to check types: - make dialyzer +Designed to run on multiple servers in locations distributed world-wide. +It provides continuous functioning even when there are failures or network partition. - # Open a shell: - make shell - # Build a release: - make rel +How to Use +========== +You will find all information on the [project website](http://antidotedb.eu) or the [usage documentation](https://antidotedb.gitbook.io/documentation/). -### Code style +Small tutorials on how to use Antidote can be found for [Java](https://github.com/AntidoteDB/antidote-java-tutorial) +and [Jupyter Notebook](https://github.com/AntidoteDB/antidote-jupyter-notebook). -Before commiting code run `make lint` to check the code style. +Topics: -In addition there are the following rules which are not checked automatically: +* [Configuring Features of Antidote](https://antidotedb.gitbook.io/documentation/architecture/configuration) +* [Benchmarking Antidote](https://github.com/AntidoteDB/Benchmarks) +* Deploying Antidote + * [Natively](https://antidotedb.gitbook.io/documentation/deployment/native) + * [Local Docker setup](https://antidotedb.gitbook.io/documentation/deployment/docker) + * [Docker compose setups](https://antidotedb.gitbook.io/documentation/deployment/docker-compose-setup) + * [Docker Swarm](https://antidotedb.gitbook.io/documentation/deployment/dockerswarm) + * [Kubernetes](https://antidotedb.gitbook.io/documentation/deployment/kubernetes) +* [Monitoring an Antidote instance or data center](https://github.com/AntidoteDB/antidote_stats) +* [Protocol Buffer API](https://antidotedb.gitbook.io/documentation/api/protocol-buffer-api) + * [Erlang Client Repository](https://github.com/AntidoteDB/antidote-erlang-client) + * [Java Client Repository](https://github.com/AntidoteDB/antidote-java-client) + * [JavaScript Client Repository](https://github.com/AntidoteDB/antidote_ts_client) + * [Go Client Repository](https://github.com/AntidoteDB/antidote-go-client) + * [Python Client Repository](https://github.com/AntidoteDB/antidote-python-client) + * [REST Client Repository](https://github.com/LightKone/antidote-rest-server) -- Indentation should use 4 spaces (no tabs) -- Exported functions must be documented and have a type specification +Applications that use AntidoteDB: -### Working on dependencies +* [Calender App](https://github.com/AntidoteDB/calender-app) +* [Antidote Web Shell](https://github.com/AntidoteDB/antidote-web-shell) -When working on dependencies of Antidote it can be helpful to use them as [Checkout Dependencies](https://www.rebar3.org/docs/dependencies#section-checkout-dependencies): -- Create a folder named `_checkouts` in your `antidote` folder (next to the `_build` folder) -- Clone the dependency into that folder. The folder name in `_checkouts` must be the name of the dependency in `rebar.config`. - Note that symbolic links in the `_checkouts` folder are ignored. -- When running a rebar3 task on Antidote, it will always use the latest version from the dependencies. It will also recompile all other dependencies, which can be avoided [by patching rebar3](https://github.com/erlang/rebar3/issues/2152) +Contributing & Development +============== +Antidote encourages open-source development. +If you want to contribute, you can find all necessary information in the [developer documentation](https://antidotedb.gitbook.io/documentation/development/setup) +To make yourself familiar with AntidoteDB, you can start by looking at [good first issues](https://github.com/AntidoteDB/antidote/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). diff --git a/TESTING.md b/TESTING.md deleted file mode 100644 index 3d8114f0c..000000000 --- a/TESTING.md +++ /dev/null @@ -1,109 +0,0 @@ -Antidote Testing -================ - -* [Unit Tests](#unit-tests) -* [System Tests](#system-tests) -* [Release Tests](#release-tests) -* [Test Logging](#test-logging) -* [Utility Functions](#utility-functions) - -Unit Tests ----------- - -EUnit is used for unit testing. -Unit tests reside in the module they are testing and test the module in isolation. -The basic test skeleton should be encapsulated in a TEST block. - - -``` --ifdef(TEST). -main_test_() -> - {foreach, - fun setup/0, fun cleanup/1, - [ fun my_test/1, ... ] - }. - -% Setup and Cleanup -setup() -> io:format(user,"setup",[]), ... - -cleanup(Args_from_setup_return) -> ... - -... --endif. -``` - -* All tests can then be executed via `make test`. -* A single module can be tested via `rebar3 eunit --module=MODULE_NAME` -* Log output is consumed; use `io:format/3` for debugging purposes - - -System Tests ----------------------------------- - -System test suite tests a certain aspect of the system and can be either a single data center or a multi-datacenter suite. - -A suite should be name `..._SUITE.erl` by convention. - -To execute system tests, execute one of the following commands - - make singledc - make singledc SUITE=... - make multidc - make multidc SUITE=... - make systests - - -#### Bucket Namespace - -Every system test should have its own bucket namespace defined at the top of the suite. -Unique bucket namespaces ensure that test suites do not interfere with each other. -The convention is filename with SUITE replaced by bucket. -E.g., the bucket for `antidote_SUITE.erl` should be defined as - - -define(BUCKET, antidote_bucket). - -and the macro should be used *once* at the start of a test case, defining the bucket to be used for that test case. -If a unique bucket *per test case* is needed (e.g. for parallel tests), then - - -define(BUCKET, test_utils:bucket(antidote_bucket)). - -can be used instead. - - -#### Multiple Data Center System Test Setup - -System tests which use multiple data centers are located in the test/multidc folder. -The fixed test setup is as follows: - -* 3 interconnected Data center -* Data center one has two physical nodes - - - -Release Tests -------------- - -`make reltest` tests the antidote release executable. - - -Test Logging -------------- - -To log messages in a test suite, use the function `ct:log`. If a message should be visibile in the printed log (e.g. in travis), then `ct:pal` or `ct:print` can be used. - - -Utility Functions ------------------ - -There are multiple help functions already implemented for common functionality in the `test/utils` folder. They are separated by functionality into the following files: - -* `antidote_utils` - * Functions to manipulate antidote data types -* `riak_utils` - * Riak cluster management. Useful for multi-dc tests. -* `time_utils` - * Time-based helper functions -* `test_utils` - * Test initialization and other helper functions which do not fit into the other categories - - diff --git a/bin/env b/bin/env deleted file mode 100755 index c2351af3e..000000000 --- a/bin/env +++ /dev/null @@ -1,117 +0,0 @@ -#!/bin/bash - -# Handoff port. -if [ -z "$HANDOFF_PORT" ]; then - export HANDOFF_PORT=8099 -fi - -# Protocol buffer port. -if [ -z "$PB_PORT" ]; then - export PB_PORT=8087 -fi - -# Protocol buffer ip. -if [ -z "$PB_IP" ]; then - export PB_IP="127.0.0.1" -fi - -# Pub/sub port. -if [ -z "$PUBSUB_PORT" ]; then - export PUBSUB_PORT=8086 -fi - -# Logeader port. -if [ -z "$LOGREADER_PORT" ]; then - export LOGREADER_PORT=8085 -fi - -# Metrics port. -if [ -z "$METRICS_PORT" ]; then - export METRICS_PORT=3001 -fi - -# Ring State directory. -if [ -z "$RING_STATE_DIR" ]; then - export RING_STATE_DIR="data/ring" -fi - -# Platform data directory. -if [ -z "$PLATFORM_DATA_DIR" ]; then - export PLATFORM_DATA_DIR="data" -fi - -# If we're running in Mesos... -if [ ! -z "$MESOS_TASK_ID" ]; then - # Choose publicly routable IP. - if [ -z "$IP" ]; then - export IP=$(ip route get 8.8.8.8 | head -1 | cut -d' ' -f8) - fi - - # Caveat: there's only one antidote supported per node because of - # EPMD, so we will address it by name@IP. - - # Choose the hostname for the epmd long name if the hostname exists - # and if it resolves through the resolver; using a resolvable name - # that's only resolvable with resolv.conf won't work for long names. - if [ ! -z "$HOSTNAME" ]; then - if /usr/bin/dig ${HOSTNAME} | grep -q 'NXDOMAIN' - export NODE_NAME=antidote@${HOSTNAME} - then - export NODE_NAME=antidote@${IP} - fi - fi - - # Else, default to IP. - if [ -z "$NODE_NAME" ]; then - export NODE_NAME=andtidote@${IP} - fi - - # Handoff port. - export HANDOFF_PORT=${PORT1} - - # Protocol buffer port. - export PB_PORT=${PORT2} - - # Pub/sub port. - export PUBSUB_PORT=${PORT3} - - # Logeader port. - export LOGREADER_PORT=${PORT4} - - # Metrics port. - export METRICS_PORT=${PORT5} -fi - -# Assume 127.0.0.1 as bind host. -if [ -z "$IP" ]; then - echo "IP address not set; defaulting to 127.0.0.1." - export IP=127.0.0.1 -fi - -if [ -z "$NODE_NAME" ]; then - export NODE_NAME=antidote@${IP} -fi - -if [ -z "$COOKIE" ]; then - export COOKIE=antidote -fi - -export RELX_REPLACE_OS_VARS=true - -# Print task id and ports only if assigned -[[ -z "${MESOS_TASK_ID}" ]] || echo "MESOS_TASK_ID: ${MESOS_TASK_ID}" -[[ -z "${PORT0}" ]] || echo "PORT0: ${PORT0}" -[[ -z "${PORT1}" ]] || echo "PORT0: ${PORT1}" -[[ -z "${PORT2}" ]] || echo "PORT0: ${PORT2}" -[[ -z "${PORT3}" ]] || echo "PORT0: ${PORT3}" -[[ -z "${PORT4}" ]] || echo "PORT0: ${PORT4}" -[[ -z "${PORT5}" ]] || echo "PORT0: ${PORT5}" - -echo "NODE_NAME: ${NODE_NAME}" -echo "COOKIE: ${COOKIE}" -echo "IP: ${IP}" -echo "HOSTNAME: ${HOSTNAME}" - -# Execute antidote release script -RELNAME="`dirname \"$0\"`"/antidote -exec ${RELNAME} "$@" diff --git a/bin/launch-nodes.sh b/bin/launch-nodes.sh deleted file mode 100755 index a5ae61b00..000000000 --- a/bin/launch-nodes.sh +++ /dev/null @@ -1,9 +0,0 @@ -#!/bin/bash - -export RELX_REPLACE_OS_VARS=true - -for i in `seq 1 10`; -do - PLATFORM_DATA_DIR="data/${i}" RING_STATE_DIR="data/ring/${i}" HANDOFF_PORT=8${i}99 PB_PORT=8${i}87 PUBSUB_PORT=8${i}86 LOGREADER_PORT=8${i}85 METRICS_PORT=300${i} NODE_NAME=antidote-${i} _build/default/rel/antidote/bin/antidote foreground & - sleep 1 -done diff --git a/config/lager.config b/config/lager.config new file mode 100644 index 000000000..7858afa48 --- /dev/null +++ b/config/lager.config @@ -0,0 +1,12 @@ +[ + %% stop lager completely + {lager, [ + {error_logger_whitelist, [error_logger]}, + % stop redirecting error_logger messages + {error_logger_redirect, false}, + % stop writing crash log + {crash_log, false}, + % use our redirection backend + {handlers, [{antidote_lager_backend, [debug]}]} + ]} +]. diff --git a/config/legacy/riak_core.schema b/config/legacy/riak_core.schema new file mode 100644 index 000000000..414cf96a2 --- /dev/null +++ b/config/legacy/riak_core.schema @@ -0,0 +1,729 @@ +%%-*- mode: erlang -*- + +%% @doc enable active anti-entropy subsystem +{mapping, "anti_entropy", "riak_core.anti_entropy", [ + {datatype, {enum, [on, off, debug]}}, + {default, on} +]}. + +{ translation, + "riak_core.anti_entropy", + fun(Conf) -> + Setting = cuttlefish:conf_get("anti_entropy", Conf), + case Setting of + on -> {on, []}; + debug -> {on, [debug]}; + off -> {off, []}; + _Default -> {on, []} + end + end +}. + +%% @doc Restrict how fast AAE can build hash trees. Building the tree +%% for a given partition requires a full scan over that partition's +%% data. Once built, trees stay built until they are expired. +%% Config is of the form: +%% {num-builds, per-timespan} +%% Default is 1 build per hour. +{mapping, "anti_entropy.build_limit.number", "riak_core.anti_entropy_build_limit", [ + {default, 1}, + {datatype, integer} +]}. + +{mapping, "anti_entropy.build_limit.per_timespan", "riak_core.anti_entropy_build_limit", [ + {default, "1h"}, + {datatype, {duration, ms}} +]}. + +{translation, + "riak_core.anti_entropy_build_limit", + fun(Conf) -> + {cuttlefish:conf_get("anti_entropy.build_limit.number", Conf), + cuttlefish:conf_get("anti_entropy.build_limit.per_timespan", Conf)} + end}. + +%% @doc Determine how often hash trees are expired after being built. +%% Periodically expiring a hash tree ensures the on-disk hash tree +%% data stays consistent with the actual k/v backend data. It also +%% helps Riak identify silent disk failures and bit rot. However, +%% expiration is not needed for normal AAE operation and should be +%% infrequent for performance reasons. The time is specified in +%% milliseconds. The default is 1 week. +{mapping, "anti_entropy.expire", "riak_core.anti_entropy_expire", [ + {default, "1w"}, + {datatype, {duration, ms}} +]}. + +%% @doc Limit how many AAE exchanges/builds can happen concurrently. +{mapping, "anti_entropy.concurrency", "riak_core.anti_entropy_concurrency", [ + {default, 2}, + {datatype, integer} +]}. + +%% @doc The tick determines how often the AAE manager looks for work +%% to do (building/expiring trees, triggering exchanges, etc). +%% The default is every 15 seconds. Lowering this value will +%% speedup the rate that all replicas are synced across the cluster. +%% Increasing the value is not recommended. +{mapping, "anti_entropy.tick", "riak_core.anti_entropy_tick", [ + {default, "15s"}, + {datatype, {duration, ms}} +]}. + +%% @doc The directory where AAE hash trees are stored. +{mapping, "anti_entropy.data_dir", "riak_core.anti_entropy_data_dir", [ + {default, "/anti_entropy"} +]}. + + +%% @doc This parameter defines the percentage, 1 to 100, of total +%% server memory to assign to leveldb. leveldb will dynamically +%% adjust it internal cache sizs as Riak activates / inactivates +%% vnodes on this server to stay within this size. The memory size +%% can alternately be assigned as a byte count via total_leveldb_mem instead. +{mapping, "anti_entropy.total_leveldb_mem_percent", "riak_core.aae_total_leveldb_mem_percent", + [{default, "80"}, + {datatype, integer}]}. + + +%% @doc This parameter defines the number of bytes of +%% server memory to assign to leveldb. leveldb will dynamically +%% adjust it internal cache sizes as Riak activates / inactivates +%% vnodes on this server to stay within this size. The memory size +%% can alternately be assigned as percentage of total server memory +%% via total_leveldb_mem_percent instead. +{mapping, "anti_entropy.total_leveldb_mem", "riak_core.aae_total_leveldb_mem", + [{datatype, bytesize}, + {level, advanced}]}. + + +%% @doc The 'sync' parameter defines how new key/value data is placed in the +%% recovery log. The recovery log is only used if the Riak program crashes or +%% the server loses power unexpectedly. The parameter's original intent was +%% to guarantee that each new key / value was written to the physical disk +%% before leveldb responded with “write good”. The reality in modern servers +%% is that many layers of data caching exist between the database program and +%% the physical disks. This flag influences only one of the layers. +{mapping, "anti_entropy.sync", "riak_core.aae_sync", + [{default, false}, + {datatype, {enum, [true, false]}}, + {level, advanced}]}. + +%% @doc limited_developer_mem is a Riak specific option that is used when +%% a developer is testing a high number of vnodes and/or several VMs +%% on a machine with limited physical memory. Do NOT use this option +%% if making performance measurements. This option overwrites values +%% given to write_buffer_size_min and write_buffer_size_max. +{mapping, "anti_entropy.limited_developer_mem", "riak_core.aae_limited_developer_mem", + [{default, false}, + {datatype, {enum, [true, false]}}, + {level, advanced}]}. + + +%% @doc Each vnode first stores new key/value data in a memory based write +%% buffer. This write buffer is in parallel to the recovery log mentioned +%% in the “sync” parameter. Riak creates each vnode with a randomly sized +%% write buffer for performance reasons. The random size is somewhere +%% between write_buffer_size_min and write_buffer_size_max. +{mapping, "anti_entropy.write_buffer_size_min", "riak_core.aae_write_buffer_size_min", + [{default, "30MB"}, + {datatype, bytesize}, + {level, advanced}]}. + +{mapping, "anti_entropy.write_buffer_size_max", "riak_core.aae_write_buffer_size_max", + [{default, "60MB"}, + {datatype, bytesize}, + {level, advanced}]}. + +%% @doc Whether the distributed throttle for active anti-entropy is +%% enabled. +{mapping, "anti_entropy.throttle", "riak_core.aae_throttle_kill_switch", [ + {default, on}, + {datatype, {flag, off, on}}, + hidden +]}. + +%% @doc Sets the throttling tiers for active anti-entropy. Each tier +%% is a minimum vnode mailbox size and a time-delay that the throttle +%% should observe at that size and above. For example: +%% +%% anti_entropy.throttle.tier1.mailbox_size = 0 +%% anti_entropy.throttle.tier1.delay = 0ms +%% anti_entropy.throttle.tier2.mailbox_size = 40 +%% anti_entropy.throttle.tier2.delay = 5ms +%% +%% If configured, there must be a tier which includes a mailbox size +%% of 0. Both .mailbox_size and .delay must be set for each tier. +%% @see anti_entropy.throttle +{mapping, + "anti_entropy.throttle.$tier.mailbox_size", + "riak_core.aae_throttle_limits", [ + {datatype, integer}, + hidden, + {validators, ["non_negative"]} +]}. + +%% @see anti_entropy.throttle.$tier.mailbox_size +{mapping, + "anti_entropy.throttle.$tier.delay", + "riak_core.aae_throttle_limits", [ + {datatype, {duration, ms}}, + hidden +]}. + +{validator, + "non_negative", + "must be greater than or equal to 0", + fun(Value) -> Value >= 0 end}. + +{translation, + "riak_core.aae_throttle_limits", + fun(Conf) -> + %% Grab all of the possible names of tiers so we can ensure that + %% both mailbox_size and delay are included for each tier. + TierNamesM = cuttlefish_variable:fuzzy_matches(["anti_entropy", "throttle", "$tier", "mailbox_size"], Conf), + TierNamesD = cuttlefish_variable:fuzzy_matches(["anti_entropy", "throttle", "$tier", "delay"], Conf), + TierNames = lists:usort(TierNamesM ++ TierNamesD), + Throttles = lists:sort(lists:foldl( + fun({"$tier", Tier}, Settings) -> + Mbox = cuttlefish:conf_get(["anti_entropy", "throttle", Tier, "mailbox_size"], Conf), + Delay = cuttlefish:conf_get(["anti_entropy", "throttle", Tier, "delay"], Conf), + [{Mbox - 1, Delay}|Settings] + end, [], TierNames)), + case Throttles of + %% -1 is a magic "minimum" bound and must be included, so if it + %% isn't present we call it invalid + [{-1,_}|_] -> Throttles; + _ -> cuttlefish:invalid("anti_entropy.throttle tiers must include a tier with mailbox_size 0") + end +end +}. + + +%% @doc Each database .sst table file can include an optional "bloom filter" +%% that is highly effective in shortcutting data queries that are destined +%% to not find the requested key. The bloom_filter typically increases the +%% size of an .sst table file by about 2%. This option must be set to true +%% in the riak.conf to take effect. +{mapping, "anti_entropy.bloomfilter", "riak_core.aae_use_bloomfilter", + [{default, on}, + {datatype, {enum, [on, off]}}]}. + +{translation, + "riak_core.aae_use_bloomfilter", + fun(Conf) -> + case cuttlefish:conf_get("anti_entropy.bloomfilter", Conf) of + on -> true; + off -> false; + _ -> true + end + end +}. + + +%% @doc sst_block_size defines the size threshold for a block / chunk of data +%% within one .sst table file. Each new block gets an index entry in the .sst +%% table file's master index. +{mapping, "anti_entropy.block_size", "riak_core.aae_sst_block_size", +[{default, "4KB"}, + {datatype, bytesize}, + {level, advanced}]}. + + +%% @doc block_restart_interval defines the key count threshold for a new key +%% entry in the key index for a block. +%% Most clients should leave this parameter alone. +{mapping, "anti_entropy.block_restart_interval", "riak_core.aae_block_restart_interval", + [{default, 16}, + {datatype, integer}, + {level, advanced}]}. + + +%% @doc verify_checksums controls whether or not validation occurs when Riak +%% requests data from the leveldb database on behalf of the user. +{mapping, "anti_entropy.verify_checksums", "riak_core.aae_verify_checksums", + [{default, true}, + {datatype, {enum, [true, false]}}, + {level, advanced}]}. + + +%% @doc verify_compaction controls whether or not validation occurs when +%% leveldb reads data as part of its background compaction operations. +{mapping, "anti_entropy.verify_compaction", "riak_core.aae_verify_compaction", + [{default, true}, + {datatype, {enum, [true, false]}}, + {level, advanced}]}. + +%% @doc The number of worker threads performing LevelDB operations. +{mapping, "anti_entropy.threads", "riak_core.aae_eleveldb_threads", + [{default, 71}, + {datatype, integer}, + {level, advanced}]}. + +%% @doc Option to override LevelDB's use of fadvise(DONTNEED) with +%% fadvise(WILLNEED) instead. WILLNEED can reduce disk activity on +%% systems where physical memory exceeds the database size. +{mapping, "anti_entropy.fadvise_willneed", "riak_core.aae_fadvise_willneed", + [{default, false}, + {datatype, {enum, [true, false]}}, + {level, advanced}]}. + +%% Default Bucket Properties + +%% @doc The number of replicas stored. Note: See Replication +%% Properties for further discussion. +%% http://docs.basho.com/riak/latest/dev/advanced/cap-controls/ +{mapping, "buckets.default.n_val", "riak_core.default_bucket_props.n_val", [ + {datatype, integer}, + {default, 3}, + hidden +]}. + +%% @doc Number of partitions in the cluster (only valid when first +%% creating the cluster). Must be a power of 2, minimum 8 and maximum +%% 1024. +{mapping, "ring_size", "riak_core.ring_creation_size", [ + {datatype, integer}, + {default, 64}, + {validators, ["ring_size^2", "ring_size_max", "ring_size_min"]}, + {commented, 64} +]}. + +%% ring_size validators +{validator, "ring_size_max", + "2048 and larger are supported, but considered advanced config", + fun(Size) -> + Size =< 1024 + end}. + +{mapping, "buckets.default.pr", "riak_core.default_bucket_props.pr", [ + {default, "0"}, + {level, advanced} +]}. + +%% Cut and paste translation screams to be rewritten as a datatype, but that's a +%% "nice to have" +{translation, + "riak_core.default_bucket_props.pr", + fun(Conf) -> + Setting = cuttlefish:conf_get("buckets.default.pr", Conf), + case Setting of + "quorum" -> quorum; + "all" -> all; + X -> + try list_to_integer(Setting) of + Int -> Int + catch + E:R -> error + end + end + end +}. + +{mapping, "buckets.default.r", "riak_core.default_bucket_props.r", [ + {default, "quorum"}, + {level, advanced} +]}. +{translation, + "riak_core.default_bucket_props.r", + fun(Conf) -> + Setting = cuttlefish:conf_get("buckets.default.r", Conf), + case Setting of + "quorum" -> quorum; + "all" -> all; + X -> + try list_to_integer(Setting) of + Int -> Int + catch + E:R -> error + end + end + end +}. + +{mapping, "buckets.default.w", "riak_core.default_bucket_props.w", [ + {default, "quorum"}, + {level, advanced} +]}. +{translation, + "riak_core.default_bucket_props.w", + fun(Conf) -> + Setting = cuttlefish:conf_get("buckets.default.w", Conf), + case Setting of + "quorum" -> quorum; + "all" -> all; + X -> + try list_to_integer(Setting) of + Int -> Int + catch + E:R -> error + end + end + end +}. + +{mapping, "buckets.default.pw", "riak_core.default_bucket_props.pw", [ + {default, "0"}, + {level, advanced} +]}. +{translation, + "riak_core.default_bucket_props.pw", + fun(Conf) -> + Setting = cuttlefish:conf_get("buckets.default.pw", Conf), + case Setting of + "quorum" -> quorum; + "all" -> all; + X -> + try list_to_integer(Setting) of + Int -> Int + catch + E:R -> error + end + end + end +}. + +{mapping, "buckets.default.dw", "riak_core.default_bucket_props.dw", [ + {default, "quorum"}, + {level, advanced} +]}. +{translation, + "riak_core.default_bucket_props.dw", + fun(Conf) -> + Setting = cuttlefish:conf_get("buckets.default.dw", Conf), + case Setting of + "quorum" -> quorum; + "all" -> all; + X -> + try list_to_integer(Setting) of + Int -> Int + catch + E:R -> error + end + end + end +}. + +{mapping, "buckets.default.rw", "riak_core.default_bucket_props.rw", [ + {default, "quorum"}, + {level, advanced} +]}. +{translation, + "riak_core.default_bucket_props.rw", + fun(Conf) -> + Setting = cuttlefish:conf_get("buckets.default.rw", Conf), + case Setting of + "quorum" -> quorum; + "all" -> all; + X -> + try list_to_integer(Setting) of + Int -> Int + catch + E:R -> error + end + end + end +}. + +%% {mapping, "buckets.default.basic_quorum", "riak_core.default_bucket_props.basic_quorum", false}, +%% {mapping, "buckets.default.notfound_ok", "riak_core.default_bucket_props.notfound_ok", true} + +%% @doc whether or not siblings are allowed. +%% Note: See Vector Clocks for a discussion of sibling resolution. +{mapping, "buckets.default.siblings", "riak_core.default_bucket_props.allow_mult", [ + {datatype, {enum, [on, off]}}, + {default, on}, + {level, advanced} +]}. + +{translation, + "riak_core.default_bucket_props.allow_mult", + fun(Conf) -> + Setting = cuttlefish:conf_get("buckets.default.siblings", Conf), + case Setting of + on -> true; + off -> false; + _Default -> true + end + end}. + +{validator, "ring_size^2", "not a power of 2", + fun(Size) -> + (Size band (Size-1) =:= 0) + end}. + +{validator, "ring_size_min", "must be at least 8", + fun(Size) -> + Size >= 8 + end}. + +%% @doc Number of concurrent node-to-node transfers allowed. +{mapping, "transfer_limit", "riak_core.handoff_concurrency", [ + {datatype, integer}, + {default, 2}, + {commented, 2} +]}. + +%% @doc Default location of ringstate +{mapping, "ring.state_dir", "riak_core.ring_state_dir", [ + {datatype, directory}, + {default, "$(platform_data_dir)/ring"}, + hidden +]}. + +%% @doc Default cert location for https can be overridden +%% with the ssl config variable, for example: +{mapping, "ssl.certfile", "riak_core.ssl.certfile", [ + {datatype, file}, + {commented, "$(platform_etc_dir)/cert.pem"} +]}. + +%% @doc Default key location for https can be overridden with the ssl +%% config variable, for example: +{mapping, "ssl.keyfile", "riak_core.ssl.keyfile", [ + {datatype, file}, + {commented, "$(platform_etc_dir)/key.pem"} +]}. + +%% @doc Default signing authority location for https can be overridden +%% with the ssl config variable, for example: +{mapping, "ssl.cacertfile", "riak_core.ssl.cacertfile", [ + {datatype, file}, + {commented, "$(platform_etc_dir)/cacertfile.pem"} +]}. + +%% @doc handoff.ip is the network address that Riak binds to for +%% intra-cluster data handoff. +{mapping, "handoff.ip", "riak_core.handoff_ip", [ + {default, "" }, + {datatype, string}, + {validators, ["valid_ipaddr", "not_localhost"]}, + hidden +]}. + +{validator, + "valid_ipaddr", + "must be a valid IP address", + fun(AddrString) -> + case inet_parse:address(AddrString) of + {ok, _} -> true; + {error, _} -> false + end + end}. + +{validator, + "not_localhost", + "can't be a local ip", + fun(AddrString) -> + case inet_parse:address(AddrString) of + {ok, {127, 0, _, _}} -> false; + {ok, _} -> true; + {error, _} -> false + end + end}. + +%% @doc handoff.port is the TCP port that Riak uses for +%% intra-cluster data handoff. +{mapping, "handoff.port", "riak_core.handoff_port", [ + {default, 9999}, + {datatype, integer}, + hidden +]}. + +%% @doc To encrypt riak_core intra-cluster data handoff traffic, +%% uncomment the following line and edit its path to an appropriate +%% certfile and keyfile. (This example uses a single file with both +%% items concatenated together.) +{mapping, "handoff.ssl.certfile", "riak_core.handoff_ssl_options.certfile", [ +%% {commented, "/tmp/erlserver.pem"}, + {datatype, file}, + hidden +]}. + +%% @doc if you need a seperate keyfile for handoff +{mapping, "handoff.ssl.keyfile", "riak_core.handoff_ssl_options.keyfile", [ + {datatype, file}, + hidden +]}. + +%% @doc Enables/disables outbound handoff transfers for this node. If you +%% turn this setting off at runtime with riak-admin, it will kill any +%% outbound handoffs currently running. +{mapping, "handoff.outbound", "riak_core.disable_outbound_handoff", [ + {default, on}, + {datatype, {flag, off, on}}, + hidden +]}. + +%% @doc Enables/disables inbound handoff transfers for this node. If you +%% turn this setting off at runtime with riak-admin, it will kill any +%% inbound handoffs currently running. +{mapping, "handoff.inbound", "riak_core.disable_inbound_handoff", [ + {default, on}, + {datatype, {flag, off, on}}, + hidden +]}. + +%% @doc The time a vnode has to be idle for a handoff to occour. (I think) +{mapping, "handoff.inactivity_timeout", "riak_core.vnode_inactivity_timeout", [ + {default, "1m"}, + {datatype, {duration, ms}} +]}. + +%% @doc DTrace support Do not enable 'dtrace' unless your Erlang/OTP +%% runtime is compiled to support DTrace. DTrace is available in +%% R15B01 (supported by the Erlang/OTP official source package) and in +%% R14B04 via a custom source repository & branch. +{mapping, "dtrace", "riak_core.dtrace_support", [ + {default, off}, + {datatype, flag} +]}. + +%% consistent on/off (in lieu of enabled/disabled, true/false) +{ translation, + "riak_core.dtrace_support", + fun(Conf) -> + Setting = cuttlefish:conf_get("dtrace", Conf), + case Setting of + on -> true; + off -> false; + _Default -> false + end + end +}. + +%% @doc Platform-specific installation paths (substituted by rebar) +{mapping, "platform_bin_dir", "riak_core.platform_bin_dir", [ + {datatype, directory}, + {default, ""} +]}. + +%% @see platform_bin_dir +{mapping, "platform_data_dir", "riak_core.platform_data_dir", [ + {datatype, directory}, + {default, ""} +]}. + +%% @see platform_bin_dir +{mapping, "platform_etc_dir", "riak_core.platform_etc_dir", [ + {datatype, directory}, + {default, ""} +]}. + +%% @see platform_bin_dir +{mapping, "platform_lib_dir", "riak_core.platform_lib_dir", [ + {datatype, directory}, + {default, ""} +]}. + +%% @see platform_bin_dir +{mapping, "platform_log_dir", "riak_core.platform_log_dir", [ + {datatype, directory}, + {default, ""} +]}. + +%% @doc Enable consensus subsystem. Set to 'on' to enable the +%% consensus subsystem used for strongly consistent Riak operations. +{mapping, "strong_consistency", "riak_core.enable_consensus", [ + {datatype, flag}, + {default, off}, + {commented, on} +]}. + +%% @doc Whether to enable the background manager globally. When +%% enabled, participating Riak subsystems will coordinate access to +%% shared resources. This will help to prevent system response +%% degradation under times of heavy load from multiple background +%% tasks. Specific subsystems may also have their own controls over +%% use of the background manager. +{mapping, "background_manager", "riak_core.use_background_manager", [ + {datatype, flag}, + {default, off}, + hidden +]}. + +%% @doc Interval of time between vnode management +%% activities. Modifying this will change the amount of time between +%% attemps to trigger handoff between this node and any other member +%% of the cluster. +{mapping, "vnode_management_timer", "riak_core.vnode_management_timer", [ + {default, "10s"}, + {datatype, {duration, ms}}, + hidden +]}. + +%% @doc Home directory for the run user +{mapping, "run_user_home", "setup.home", + [{default, ""}, + hidden, + {datatype, string}]}. + +%% Async Job Management +%% +%% This is a translation for mappings that appear in other schema files. +%% Mappings are from "cluster.job.$namespace.$operation"* to +%% "riak_core.job_accept_class" with required attributes +%% [merge, {datatype, {flag, enabled, disabled}}].** +%% * Mappings are only performed on elements with exactly the number of +%% segments shown - any other number of elements, even with a matching +%% prefix, is ignored. +%% ** The 'datatype' should be 'flag', and 'enabled'/'disabled' are our +%% conventions, but any OnFlag/OffFlag pair can be used as long as they map +%% to boolean values. +%% Other attributes, such as 'hidden' or {default, X} are fine, since they +%% don't make it down the stack to here. +%% Job classes that should be enabled by default MUST have a {default, enabled} +%% attribute, as the runtime filter only defaults to accept when no values have +%% been set from ANY schema file. +%% +%% Example: +%% {mapping, "cluster.job.harry.fold", "riak_core.job_accept_class", [ +%% merge, +%% {datatype, {flag, enabled, disabled}}, +%% {default, enabled} +%% ]}. +%% {mapping, "cluster.job.alice.list", "riak_core.job_accept_class", [ +%% merge, +%% {datatype, {flag, enabled, disabled}}, +%% {default, disabled} +%% ]}. +%% Results in: +%% {riak_core, [ +%% ... +%% {job_accept_class, [{harry, fold}]} +%% ... +%% ]}. +%% +{translation, + "riak_core.job_accept_class", + fun(Conf) -> + Fold = + fun({[_, _, Mod, Op], true}, Result) -> + [{erlang:list_to_atom(Mod), erlang:list_to_atom(Op)} | Result]; + ({[_, _, _, _], false}, Result) -> + Result; + ({[_, _, _, _], _} = Setting, _) -> + cuttlefish:invalid(io_lib:format("~p", [Setting])); + (_, Result) -> + Result + end, + lists:sort(lists:foldl(Fold, [], + cuttlefish_variable:filter_by_prefix(["cluster", "job"], Conf))) + end}. + + +%% @doc Some requests to the vnodes are handled by an asyncronous worker pool. +%% This parameter allows for tuning this pools behaviour when it comes dealing +%% with requests that are queued. +%% The default (fifo) will serve requests in the order they arrive at the worker +%% pool. The alternative is to serve the requests in the reverse order, dealing +%% with the most recent request first. +%% There are pro's and con's for both aproaches, it is best to test out what +%% works best for the desired characteristics. +%% +%% As a very rought rule of thumb: +%% - fifo will lead to lower extremes +%% - filo will lead to lower medians/mediums +{mapping, "worker.queue_strategy", "riak_core.queue_worker_strategy", + [{default, fifo}, + {datatype, {enum, [fifo, filo]}}]}. diff --git a/config/network.config b/config/network.config new file mode 100644 index 000000000..8e600606d --- /dev/null +++ b/config/network.config @@ -0,0 +1,29 @@ +[ + {riak_core, [ + %% riak_handoff_port is the TCP port used for intra-cluster data handoff. + {handoff_port, 8099} + ]}, + + {ranch, [ + %% IP where antidote will listen to for connection requests + {pb_ip, "127.0.0.1"}, + + %% Port for antidote client requests + {pb_port, 8087} + ]}, + + {antidote, [ + {pubsub_port, 8086}, + {logreader_port, 8085} + ]}, + + {antidote_stats, [ + {metrics_port, 3001} + ]} + +%% possible to restrict distributed Erlang ports +%% {kernel, [ +%% {inet_dist_listen_min, 9100}, +%% {inet_dist_listen_max, 9100} +%% ]} +]. diff --git a/config/sys-debug.config b/config/sys-debug.config deleted file mode 100644 index f7dc3e9ee..000000000 --- a/config/sys-debug.config +++ /dev/null @@ -1,45 +0,0 @@ -[ - %% Riak Core config - {riak_core, [ - %% Default location of ringstate - {ring_state_dir, "data.antidote@127.0.0.1/ring"}, - {platform_data_dir, "data.antidote@127.0.0.1"}, - {schema_dirs, [ - "_build/default/lib/cuttlefish/priv/", - "_build/default/lib/riak_core/priv/", - "_build/default/lib/riak_sysmon/priv/", - "_build/default/lib/eleveldb/priv/" - ]}, - - %% riak_handoff_port is the TCP port that Riak uses for - %% intra-cluster data handoff. - {handoff_port, 8099} - ]}, - -{kernel, [ - {logger, [ - {handler, default, logger_std_h, - #{level => debug, - formatter => {logger_formatter, #{single_line => false, max_size => 2048}}, - config => #{type => standard_io}}}, - {handler, errors, logger_std_h, - #{level => error, - formatter => {logger_formatter, #{single_line => false, max_size => 2048}}, - config => #{type => {file, "logger_logs/errors.log"}}}} - ]}, - {logger_level, all} - ]}, - - {ranch, [ - {pb_ip, "127.0.0.1"}, - {pb_port, 8087} - ]}, - - {prometheus, [{collectors, [prometheus_process_collector, default]}]}, - - {antidote, [ - {pubsub_port, 8086}, - {logreader_port, 8085}, - {metrics_port, 3001} - ]} -]. diff --git a/config/sys.config b/config/sys.config deleted file mode 100644 index 166073807..000000000 --- a/config/sys.config +++ /dev/null @@ -1,40 +0,0 @@ -[ - %% Riak Core config - {riak_core, [ - %% Default location of ringstate - {ring_state_dir, "${RING_STATE_DIR}"}, - {platform_data_dir, "${PLATFORM_DATA_DIR}"}, - - %% riak_handoff_port is the TCP port that Riak uses for - %% intra-cluster data handoff. - {handoff_port, 8099} - ]}, - - %% logger config -{kernel, [ - {logger, [ - {handler, default, logger_std_h, - #{level => info, - formatter => {logger_formatter, #{single_line => false, max_size => 2048}}, - config => #{type => standard_io}}}, - {handler, errors, logger_std_h, - #{level => error, - formatter => {logger_formatter, #{single_line => false, max_size => 2048}}, - config => #{type => {file, "logger_logs/errors.log"}}}} - ]}, - {logger_level, info} - ]}, - - {ranch, [ - {pb_ip, "${PB_IP}"}, - {pb_port, 8087} - ]}, - - {prometheus, [{collectors, [prometheus_process_collector, default]}]}, - - {antidote, [ - {pubsub_port, 8086}, - {logreader_port, 8085}, - {metrics_port, 3001} - ]} -]. diff --git a/config/sys.config.src b/config/sys.config.src new file mode 100644 index 000000000..69e6317c0 --- /dev/null +++ b/config/sys.config.src @@ -0,0 +1,98 @@ +[ + "network.config", + "lager.config", + + %% logger config + {kernel, [ + {logger, [ + + {handler, default, logger_std_h, + #{level => info, + formatter => {logger_formatter, #{single_line => false, max_size => 2048}}, + config => #{type => standard_io}}}, + + {handler, debug, logger_std_h, + #{level => info, + formatter => {logger_formatter, #{single_line => true, max_size => 2048}}, + config => #{type => {file, "${ROOT_DIR_PREFIX}${LOGGER_DIR_PREFIX}logger_logs/info.log"}}}}, + + {handler, errors, logger_std_h, + #{level => error, + formatter => {logger_formatter, #{single_line => false, max_size => 2048}}, + config => #{type => {file, "${ROOT_DIR_PREFIX}${LOGGER_DIR_PREFIX}logger_logs/errors.log"}}}} + ]}, + {logger_level, info} + ]}, + + + {riak_core, [ + %% riak directories + {ring_state_dir, "${ROOT_DIR_PREFIX}${DATA_DIR_PREFIX}data_riak_core"}, + {platform_data_dir, "${ROOT_DIR_PREFIX}${DATA_DIR_PREFIX}data_riak_core"} + ]}, + + + {setup, [ + %% stops setup from creating strange folders (data@node() and log@node()) + {verify_directories, false} + ]}, + + + {antidote, [ + %% antidote data directory + {data_dir, "${ROOT_DIR_PREFIX}${DATA_DIR_PREFIX}data_antidote"} + + %% stats: + %% Enables or disables metrics collection for this node. Can be disabled on slow nodes to improve performance. + %% true -> enables collecting statistics via the antidote_stats module + %% false -> disables all statistics collection + %{stats, true}, + + %% extended_stats: + %% Expensive metric collection for all erlang processes. + %% true -> process queues and reductions are monitored for busy processes + %% false -> single process monitoring disables + %{extended_stats, true}, + + %% txn_cert: + %% true -> write operations are certified during commit, aborting the transaction if a write conflict is detected (i.e. snapshot isolation + %% is ensured for the updates within a single DC, updates across DCs are not checked) + %% false -> transactions perform no certification and always commit (outside of crashes/errors) + %{txn_cert, true}, + + %% txn_prot: + %% clocksi -> uses "Cure" protocol to define snapshots and causal dependencies (https://pages.lip6.fr/Marc.Shapiro/papers/Cure-final-ICDCS16.pdf) + %% gr -> uses "Gentle-rain like" protocol to define snapshots and causal dependencies (https://infoscience.epfl.ch/record/202079) + %{txn_prot, clocksi}, + + %% recover_from_log: + %% true -> on node start will load any operations stored on the disk log to the in memory cache of the key-value store + %% false -> on node start the state of the key-value store will be empty + %{recover_from_log, true}, + + %% recover_meta_data_on_start: + %% true -> meta-data state will be loaded from disk on restart including connection state between other DCs and node names and configurations, + %% nodes will automatically reconnect to other dcs on restart + %% false -> meta-data concerning node names and connections to other dcs will not be loaded on restart + %{recover_meta_data_on_start, true}, + + %% sync_log: + %% true -> local transactions will be stored on log synchronously, i.e. when the reply is sent the updates are guaranteed to be + %% stored to disk (this is very slow in the current logging setup) + %% false -> all updates are sent to the operating system to be stored to disk (eventually), but are not guaranteed to be stored durably on disk + %% when the reply is sent + %{sync_log, false}, + + %% %% enable_logging: + %% true -> writes to disk done by the logging_vnode are enabled + %% false -> writes to disk are disabled, this improves performance when benchmarking. + %% WARNING: disabling logging makes updates non-recoverable after shutting down antidote or under failures. + %{enable_logging, true}, + + %% %% auto_start_read_servers: + %% true -> read servers will start automatically. It should be set to true when Antidote will be run in a single node/machine. + %% false -> read servers will not start automatically. It should be set to false when many Antidote instances will be run in a cluster. + %% In this case, a inter_dc_manager:start_bg_processes(stable) needs to be issued per antidote instance after joining the cluster. + %{auto_start_read_servers, true} + ]} +]. diff --git a/config/vars.config b/config/vars.config deleted file mode 100644 index 791fde94e..000000000 --- a/config/vars.config +++ /dev/null @@ -1,21 +0,0 @@ -%% -%% etc/app.config -%% -{ring_state_dir, "data/ring"}. -{ring_creation_size, "16"}. -{platform_data_dir, "./data"}. -{web_ip, "127.0.0.1"}. -{web_port, "8098"}. -{handoff_port, "8099"}. -{pb_ip, "127.0.0.1"}. -{pb_port, "8087"}. -{pubsub_port, "8086"}. -{logreader_port, "8085"}. -{metrics_port, "3001"}. - - -%% -%% etc/vm.args -%% -{node, "antidote@127.0.0.1"}. -{cookie, "antidote"}. diff --git a/config/vm.args b/config/vm.args deleted file mode 100644 index 72141c452..000000000 --- a/config/vm.args +++ /dev/null @@ -1,42 +0,0 @@ -## Name of the node --name ${NODE_NAME} - -## Cookie for distributed erlang --setcookie ${COOKIE} - -## Heartbeat management; auto-restarts VM if it dies or becomes unresponsive -## (Disabled by default..use with caution!) -##-heart - -## Enable kernel poll and a few async threads -+K true -+A 5 - -## Increase number of concurrent ports/sockets --env ERL_MAX_PORTS 4096 - -## Tweak GC to run more often --env ERL_FULLSWEEP_AFTER 10 - -## Increase max ETS tables. --env ERL_MAX_ETS_TABLES 50000 - -## Increase distribution port buffer size. -+zdbbl 32768 - -## Disable time_warp, because Antidote is not time-warp safe at the moment (see https://github.com/SyncFree/antidote/issues/226) -+C no_time_warp - -## The following section is required beacuse variable replacement can -## only occur with string, not integers. - -## Antidote configuration values coming from the command line. --antidote pubsub_port ${PUBSUB_PORT} --antidote logreader_port ${LOGREADER_PORT} --antidote metrics_port ${METRICS_PORT} - -## Ranch configuration values coming from the command line. --ranch pb_port ${PB_PORT} - -## Riak Core configuration values coming from the command line. --riak_core handoff_port ${HANDOFF_PORT} diff --git a/config/vm.args.src b/config/vm.args.src new file mode 100644 index 000000000..934aac637 --- /dev/null +++ b/config/vm.args.src @@ -0,0 +1,18 @@ +## Name of the node +-name ${NODE_NAME} + +## Cookie for distributed erlang +-setcookie ${COOKIE} + +## Enable kernel poll and a few async threads ++K true ++A 5 + +## Tweak GC to run more often +-env ERL_FULLSWEEP_AFTER 10 + +## Increase distribution port buffer size. ++zdbbl 32768 + +## Enable time warp once Issue #226 is fixed +##+C multi_time_warp diff --git a/include/antidote.hrl b/include/antidote.hrl index b2f48b8ba..4ba5f1ac3 100644 --- a/include/antidote.hrl +++ b/include/antidote.hrl @@ -268,4 +268,4 @@ }). -type snapshot_get_response() :: #snapshot_get_response{}. --define(STATS(Type), gen_server:cast(antidote_stats_collector, Type)). +-define(STATS(Type), case application:get_env(antidote, stats, true) of true -> gen_server:cast(antidote_stats_collector, Type); _ -> ok end). diff --git a/monitoring/Antidote-Dashboard.json b/monitoring/Antidote-Dashboard.json deleted file mode 100644 index b64e4427e..000000000 --- a/monitoring/Antidote-Dashboard.json +++ /dev/null @@ -1,1537 +0,0 @@ -{ - "__inputs": [ - { - "name": "DS_PROMETHEUS", - "label": "prometheus", - "description": "", - "type": "datasource", - "pluginId": "prometheus", - "pluginName": "Prometheus" - } - ], - "__requires": [ - { - "type": "grafana", - "id": "grafana", - "name": "Grafana", - "version": "4.4.3" - }, - { - "type": "panel", - "id": "graph", - "name": "Graph", - "version": "" - }, - { - "type": "datasource", - "id": "prometheus", - "name": "Prometheus", - "version": "1.0.0" - }, - { - "type": "panel", - "id": "singlestat", - "name": "Singlestat", - "version": "" - } - ], - "annotations": { - "list": [] - }, - "editable": true, - "gnetId": null, - "graphTooltip": 0, - "hideControls": false, - "id": null, - "links": [], - "refresh": "5s", - "rows": [ - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": { - "ETS Limit": "#508642" - }, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 12, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "DETS Tables", - "yaxis": 1 - } - ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "erlang_vm_ets_limit{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "ETS Limit", - "refId": "A", - "step": 2 - }, - { - "expr": "erlang_vm_ets_tables{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "ETS Tables", - "refId": "B", - "step": 2 - }, - { - "expr": "erlang_vm_dets_tables{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "DETS Tables", - "refId": "C", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "ETS/DETS", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 14, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "erlang_vm_process_limit{instance=\"$node\"}", - "hide": true, - "intervalFactor": 2, - "legendFormat": "Process Limit", - "refId": "A", - "step": 2 - }, - { - "expr": "erlang_vm_process_count{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "Processes", - "refId": "B", - "step": 2 - }, - { - "expr": "erlang_vm_statistics_run_queues_length_total{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "Run Queues Length", - "refId": "C", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Processes", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 15, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Reductions", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 3, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(erlang_vm_statistics_context_switches{instance=\"$node\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Context Switches", - "refId": "B", - "step": 2 - }, - { - "expr": "irate(erlang_vm_statistics_reductions_total{instance=\"$node\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Reductions", - "refId": "C", - "step": 2 - }, - { - "expr": "irate(erlang_vm_statistics_runtime_milliseconds{instance=\"$node\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Runtime", - "refId": "D", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Load", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "cacheTimeout": null, - "colorBackground": false, - "colorValue": false, - "colors": [ - "rgba(245, 54, 54, 0.9)", - "rgba(237, 129, 40, 0.89)", - "rgba(50, 172, 45, 0.97)" - ], - "datasource": "${DS_PROMETHEUS}", - "decimals": 1, - "format": "dtdurations", - "gauge": { - "maxValue": 100, - "minValue": 0, - "show": false, - "thresholdLabels": false, - "thresholdMarkers": true - }, - "id": 16, - "interval": null, - "links": [], - "mappingType": 1, - "mappingTypes": [ - { - "name": "value to text", - "value": 1 - }, - { - "name": "range to text", - "value": 2 - } - ], - "maxDataPoints": 100, - "nullPointMode": "connected", - "nullText": null, - "postfix": "", - "postfixFontSize": "50%", - "prefix": "", - "prefixFontSize": "50%", - "rangeMaps": [ - { - "from": "null", - "text": "N/A", - "to": "null" - } - ], - "span": 3, - "sparkline": { - "fillColor": "rgba(31, 118, 189, 0.18)", - "full": false, - "lineColor": "rgb(31, 120, 193)", - "show": false - }, - "tableColumn": "", - "targets": [ - { - "expr": "process_uptime_seconds{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "", - "refId": "A", - "step": 4 - } - ], - "thresholds": "", - "title": "Uptime", - "type": "singlestat", - "valueFontSize": "80%", - "valueMaps": [ - { - "op": "=", - "text": "N/A", - "value": "null" - } - ], - "valueName": "avg" - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 6, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(erlang_vm_statistics_bytes_output_total{instance=\"$node\"}[5m])", - "intervalFactor": 2, - "legendFormat": "Output Bytes", - "metric": "erlang_vm_statistics_bytes_output_total", - "refId": "A", - "step": 2 - }, - { - "expr": "irate(erlang_vm_statistics_bytes_received_total{instance=\"$node\"}[5m])", - "intervalFactor": 2, - "legendFormat": "Received Bytes", - "metric": "erlang_vm_statistics_bytes_received_total", - "refId": "B", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "VM IO", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "decbytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "GBs", - "logBase": 1, - "max": null, - "min": null, - "show": false - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 7, - "legend": { - "alignAsTable": false, - "avg": false, - "current": false, - "max": false, - "min": false, - "rightSide": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Words Reclaimed", - "yaxis": 2 - }, - { - "alias": "Bytes Reclaimed", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "irate(erlang_vm_statistics_garbage_collection_number_of_gcs{instance=\"$node\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Number of GCs", - "metric": "erlang_vm_statistics_garbage_collection_number_of_gcs", - "refId": "A", - "step": 2 - }, - { - "expr": "irate(erlang_vm_statistics_garbage_collection_bytes_reclaimed{instance=\"$node\"}[5m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Bytes Reclaimed", - "metric": "erlang_vm_statistics_garbage_collection_words_reclaimed", - "refId": "B", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "VM GC", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "decbytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 3, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "erlang_vm_memory_bytes_total{instance=\"$node\", kind=\"processes\"}", - "intervalFactor": 2, - "legendFormat": "Processes Memory", - "refId": "B", - "step": 2 - }, - { - "expr": "erlang_vm_memory_system_bytes_total{instance=\"$node\", usage=\"atom\"}", - "intervalFactor": 2, - "legendFormat": "Atoms", - "refId": "C", - "step": 2 - }, - { - "expr": "erlang_vm_memory_system_bytes_total{instance=\"$node\", usage=\"binary\"}", - "intervalFactor": 2, - "legendFormat": "Binary", - "refId": "D", - "step": 2 - }, - { - "expr": "erlang_vm_memory_system_bytes_total{instance=\"$node\", usage=\"code\"}", - "intervalFactor": 2, - "legendFormat": "Code", - "refId": "E", - "step": 2 - }, - { - "expr": "erlang_vm_memory_system_bytes_total{instance=\"$node\", usage=\"ets\"}", - "intervalFactor": 2, - "legendFormat": "ETS", - "refId": "F", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "VM Memory", - "tooltip": { - "msResolution": false, - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": "250px", - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 11, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_virtual_memory_bytes{instance=\"$node\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Virtual Memory", - "refId": "A", - "step": 2 - }, - { - "expr": "process_resident_memory_bytes{instance=\"$node\"}", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Resident Memory", - "refId": "B", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "OS Process Memory", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "bytes", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 8, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Max Ports", - "yaxis": 2 - }, - { - "alias": "Ports", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_open_fds{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "Open FDs", - "metric": "", - "refId": "A", - "step": 2 - }, - { - "expr": "process_max_fds{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "Max FDs", - "refId": "B", - "step": 2 - }, - { - "expr": "erlang_vm_port_limit{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "Max Ports", - "refId": "C", - "step": 2 - }, - { - "expr": "erlang_vm_port_count{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "Ports", - "refId": "D", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "File Descriptors & Ports", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "none", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "decimals": null, - "editable": true, - "error": false, - "fill": 1, - "grid": {}, - "id": 10, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 2, - "links": [], - "nullPointMode": "connected", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Threads", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "process_threads_total{instance=\"$node\"}", - "intervalFactor": 2, - "legendFormat": "Threads", - "metric": "", - "refId": "A", - "step": 2 - }, - { - "expr": "sum(irate(process_cpu_seconds_total{instance=\"$node\"}[30s])) without (kind) * 100", - "intervalFactor": 2, - "legendFormat": "CPU", - "refId": "B", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Native Threads & CPU", - "tooltip": { - "msResolution": true, - "shared": true, - "sort": 0, - "value_type": "cumulative" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "percent", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "New row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 246, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 20, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": true, - "steppedLine": false, - "targets": [ - { - "expr": "irate(antidote_operations_total{instance=\"$node\", type=\"update\"}[1m])", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Updates", - "metric": "", - "refId": "A", - "step": 2 - }, - { - "expr": "irate(antidote_operations_total{instance=\"$node\", type=\"read_async\"}[1m])", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Async Reads", - "metric": "", - "refId": "B", - "step": 2 - }, - { - "expr": "irate(antidote_operations_total{instance=\"$node\", type=\"read\"}[1m])", - "format": "time_series", - "hide": false, - "interval": "", - "intervalFactor": 2, - "legendFormat": "Reads", - "refId": "C", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Antidote Operations", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ops", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 18, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 6, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "antidote_open_transactions", - "format": "time_series", - "intervalFactor": 2, - "legendFormat": "Open Transactions", - "metric": "antidote_open_transactions", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Open Transactions", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - }, - { - "collapse": false, - "height": 261, - "panels": [ - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 17, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": " rate(antidote_staleness_sum{instance=\"$node\"}[5m])\n/\n rate(antidote_staleness_count{instance=\"$node\"}[5m])", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Staleness", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Staleness", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "ms", - "label": "", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": "", - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 19, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": false, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [ - { - "alias": "Aborted Transactions Total", - "yaxis": 2 - } - ], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(antidote_aborted_transactions_total{instance=\"$node\"}[1m])", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Aborted Transaction Rate", - "metric": "", - "refId": "A", - "step": 2 - }, - { - "expr": "antidote_aborted_transactions_total{instance=\"$node\"}", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Aborted Transactions Total", - "refId": "B", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Aborted Transactions", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [ - "total" - ] - }, - "yaxes": [ - { - "format": "short", - "label": "txn/sec", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": "txn", - "logBase": 1, - "max": null, - "min": "0", - "show": true - } - ] - }, - { - "aliasColors": {}, - "bars": false, - "dashLength": 10, - "dashes": false, - "datasource": "${DS_PROMETHEUS}", - "fill": 1, - "id": 21, - "legend": { - "avg": false, - "current": false, - "max": false, - "min": false, - "show": true, - "total": false, - "values": false - }, - "lines": true, - "linewidth": 1, - "links": [], - "nullPointMode": "null", - "percentage": false, - "pointradius": 5, - "points": false, - "renderer": "flot", - "seriesOverrides": [], - "spaceLength": 10, - "span": 4, - "stack": false, - "steppedLine": false, - "targets": [ - { - "expr": "rate(antidote_error_count{instance=\"$node\"}[1m])", - "format": "time_series", - "interval": "", - "intervalFactor": 2, - "legendFormat": "Error Rate", - "refId": "A", - "step": 2 - } - ], - "thresholds": [], - "timeFrom": null, - "timeShift": null, - "title": "Errors", - "tooltip": { - "shared": true, - "sort": 0, - "value_type": "individual" - }, - "type": "graph", - "xaxis": { - "buckets": null, - "mode": "time", - "name": null, - "show": true, - "values": [] - }, - "yaxes": [ - { - "format": "short", - "label": "errors/sec", - "logBase": 1, - "max": null, - "min": "0", - "show": true - }, - { - "format": "short", - "label": null, - "logBase": 1, - "max": null, - "min": null, - "show": true - } - ] - } - ], - "repeat": null, - "repeatIteration": null, - "repeatRowId": null, - "showTitle": false, - "title": "Dashboard Row", - "titleSize": "h6" - } - ], - "schemaVersion": 14, - "style": "dark", - "tags": [], - "templating": { - "list": [ - { - "allValue": null, - "current": {}, - "datasource": "${DS_PROMETHEUS}", - "hide": 0, - "includeAll": false, - "label": "Node", - "multi": false, - "name": "node", - "options": [], - "query": "label_values(antidote_open_transactions, instance)", - "refresh": 1, - "regex": "", - "sort": 1, - "tagValuesQuery": "label_values({job=\"$tag\"},instance)", - "tags": [ - "antidote", - "prometheus" - ], - "tagsQuery": "label_values(job)", - "type": "query", - "useTags": true - } - ] - }, - "time": { - "from": "now-5m", - "to": "now" - }, - "timepicker": { - "now": true, - "refresh_intervals": [ - "5s", - "10s", - "30s", - "1m", - "5m", - "15m", - "30m", - "1h", - "2h", - "1d" - ], - "time_options": [ - "5m", - "15m", - "1h", - "6h", - "12h", - "24h", - "2d", - "7d", - "30d" - ] - }, - "timezone": "browser", - "title": "Antidote", - "version": 3 -} \ No newline at end of file diff --git a/monitoring/docker-compose.yml b/monitoring/docker-compose.yml deleted file mode 100644 index 5a5572d8d..000000000 --- a/monitoring/docker-compose.yml +++ /dev/null @@ -1,24 +0,0 @@ -version: '2' -services: - antidote: - image: antidotedb/antidote:latest - environment: - NODE_NAME: "antidote@antidote" - SHORT_NAME: "true" - ports: - - "8087:8087" - prometheus: - image: prom/prometheus - ports: - - "9090:9090" - volumes: - - ".:/data" - command: "--config.file=/data/prometheus.yml" - depends_on: - - antidote - grafana: - image: grafana/grafana - ports: - - "3000:3000" - depends_on: - - prometheus diff --git a/monitoring/prometheus.yml b/monitoring/prometheus.yml deleted file mode 100644 index 49781459e..000000000 --- a/monitoring/prometheus.yml +++ /dev/null @@ -1,28 +0,0 @@ -global: - scrape_interval: 15s # By default, scrape targets every 15 seconds. - - # Attach these labels to any time series or alerts when communicating with - # external systems (federation, remote storage, Alertmanager). - external_labels: - monitor: 'codelab-monitor' - -# A scrape configuration containing exactly one endpoint to scrape: -# Here it's Prometheus itself. -scrape_configs: - # The job name is added as a label `job=` to any timeseries scraped from this config. - - job_name: 'prometheus' - - # Override the global default and scrape targets from this job every 5 seconds. - scrape_interval: 5s - - static_configs: - - targets: ['localhost:9090'] - - # Job for scraping Antidote - - job_name: 'antidote' - - # Override the global default and scrape targets from this job every 10 seconds. - scrape_interval: 10s - - static_configs: - - targets: ['antidote:3001'] diff --git a/rebar.config b/rebar.config index a5b4eed69..60f0adf29 100644 --- a/rebar.config +++ b/rebar.config @@ -4,7 +4,7 @@ %% riak framework {riak_core, "3.1.1", {pkg,riak_core_ng}}, % ranch socket acceptor pool for managing protocol buffer sockets - ranch, + {ranch, "1.7.1"}, %% efficient inter-dc messaging {erlzmq, {git, "https://github.com/zeromq/erlzmq2", {ref, "573d583"}}}, @@ -16,7 +16,7 @@ vectorclock, %% antidote stats module; expose metrics for prometheus as HTTP-API - {antidote_stats, {git, "https://github.com/AntidoteDB/antidote_stats", {tag, "v2"}}} + {antidote_stats, {git, "https://github.com/AntidoteDB/antidote_stats", {tag, "v12"}}} ]}. @@ -28,8 +28,6 @@ ]}. -{eunit_opts, [verbose, {report, {eunit_surefire, [{dir, "eunit_logs/"}]}}]}. - {dialyzer, [{warnings, [ error_handling, race_conditions, @@ -67,9 +65,9 @@ {provider_hooks, [{post, [{compile, {pc, compile}}, {clean, {pc, clean}}]}]} ]}]}, - - %% Normal + %% disable setup post hooks {override, setup, [{post_hooks, []}]}, + {override, eleveldb, [ {plugins, [pc]}, {pre_hooks, [ @@ -102,22 +100,26 @@ ]} ]}. -{relx, [{release, {antidote, "0.0.2"}, [antidote]}, +{relx, [{release, {antidote, "0.2.1"}, [antidote]}, {dev_mode, false}, + % do not expect Erlang runtime at deployment site {include_erts, true}, - {sys_config, "config/sys.config"}, - {vm_args, "config/vm.args"}, + % application environment + {sys_config_src, "config/sys.config.src"}, + % vm arguments passed when starting the Erlang VM + {vm_args_src, "config/vm.args.src"}, {overlay, [ - {mkdir, "data/ring"}, - {copy, "bin/env", "bin/env"}, - {template, "_build/default/lib/cuttlefish/priv/erlang_vm.schema", "lib/11-erlang_vm.schema"}, - {template, "_build/default/lib/riak_core/priv/riak_core.schema", "lib/12-riak_core.schema"}, - {template, "_build/default/lib/riak_sysmon/priv/riak_sysmon.schema", "lib/15-riak_sysmon.schema"}, - {template, "_build/default/lib/eleveldb/priv/eleveldb.schema", "lib/21-leveldb.schema"} + % copy nested configuration file(s) + {copy, "config/network.config", "releases/{{release_version}}/network.config"}, + % legacy riak_core schema expected at that location at runtime + {copy, "config/legacy/riak_core.schema", "lib/12-riak_core.schema"}, + % redirect lager output to logger + {copy, "config/lager.config", "releases/{{release_version}}/lager.config"} ]}, - {overlay_vars, "config/vars.config"}, + + % create start script with additional features {extended_start_script, true} ]}. @@ -177,5 +179,5 @@ {cover_enabled, true}. {cover_export_enabled, true}. -{coveralls_coverdata, "logs/*/all.coverdata"}. +{coveralls_coverdata, "_build/test/cover/*.coverdata"}. {coveralls_service_name, "travis-ci"}. diff --git a/rebar.lock b/rebar.lock index c63af128d..0417bbcb5 100644 --- a/rebar.lock +++ b/rebar.lock @@ -1,10 +1,10 @@ {"1.1.0", -[{<<"accept">>,{pkg,<<"accept">>,<<"0.3.0">>},1}, +[{<<"accept">>,{pkg,<<"accept">>,<<"0.3.0">>},2}, {<<"antidote_crdt">>,{pkg,<<"antidote_crdt">>,<<"0.1.2">>},0}, {<<"antidote_pb_codec">>,{pkg,<<"antidote_pb_codec">>,<<"0.1.2">>},0}, {<<"antidote_stats">>, {git,"https://github.com/AntidoteDB/antidote_stats", - {ref,"f5fbe7db132b2a113c067c63662239fa5c0a52fb"}}, + {ref,"50fe89a4fc36985a6a5ac4079a92bb9861c2f351"}}, 0}, {<<"antidotec_pb">>,{pkg,<<"antidotec_pb">>,<<"0.2.9">>},0}, {<<"basho_stats">>,{pkg,<<"basho_stats">>,<<"1.0.3">>},1}, @@ -14,8 +14,8 @@ {<<"clique">>,{pkg,<<"clique">>,<<"0.3.12">>},1}, {<<"cuttlefish">>,{pkg,<<"cuttlefish">>,<<"2.1.4">>},1}, {<<"eleveldb">>,{pkg,<<"eleveldb">>,<<"2.2.20">>},1}, - {<<"elli">>,{pkg,<<"elli">>,<<"3.2.0">>},0}, - {<<"elli_prometheus">>,{pkg,<<"elli_prometheus">>,<<"0.1.1">>},0}, + {<<"elli">>,{pkg,<<"elli">>,<<"3.2.0">>},1}, + {<<"elli_prometheus">>,{pkg,<<"elli_prometheus">>,<<"0.1.1">>},1}, {<<"erlzmq">>, {git,"https://github.com/zeromq/erlzmq2", {ref,"573d583930c4b1134e504bec83926d188112b401"}}, @@ -32,10 +32,10 @@ {<<"parse_trans">>,{pkg,<<"parse_trans">>,<<"3.3.0">>},2}, {<<"pbkdf2">>,{pkg,<<"pbkdf2">>,<<"2.0.0">>},1}, {<<"poolboy">>,{pkg,<<"basho_poolboy">>,<<"0.8.4">>},1}, - {<<"prometheus">>,{pkg,<<"prometheus">>,<<"4.4.1">>},0}, + {<<"prometheus">>,{pkg,<<"prometheus">>,<<"4.4.1">>},1}, {<<"prometheus_process_collector">>, - {pkg,<<"prometheus_process_collector">>,<<"1.4.5">>}, - 0}, + {pkg,<<"prometheus_process_collector">>,<<"1.4.3">>}, + 1}, {<<"ranch">>,{pkg,<<"ranch">>,<<"1.7.1">>},0}, {<<"riak_core">>,{pkg,<<"riak_core_ng">>,<<"3.1.1">>},0}, {<<"riak_ensemble">>,{pkg,<<"riak_ensemble_ng">>,<<"2.4.4">>},1}, @@ -66,7 +66,7 @@ {<<"pbkdf2">>, <<"11C23279FDED5C0027AB3996CFAE77805521D7EF4BABDE2BD7EC04A9086CF499">>}, {<<"poolboy">>, <<"45C306FF1C9F6451730DD21642EDF55FA72EBD5E2FE4A38D8D8A56B8EA21A256">>}, {<<"prometheus">>, <<"1E96073B3ED7788053768FEA779CBC896DDC3BDD9BA60687F2AD50B252AC87D6">>}, - {<<"prometheus_process_collector">>, <<"9BAEA93F5D8C2758DBAD0DE021EF74438D2F81A01D1F24F5EF0BB949A7B4191D">>}, + {<<"prometheus_process_collector">>, <<"657386E8F142FC817347D95C1F3A05AB08710F7DF9E7F86DB6FACAED107ED929">>}, {<<"ranch">>, <<"6B1FAB51B49196860B733A49C07604465A47BDB78AA10C1C16A3D199F7F8C881">>}, {<<"riak_core">>, <<"098CEEF293F9F232E724DD8745916F2E4CE0F14CB8EDD1653E8425A14F0DD0B6">>}, {<<"riak_ensemble">>, <<"F9E04052F4A7FAAD20F008DFF18D34D3552513000410CE9C5941B4F7361741E8">>}, diff --git a/src/antidote.app.src b/src/antidote.app.src index 2ab113e6f..170489063 100644 --- a/src/antidote.app.src +++ b/src/antidote.app.src @@ -1,19 +1,18 @@ %% -*- erlang -*- -{application, antidote, - [ - {description, "A transactional CRDT database"}, - {vsn, "0.2.1"}, - {registered, []}, - {applications, [ - kernel, - stdlib, - riak_core, - cuttlefish, - erlzmq, - runtime_tools, - tools, - antidote_stats - ]}, +{application, antidote, [ + {description, "A transactional CRDT database"}, + {vsn, "0.2.1"}, + {applications, [ + kernel, + stdlib, + riak_core, + cuttlefish, + erlzmq, + runtime_tools, + tools, + antidote_stats, + lager + ]}, {included_applications, [ vectorclock, antidote_pb_codec, @@ -21,39 +20,15 @@ ranch ]}, - {mod, { antidote_app, []}}, - %% Options for environmental variables - %% txn_cert: - %% true -> write operations are certified during commit, aborting the transaction if a write conflict is detected (i.e. snapshot isolation - %% is ensured for the updates within a single DC, updates across DCs are not checked) - %% false -> transactions perform no certification and always commit (outside of crashes/errors) - %% txn_prot: - %% clocksi -> uses "Cure" protocol to define snapshots and causal dependencies (https://pages.lip6.fr/Marc.Shapiro/papers/Cure-final-ICDCS16.pdf) - %% gr -> uses "Gentle-rain like" protocol to define snapshots and causal dependencies (https://infoscience.epfl.ch/record/202079) - %% recover_from_log: - %% true -> on node start will load any operations stored on the disk log to the in memory cache of the key-value store - %% false -> on node start the state of the key-value store will be empty - %% recover_meta_data_on_start: - %% true -> meta-data state will be loaded from disk on restart including connection state between other DCs and node names and configurations, - %% nodes will automatically reconnect to other dcs on restart - %% false -> meta-data concerning node names and connections to other dcs will not be loaded on restart - %% sync_log: - %% true -> local transactions will be stored on log synchronously, i.e. when the reply is sent the updates are guaranteed to be - %% stored to disk (this is very slow in the current logging setup) - %% false -> all updates are sent to the operating system to be stored to disk (eventually), but are not guaranteed to be stored durably on disk - %% when the reply is sent - %% %% auto_start_read_servers: - %% true -> read servers will start automatically. It should be set to true when Antidote will be run in a single node/machine. - %% false -> read servers will not start automatically. It should be set to false when many Antidote instances will be run in a cluster. - %% In this case, a inter_dc_manager:start_bg_processes(stable) needs to be issued per antidote instance after joining the cluster. - %% %% enable_logging: - %% true -> writes to disk done by the logging_vnode are enabled - %% false -> writes to disk are disabled, this improves performance when benchmarking. - %% WARNING: disabling logging makes updates non-recoverable after shutting down antidote or under failures. + {mod, {antidote_app, []}}, - {env, [{txn_cert, true}, {txn_prot, clocksi}, {recover_from_log, true}, - {recover_meta_data_on_start, true}, {sync_log, false}, - {enable_logging, true}, - {auto_start_read_servers, true} - ]} - ]}. + % Default values for sys.config options used for system tests which do not load config files + {env, [{txn_cert, true}, {txn_prot, clocksi}, {recover_from_log, true}, + {recover_meta_data_on_start, true}, {sync_log, false}, + {enable_logging, true}, + {auto_start_read_servers, true}, + {data_dir, "data_antidote"}, + {stats, true}, + {extended_stats, true} + ]} +]}. diff --git a/src/antidote_app.erl b/src/antidote_app.erl index fb156ff57..6bcd51aa0 100644 --- a/src/antidote_app.erl +++ b/src/antidote_app.erl @@ -27,6 +27,8 @@ %% ------------------------------------------------------------------- -module(antidote_app). +-include_lib("kernel/include/logger.hrl"). + -behaviour(application). %% Application callbacks @@ -37,6 +39,14 @@ %% =================================================================== start(_StartType, _StartArgs) -> + ok = validate_data_dir(), + + % set the error logger counting the number of errors during operation + ok = logger:add_handler(count_errors, antidote_error_monitor, #{level => error}), + + % set the warning logger counting the number of warnings during operation + ok = logger:add_handler(count_warnings, antidote_warning_monitor, #{level => warning}), + case antidote_sup:start_link() of {ok, Pid} -> ok = riak_core:register([{vnode_module, logging_vnode}]), @@ -75,5 +85,33 @@ start(_StartType, _StartArgs) -> {error, Reason} end. +validate_data_dir() -> + {ok, DataDir} = application:get_env(antidote, data_dir), + case filelib:ensure_dir(filename:join(DataDir, "dummy")) of + ok -> ok; + {error, Reason} -> + ?LOG_CRITICAL("Data directory ~p does not exist, and could not be created: ~p", [DataDir, Reason]), + throw({error, invalid_data_dir}) + end. + stop(_State) -> ok. + + +%% =================================================================== +%% Unit Tests +%% =================================================================== + +-ifdef(TEST). +-include_lib("eunit/include/eunit.hrl"). + +%% Throw error if data dir is a file +data_dir_is_a_file_test() -> + {ok, Level} = maps:find(level, logger:get_primary_config()), + logger:set_primary_config(level, emergency), + application:set_env(antidote, data_dir, "tmpfile"), + ok = file:write_file("tmpfile", <<"hello">>), + {error, invalid_data_dir} = (catch validate_data_dir()), + logger:set_primary_config(level, Level), + ok = file:delete("tmpfile"). +-endif. diff --git a/src/antidote_lager_backend.erl b/src/antidote_lager_backend.erl new file mode 100644 index 000000000..2c934f0c4 --- /dev/null +++ b/src/antidote_lager_backend.erl @@ -0,0 +1,42 @@ +-module(antidote_lager_backend). + +%% This module can be removed once lager is removed from all dependencies of antidote + +-behaviour(gen_event). + +-export([init/1, handle_call/2, handle_event/2, handle_info/2, terminate/2, + code_change/3]). + +%% @private +init([Level]) when is_atom(Level) -> {ok, no_state}; +init([_, true]) -> {ok, no_state}; +init([_, false]) -> {ok, no_state}; +init([_, {Formatter, _}]) when is_atom(Formatter) -> {ok, no_state}; +init(_) -> {ok, no_state}. + +%% @private +handle_call(get_loglevel, State) -> {ok, {mask, 255}, State}; +handle_call({set_loglevel, _}, State) -> {ok, ok, State}; +handle_call(_Request, State) -> {ok, ok, State}. + +%% @private +handle_event({log, Message}, State) -> + {lager_msg, _, _, Level, _, _, InternalMessage} = Message, + ToLog = binary_to_list(iolist_to_binary(InternalMessage)), + case Level of + emergency -> logger:emergency("~p", [ToLog]); + alert -> logger:alert("~p", [ToLog]); + critical -> logger:critical("~p", [ToLog]); + error -> logger:error("~p", [ToLog]); + warning -> logger:warning("~p", [ToLog]); + notice -> logger:notice("~p", [ToLog]); + info -> logger:info("~p", [ToLog]); + debug -> logger:debug("~p", [ToLog]) + end, + {ok, State}; + +handle_event(_Event, State) -> {ok, State}. +handle_info(_Info, State) -> {ok, State}. +terminate(_Reason, _State) -> ok. +code_change(_OldVsn, State, _Extra) -> {ok, State}. + diff --git a/src/antidote_ring_event_handler.erl b/src/antidote_ring_event_handler.erl index ee662e582..88cb03e14 100644 --- a/src/antidote_ring_event_handler.erl +++ b/src/antidote_ring_event_handler.erl @@ -29,15 +29,21 @@ -module(antidote_ring_event_handler). -behaviour(gen_event). +-include("antidote.hrl"). +-include_lib("kernel/include/logger.hrl"). + %% gen_event callbacks -export([init/1, handle_event/2, handle_call/2, handle_info/2, terminate/2, code_change/3]). -record(state, {}). init([]) -> + update_status(), {ok, #state{}}. handle_event({ring_update, _Ring}, State) -> + ?LOG_INFO("Ring update~n~p", [riak_core_cluster_cli:status(ok, [], [])]), + update_status(), {ok, State}. handle_call(_Event, State) -> @@ -51,3 +57,54 @@ terminate(_Reason, _State) -> code_change(_OldVsn, State, _Extra) -> {ok, State}. + + +update_status() -> + %% ring status + {_Claimant, RingReady, Down, MarkedDown, Changes} = riak_core_status:ring_status(), + ?STATS({ring_ready, RingReady}), + + + {ok, Ring} = riak_core_ring_manager:get_my_ring(), + Members = riak_core_ring:all_members(Ring), + + %% node availability and ring state for every member as seen by this node + lists:foreach(fun(Node) -> + %% member status + NodeState = riak_core_ring:member_status(Ring, Node), + ?STATS({node_state, Node, NodeState}), + + %% ring claimed + RingClaimed = claim_percent(Ring, Node), + ?STATS({ring_claimed, Node, RingClaimed}), + + %% ring pending + RingPending = future_claim_percentage(Changes, Ring, Node), + ?STATS({ring_pending, Node, RingPending}), + + ?STATS({ring_member_availability, Node, node_availability(Node, Down, MarkedDown)}) + end, Members), + + ok. + + +claim_percent(Ring, Node) -> + RingSize = riak_core_ring:num_partitions(Ring), + Indices = riak_core_ring:indices(Ring, Node), + length(Indices) * 100 / RingSize. + +future_claim_percentage([], _Ring, _Node) -> + 0.0; +future_claim_percentage(_Changes, Ring, Node) -> + FutureRingSize = riak_core_ring:future_num_partitions(Ring), + NextIndices = riak_core_ring:future_indices(Ring, Node), + length(NextIndices) * 100 / FutureRingSize. + + +node_availability(Node, Down, MarkedDown) -> + case {lists:member(Node, Down), lists:member(Node, MarkedDown)} of + {false, false} -> 1; + {true, true } -> -1; + {true, false} -> -2; + {false, true } -> 2 + end. diff --git a/src/antidote_stats.erl b/src/antidote_stats.erl index ea2bc324a..a97838f8d 100644 --- a/src/antidote_stats.erl +++ b/src/antidote_stats.erl @@ -35,8 +35,14 @@ -behaviour(gen_server). %% Interval to collect metrics -define(INTERVAL, 10000). %% 10 sec +%% Interval to collect expensive metrics +-define(INTERVAL_LONG, 60000). %% 60 seconds %% Metrics collection will be started after INIT_INTERVAL after application startup. -define(INIT_INTERVAL, 10000). %% 10 seconds +%% What message queue length should log a warning +-define(QUEUE_LENGTH_THRESHOLD, 10). +%% If process collection takes too long, turn it off and alert user +-define(TIME_METRIC_COLLECTION_THRESHOLD_MS, 40000). %% API -export([start_link/0]). @@ -45,16 +51,28 @@ -export([init/1, handle_call/3, handle_cast/2, handle_info/2, terminate/2, code_change/3]). +-record(state, { + timer :: any(), + timer_expensive :: any(), + monitored_processes :: list() +}). + start_link() -> gen_server:start_link({local, ?MODULE}, ?MODULE, [], []). init([]) -> - % set the error logger counting the number of errors during operation - ok = logger:add_handler(count_errors, antidote_error_monitor, #{level => error}), - % start the timer for updating the calculated metrics - Timer = erlang:send_after(?INIT_INTERVAL, self(), periodic_update), - {ok, Timer}. + TimerCheap = erlang:send_after(?INIT_INTERVAL, self(), periodic_update), + + % start the timer for updating the calculated expensive metrics + TimerExpensive = erlang:send_after(?INIT_INTERVAL, self(), periodic_expensive_update), + + {ok, #state{ + timer = TimerCheap, + timer_expensive = TimerExpensive, + % start only monitoring inter_dc_query and processes without registered names (undefined) + monitored_processes = [inter_dc_query, undefined] + }}. handle_call(_Req, _From, State) -> {reply, ok, State}. @@ -62,16 +80,40 @@ handle_call(_Req, _From, State) -> handle_cast(_Req, State) -> {noreply, State}. -handle_info(periodic_update, OldTimer) -> +handle_info(periodic_update, State = #state{timer = CheapTimer}) -> %% ? - _ = erlang:cancel_timer(OldTimer), + _ = erlang:cancel_timer(CheapTimer), %% update all known stats _ = update_staleness(), - %% schedule tick + update_dc_count(), + + %% schedule tick if continue Timer = erlang:send_after(?INTERVAL, self(), periodic_update), - {noreply, Timer}. + {noreply, State#state{timer = Timer}}; + +handle_info(periodic_expensive_update, State = #state{timer_expensive = ExpensiveTimer, monitored_processes = Monitored}) -> + %% ? + _ = erlang:cancel_timer(ExpensiveTimer), + + %% only collect extended stats if enabled + case application:get_env(antidote, extended_stats) of + {ok, true} -> + %% update process infos + {Continue, NewMonitored} = update_processes_info(Monitored), + + %% schedule tick if continue + case Continue of + true -> Timer = erlang:send_after(?INTERVAL_LONG, self(), periodic_expensive_update); + _ -> Timer = undefined + end, + {noreply, State#state{timer_expensive = Timer, monitored_processes = NewMonitored}}; + + _ -> + {noreply, State#state{timer_expensive = undefined}} + end. + terminate(_Reason, _State) -> ok. @@ -99,3 +141,78 @@ calculate_staleness() -> to_microsec({MegaSecs, Secs, MicroSecs}) -> (MegaSecs * 1000000 + Secs) * 1000000 + MicroSecs. + + +update_dc_count() -> + DCs = dc_meta_data_utilities:get_dc_ids(true), + ?STATS({dc_count, length(DCs)}). + + +update_processes_info(Monitored) -> + TimeStart = erlang:system_time(millisecond), + + %% get all processes + Processes = erlang:processes(), + + %% collect info of each process + Infos = [erlang:process_info(P) || P <- Processes], + + + %% get only name, queue, and reductions + KeyValueList = [ + { + proplists:get_value(registered_name, ProcessInfo), + proplists:get_value(message_queue_len, ProcessInfo), + proplists:get_value(reductions, ProcessInfo) + } || ProcessInfo <- Infos, ProcessInfo /= undefined], + + + %% fold over list, group by name + {QueueMap, ReductionsMap} = lists:foldl( + fun({Name, Messages, Reductions}, {QMap, RMap}) -> + { + maps:put(Name, maps:get(Name, QMap, 0) + Messages, QMap), + maps:put(Name, maps:get(Name, RMap, 0) + Reductions, RMap) + } + end, + {maps:new(), maps:new()}, + KeyValueList + ), + + %% for each process, update queue length and reductions only if monitored + NewMonitored = maps:fold(fun(Name, Messages, MonitorAcc) -> + case lists:any(fun(E) -> E == Name end, MonitorAcc) of + true -> + ?STATS({process_message_queue_length, Name, Messages}), + ?STATS({process_reductions, Name, maps:get(Name, ReductionsMap)}), + MonitorAcc; + _ -> + %% if a process is not monitored and the threshold is reached, add to monitored list + case Messages > ?QUEUE_LENGTH_THRESHOLD of + true -> + logger:warning("New process has a message queue and is now being monitored ~p: ~p", [Name, Messages]), + ?STATS({process_message_queue_length, Name, Messages}), + ?STATS({process_reductions, Name, maps:get(Name, ReductionsMap)}), + MonitorAcc ++ [Name]; + false -> + MonitorAcc + end + end + + end, + Monitored, + QueueMap + ), + + + %% measure time to scrape and report + TimeMs = erlang:system_time(millisecond) - TimeStart, + ?STATS({process_scrape_time, TimeMs}), + case TimeMs > ?TIME_METRIC_COLLECTION_THRESHOLD_MS of + true -> + logger:alert("System metric process collection took too long (~p ms over ~p ms threshold), turning process info collection off", [TimeMs, ?TIME_METRIC_COLLECTION_THRESHOLD_MS]), + {false, NewMonitored}; + _ -> + logger:debug("Took ~p ms to scrape processes", [TimeMs]), + {true, NewMonitored} + end . diff --git a/src/antidote_sup.erl b/src/antidote_sup.erl index 4a7fa5e6f..43ad727f8 100644 --- a/src/antidote_sup.erl +++ b/src/antidote_sup.erl @@ -114,6 +114,8 @@ init(_Args) -> type => supervisor, modules => [antidote_pb_sup]}, + AntidoteStats = ?CHILD(antidote_stats, worker, []), + {ok, {{one_for_one, 5, 10}, @@ -136,5 +138,6 @@ init(_Args) -> MetaDataSenderSup, BCounterManager, LogResponseReaderSup, - PbSup + PbSup, + AntidoteStats ]}}. diff --git a/src/antidote_warning_monitor.erl b/src/antidote_warning_monitor.erl new file mode 100644 index 000000000..7eef61997 --- /dev/null +++ b/src/antidote_warning_monitor.erl @@ -0,0 +1,36 @@ +%% ------------------------------------------------------------------- +%% +%% Copyright <2013-2018> < +%% Technische Universität Kaiserslautern, Germany +%% Université Pierre et Marie Curie / Sorbonne-Université, France +%% Universidade NOVA de Lisboa, Portugal +%% Université catholique de Louvain (UCL), Belgique +%% INESC TEC, Portugal +%% > +%% +%% This file is provided to you under the Apache License, +%% Version 2.0 (the "License"); you may not use this file +%% except in compliance with the License. You may obtain +%% a copy of the License at +%% +%% http://www.apache.org/licenses/LICENSE-2.0 +%% +%% Unless required by applicable law or agreed to in writing, +%% software distributed under the License is distributed on an +%% "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +%% KIND, either expressed or implied. See the License for the +%% specific language governing permissions and limitations +%% under the License. +%% +%% List of the contributors to the development of Antidote: see AUTHORS file. +%% Description and complete License: see LICENSE file. +%% ------------------------------------------------------------------- + +-module(antidote_warning_monitor). + +-include("antidote.hrl"). + +-export([log/2]). + +log(_Event, _Config) -> + ?STATS(log_warning). diff --git a/src/clocksi_readitem_server.erl b/src/clocksi_readitem_server.erl index 473e39c5c..b2552ade3 100644 --- a/src/clocksi_readitem_server.erl +++ b/src/clocksi_readitem_server.erl @@ -203,6 +203,7 @@ init([Partition, Id]) -> prepared_cache=PreparedCache, self=Self}}. handle_call({perform_read, Key, Type, Transaction, PropertyList}, Coordinator, SD0) -> + ?STATS(operation_read_internal), ok = perform_read_internal(Coordinator, Key, Type, Transaction, PropertyList, SD0), {noreply, SD0}; @@ -210,6 +211,7 @@ handle_call({go_down}, _Sender, SD0) -> {stop, shutdown, ok, SD0}. handle_cast({perform_read_cast, Coordinator, Key, Type, Transaction, PropertyList}, SD0) -> + ?STATS(operation_read_async_internal), ok = perform_read_internal(Coordinator, Key, Type, Transaction, PropertyList, SD0), {noreply, SD0}. diff --git a/src/inter_dc_dep_vnode.erl b/src/inter_dc_dep_vnode.erl index f831aa96f..deefc42c1 100644 --- a/src/inter_dc_dep_vnode.erl +++ b/src/inter_dc_dep_vnode.erl @@ -146,9 +146,12 @@ try_store(State, Txn=#interdc_txn{dcid = DCID, partition = Partition, timestamp {ok, _} = logging_vnode:append_group({Partition, node()}, [Partition], Ops, false), - %% Update the materializer (send only the update operations) ClockSiOps = updates_to_clocksi_payloads(Txn), + ?STATS({dc_ops_received, length(ClockSiOps)}), + ?STATS({dc_ops_received_size, byte_size(term_to_binary(ClockSiOps))}), + + %% Update the materializer (send only the update operations) ok = lists:foreach(fun(Op) -> materializer_vnode:update(Op#clocksi_payload.key, Op) end, ClockSiOps), {update_clock(State, DCID, Timestamp), true} end. @@ -157,6 +160,7 @@ handle_command({set_dependency_clock, Vector}, _Sender, State) -> {reply, ok, State#state{vectorclock = Vector}}; handle_command({txn, Txn}, _Sender, State) -> + NewState = process_all_queues(push_txn(State, Txn)), {reply, ok, NewState}; diff --git a/src/logging_vnode.erl b/src/logging_vnode.erl index c50140d48..37d64cb7f 100644 --- a/src/logging_vnode.erl +++ b/src/logging_vnode.erl @@ -360,6 +360,7 @@ handle_command({read, LogId}, _Sender, %% handle_command({read_from, LogId, _From}, _Sender, #state{partition=Partition, logs_map=Map, last_read=Lastread}=State) -> + ?STATS(log_read_from), case get_log_from_map(Map, Partition, LogId) of {ok, Log} -> ok = disk_log:sync(Log), @@ -394,6 +395,7 @@ handle_command({append, LogId, LogOperation, Sync}, _Sender, op_id_table=OpIdTable, partition=Partition, enable_log_to_disk=EnableLog}=State) -> + ?STATS(operation_update_internal), case get_log_from_map(Map, Partition, LogId) of {ok, Log} -> MyDCID = dc_meta_data_utilities:get_my_dc_id(), @@ -579,6 +581,7 @@ read_internal(_Log, error, Ops) -> read_internal(_Log, eof, Ops) -> {eof, Ops}; read_internal(Log, Continuation, Ops) -> + ?STATS(log_read_read), {NewContinuation, NewOps} = case disk_log:chunk(Log, Continuation) of {C, O} -> {C, O}; @@ -905,8 +908,9 @@ open_logs(LogFile, [Next|Rest], Map, ClockTable, MaxVector)-> PreflistString = string:join( lists:map(fun erlang:integer_to_list/1, PartitionList), "-"), LogId = LogFile ++ "--" ++ PreflistString, - LogPath = filename:join( - application:get_env(riak_core, platform_data_dir, undefined), LogId), + {ok, DataDir} = application:get_env(antidote, data_dir), + LogPath = filename:join(DataDir, LogId), + ?STATS({log_append, LogPath, filelib:file_size(LogPath ++ ".LOG")}), case disk_log:open([{name, LogPath}]) of {ok, Log} -> {eof, NewMaxVector} = get_last_op_from_log(Log, start, ClockTable, MaxVector), @@ -975,7 +979,10 @@ fold_log(Log, Continuation, F, Acc) -> insert_log_record(Log, LogId, LogRecord, EnableLogging) -> Result = case EnableLogging of true -> - disk_log:log(Log, {LogId, LogRecord}); + BinaryRecord = term_to_binary({LogId, LogRecord}), + ?STATS({log_append, Log, erlang:byte_size(BinaryRecord)}), + ?LOG_DEBUG("Appending ~p bytes", [erlang:byte_size(BinaryRecord)]), + disk_log:blog(Log, term_to_binary({LogId, LogRecord})); false -> ok end, diff --git a/src/meta_data_sender.erl b/src/meta_data_sender.erl index 577e1a1b3..d0c1f17a2 100644 --- a/src/meta_data_sender.erl +++ b/src/meta_data_sender.erl @@ -176,7 +176,7 @@ send_meta_data(cast, timeout, State = #state{last_result = LastResult, Store = case HasChanged of true -> %% update changed counter for this metadata type - ?STATS({metadata_updated, Name}), + %?STATS({metadata_updated, Name}), true = ets:insert(get_table_name(Name, ?META_TABLE_STABLE_NAME), {merged_data, NewResult}), NewResult; false -> diff --git a/src/stable_meta_data_server.erl b/src/stable_meta_data_server.erl index 3a84d259c..b55d826f7 100644 --- a/src/stable_meta_data_server.erl +++ b/src/stable_meta_data_server.erl @@ -140,8 +140,8 @@ broadcast_meta_data_merge(Key, Value, MergeFunc, InitFunc) -> %% -------------------------------------------------------------------+ init([]) -> - Path = filename:join( - application:get_env(riak_core, platform_data_dir, undefined), ?TABLE_NAME), + {ok, DataDir} = application:get_env(antidote, data_dir), + Path = filename:join(DataDir, ?TABLE_NAME), {ok, DetsTable} = dets:open_file(Path, [{type, set}]), Table = ets:new(?TABLE_NAME, [set, named_table, protected, ?META_TABLE_STABLE_CONCURRENCY]), @@ -174,7 +174,7 @@ handle_call({update_meta_data, KeyValueList, IsEnv}, _Sender, State = #state{tab true = ets:insert(Table, KeyValueList), ok = dets:insert(DetsTable, KeyValueList), ok = dets:sync(DetsTable), - ?STATS(metadata_update_stable), + %?STATS(metadata_update_stable), {reply, ok, State}; handle_call({merge_meta_data, Key, Value, MergeFunc, InitFunc}, _Sender, State = #state{table = Table, dets_table = DetsTable}) -> diff --git a/test/release_test.sh b/test/release_test.sh index 97efb774f..2557d9233 100755 --- a/test/release_test.sh +++ b/test/release_test.sh @@ -2,15 +2,19 @@ # This builds a release, starts it and tries to do simple transaction; exits immediately upon error set -e +# set test node name +export NODE_NAME=antidote@127.0.0.1 +export ROOT_DIR_PREFIX=antidote@127.0.0.1/ + # cd to root project directory SCRIPTDIR=`dirname $0` cd "$SCRIPTDIR/.." # Start Antidote -./_build/default/rel/antidote/bin/env start +./_build/default/rel/antidote/bin/antidote start # Execute test transaction ./test/release_test.escript # Stop Antidote -./_build/default/rel/antidote/bin/env stop +./_build/default/rel/antidote/bin/antidote stop diff --git a/test/utils/ct_redirect_handler.erl b/test/utils/ct_redirect_handler.erl new file mode 100644 index 000000000..23f522662 --- /dev/null +++ b/test/utils/ct_redirect_handler.erl @@ -0,0 +1,14 @@ +%% redirects log messages to ct:log + +-module(ct_redirect_handler). + +%% API +-export([log/2]). + +log(LogEvent, _Config) -> + CtMaster = application:get_env(antidote, ct_master, undefined), + #{msg := Message} = LogEvent, + case Message of + {Msg, Format} -> _ = rpc:call(CtMaster, ct, log, [Msg, Format]); + _ -> _ = rpc:call(CtMaster, ct, log, ["~p", [Message]]) + end. diff --git a/test/utils/ct_slave_ext.erl b/test/utils/ct_slave_ext.erl deleted file mode 100644 index 5a9b77903..000000000 --- a/test/utils/ct_slave_ext.erl +++ /dev/null @@ -1,376 +0,0 @@ -%%-------------------------------------------------------------------- -%% %CopyrightBegin% -%% -%% Copyright Ericsson AB 2010-2018. All Rights Reserved. -%% -%% Licensed under the Apache License, Version 2.0 (the "License"); -%% you may not use this file except in compliance with the License. -%% You may obtain a copy of the License at -%% -%% http://www.apache.org/licenses/LICENSE-2.0 -%% -%% Unless required by applicable law or agreed to in writing, software -%% distributed under the License is distributed on an "AS IS" BASIS, -%% WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -%% See the License for the specific language governing permissions and -%% limitations under the License. -%% -%% %CopyrightEnd% - -%%---------------------------------------------------------------------- -%% This is an adaptation of ct_slave from the Erlang standard library. -%% It adds the following features: -%% - Configuration to set the working directory of a slave -%%---------------------------------------------------------------------- --module(ct_slave_ext). - --export([start/1, start/2, start/3, stop/1, stop/2]). - --export([slave_started/2, slave_ready/2, monitor_master/1]). - --record(options, {username, password, boot_timeout, init_timeout, - startup_timeout, startup_functions, monitor_master, - kill_if_fail, erl_flags, env, ssh_port, ssh_opts, - stop_timeout, working_dir}). - -start(Node) -> - start(gethostname(), Node). - -start(_HostOrNode = Node, _NodeOrOpts = Opts) %% match to satiate edoc - when is_list(Opts) -> - start(gethostname(), Node, Opts); - -start(Host, Node) -> - start(Host, Node, []). - -start(Host, Node, Opts) -> - ENode = enodename(Host, Node), - case erlang:is_alive() of - false-> - {error, not_alive, node()}; - true-> - case is_started(ENode) of - false-> - OptionsRec = fetch_options(Opts), - do_start(Host, Node, OptionsRec); - {true, not_connected}-> - {error, started_not_connected, ENode}; - {true, connected}-> - {error, already_started, ENode} - end - end. - -stop(Node) -> - stop(gethostname(), Node). - -stop(_HostOrNode = Node, _NodeOrOpts = Opts) %% match to satiate edoc - when is_list(Opts) -> - stop(gethostname(), Node, Opts); - -stop(Host, Node) -> - stop(Host, Node, []). - -stop(Host, Node, Opts) -> - ENode = enodename(Host, Node), - case is_started(ENode) of - {true, connected}-> - OptionsRec = fetch_options(Opts), - do_stop(ENode, OptionsRec); - {true, not_connected}-> - {error, not_connected, ENode}; - false-> - {error, not_started, ENode} - end. - -%%% fetch an option value from the tagged tuple list with default -get_option_value(Key, OptionList, Default) -> - case lists:keyfind(Key, 1, OptionList) of - false-> - Default; - {Key, Value}-> - Value - end. - -%%% convert option list to the option record, fill all defaults -fetch_options(Options) -> - UserName = get_option_value(username, Options, []), - Password = get_option_value(password, Options, []), - BootTimeout = get_option_value(boot_timeout, Options, 3), - InitTimeout = get_option_value(init_timeout, Options, 1), - StartupTimeout = get_option_value(startup_timeout, Options, 1), - StartupFunctions = get_option_value(startup_functions, Options, []), - Monitor = get_option_value(monitor_master, Options, false), - KillIfFail = get_option_value(kill_if_fail, Options, true), - ErlFlags = get_option_value(erl_flags, Options, []), - EnvVars = get_option_value(env, Options, []), - SSHPort = get_option_value(ssh_port, Options, []), - SSHOpts = get_option_value(ssh_opts, Options, []), - StopTimeout = get_option_value(stop_timeout, Options, 5), - WorkingDir = get_option_value(cd, Options ,false), - #options{username=UserName, password=Password, - boot_timeout=BootTimeout, init_timeout=InitTimeout, - startup_timeout=StartupTimeout, startup_functions=StartupFunctions, - monitor_master=Monitor, kill_if_fail=KillIfFail, - erl_flags=ErlFlags, env=EnvVars, ssh_port=SSHPort, ssh_opts=SSHOpts, - stop_timeout=StopTimeout, working_dir=WorkingDir}. - -% send a message when slave node is started -slave_started(ENode, MasterPid) -> - MasterPid ! {node_started, ENode}, - ok. - -% send a message when slave node has finished startup -slave_ready(ENode, MasterPid) -> - MasterPid ! {node_ready, ENode}, - ok. - -% start monitoring of the master node -monitor_master(MasterNode) -> - spawn(fun() -> monitor_master_int(MasterNode) end). - -% code of the masterdeath-waiter process -monitor_master_int(MasterNode) -> - ct_util:mark_process(), - erlang:monitor_node(MasterNode, true), - receive - {nodedown, MasterNode}-> - init:stop() - end. - -% check if node is listed in the nodes() -is_connected(ENode) -> - [N||N<-nodes(), N==ENode] == [ENode]. - -% check if node is alive (ping and disconnect if pingable) -is_started(ENode) -> - case is_connected(ENode) of - true-> - {true, connected}; - false-> - case net_adm:ping(ENode) of - pang-> - false; - pong-> - erlang:disconnect_node(ENode), - {true, not_connected} - end - end. - -% make a Erlang node name from name and hostname -enodename(Host, Node) -> - case lists:member($@, atom_to_list(Node)) of - true -> - Node; - false -> - list_to_atom(atom_to_list(Node)++"@"++atom_to_list(Host)) - end. - -% performs actual start of the "slave" node -do_start(Host, Node, Options) -> - ENode = enodename(Host, Node), - Functions = - lists:append([[{ct_slave, slave_started, [ENode, self()]}], - Options#options.startup_functions, - [{ct_slave, slave_ready, [ENode, self()]}]]), - Functions2 = if - Options#options.monitor_master-> - [{ct_slave, monitor_master, [node()]}|Functions]; - true-> - Functions - end, - MasterHost = gethostname(), - _ = if - MasterHost == Host -> - spawn_local_node(Node, Options); - true-> - spawn_remote_node(Host, Node, Options) - end, - - BootTimeout = Options#options.boot_timeout, - InitTimeout = Options#options.init_timeout, - StartupTimeout = Options#options.startup_timeout, - Result = case wait_for_node_alive(ENode, BootTimeout) of - pong-> - case test_server:is_cover() of - true -> - MainCoverNode = cover:get_main_node(), - rpc:call(MainCoverNode,cover,start,[ENode]); - false -> - ok - end, - call_functions(ENode, Functions2), - receive - {node_started, ENode}-> - receive - {node_ready, ENode}-> - {ok, ENode} - after StartupTimeout*1000-> - {error, startup_timeout, ENode} - end - after InitTimeout*1000 -> - {error, init_timeout, ENode} - end; - pang-> - {error, boot_timeout, ENode} - end, - _ = case Result of - {ok, ENode}-> - ok; - {error, Timeout, ENode} - when ((Timeout==init_timeout) or (Timeout==startup_timeout)) and - Options#options.kill_if_fail-> - do_stop(ENode); - _-> ok - end, - Result. - -% are we using fully qualified hostnames -long_or_short() -> - case net_kernel:longnames() of - true-> - " -name "; - false-> - " -sname " - end. - -% get the localhost's name, depending on the using name policy -gethostname() -> - Hostname = case net_kernel:longnames() of - true-> - net_adm:localhost(); - _-> - {ok, Name}=inet:gethostname(), - Name - end, - list_to_atom(Hostname). - -% get cmd for starting Erlang -get_cmd(Node, Flags) -> - Cookie = erlang:get_cookie(), - "erl -setcookie "++ atom_to_list(Cookie) ++ - long_or_short() ++ atom_to_list(Node) ++ " " ++ Flags. - -% spawn node locally -spawn_local_node(Node, Options) -> - #options{env=Env,erl_flags=ErlFlags} = Options, - Cmd = get_cmd(Node, ErlFlags), - PortOpds = - [stream, {env, Env}] ++ - case Options#options.working_dir of - false -> []; - Dir -> [{cd, Dir}] - end, - spawn(fun() -> - Port = open_port({spawn, Cmd}, PortOpds), - Cwd = case Options#options.working_dir of - false -> - {ok, D} = file:get_cwd(), - D; - D -> D - end, - log_output(Node, Port, Cwd ++ "/stdio.log") - end). - -log_output(Node, Port, File) -> - receive - {Port, {data, Data}} -> - ct:log("Node ~p output:~n~s", [Node, Data]), - {ok, F} = file:open(File, [append]), - file:write(F, Data), - file:close(F), - log_output(Node, Port, File) - end. - - -% spawn node remotely -spawn_remote_node(Host, Node, Options) -> - #options{username=Username, - password=Password, - erl_flags=ErlFlags, - env=Env, - ssh_port=MaybeSSHPort, - ssh_opts=SSHOpts} = Options, - SSHPort = case MaybeSSHPort of - [] -> 22; % Use default SSH port - A -> A - end, - SSHOptions = case {Username, Password} of - {[], []}-> - []; - {_, []}-> - [{user, Username}]; - {_, _}-> - [{user, Username}, {password, Password}] - end ++ [{silently_accept_hosts, true}] ++ SSHOpts, - {ok, _} = application:ensure_all_started(ssh), - {ok, SSHConnRef} = ssh:connect(atom_to_list(Host), SSHPort, SSHOptions), - {ok, SSHChannelId} = ssh_connection:session_channel(SSHConnRef, infinity), - ssh_setenv(SSHConnRef, SSHChannelId, Env), - ssh_connection:exec(SSHConnRef, SSHChannelId, get_cmd(Node, ErlFlags), infinity). - -ssh_setenv(SSHConnRef, SSHChannelId, [{Var, Value} | Vars]) - when is_list(Var), is_list(Value) -> - success = ssh_connection:setenv(SSHConnRef, SSHChannelId, - Var, Value, infinity), - ssh_setenv(SSHConnRef, SSHChannelId, Vars); -ssh_setenv(_SSHConnRef, _SSHChannelId, []) -> ok. - -% call functions on a remote Erlang node -call_functions(_Node, []) -> - ok; -call_functions(Node, [{M, F, A}|Functions]) -> - rpc:call(Node, M, F, A), - call_functions(Node, Functions). - -% wait N seconds until node is pingable -wait_for_node_alive(_Node, 0) -> - pang; -wait_for_node_alive(Node, N) -> - timer:sleep(1000), - case net_adm:ping(Node) of - pong-> - pong; - pang-> - wait_for_node_alive(Node, N-1) - end. - -% call init:stop on a remote node -do_stop(ENode) -> - do_stop(ENode, fetch_options([])). -do_stop(ENode, Options) -> - {Cover,MainCoverNode} = - case test_server:is_cover() of - true -> - Main = cover:get_main_node(), - rpc:call(Main,cover,flush,[ENode]), - {true,Main}; - false -> - {false,undefined} - end, - spawn(ENode, init, stop, []), - StopTimeout = Options#options.stop_timeout, - case wait_for_node_dead(ENode, StopTimeout) of - {ok,ENode} -> - if Cover -> - %% To avoid that cover is started again if a node - %% with the same name is started later. - rpc:call(MainCoverNode,cover,stop,[ENode]); - true -> - ok - end, - {ok,ENode}; - Error -> - Error - end. - -% wait N seconds until node is disconnected -wait_for_node_dead(Node, 0) -> - {error, stop_timeout, Node}; -wait_for_node_dead(Node, N) -> - timer:sleep(1000), - case lists:member(Node, nodes()) of - true-> - wait_for_node_dead(Node, N-1); - false-> - {ok, Node} - end. diff --git a/test/utils/test_utils.erl b/test/utils/test_utils.erl index f06c94091..5ebe1ded3 100644 --- a/test/utils/test_utils.erl +++ b/test/utils/test_utils.erl @@ -108,6 +108,7 @@ at_init_testsuite() -> %% =========================================== start_node(Name, Config) -> + %% code path for compiled dependencies (ebin folders) CodePath = lists:filter(fun filelib:is_dir/1, code:get_path()), ct:log("Starting node ~p", [Name]), @@ -119,46 +120,73 @@ start_node(Name, Config) -> %% have the slave nodes monitor the runner node, so they can't outlive it NodeConfig = [ + %% have the slave nodes monitor the runner node, so they can't outlive it {monitor_master, true}, + {erl_flags, "-smp"}, %% smp for the eleveldb god - {cd, NodeDir}, - {startup_functions, [ - {code, set_path, [CodePath]} - ]}], - case ct_slave_ext:start(Name, NodeConfig) of + + %% set code path for dependencies + {startup_functions, [ {code, set_path, [CodePath]} ]}], + case ct_slave:start(Name, NodeConfig) of {ok, Node} -> % load application to allow for configuring the environment before starting ok = rpc:call(Node, application, load, [riak_core]), ok = rpc:call(Node, application, load, [antidote_stats]), ok = rpc:call(Node, application, load, [ranch]), ok = rpc:call(Node, application, load, [antidote]), + ok = rpc:call(Node, application, load, [lager]), + %% get remote working dir of node + {ok, NodeWorkingDir} = rpc:call(Node, file, get_cwd, []), - ct:log("Setting environment for riak"), - PlatformDir = NodeDir ++ "/data/", - RingDir = PlatformDir ++ "/ring/", - NumberOfVNodes = 4, - filelib:ensure_dir(PlatformDir), - filelib:ensure_dir(RingDir), - ok = rpc:call(Node, application, set_env, [riak_core, riak_state_dir, RingDir]), - ok = rpc:call(Node, application, set_env, [riak_core, ring_creation_size, NumberOfVNodes]), - ok = rpc:call(Node, application, set_env, [riak_core, platform_data_dir, PlatformDir]), - ok = rpc:call(Node, application, set_env, [riak_core, handoff_port, web_ports(Name) + 3]), + %% DATA DIRS + ok = rpc:call(Node, application, set_env, [antidote, data_dir, filename:join([NodeWorkingDir, Node, "antidote-data"])]), + ok = rpc:call(Node, application, set_env, [riak_core, ring_state_dir, filename:join([NodeWorkingDir, Node, "data"])]), + ok = rpc:call(Node, application, set_env, [riak_core, platform_data_dir, filename:join([NodeWorkingDir, Node, "data"])]), ok = rpc:call(Node, application, set_env, [riak_core, schema_dirs, [AntidoteFolder ++ "/_build/default/rel/antidote/lib/"]]), - ct:log("Setting environment for ranch"), - ok = rpc:call(Node, application, set_env, [ranch, pb_port, web_ports(Name) + 2]), - - ct:log("Setting environment for antidote_stats"), - ok = rpc:call(Node, application, set_env, [antidote_stats, metrics_port, web_ports(Name) + 4]), - ct:log("Setting environment for antidote"), - ok = rpc:call(Node, application, set_env, [antidote, pubsub_port, web_ports(Name) + 1]), - ok = rpc:call(Node, application, set_env, [antidote, logreader_port, web_ports(Name)]), + %% PORTS + Port = web_ports(Name), + ok = rpc:call(Node, application, set_env, [antidote, logreader_port, Port]), + ok = rpc:call(Node, application, set_env, [antidote, pubsub_port, Port + 1]), + ok = rpc:call(Node, application, set_env, [ranch, pb_port, Port + 2]), + ok = rpc:call(Node, application, set_env, [riak_core, handoff_port, Port + 3]), + ok = rpc:call(Node, application, set_env, [antidote_stats, metrics_port, Port + 4]), + + + %% LOGGING Configuration + %% add additional logging handlers to ensure easy access to remote node logs + %% for each logging level + LogRoot = filename:join([NodeWorkingDir, Node, "logs"]), + %% set the logger configuration + ok = rpc:call(Node, application, set_env, [antidote, logger, log_config(LogRoot)]), + %% set primary output level, no filter + rpc:call(Node, logger, set_primary_config, [level, all]), + %% load additional logger handlers at remote node + rpc:call(Node, logger, add_handlers, [antidote]), + + %% legacy lager folder until lager is removed + LagerRoot = filename:join([NodeWorkingDir, Node, "lager"]), + ok = rpc:call(Node, application, set_env, [lager, log_root, LagerRoot]), + ok = rpc:call(Node, application, set_env, [lager, error_logger_whitelist, [error_logger]]), + ok = rpc:call(Node, application, set_env, [lager, error_logger_redirect, false]), + ok = rpc:call(Node, application, set_env, [lager, crash_log, false]), + ok = rpc:call(Node, application, set_env, [lager, handlers, [{antidote_lager_backend, [debug]}]]), + ok = rpc:call(Node, application, set_env, [lager, logger, log_config(LogRoot)]), + + %% redirect slave logs to ct_master logs + ok = rpc:call(Node, application, set_env, [antidote, ct_master, node()]), + ConfLog = #{level => debug, formatter => {logger_formatter, #{single_line => true, max_size => 2048}}, config => #{type => standard_io}}, + _ = rpc:call(Node, logger, add_handler, [antidote_redirect_ct, ct_redirect_handler, ConfLog]), + + + %% ANTIDOTE Configuration + %% reduce number of actual log files created to 4, reduces start-up time of node + ok = rpc:call(Node, application, set_env, [riak_core, ring_creation_size, 4]), - ct:log("Start antidote"), {ok, _} = rpc:call(Node, application, ensure_all_started, [antidote]), - ct:pal("Node ~p started with ports ~p-~p", [Node, web_ports(Name), web_ports(Name)+4]), + ct:pal("Node ~p started with ports ~p-~p", [Node, Port, Port + 4]), Node; {error, already_started, Node} -> @@ -166,7 +194,7 @@ start_node(Name, Config) -> Node; {error, Reason, Node} -> ct:pal("Error starting node ~w, reason ~w, will retry", [Node, Reason]), - ct_slave_ext:stop(Name), + ct_slave:stop(Name), time_utils:wait_until_offline(Node), start_node(Name, Config) end. @@ -182,7 +210,7 @@ kill_and_restart_nodes(NodeList, Config) -> %% @doc Kills all given nodes, crashes if one node cannot be stopped -spec kill_nodes([node()]) -> [node()]. kill_nodes(NodeList) -> - lists:map(fun(Node) -> {ok, Name} = ct_slave_ext:stop(get_node_name(Node)), Name end, NodeList). + lists:map(fun(Node) -> {ok, Name} = ct_slave:stop(get_node_name(Node)), Name end, NodeList). %% @doc Send force kill signals to all given nodes @@ -195,7 +223,7 @@ brutal_kill_nodes(NodeList) -> %% kill -9 after X seconds just in case %% rpc:cast(Node, timer, apply_after, %% [?FORCE_KILL_TIMER, os, cmd, [io_lib:format("kill -9 ~s", [OSPidToKill])]]), - ct_slave_ext:stop(get_node_name(Node)), + ct_slave:stop(get_node_name(Node)), rpc:cast(Node, os, cmd, [io_lib:format("kill -15 ~s", [OSPidToKill])]), Node end, NodeList). @@ -467,3 +495,36 @@ bucket(BucketBaseAtom) -> Bucket = list_to_atom(atom_to_list(BucketBaseAtom) ++ BucketRandomSuffix), ct:log("Using random bucket: ~p", [Bucket]), Bucket. + + +%% logger configuration for each level +%% see http://erlang.org/doc/man/logger.html +log_config(LogDir) -> + DebugConfig = #{level => debug, + formatter => {logger_formatter, #{single_line => true, max_size => 2048}}, + config => #{type => {file, filename:join(LogDir, "debug.log")}}}, + + InfoConfig = #{level => info, + formatter => {logger_formatter, #{single_line => true, max_size => 2048}}, + config => #{type => {file, filename:join(LogDir, "info.log")}} + }, + + NoticeConfig = #{level => notice, + formatter => {logger_formatter, #{single_line => true, max_size => 2048}}, + config => #{type => {file, filename:join(LogDir, "notice.log")}}}, + + WarningConfig = #{level => warning, + formatter => {logger_formatter, #{single_line => true, max_size => 2048}}, + config => #{type => {file, filename:join(LogDir, "warning.log")}}}, + + ErrorConfig = #{level => error, + formatter => {logger_formatter, #{single_line => true, max_size => 2048}}, + config => #{type => {file, filename:join(LogDir, "error.log")}}}, + + [ + {handler, debug_antidote, logger_std_h, DebugConfig}, + {handler, info_antidote, logger_std_h, InfoConfig}, + {handler, notice_antidote, logger_std_h, NoticeConfig}, + {handler, warning_antidote, logger_std_h, WarningConfig}, + {handler, error_antidote, logger_std_h, ErrorConfig} + ].