Skip to content

Commit

Permalink
Resiliency: Master election should demotes nodes which try to join th…
Browse files Browse the repository at this point in the history
…e cluster for the first time

With the change in elastic#7493,  we introduced a pinging round when a master nodes goes down. That pinging round helps validating the current state of the cluster and takes, by default, 3 seconds. It may be that during that window, a new node tries to join the cluster and starts pinging (this is typical when you quickly restart the current master).  If this node gets elected as the new master it will force recovery from the gateway (it has no in memory cluster state), which in turn will cause a full cluster shard synchronisation. While this is not a problem on it's own, it's a shame. This commit demotes "new" nodes during master election so the will only be elected if really needed.

Closes elastic#7558
  • Loading branch information
bleskes committed Sep 16, 2014
1 parent 0254360 commit 1f39d43
Show file tree
Hide file tree
Showing 10 changed files with 233 additions and 86 deletions.
62 changes: 48 additions & 14 deletions src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
Expand Up @@ -52,6 +52,7 @@
import org.elasticsearch.discovery.zen.fd.MasterFaultDetection;
import org.elasticsearch.discovery.zen.fd.NodesFaultDetection;
import org.elasticsearch.discovery.zen.membership.MembershipAction;
import org.elasticsearch.discovery.zen.ping.PingContextProvider;
import org.elasticsearch.discovery.zen.ping.ZenPing;
import org.elasticsearch.discovery.zen.ping.ZenPingService;
import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction;
Expand All @@ -69,14 +70,15 @@
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import static com.google.common.collect.Lists.newArrayList;
import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;

/**
*
*/
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, DiscoveryNodesProvider {
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, PingContextProvider {

public final static String SETTING_REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
public final static String SETTING_PING_TIMEOUT = "discovery.zen.ping.timeout";
Expand Down Expand Up @@ -139,6 +141,9 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen

private volatile boolean rejoinOnMasterGone;

/** counts the time this node has joined the cluster or have elected it self as master */
private final AtomicLong clusterJoinsCounter = new AtomicLong();

@Nullable
private NodeService nodeService;

Expand Down Expand Up @@ -194,7 +199,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
this.nodesFD.addListener(new NodeFaultDetectionListener());

this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings, clusterName);
this.pingService.setNodesProvider(this);
this.pingService.setPingContextProvider(this);
this.membership = new MembershipAction(settings, clusterService, transportService, this, new MembershipListener());

transportService.registerHandler(DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequestHandler());
Expand Down Expand Up @@ -290,6 +295,7 @@ public String nodeDescription() {
return clusterName.value() + "/" + localNode.id();
}

/** start of {@link org.elasticsearch.discovery.zen.ping.PingContextProvider } implementation */
@Override
public DiscoveryNodes nodes() {
DiscoveryNodes latestNodes = this.latestDiscoNodes;
Expand All @@ -305,6 +311,14 @@ public NodeService nodeService() {
return this.nodeService;
}

@Override
public boolean nodeHasJoinedClusterOnce() {
return clusterJoinsCounter.get() > 0;
}

/** end of {@link org.elasticsearch.discovery.zen.ping.PingContextProvider } implementation */


@Override
public void publish(ClusterState clusterState, AckListener ackListener) {
if (!master) {
Expand Down Expand Up @@ -387,6 +401,8 @@ public void onFailure(String source, Throwable t) {
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
sendInitialStateEventIfNeeded();
long count = clusterJoinsCounter.incrementAndGet();
logger.trace("cluster joins counter set to [{}] (elected as master)", count);
}
});
} else {
Expand All @@ -404,8 +420,8 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
}

masterFD.start(masterNode, "initial_join");
// no need to submit the received cluster state, we will get it from the master when it publishes
// the fact that we joined
long count = clusterJoinsCounter.incrementAndGet();
logger.trace("cluster joins counter set to [{}] (joined master)", count);
}
}
}
Expand Down Expand Up @@ -922,7 +938,7 @@ private DiscoveryNode findMaster() {
sb.append(" {none}");
} else {
for (ZenPing.PingResponse pingResponse : fullPingResponses) {
sb.append("\n\t--> ").append("target [").append(pingResponse.target()).append("], master [").append(pingResponse.master()).append("]");
sb.append("\n\t--> ").append(pingResponse);
}
}
logger.trace(sb.toString());
Expand All @@ -931,7 +947,7 @@ private DiscoveryNode findMaster() {
// filter responses
List<ZenPing.PingResponse> pingResponses = Lists.newArrayList();
for (ZenPing.PingResponse pingResponse : fullPingResponses) {
DiscoveryNode node = pingResponse.target();
DiscoveryNode node = pingResponse.node();
if (masterElectionFilterClientNodes && (node.clientNode() || (!node.masterNode() && !node.dataNode()))) {
// filter out the client node, which is a client node, or also one that is not data and not master (effectively, client)
} else if (masterElectionFilterDataNodes && (!node.masterNode() && node.dataNode())) {
Expand All @@ -947,7 +963,7 @@ private DiscoveryNode findMaster() {
sb.append(" {none}");
} else {
for (ZenPing.PingResponse pingResponse : pingResponses) {
sb.append("\n\t--> ").append("target [").append(pingResponse.target()).append("], master [").append(pingResponse.master()).append("]");
sb.append("\n\t--> ").append(pingResponse);
}
}
logger.debug(sb.toString());
Expand All @@ -963,20 +979,38 @@ private DiscoveryNode findMaster() {
}
}

Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet();
// nodes discovered during pinging
Set<DiscoveryNode> activeNodes = Sets.newHashSet();
// nodes discovered who has previously been part of the cluster and do not ping for the very first time
Set<DiscoveryNode> joinedOnceActiveNodes = Sets.newHashSet();
if (localNode.masterNode()) {
possibleMasterNodes.add(localNode);
activeNodes.add(localNode);
long joinsCounter = clusterJoinsCounter.get();
if (joinsCounter > 0) {
logger.trace("adding local node to the list of active nodes who has previously joined the cluster (joins counter is [{}})", joinsCounter);
joinedOnceActiveNodes.add(localNode);
}
}
for (ZenPing.PingResponse pingResponse : pingResponses) {
possibleMasterNodes.add(pingResponse.target());
activeNodes.add(pingResponse.node());
if (pingResponse.hasJoinedOnce()) {
joinedOnceActiveNodes.add(pingResponse.node());
}
}

if (pingMasters.isEmpty()) {
// if we don't have enough master nodes, we bail, because there are not enough master to elect from
if (electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
return electMaster.electMaster(possibleMasterNodes);
if (electMaster.hasEnoughMasterNodes(activeNodes)) {
// we give preference to nodes who have previously already joined the cluster. Those will
// have a cluster state in memory, including an up to date routing table (which is not persistent to disk
// by the gateway)
DiscoveryNode master = electMaster.electMaster(joinedOnceActiveNodes);
if (master != null) {
return master;
}
return electMaster.electMaster(activeNodes);
} else {
logger.trace("not enough master nodes [{}]", possibleMasterNodes);
// if we don't have enough master nodes, we bail, because there are not enough master to elect from
logger.trace("not enough master nodes [{}]", activeNodes);
return null;
}
} else {
Expand Down
@@ -0,0 +1,32 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.discovery.zen.ping;

import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;

/**
*
*/
public interface PingContextProvider extends DiscoveryNodesProvider {

/** return true if this node has previously joined the cluster at least once. False if this is first join */
boolean nodeHasJoinedClusterOnce();

}
51 changes: 40 additions & 11 deletions src/main/java/org/elasticsearch/discovery/zen/ping/ZenPing.java
Expand Up @@ -20,14 +20,14 @@
package org.elasticsearch.discovery.zen.ping;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.component.LifecycleComponent;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Streamable;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;

import java.io.IOException;

Expand All @@ -39,7 +39,7 @@
*/
public interface ZenPing extends LifecycleComponent<ZenPing> {

void setNodesProvider(DiscoveryNodesProvider nodesProvider);
void setPingContextProvider(PingContextProvider contextProvider);

void ping(PingListener listener, TimeValue timeout) throws ElasticsearchException;

Expand All @@ -49,36 +49,52 @@ public interface PingListener {
}

public static class PingResponse implements Streamable {

public static final PingResponse[] EMPTY = new PingResponse[0];

private ClusterName clusterName;

private DiscoveryNode target;
private DiscoveryNode node;

private DiscoveryNode master;

private boolean hasJoinedOnce;

private PingResponse() {
}

public PingResponse(DiscoveryNode target, DiscoveryNode master, ClusterName clusterName) {
this.target = target;
/**
* @param node the node which this ping describes
* @param master the current master of the node
* @param clusterName the cluster name of the node
* @param hasJoinedOnce true if the joined has successfully joined the cluster before
*/
public PingResponse(DiscoveryNode node, DiscoveryNode master, ClusterName clusterName, boolean hasJoinedOnce) {
this.node = node;
this.master = master;
this.clusterName = clusterName;
this.hasJoinedOnce = hasJoinedOnce;
}

public ClusterName clusterName() {
return this.clusterName;
}

public DiscoveryNode target() {
return target;
/** the node which this ping describes */
public DiscoveryNode node() {
return node;
}

/** the current master of the node */
public DiscoveryNode master() {
return master;
}

/** true if the joined has successfully joined the cluster before */
public boolean hasJoinedOnce() {
return hasJoinedOnce;
}

public static PingResponse readPingResponse(StreamInput in) throws IOException {
PingResponse response = new PingResponse();
response.readFrom(in);
Expand All @@ -88,27 +104,40 @@ public static PingResponse readPingResponse(StreamInput in) throws IOException {
@Override
public void readFrom(StreamInput in) throws IOException {
clusterName = readClusterName(in);
target = readNode(in);
node = readNode(in);
if (in.readBoolean()) {
master = readNode(in);
}
if (in.getVersion().onOrAfter(Version.V_1_4_0)) {
this.hasJoinedOnce = in.readBoolean();
} else {
// As of 1.4.0 we prefer to elect nodes which have previously successfully joined the cluster.
// Nodes before 1.4.0 do not take this into consideration. If pre<1.4.0 node elects it self as master
// based on the pings, we need to make sure we do the same. We therefore can not demote it
// and thus mark it as if it has previously joined.
this.hasJoinedOnce = true;
}

}

@Override
public void writeTo(StreamOutput out) throws IOException {
clusterName.writeTo(out);
target.writeTo(out);
node.writeTo(out);
if (master == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
master.writeTo(out);
}
if (out.getVersion().onOrAfter(Version.V_1_4_0)) {
out.writeBoolean(hasJoinedOnce);
}
}

@Override
public String toString() {
return "ping_response{target [" + target + "], master [" + master + "], cluster_name[" + clusterName.value() + "]}";
return "ping_response{node [" + node + "], master [" + master + "], hasJoinedOnce [" + hasJoinedOnce + "], cluster_name[" + clusterName.value() + "]}";
}
}
}
Expand Up @@ -33,7 +33,6 @@
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
import org.elasticsearch.common.util.concurrent.EsRejectedExecutionException;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;
import org.elasticsearch.discovery.zen.elect.ElectMasterService;
import org.elasticsearch.discovery.zen.ping.multicast.MulticastZenPing;
import org.elasticsearch.discovery.zen.ping.unicast.UnicastHostsProvider;
Expand Down Expand Up @@ -92,12 +91,12 @@ public void zenPings(ImmutableList<? extends ZenPing> pings) {
}

@Override
public void setNodesProvider(DiscoveryNodesProvider nodesProvider) {
public void setPingContextProvider(PingContextProvider contextProvider) {
if (lifecycle.started()) {
throw new ElasticsearchIllegalStateException("Can't set nodes provider when started");
}
for (ZenPing zenPing : zenPings) {
zenPing.setNodesProvider(nodesProvider);
zenPing.setPingContextProvider(contextProvider);
}
}

Expand Down Expand Up @@ -172,7 +171,7 @@ private CompoundPingListener(PingListener listener, ImmutableList<? extends ZenP
public void onPing(PingResponse[] pings) {
if (pings != null) {
for (PingResponse pingResponse : pings) {
responses.put(pingResponse.target(), pingResponse);
responses.put(pingResponse.node(), pingResponse);
}
}
if (counter.decrementAndGet() == 0) {
Expand Down

0 comments on commit 1f39d43

Please sign in to comment.