Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Discovery: back port #7558 to 1.x and add bwc protections of the new ping on master gone introduced in #7493 #7694

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
9 changes: 8 additions & 1 deletion src/main/java/org/elasticsearch/Version.java
Expand Up @@ -402,6 +402,13 @@ public static Version smallest(Version version1, Version version2) {
return version1.id < version2.id ? version1 : version2;
}

/**
* Returns the largest version between the 2.
*/
public static Version largest(Version version1, Version version2) {
return version1.id > version2.id ? version1 : version2;
}

/**
* Returns the version given its string representation, current version if the argument is null or empty
*/
Expand Down Expand Up @@ -434,7 +441,7 @@ public static Version fromString(String version) {

return fromId(major + minor + revision + build);

} catch(NumberFormatException e) {
} catch (NumberFormatException e) {
throw new IllegalArgumentException("unable to parse version " + version, e);
}
}
Expand Down
82 changes: 67 additions & 15 deletions src/main/java/org/elasticsearch/discovery/zen/ZenDiscovery.java
Expand Up @@ -52,6 +52,7 @@
import org.elasticsearch.discovery.zen.fd.MasterFaultDetection;
import org.elasticsearch.discovery.zen.fd.NodesFaultDetection;
import org.elasticsearch.discovery.zen.membership.MembershipAction;
import org.elasticsearch.discovery.zen.ping.PingContextProvider;
import org.elasticsearch.discovery.zen.ping.ZenPing;
import org.elasticsearch.discovery.zen.ping.ZenPingService;
import org.elasticsearch.discovery.zen.publish.PublishClusterStateAction;
Expand All @@ -69,14 +70,15 @@
import java.util.concurrent.CopyOnWriteArrayList;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.atomic.AtomicLong;

import static com.google.common.collect.Lists.newArrayList;
import static org.elasticsearch.common.unit.TimeValue.timeValueSeconds;

/**
*
*/
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, DiscoveryNodesProvider {
public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implements Discovery, PingContextProvider {

public final static String SETTING_REJOIN_ON_MASTER_GONE = "discovery.zen.rejoin_on_master_gone";
public final static String SETTING_PING_TIMEOUT = "discovery.zen.ping.timeout";
Expand Down Expand Up @@ -139,6 +141,9 @@ public class ZenDiscovery extends AbstractLifecycleComponent<Discovery> implemen

private volatile boolean rejoinOnMasterGone;

/** counts the time this node has joined the cluster or have elected it self as master */
private final AtomicLong clusterJoinsCounter = new AtomicLong();

@Nullable
private NodeService nodeService;

Expand Down Expand Up @@ -194,7 +199,7 @@ public ZenDiscovery(Settings settings, ClusterName clusterName, ThreadPool threa
this.nodesFD.addListener(new NodeFaultDetectionListener());

this.publishClusterState = new PublishClusterStateAction(settings, transportService, this, new NewClusterStateListener(), discoverySettings, clusterName);
this.pingService.setNodesProvider(this);
this.pingService.setPingContextProvider(this);
this.membership = new MembershipAction(settings, clusterService, transportService, this, new MembershipListener());

transportService.registerHandler(DISCOVERY_REJOIN_ACTION_NAME, new RejoinClusterRequestHandler());
Expand Down Expand Up @@ -290,6 +295,7 @@ public String nodeDescription() {
return clusterName.value() + "/" + localNode.id();
}

/** start of {@link org.elasticsearch.discovery.zen.ping.PingContextProvider } implementation */
@Override
public DiscoveryNodes nodes() {
DiscoveryNodes latestNodes = this.latestDiscoNodes;
Expand All @@ -305,6 +311,14 @@ public NodeService nodeService() {
return this.nodeService;
}

@Override
public boolean nodeHasJoinedClusterOnce() {
return clusterJoinsCounter.get() > 0;
}

/** end of {@link org.elasticsearch.discovery.zen.ping.PingContextProvider } implementation */


@Override
public void publish(ClusterState clusterState, AckListener ackListener) {
if (!master) {
Expand Down Expand Up @@ -387,6 +401,8 @@ public void onFailure(String source, Throwable t) {
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
sendInitialStateEventIfNeeded();
long count = clusterJoinsCounter.incrementAndGet();
logger.trace("cluster joins counter set to [{}] (elected as master)", count);
}
});
} else {
Expand All @@ -404,8 +420,8 @@ public void clusterStateProcessed(String source, ClusterState oldState, ClusterS
}

masterFD.start(masterNode, "initial_join");
// no need to submit the received cluster state, we will get it from the master when it publishes
// the fact that we joined
long count = clusterJoinsCounter.incrementAndGet();
logger.trace("cluster joins counter set to [{}] (joined master)", count);
}
}
}
Expand Down Expand Up @@ -604,7 +620,16 @@ public ClusterState execute(ClusterState currentState) {
processNewClusterStates.drainTo(pendingNewClusterStates);
logger.trace("removed [{}] pending cluster states", pendingNewClusterStates.size());

if (rejoinOnMasterGone) {
boolean rejoin = true;
if (!rejoinOnMasterGone) {
logger.debug("not rejoining cluster due to rejoinOnMasterGone [{}]", rejoinOnMasterGone);
rejoin = false;
} else if (discoveryNodes.smallestNonClientNodeVersion().before(Version.V_1_4_0_Beta)) {
logger.debug("not rejoining cluster due a minimum node version of [{}]", discoveryNodes.smallestNonClientNodeVersion());
rejoin = false;
}

if (rejoin) {
return rejoin(ClusterState.builder(currentState).nodes(discoveryNodes).build(), "master left (reason = " + reason + ")");
}

Expand Down Expand Up @@ -922,7 +947,7 @@ private DiscoveryNode findMaster() {
sb.append(" {none}");
} else {
for (ZenPing.PingResponse pingResponse : fullPingResponses) {
sb.append("\n\t--> ").append("target [").append(pingResponse.target()).append("], master [").append(pingResponse.master()).append("]");
sb.append("\n\t--> ").append(pingResponse);
}
}
logger.trace(sb.toString());
Expand All @@ -931,7 +956,7 @@ private DiscoveryNode findMaster() {
// filter responses
List<ZenPing.PingResponse> pingResponses = Lists.newArrayList();
for (ZenPing.PingResponse pingResponse : fullPingResponses) {
DiscoveryNode node = pingResponse.target();
DiscoveryNode node = pingResponse.node();
if (masterElectionFilterClientNodes && (node.clientNode() || (!node.masterNode() && !node.dataNode()))) {
// filter out the client node, which is a client node, or also one that is not data and not master (effectively, client)
} else if (masterElectionFilterDataNodes && (!node.masterNode() && node.dataNode())) {
Expand All @@ -947,7 +972,7 @@ private DiscoveryNode findMaster() {
sb.append(" {none}");
} else {
for (ZenPing.PingResponse pingResponse : pingResponses) {
sb.append("\n\t--> ").append("target [").append(pingResponse.target()).append("], master [").append(pingResponse.master()).append("]");
sb.append("\n\t--> ").append(pingResponse);
}
}
logger.debug(sb.toString());
Expand All @@ -963,20 +988,47 @@ private DiscoveryNode findMaster() {
}
}

Set<DiscoveryNode> possibleMasterNodes = Sets.newHashSet();
// nodes discovered during pinging
Set<DiscoveryNode> activeNodes = Sets.newHashSet();
// nodes discovered who has previously been part of the cluster and do not ping for the very first time
Set<DiscoveryNode> joinedOnceActiveNodes = Sets.newHashSet();
if (localNode.masterNode()) {
possibleMasterNodes.add(localNode);
activeNodes.add(localNode);
long joinsCounter = clusterJoinsCounter.get();
if (joinsCounter > 0) {
logger.trace("adding local node to the list of active nodes who has previously joined the cluster (joins counter is [{}})", joinsCounter);
joinedOnceActiveNodes.add(localNode);
}
}

Version minimumPingVersion = localNode.version();
for (ZenPing.PingResponse pingResponse : pingResponses) {
possibleMasterNodes.add(pingResponse.target());
activeNodes.add(pingResponse.node());
minimumPingVersion = Version.smallest(pingResponse.node().version(), minimumPingVersion);
if (pingResponse.hasJoinedOnce() != null && pingResponse.hasJoinedOnce()) {
assert pingResponse.node().getVersion().onOrAfter(Version.V_1_4_0_Beta) : "ping version [" + pingResponse.node().version() + "]< 1.4.0 while having hasJoinedOnce == true";
joinedOnceActiveNodes.add(pingResponse.node());
}
}

if (minimumPingVersion.before(Version.V_1_4_0_Beta)) {
logger.trace("ignoring joined once flags in ping responses, minimum ping version [{}]", minimumPingVersion);
joinedOnceActiveNodes.clear();
}

if (pingMasters.isEmpty()) {
// if we don't have enough master nodes, we bail, because there are not enough master to elect from
if (electMaster.hasEnoughMasterNodes(possibleMasterNodes)) {
return electMaster.electMaster(possibleMasterNodes);
if (electMaster.hasEnoughMasterNodes(activeNodes)) {
// we give preference to nodes who have previously already joined the cluster. Those will
// have a cluster state in memory, including an up to date routing table (which is not persistent to disk
// by the gateway)
DiscoveryNode master = electMaster.electMaster(joinedOnceActiveNodes);
if (master != null) {
return master;
}
return electMaster.electMaster(activeNodes);
} else {
logger.trace("not enough master nodes [{}]", possibleMasterNodes);
// if we don't have enough master nodes, we bail, because there are not enough master to elect from
logger.trace("not enough master nodes [{}]", activeNodes);
return null;
}
} else {
Expand Down
@@ -0,0 +1,32 @@
/*
* Licensed to Elasticsearch under one or more contributor
* license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright
* ownership. Elasticsearch licenses this file to you under
* the Apache License, Version 2.0 (the "License"); you may
* not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package org.elasticsearch.discovery.zen.ping;

import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;

/**
*
*/
public interface PingContextProvider extends DiscoveryNodesProvider {

/** return true if this node has previously joined the cluster at least once. False if this is first join */
boolean nodeHasJoinedClusterOnce();

}
57 changes: 46 additions & 11 deletions src/main/java/org/elasticsearch/discovery/zen/ping/ZenPing.java
Expand Up @@ -20,14 +20,15 @@
package org.elasticsearch.discovery.zen.ping;

import org.elasticsearch.ElasticsearchException;
import org.elasticsearch.Version;
import org.elasticsearch.cluster.ClusterName;
import org.elasticsearch.cluster.node.DiscoveryNode;
import org.elasticsearch.common.Nullable;
import org.elasticsearch.common.component.LifecycleComponent;
import org.elasticsearch.common.io.stream.StreamInput;
import org.elasticsearch.common.io.stream.StreamOutput;
import org.elasticsearch.common.io.stream.Streamable;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.discovery.zen.DiscoveryNodesProvider;

import java.io.IOException;

Expand All @@ -39,7 +40,7 @@
*/
public interface ZenPing extends LifecycleComponent<ZenPing> {

void setNodesProvider(DiscoveryNodesProvider nodesProvider);
void setPingContextProvider(PingContextProvider contextProvider);

void ping(PingListener listener, TimeValue timeout) throws ElasticsearchException;

Expand All @@ -49,36 +50,54 @@ public interface PingListener {
}

public static class PingResponse implements Streamable {

public static final PingResponse[] EMPTY = new PingResponse[0];

private ClusterName clusterName;

private DiscoveryNode target;
private DiscoveryNode node;

private DiscoveryNode master;

@Nullable
private Boolean hasJoinedOnce;

private PingResponse() {
}

public PingResponse(DiscoveryNode target, DiscoveryNode master, ClusterName clusterName) {
this.target = target;
/**
* @param node the node which this ping describes
* @param master the current master of the node
* @param clusterName the cluster name of the node
* @param hasJoinedOnce true if the joined has successfully joined the cluster before
*/
public PingResponse(DiscoveryNode node, DiscoveryNode master, ClusterName clusterName, boolean hasJoinedOnce) {
this.node = node;
this.master = master;
this.clusterName = clusterName;
this.hasJoinedOnce = hasJoinedOnce;
}

public ClusterName clusterName() {
return this.clusterName;
}

public DiscoveryNode target() {
return target;
/** the node which this ping describes */
public DiscoveryNode node() {
return node;
}

/** the current master of the node */
public DiscoveryNode master() {
return master;
}

/** true if the node has successfully joined the cluster before, null for nodes with a <1.4.0 version */
@Nullable
public Boolean hasJoinedOnce() {
return hasJoinedOnce;
}

public static PingResponse readPingResponse(StreamInput in) throws IOException {
PingResponse response = new PingResponse();
response.readFrom(in);
Expand All @@ -88,27 +107,43 @@ public static PingResponse readPingResponse(StreamInput in) throws IOException {
@Override
public void readFrom(StreamInput in) throws IOException {
clusterName = readClusterName(in);
target = readNode(in);
node = readNode(in);
if (in.readBoolean()) {
master = readNode(in);
}
if (in.getVersion().onOrAfter(Version.V_1_4_0_Beta)) {
if (in.readBoolean()) {
this.hasJoinedOnce = in.readBoolean();
}
} else {
this.hasJoinedOnce = null;
}

}

@Override
public void writeTo(StreamOutput out) throws IOException {
clusterName.writeTo(out);
target.writeTo(out);
node.writeTo(out);
if (master == null) {
out.writeBoolean(false);
} else {
out.writeBoolean(true);
master.writeTo(out);
}
if (out.getVersion().onOrAfter(Version.V_1_4_0_Beta)) {
if (hasJoinedOnce != null) {
out.writeBoolean(true);
out.writeBoolean(hasJoinedOnce);
} else {
out.writeBoolean(false);
}
}
}

@Override
public String toString() {
return "ping_response{target [" + target + "], master [" + master + "], cluster_name[" + clusterName.value() + "]}";
return "ping_response{node [" + node + "], master [" + master + "], hasJoinedOnce [" + hasJoinedOnce + "], cluster_name[" + clusterName.value() + "]}";
}
}
}