Skip to content

Commit

Permalink
[CORE] Don't update indexShard if it has been removed before
Browse files Browse the repository at this point in the history
Today we have logic that removes a shard from the indexservice if
the shard has changed ie. from replica to primary or if it's recovery
source vanished etc. This can cause shards from been not allocated at
all on a nodes causeing delete requests to timeout since we were waiting
for shards on nodes that got dropped due to a IndexShardMissingException

Closes #7509
  • Loading branch information
s1monw authored and areek committed Sep 8, 2014
1 parent 4856c58 commit 961d0c9
Show file tree
Hide file tree
Showing 2 changed files with 8 additions and 4 deletions.
Expand Up @@ -43,6 +43,7 @@
import org.elasticsearch.common.component.AbstractLifecycleComponent;
import org.elasticsearch.common.compress.CompressedString;
import org.elasticsearch.common.inject.Inject;
import org.elasticsearch.common.inject.Injector;
import org.elasticsearch.common.settings.Settings;
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
Expand Down Expand Up @@ -548,11 +549,13 @@ private void applyNewOrUpdatedShards(final ClusterChangedEvent event) throws Ela
// if the current and global routing are initializing, but are still not the same, its a different "shard" being allocated
// for example: a shard that recovers from one node and now needs to recover to another node,
// or a replica allocated and then allocating a primary because the primary failed on another node
boolean shardHasBeenRemoved = false;
if (currentRoutingEntry.initializing() && shardRouting.initializing() && !currentRoutingEntry.equals(shardRouting)) {
logger.debug("[{}][{}] removing shard (different instance of it allocated on this node, current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting);
// cancel recovery just in case we are in recovery (its fine if we are not in recovery, it will be a noop).
recoveryTarget.cancelRecovery(indexShard);
indexService.removeShard(shardRouting.id(), "removing shard (different instance of it allocated on this node)");
shardHasBeenRemoved = true;
} else if (isPeerRecovery(shardRouting)) {
// check if there is an existing recovery going, and if so, and the source node is not the same, cancel the recovery to restart it
RecoveryStatus recoveryStatus = recoveryTarget.recoveryStatus(indexShard);
Expand All @@ -563,11 +566,12 @@ private void applyNewOrUpdatedShards(final ClusterChangedEvent event) throws Ela
logger.debug("[{}][{}] removing shard (recovery source changed), current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting);
recoveryTarget.cancelRecovery(indexShard);
indexService.removeShard(shardRouting.id(), "removing shard (recovery source node changed)");
shardHasBeenRemoved = true;
}
}
}

if (!shardRouting.equals(indexShard.routingEntry())) {
if (shardHasBeenRemoved == false && !shardRouting.equals(indexShard.routingEntry())) {
// if we happen to remove the shardRouting by id above we don't need to jump in here!
indexShard.routingEntry(shardRouting);
indexService.shardInjectorSafe(shardId).getInstance(IndexShardGatewayService.class).routingStateChanged();
}
Expand Down
Expand Up @@ -99,9 +99,9 @@ public void testRandomDirectoryIOExceptions() throws IOException, InterruptedExc
.setSettings(settings)
.addMapping("type", mapping).execute().actionGet();
numInitialDocs = between(10, 100);
ensureYellow();
ensureGreen();
for (int i = 0; i < numInitialDocs ; i++) {
client().prepareIndex("test", "initial", "" + i).setTimeout(TimeValue.timeValueSeconds(1)).setSource("test", "init").get();
client().prepareIndex("test", "initial", "" + i).setSource("test", "init").get();
}
client().admin().indices().prepareRefresh("test").execute().get();
client().admin().indices().prepareFlush("test").setWaitIfOngoing(true).execute().get();
Expand Down

0 comments on commit 961d0c9

Please sign in to comment.