[CORE] Don't update indexShard if it has been removed before

Today we have logic that removes a shard from the indexservice if the shard has changed ie. from replica to primary or if it's recovery source vanished etc. This can cause shards from been not allocated at all on a nodes causeing delete requests to timeout since we were waiting for shards on nodes that got dropped due to a IndexShardMissingException Closes #7509
elastic · Sep 8, 2014 · 961d0c9 · 961d0c9
1 parent 4856c58
commit 961d0c9
Show file tree

Hide file tree

Showing 2 changed files with 8 additions and 4 deletions.
diff --git a/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java b/src/main/java/org/elasticsearch/indices/cluster/IndicesClusterStateService.java
@@ -43,6 +43,7 @@
 import org.elasticsearch.common.component.AbstractLifecycleComponent;
 import org.elasticsearch.common.compress.CompressedString;
 import org.elasticsearch.common.inject.Inject;
+import org.elasticsearch.common.inject.Injector;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
@@ -548,11 +549,13 @@ private void applyNewOrUpdatedShards(final ClusterChangedEvent event) throws Ela
                 // if the current and global routing are initializing, but are still not the same, its a different "shard" being allocated
                 // for example: a shard that recovers from one node and now needs to recover to another node,
                 //              or a replica allocated and then allocating a primary because the primary failed on another node
+                boolean shardHasBeenRemoved = false;
                 if (currentRoutingEntry.initializing() && shardRouting.initializing() && !currentRoutingEntry.equals(shardRouting)) {
                     logger.debug("[{}][{}] removing shard (different instance of it allocated on this node, current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting);
                     // cancel recovery just in case we are in recovery (its fine if we are not in recovery, it will be a noop).
                     recoveryTarget.cancelRecovery(indexShard);
                     indexService.removeShard(shardRouting.id(), "removing shard (different instance of it allocated on this node)");
+                    shardHasBeenRemoved = true;
                 } else if (isPeerRecovery(shardRouting)) {
                     // check if there is an existing recovery going, and if so, and the source node is not the same, cancel the recovery to restart it
                     RecoveryStatus recoveryStatus = recoveryTarget.recoveryStatus(indexShard);
@@ -563,11 +566,12 @@ private void applyNewOrUpdatedShards(final ClusterChangedEvent event) throws Ela
                             logger.debug("[{}][{}] removing shard (recovery source changed), current [{}], global [{}])", shardRouting.index(), shardRouting.id(), currentRoutingEntry, shardRouting);
                             recoveryTarget.cancelRecovery(indexShard);
                             indexService.removeShard(shardRouting.id(), "removing shard (recovery source node changed)");
+                            shardHasBeenRemoved = true;
                         }
                     }
                 }
-
-                if (!shardRouting.equals(indexShard.routingEntry())) {
+                if (shardHasBeenRemoved == false && !shardRouting.equals(indexShard.routingEntry())) {
+                    // if we happen to remove the shardRouting by id above we don't need to jump in here!
                     indexShard.routingEntry(shardRouting);
                     indexService.shardInjectorSafe(shardId).getInstance(IndexShardGatewayService.class).routingStateChanged();
                 }

diff --git a/src/test/java/org/elasticsearch/search/basic/SearchWithRandomExceptionsTests.java b/src/test/java/org/elasticsearch/search/basic/SearchWithRandomExceptionsTests.java
@@ -99,9 +99,9 @@ public void testRandomDirectoryIOExceptions() throws IOException, InterruptedExc
                     .setSettings(settings)
                     .addMapping("type", mapping).execute().actionGet();
             numInitialDocs = between(10, 100);
-            ensureYellow();
+            ensureGreen();
             for (int i = 0; i < numInitialDocs ; i++) {
-                client().prepareIndex("test", "initial", "" + i).setTimeout(TimeValue.timeValueSeconds(1)).setSource("test", "init").get();
+                client().prepareIndex("test", "initial", "" + i).setSource("test", "init").get();
             }
             client().admin().indices().prepareRefresh("test").execute().get();
             client().admin().indices().prepareFlush("test").setWaitIfOngoing(true).execute().get();