From 5286c186c844c750e1658c2a8d83dd209cd514eb Mon Sep 17 00:00:00 2001
From: Igor Motov <igor@motovs.org>
Date: Mon, 1 Jun 2015 13:54:31 -1000
Subject: [PATCH] Snapshot/Restore: sync up snapshot shard status on a master
 restart

When a snapshot operation on a particular shard finishes, the data node where this shard resides sends an update shard status request to the master node to indicate that the operation on the shard is done. When the master node receives the command it queues cluster state update task and acknowledges the receipt of the command to the data node.

The update snapshot shard status tasks have relatively low priority, so during cluster instability they tend to get stuck at the end of the queue. If the master node gets restarted before processing these tasks the information about the shards can be lost and the new master assumes that they are still in process while the data node thinks that these shards are already done.

 This commit add a retry mechanism that checks compares cluster state of a newly elected master and the current state of snapshot shards and updates the cluster state on the master again if needed.

Closes #11314
---
 .../snapshots/SnapshotsService.java           |  65 ++++++++-
 .../snapshots/AbstractSnapshotTests.java      | 129 ++++++++++++++++++
 .../DedicatedClusterSnapshotRestoreTests.java |  80 +++++++++++
 .../SharedClusterSnapshotRestoreTests.java    | 121 ----------------
 4 files changed, 269 insertions(+), 126 deletions(-)

diff --git a/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
index cdf068ceceea9..ae22fbfa055e1 100644
--- a/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
+++ b/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
@@ -549,13 +549,18 @@ public void clusterChanged(ClusterChangedEvent event) {
 
             if (prev == null) {
                 if (curr != null) {
-                    processIndexShardSnapshots(curr);
+                    processIndexShardSnapshots(event);
                 }
             } else {
                 if (!prev.equals(curr)) {
-                    processIndexShardSnapshots(curr);
+                    processIndexShardSnapshots(event);
                 }
             }
+            if (event.state().nodes().masterNodeId() != null &&
+                    event.state().nodes().masterNodeId().equals(event.previousState().nodes().masterNodeId()) == false) {
+                syncShardStatsOnNewMaster(event);
+            }
+
         } catch (Throwable t) {
             logger.warn("Failed to update snapshot state ", t);
         }
@@ -778,9 +783,10 @@ private boolean removedNodesCleanupNeeded(ClusterChangedEvent event) {
     /**
      * Checks if any new shards should be snapshotted on this node
      *
-     * @param snapshotMetaData snapshot metadata to be processed
+     * @param event cluster state changed event
      */
-    private void processIndexShardSnapshots(SnapshotMetaData snapshotMetaData) {
+    private void processIndexShardSnapshots(ClusterChangedEvent event) {
+        SnapshotMetaData snapshotMetaData = event.state().metaData().custom(SnapshotMetaData.TYPE);
         Map<SnapshotId, SnapshotShards> survivors = newHashMap();
         // First, remove snapshots that are no longer there
         for (Map.Entry<SnapshotId, SnapshotShards> entry : shardSnapshots.entrySet()) {
@@ -830,7 +836,17 @@ private void processIndexShardSnapshots(SnapshotMetaData snapshotMetaData) {
                         for (Map.Entry<ShardId, SnapshotMetaData.ShardSnapshotStatus> shard : entry.shards().entrySet()) {
                             IndexShardSnapshotStatus snapshotStatus = snapshotShards.shards.get(shard.getKey());
                             if (snapshotStatus != null) {
-                                snapshotStatus.abort();
+                                if (snapshotStatus.stage() == IndexShardSnapshotStatus.Stage.STARTED) {
+                                    snapshotStatus.abort();
+                                } else if (snapshotStatus.stage() == IndexShardSnapshotStatus.Stage.DONE) {
+                                    logger.debug("[{}] trying to cancel snapshot on the shard [{}] that is already done, updating status on the master", entry.snapshotId(), shard.getKey());
+                                    updateIndexShardSnapshotStatus(new UpdateIndexShardSnapshotStatusRequest(entry.snapshotId(), shard.getKey(),
+                                            new ShardSnapshotStatus(event.state().nodes().localNodeId(), SnapshotMetaData.State.SUCCESS)));
+                                } else if (snapshotStatus.stage() == IndexShardSnapshotStatus.Stage.FAILURE) {
+                                    logger.debug("[{}] trying to cancel snapshot on the shard [{}] that has already failed, updating status on the master", entry.snapshotId(), shard.getKey());
+                                    updateIndexShardSnapshotStatus(new UpdateIndexShardSnapshotStatusRequest(entry.snapshotId(), shard.getKey(),
+                                            new ShardSnapshotStatus(event.state().nodes().localNodeId(), State.FAILED, snapshotStatus.failure())));
+                                }
                             }
                         }
                     }
@@ -878,6 +894,45 @@ public void run() {
         }
     }
 
+    /**
+     * Checks if any shards were processed that the new master doesn't know about
+     * @param event
+     */
+    private void syncShardStatsOnNewMaster(ClusterChangedEvent event) {
+        SnapshotMetaData snapshotMetaData = event.state().getMetaData().custom(SnapshotMetaData.TYPE);
+        if (snapshotMetaData == null) {
+            return;
+        }
+        for (SnapshotMetaData.Entry snapshot : snapshotMetaData.entries()) {
+            if (snapshot.state() == State.STARTED || snapshot.state() == State.ABORTED) {
+                ImmutableMap<ShardId, IndexShardSnapshotStatus> localShards = currentSnapshotShards(snapshot.snapshotId());
+                if (localShards != null) {
+                    ImmutableMap<ShardId, ShardSnapshotStatus> masterShards = snapshot.shards();
+                    for(Map.Entry<ShardId, IndexShardSnapshotStatus> localShard : localShards.entrySet()) {
+                        ShardId shardId = localShard.getKey();
+                        IndexShardSnapshotStatus localShardStatus = localShard.getValue();
+                        ShardSnapshotStatus masterShard = masterShards.get(shardId);
+                        if (masterShard != null && masterShard.state().completed() == false) {
+                            // Master knows about the shard and thinks it has not completed
+                            if (localShardStatus.stage() == IndexShardSnapshotStatus.Stage.DONE) {
+                                // but we think the shard is done - we need to make new master know that the shard is done
+                                logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard is done locally, updating status on the master", snapshot.snapshotId(), shardId);
+                                updateIndexShardSnapshotStatus(new UpdateIndexShardSnapshotStatusRequest(snapshot.snapshotId(), shardId,
+                                        new ShardSnapshotStatus(event.state().nodes().localNodeId(), SnapshotMetaData.State.SUCCESS)));
+                            } else if (localShard.getValue().stage() == IndexShardSnapshotStatus.Stage.FAILURE) {
+                                // but we think the shard failed - we need to make new master know that the shard failed
+                                logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard failed locally, updating status on master", snapshot.snapshotId(), shardId);
+                                updateIndexShardSnapshotStatus(new UpdateIndexShardSnapshotStatusRequest(snapshot.snapshotId(), shardId,
+                                        new ShardSnapshotStatus(event.state().nodes().localNodeId(), State.FAILED, localShardStatus.failure())));
+
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
     /**
      * Updates the shard status
      *
diff --git a/src/test/java/org/elasticsearch/snapshots/AbstractSnapshotTests.java b/src/test/java/org/elasticsearch/snapshots/AbstractSnapshotTests.java
index 04ff667b3f3b0..233afe32dba04 100644
--- a/src/test/java/org/elasticsearch/snapshots/AbstractSnapshotTests.java
+++ b/src/test/java/org/elasticsearch/snapshots/AbstractSnapshotTests.java
@@ -22,8 +22,12 @@
 import com.google.common.collect.ImmutableList;
 
 import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
+import org.elasticsearch.action.admin.cluster.tasks.PendingClusterTasksResponse;
+import org.elasticsearch.cluster.*;
 import org.elasticsearch.cluster.metadata.SnapshotId;
 import org.elasticsearch.cluster.metadata.SnapshotMetaData;
+import org.elasticsearch.cluster.service.PendingClusterTask;
+import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.repositories.RepositoriesService;
@@ -37,9 +41,12 @@
 import java.nio.file.Path;
 import java.nio.file.SimpleFileVisitor;
 import java.nio.file.attribute.BasicFileAttributes;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 
 /**
  */
@@ -121,4 +128,126 @@ public static String blockNodeWithIndex(String index) {
     public static void unblockNode(String node) {
         ((MockRepository)internalCluster().getInstance(RepositoriesService.class, node).repository("test-repo")).unblock();
     }
+
+    protected void assertBusyPendingTasks(final String taskPrefix, final int expectedCount) throws Exception {
+        assertBusy(new Runnable() {
+            @Override
+            public void run() {
+                PendingClusterTasksResponse tasks = client().admin().cluster().preparePendingClusterTasks().get();
+                int count = 0;
+                for(PendingClusterTask task : tasks) {
+                    if (task.getSource().toString().startsWith(taskPrefix)) {
+                        count++;
+                    }
+                }
+                assertThat(count, greaterThanOrEqualTo(expectedCount));
+            }
+        }, 1, TimeUnit.MINUTES);
+    }
+
+    /**
+     * Cluster state task that blocks waits for the blockOn task to show up and then blocks execution not letting
+     * any cluster state update task to be performed unless they have priority higher then passThroughPriority.
+     *
+     * This class is useful to testing of cluster state update task batching for lower priority tasks.
+     */
+    protected class BlockingClusterStateListener implements ClusterStateListener {
+
+        private final Predicate<ClusterChangedEvent> blockOn;
+        private final Predicate<ClusterChangedEvent> countOn;
+        private final ClusterService clusterService;
+        private final CountDownLatch latch;
+        private final Priority passThroughPriority;
+        private int count;
+        private boolean timedOut;
+        private final TimeValue timeout;
+        private long stopWaitingAt = -1;
+
+        public BlockingClusterStateListener(ClusterService clusterService, String blockOn, String countOn, Priority passThroughPriority) {
+            this(clusterService, blockOn, countOn, passThroughPriority, TimeValue.timeValueSeconds(10));
+        }
+
+        public BlockingClusterStateListener(ClusterService clusterService, final String blockOn, final String countOn, Priority passThroughPriority, TimeValue timeout) {
+            this.clusterService = clusterService;
+            this.blockOn = new Predicate<ClusterChangedEvent>() {
+                @Override
+                public boolean apply(ClusterChangedEvent clusterChangedEvent) {
+                    return clusterChangedEvent.source().startsWith(blockOn);
+                }
+            };
+            this.countOn = new Predicate<ClusterChangedEvent>() {
+                @Override
+                public boolean apply(ClusterChangedEvent clusterChangedEvent) {
+                    return clusterChangedEvent.source().startsWith(countOn);
+                }
+            };
+            this.latch = new CountDownLatch(1);
+            this.passThroughPriority = passThroughPriority;
+            this.timeout = timeout;
+
+        }
+
+        public void unblock() {
+            latch.countDown();
+        }
+
+        @Override
+        public void clusterChanged(ClusterChangedEvent event) {
+            if (blockOn.apply(event)) {
+                logger.info("blocking cluster state tasks on [{}]", event.source());
+                assert stopWaitingAt < 0; // Make sure we are the first time here
+                stopWaitingAt = System.currentTimeMillis() + timeout.getMillis();
+                addBlock();
+            }
+            if (countOn.apply(event)) {
+                count++;
+            }
+        }
+
+        private void addBlock() {
+            // We should block after this task - add blocking cluster state update task
+            clusterService.submitStateUpdateTask("test_block", passThroughPriority, new ClusterStateUpdateTask() {
+                @Override
+                public ClusterState execute(ClusterState currentState) throws Exception {
+                    while(System.currentTimeMillis() < stopWaitingAt) {
+                        for (PendingClusterTask task : clusterService.pendingTasks()) {
+                            if (task.getSource().string().equals("test_block") == false && passThroughPriority.sameOrAfter(task.getPriority())) {
+                                // There are other higher priority tasks in the queue and let them pass through and then set the block again
+                                logger.info("passing through cluster state task {}", task.getSource());
+                                addBlock();
+                                return currentState;
+                            }
+                        }
+                        try {
+                            logger.info("wating....");
+                            if (latch.await(Math.min(100, timeout.millis()), TimeUnit.MILLISECONDS)){
+                                // Done waiting - unblock
+                                logger.info("unblocked");
+                                return currentState;
+                            }
+                            logger.info("done wating....");
+                        } catch (InterruptedException ex) {
+                            Thread.currentThread().interrupt();
+                        }
+                    }
+                    timedOut = true;
+                    return currentState;
+                }
+
+                @Override
+                public void onFailure(String source, Throwable t) {
+                    logger.warn("failed to execute [{}]", t, source);
+                }
+            });
+
+        }
+
+        public int count() {
+            return count;
+        }
+
+        public boolean timedOut() {
+            return timedOut;
+        }
+    }
 }
diff --git a/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreTests.java b/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreTests.java
index d36bd5231544d..2ab085039edaa 100644
--- a/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreTests.java
+++ b/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreTests.java
@@ -34,6 +34,7 @@
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
 import org.elasticsearch.action.admin.indices.recovery.ShardRecoveryResponse;
+import org.elasticsearch.action.index.IndexRequestBuilder;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterService;
 import org.elasticsearch.cluster.ClusterState;
@@ -41,6 +42,7 @@
 import org.elasticsearch.cluster.AbstractDiffable;
 import org.elasticsearch.cluster.metadata.MetaData;
 import org.elasticsearch.cluster.metadata.MetaData.Custom;
+import org.elasticsearch.cluster.metadata.SnapshotMetaData;
 import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
 import org.elasticsearch.cluster.metadata.MetaDataIndexStateService;
 import org.elasticsearch.common.Nullable;
@@ -64,6 +66,7 @@
 import org.elasticsearch.snapshots.mockstore.MockRepositoryModule;
 import org.elasticsearch.snapshots.mockstore.MockRepositoryPlugin;
 import org.elasticsearch.test.InternalTestCluster;
+import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.rest.FakeRestRequest;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -791,6 +794,83 @@ public void run() {
         logger.info("--> done");
     }
 
+    @Test
+    public void masterShutdownDuringSnapshotTest() throws Exception {
+
+        Settings masterSettings = settingsBuilder().put("node.data", false).build();
+        Settings dataSettings = settingsBuilder().put("node.master", false).build();
+
+        logger.info("-->  starting two master nodes and two data nodes");
+        internalCluster().startNode(masterSettings);
+        internalCluster().startNode(masterSettings);
+        internalCluster().startNode(dataSettings);
+        internalCluster().startNode(dataSettings);
+
+        final Client client = client();
+
+        logger.info("-->  creating repository");
+        assertAcked(client.admin().cluster().preparePutRepository("test-repo")
+                .setType("fs").setSettings(Settings.settingsBuilder()
+                        .put("location", randomRepoPath())
+                        .put("compress", randomBoolean())
+                        .put("chunk_size", randomIntBetween(100, 1000))));
+
+        assertAcked(prepareCreate("test-idx", 0, settingsBuilder().put("number_of_shards", between(1, 20))
+                .put("number_of_replicas", 0)));
+        ensureGreen();
+
+        logger.info("--> indexing some data");
+        final int numdocs = randomIntBetween(10, 100);
+        IndexRequestBuilder[] builders = new IndexRequestBuilder[numdocs];
+        for (int i = 0; i < builders.length; i++) {
+            builders[i] = client().prepareIndex("test-idx", "type1", Integer.toString(i)).setSource("field1", "bar " + i);
+        }
+        indexRandom(true, builders);
+        flushAndRefresh();
+
+        final int numberOfShards = getNumShards("test-idx").numPrimaries;
+        logger.info("number of shards: {}", numberOfShards);
+
+        final ClusterService clusterService = internalCluster().clusterService(internalCluster().getMasterName());
+        BlockingClusterStateListener snapshotListener = new BlockingClusterStateListener(clusterService, "update_snapshot [", "update snapshot state", Priority.HIGH);
+        try {
+            clusterService.addFirst(snapshotListener);
+            logger.info("--> snapshot");
+            dataNodeClient().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap").setWaitForCompletion(false).setIndices("test-idx").get();
+
+            // Await until some updates are in pending state.
+            assertBusyPendingTasks("update snapshot state", 1);
+
+            logger.info("--> stopping master node");
+            internalCluster().stopCurrentMasterNode();
+
+            logger.info("--> stopping master node");
+            snapshotListener.unblock();
+
+            logger.info("--> wait until the snapshot is done");
+
+        } finally {
+            clusterService.remove(snapshotListener);
+        }
+
+        assertBusy(new Runnable() {
+            @Override
+            public void run() {
+                SnapshotsStatusResponse snapshotsStatusResponse = client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get();
+                ImmutableList<SnapshotStatus> snapshotStatuses = snapshotsStatusResponse.getSnapshots();
+                assertEquals(1, snapshotStatuses.size());
+                assertTrue(snapshotStatuses.get(0).getState().completed());
+            }
+        });
+
+        GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster().prepareGetSnapshots("test-repo").setSnapshots("test-snap").get();
+        SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
+        assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
+        assertEquals(snapshotInfo.totalShards(), snapshotInfo.successfulShards());
+        assertEquals(0, snapshotInfo.failedShards());
+    }
+
+
     private boolean snapshotIsDone(String repository, String snapshot) {
         try {
             SnapshotsStatusResponse snapshotsStatusResponse = client().admin().cluster().prepareSnapshotStatus(repository).setSnapshots(snapshot).get();
diff --git a/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreTests.java b/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreTests.java
index 1e8d45d369967..510402d78f4e4 100644
--- a/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreTests.java
+++ b/src/test/java/org/elasticsearch/snapshots/SharedClusterSnapshotRestoreTests.java
@@ -1894,125 +1894,4 @@ public void batchingShardUpdateTaskTest() throws Exception {
         assertEquals(1, restoreListener.count());
     }
 
-    private void assertBusyPendingTasks(final String taskPrefix, final int expectedCount) throws Exception {
-        assertBusy(new Runnable() {
-            @Override
-            public void run() {
-                PendingClusterTasksResponse tasks = client().admin().cluster().preparePendingClusterTasks().get();
-                int count = 0;
-                for(PendingClusterTask task : tasks) {
-                    if (task.getSource().toString().startsWith(taskPrefix)) {
-                        count++;
-                    }
-                }
-                assertThat(count, equalTo(expectedCount));
-            }
-        }, 1, TimeUnit.MINUTES);
-    }
-
-    /**
-     * Cluster state task that blocks waits for the blockOn task to show up and then blocks execution not letting
-     * any cluster state update task to be performed unless they have priority higher then passThroughPriority.
-     *
-     * This class is useful to testing of cluster state update task batching for lower priority tasks.
-     */
-    public class BlockingClusterStateListener implements ClusterStateListener {
-
-        private final Predicate<ClusterChangedEvent> blockOn;
-        private final Predicate<ClusterChangedEvent> countOn;
-        private final ClusterService clusterService;
-        private final CountDownLatch latch;
-        private final Priority passThroughPriority;
-        private int count;
-        private boolean timedOut;
-        private final TimeValue timeout;
-        private long stopWaitingAt = -1;
-
-        public BlockingClusterStateListener(ClusterService clusterService, String blockOn, String countOn, Priority passThroughPriority) {
-            this(clusterService, blockOn, countOn, passThroughPriority, TimeValue.timeValueSeconds(10));
-        }
-
-        public BlockingClusterStateListener(ClusterService clusterService, final String blockOn, final String countOn, Priority passThroughPriority, TimeValue timeout) {
-            this.clusterService = clusterService;
-            this.blockOn = new Predicate<ClusterChangedEvent>() {
-                @Override
-                public boolean apply(ClusterChangedEvent clusterChangedEvent) {
-                    return clusterChangedEvent.source().startsWith(blockOn);
-                }
-            };
-            this.countOn = new Predicate<ClusterChangedEvent>() {
-                @Override
-                public boolean apply(ClusterChangedEvent clusterChangedEvent) {
-                    return clusterChangedEvent.source().startsWith(countOn);
-                }
-            };
-            this.latch = new CountDownLatch(1);
-            this.passThroughPriority = passThroughPriority;
-            this.timeout = timeout;
-
-        }
-
-        public void unblock() {
-            latch.countDown();
-        }
-
-        @Override
-        public void clusterChanged(ClusterChangedEvent event) {
-            if (blockOn.apply(event)) {
-                logger.info("blocking cluster state tasks on [{}]", event.source());
-                assert stopWaitingAt < 0; // Make sure we are the first time here
-                stopWaitingAt = System.currentTimeMillis() + timeout.getMillis();
-                addBlock();
-            }
-            if (countOn.apply(event)) {
-                count++;
-            }
-        }
-
-        private void addBlock() {
-            // We should block after this task - add blocking cluster state update task
-            clusterService.submitStateUpdateTask("test_block", passThroughPriority, new ClusterStateUpdateTask() {
-                @Override
-                public ClusterState execute(ClusterState currentState) throws Exception {
-                    while(System.currentTimeMillis() < stopWaitingAt) {
-                        for (PendingClusterTask task : clusterService.pendingTasks()) {
-                            if (task.getSource().string().equals("test_block") == false && passThroughPriority.sameOrAfter(task.getPriority())) {
-                                // There are other higher priority tasks in the queue and let them pass through and then set the block again
-                                logger.info("passing through cluster state task {}", task.getSource());
-                                addBlock();
-                                return currentState;
-                            }
-                        }
-                        try {
-                            logger.info("wating....");
-                            if (latch.await(Math.min(100, timeout.millis()), TimeUnit.MILLISECONDS)){
-                                // Done waiting - unblock
-                                logger.info("unblocked");
-                                return currentState;
-                            }
-                            logger.info("done wating....");
-                        } catch (InterruptedException ex) {
-                            Thread.currentThread().interrupt();
-                        }
-                    }
-                    timedOut = true;
-                    return currentState;
-                }
-
-                @Override
-                public void onFailure(String source, Throwable t) {
-                    logger.warn("failed to execute [{}]", t, source);
-                }
-            });
-
-        }
-
-        public int count() {
-            return count;
-        }
-
-        public boolean timedOut() {
-            return timedOut;
-        }
-    }
 }