elastic · imotov · Jun 3, 2015 · Jun 1, 2015 · s1monw · Jun 3, 2015
diff --git a/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java b/src/main/java/org/elasticsearch/snapshots/SnapshotsService.java
@@ -549,13 +549,18 @@ public void clusterChanged(ClusterChangedEvent event) {
 
             if (prev == null) {
                 if (curr != null) {
-                    processIndexShardSnapshots(curr);
+                    processIndexShardSnapshots(event);
                 }
             } else {
                 if (!prev.equals(curr)) {
-                    processIndexShardSnapshots(curr);
+                    processIndexShardSnapshots(event);
                 }
             }
+            if (event.state().nodes().masterNodeId() != null &&
+                    event.state().nodes().masterNodeId().equals(event.previousState().nodes().masterNodeId()) == false) {
+                syncShardStatsOnNewMaster(event);
+            }
+
         } catch (Throwable t) {
             logger.warn("Failed to update snapshot state ", t);
         }
@@ -778,9 +783,10 @@ private boolean removedNodesCleanupNeeded(ClusterChangedEvent event) {
     /**
      * Checks if any new shards should be snapshotted on this node
      *
-     * @param snapshotMetaData snapshot metadata to be processed
+     * @param event cluster state changed event
      */
-    private void processIndexShardSnapshots(SnapshotMetaData snapshotMetaData) {
+    private void processIndexShardSnapshots(ClusterChangedEvent event) {
+        SnapshotMetaData snapshotMetaData = event.state().metaData().custom(SnapshotMetaData.TYPE);
         Map<SnapshotId, SnapshotShards> survivors = newHashMap();
         // First, remove snapshots that are no longer there
         for (Map.Entry<SnapshotId, SnapshotShards> entry : shardSnapshots.entrySet()) {
@@ -830,7 +836,17 @@ private void processIndexShardSnapshots(SnapshotMetaData snapshotMetaData) {
                         for (Map.Entry<ShardId, SnapshotMetaData.ShardSnapshotStatus> shard : entry.shards().entrySet()) {
                             IndexShardSnapshotStatus snapshotStatus = snapshotShards.shards.get(shard.getKey());
                             if (snapshotStatus != null) {
-                                snapshotStatus.abort();
+                                if (snapshotStatus.stage() == IndexShardSnapshotStatus.Stage.STARTED) {
+                                    snapshotStatus.abort();
+                                } else if (snapshotStatus.stage() == IndexShardSnapshotStatus.Stage.DONE) {
+                                    logger.debug("[{}] trying to cancel snapshot on the shard [{}] that is already done, updating status on the master", entry.snapshotId(), shard.getKey());
+                                    updateIndexShardSnapshotStatus(new UpdateIndexShardSnapshotStatusRequest(entry.snapshotId(), shard.getKey(),
+                                            new ShardSnapshotStatus(event.state().nodes().localNodeId(), SnapshotMetaData.State.SUCCESS)));
+                                } else if (snapshotStatus.stage() == IndexShardSnapshotStatus.Stage.FAILURE) {
+                                    logger.debug("[{}] trying to cancel snapshot on the shard [{}] that has already failed, updating status on the master", entry.snapshotId(), shard.getKey());
+                                    updateIndexShardSnapshotStatus(new UpdateIndexShardSnapshotStatusRequest(entry.snapshotId(), shard.getKey(),
+                                            new ShardSnapshotStatus(event.state().nodes().localNodeId(), State.FAILED, snapshotStatus.failure())));
+                                }
                             }
                         }
                     }
@@ -878,6 +894,45 @@ public void run() {
         }
     }
 
+    /**
+     * Checks if any shards were processed that the new master doesn't know about
+     * @param event
+     */
+    private void syncShardStatsOnNewMaster(ClusterChangedEvent event) {
+        SnapshotMetaData snapshotMetaData = event.state().getMetaData().custom(SnapshotMetaData.TYPE);
+        if (snapshotMetaData == null) {
+            return;
+        }
+        for (SnapshotMetaData.Entry snapshot : snapshotMetaData.entries()) {
+            if (snapshot.state() == State.STARTED || snapshot.state() == State.ABORTED) {
+                ImmutableMap<ShardId, IndexShardSnapshotStatus> localShards = currentSnapshotShards(snapshot.snapshotId());
+                if (localShards != null) {
+                    ImmutableMap<ShardId, ShardSnapshotStatus> masterShards = snapshot.shards();
+                    for(Map.Entry<ShardId, IndexShardSnapshotStatus> localShard : localShards.entrySet()) {
+                        ShardId shardId = localShard.getKey();
+                        IndexShardSnapshotStatus localShardStatus = localShard.getValue();
+                        ShardSnapshotStatus masterShard = masterShards.get(shardId);
+                        if (masterShard != null && masterShard.state().completed() == false) {
+                            // Master knows about the shard and thinks it has not completed
+                            if (localShardStatus.stage() == IndexShardSnapshotStatus.Stage.DONE) {
+                                // but we think the shard is done - we need to make new master know that the shard is done
+                                logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard is done locally, updating status on the master", snapshot.snapshotId(), shardId);
+                                updateIndexShardSnapshotStatus(new UpdateIndexShardSnapshotStatusRequest(snapshot.snapshotId(), shardId,
+                                        new ShardSnapshotStatus(event.state().nodes().localNodeId(), SnapshotMetaData.State.SUCCESS)));
+                            } else if (localShard.getValue().stage() == IndexShardSnapshotStatus.Stage.FAILURE) {
+                                // but we think the shard failed - we need to make new master know that the shard failed
+                                logger.debug("[{}] new master thinks the shard [{}] is not completed but the shard failed locally, updating status on master", snapshot.snapshotId(), shardId);
+                                updateIndexShardSnapshotStatus(new UpdateIndexShardSnapshotStatusRequest(snapshot.snapshotId(), shardId,
+                                        new ShardSnapshotStatus(event.state().nodes().localNodeId(), State.FAILED, localShardStatus.failure())));
+
+                            }
+                        }
+                    }
+                }
+            }
+        }
+    }
+
     /**
      * Updates the shard status
      *

diff --git a/src/test/java/org/elasticsearch/snapshots/AbstractSnapshotTests.java b/src/test/java/org/elasticsearch/snapshots/AbstractSnapshotTests.java
@@ -22,8 +22,12 @@
 import com.google.common.collect.ImmutableList;
 
 import org.elasticsearch.action.admin.cluster.state.ClusterStateResponse;
+import org.elasticsearch.action.admin.cluster.tasks.PendingClusterTasksResponse;
+import org.elasticsearch.cluster.*;
 import org.elasticsearch.cluster.metadata.SnapshotId;
 import org.elasticsearch.cluster.metadata.SnapshotMetaData;
+import org.elasticsearch.cluster.service.PendingClusterTask;
+import org.elasticsearch.common.Priority;
 import org.elasticsearch.common.settings.Settings;
 import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.repositories.RepositoriesService;
@@ -37,9 +41,12 @@
 import java.nio.file.Path;
 import java.nio.file.SimpleFileVisitor;
 import java.nio.file.attribute.BasicFileAttributes;
+import java.util.concurrent.CountDownLatch;
+import java.util.concurrent.TimeUnit;
 import java.util.concurrent.atomic.AtomicInteger;
 
 import static org.hamcrest.Matchers.equalTo;
+import static org.hamcrest.Matchers.greaterThanOrEqualTo;
 
 /**
  */
@@ -121,4 +128,128 @@ public static String blockNodeWithIndex(String index) {
     public static void unblockNode(String node) {
         ((MockRepository)internalCluster().getInstance(RepositoriesService.class, node).repository("test-repo")).unblock();
     }
+
+    protected void assertBusyPendingTasks(final String taskPrefix, final int expectedCount) throws Exception {
+        assertBusy(new Runnable() {
+            @Override
+            public void run() {
+                PendingClusterTasksResponse tasks = client().admin().cluster().preparePendingClusterTasks().get();
+                int count = 0;
+                for(PendingClusterTask task : tasks) {
+                    if (task.getSource().toString().startsWith(taskPrefix)) {
+                        count++;
+                    }
+                }
+                assertThat(count, greaterThanOrEqualTo(expectedCount));
+            }
+        }, 1, TimeUnit.MINUTES);
+    }
+
+    /**
+     * Cluster state task that blocks waits for the blockOn task to show up and then blocks execution not letting
+     * any cluster state update task to be performed unless they have priority higher then passThroughPriority.
+     *
+     * This class is useful to testing of cluster state update task batching for lower priority tasks.
+     */
+    protected class BlockingClusterStateListener implements ClusterStateListener {
+
+        private final Predicate<ClusterChangedEvent> blockOn;
+        private final Predicate<ClusterChangedEvent> countOn;
+        private final ClusterService clusterService;
+        private final CountDownLatch latch;
+        private final Priority passThroughPriority;
+        private int count;
+        private boolean timedOut;
+        private final TimeValue timeout;
+        private long stopWaitingAt = -1;
+
+        public BlockingClusterStateListener(ClusterService clusterService, String blockOn, String countOn, Priority passThroughPriority) {
+            this(clusterService, blockOn, countOn, passThroughPriority, TimeValue.timeValueMinutes(1));
+        }
+
+        public BlockingClusterStateListener(ClusterService clusterService, final String blockOn, final String countOn, Priority passThroughPriority, TimeValue timeout) {
+            this.clusterService = clusterService;
+            this.blockOn = new Predicate<ClusterChangedEvent>() {
+                @Override
+                public boolean apply(ClusterChangedEvent clusterChangedEvent) {
+                    return clusterChangedEvent.source().startsWith(blockOn);
+                }
+            };
+            this.countOn = new Predicate<ClusterChangedEvent>() {
+                @Override
+                public boolean apply(ClusterChangedEvent clusterChangedEvent) {
+                    return clusterChangedEvent.source().startsWith(countOn);
+                }
+            };
+            this.latch = new CountDownLatch(1);
+            this.passThroughPriority = passThroughPriority;
+            this.timeout = timeout;
+
+        }
+
+        public void unblock() {
+            latch.countDown();
+        }
+
+        @Override
+        public void clusterChanged(ClusterChangedEvent event) {
+            if (blockOn.apply(event)) {
+                logger.info("blocking cluster state tasks on [{}]", event.source());
+                assert stopWaitingAt < 0; // Make sure we are the first time here
+                stopWaitingAt = System.currentTimeMillis() + timeout.getMillis();
+                addBlock();
+            }
+            if (countOn.apply(event)) {
+                count++;
+            }
+        }
+
+        private void addBlock() {
+            // We should block after this task - add blocking cluster state update task
+            clusterService.submitStateUpdateTask("test_block", passThroughPriority, new ClusterStateUpdateTask() {
+                @Override
+                public ClusterState execute(ClusterState currentState) throws Exception {
+                    while(System.currentTimeMillis() < stopWaitingAt) {
+                        for (PendingClusterTask task : clusterService.pendingTasks()) {
+                            if (task.getSource().string().equals("test_block") == false && passThroughPriority.sameOrAfter(task.getPriority())) {
+                                // There are other higher priority tasks in the queue and let them pass through and then set the block again
+                                logger.info("passing through cluster state task {}", task.getSource());
+                                addBlock();
+                                return currentState;
+                            }
+                        }
+                        try {
+                            logger.info("waiting....");
+                            if (latch.await(Math.min(100, timeout.millis()), TimeUnit.MILLISECONDS)){
+                                // Done waiting - unblock
+                                logger.info("unblocked");
+                                return currentState;
+                            }
+                            logger.info("done waiting....");
+                        } catch (InterruptedException ex) {
+                            logger.info("interrupted....");
+                            Thread.currentThread().interrupt();
+                            return currentState;
+                        }
+                    }
+                    timedOut = true;
+                    return currentState;
+                }
+
+                @Override
+                public void onFailure(String source, Throwable t) {
+                    logger.warn("failed to execute [{}]", t, source);
+                }
+            });
+
+        }
+
+        public int count() {
+            return count;
+        }
+
+        public boolean timedOut() {
+            return timedOut;
+        }
+    }
 }
diff --git a/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreTests.java b/src/test/java/org/elasticsearch/snapshots/DedicatedClusterSnapshotRestoreTests.java
@@ -34,13 +34,15 @@
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotStatus;
 import org.elasticsearch.action.admin.cluster.snapshots.status.SnapshotsStatusResponse;
 import org.elasticsearch.action.admin.indices.recovery.ShardRecoveryResponse;
+import org.elasticsearch.action.index.IndexRequestBuilder;
 import org.elasticsearch.client.Client;
 import org.elasticsearch.cluster.ClusterService;
 import org.elasticsearch.cluster.ClusterState;
 import org.elasticsearch.cluster.ProcessedClusterStateUpdateTask;
 import org.elasticsearch.cluster.AbstractDiffable;
 import org.elasticsearch.cluster.metadata.MetaData;
 import org.elasticsearch.cluster.metadata.MetaData.Custom;
+import org.elasticsearch.cluster.metadata.SnapshotMetaData;
 import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
 import org.elasticsearch.cluster.metadata.MetaDataIndexStateService;
 import org.elasticsearch.common.Nullable;
@@ -64,6 +66,7 @@
 import org.elasticsearch.snapshots.mockstore.MockRepositoryModule;
 import org.elasticsearch.snapshots.mockstore.MockRepositoryPlugin;
 import org.elasticsearch.test.InternalTestCluster;
+import org.elasticsearch.test.junit.annotations.TestLogging;
 import org.elasticsearch.test.rest.FakeRestRequest;
 import org.junit.Ignore;
 import org.junit.Test;
@@ -791,6 +794,83 @@ public void run() {
         logger.info("--> done");
     }
 
+    @Test
+    public void masterShutdownDuringSnapshotTest() throws Exception {
+
+        Settings masterSettings = settingsBuilder().put("node.data", false).build();
+        Settings dataSettings = settingsBuilder().put("node.master", false).build();
+
+        logger.info("-->  starting two master nodes and two data nodes");
+        internalCluster().startNode(masterSettings);
+        internalCluster().startNode(masterSettings);
+        internalCluster().startNode(dataSettings);
+        internalCluster().startNode(dataSettings);
+
+        final Client client = client();
+
+        logger.info("-->  creating repository");
+        assertAcked(client.admin().cluster().preparePutRepository("test-repo")
+                .setType("fs").setSettings(Settings.settingsBuilder()
+                        .put("location", randomRepoPath())
+                        .put("compress", randomBoolean())
+                        .put("chunk_size", randomIntBetween(100, 1000))));
+
+        assertAcked(prepareCreate("test-idx", 0, settingsBuilder().put("number_of_shards", between(1, 20))
+                .put("number_of_replicas", 0)));
+        ensureGreen();
+
+        logger.info("--> indexing some data");
+        final int numdocs = randomIntBetween(10, 100);
+        IndexRequestBuilder[] builders = new IndexRequestBuilder[numdocs];
+        for (int i = 0; i < builders.length; i++) {
+            builders[i] = client().prepareIndex("test-idx", "type1", Integer.toString(i)).setSource("field1", "bar " + i);
+        }
+        indexRandom(true, builders);
+        flushAndRefresh();
+
+        final int numberOfShards = getNumShards("test-idx").numPrimaries;
+        logger.info("number of shards: {}", numberOfShards);
+
+        final ClusterService clusterService = internalCluster().clusterService(internalCluster().getMasterName());
+        BlockingClusterStateListener snapshotListener = new BlockingClusterStateListener(clusterService, "update_snapshot [", "update snapshot state", Priority.HIGH);
+        try {
+            clusterService.addFirst(snapshotListener);
+            logger.info("--> snapshot");
+            dataNodeClient().admin().cluster().prepareCreateSnapshot("test-repo", "test-snap").setWaitForCompletion(false).setIndices("test-idx").get();
+
+            // Await until some updates are in pending state.
+            assertBusyPendingTasks("update snapshot state", 1);
+
+            logger.info("--> stopping master node");
+            internalCluster().stopCurrentMasterNode();
+
+            logger.info("--> unblocking snapshot execution");
+            snapshotListener.unblock();
+
+            logger.info("--> wait until the snapshot is done");
+
+        } finally {
+            clusterService.remove(snapshotListener);
+        }
+
+        assertBusy(new Runnable() {
+            @Override
+            public void run() {
+                SnapshotsStatusResponse snapshotsStatusResponse = client().admin().cluster().prepareSnapshotStatus("test-repo").setSnapshots("test-snap").get();
+                ImmutableList<SnapshotStatus> snapshotStatuses = snapshotsStatusResponse.getSnapshots();
+                assertEquals(1, snapshotStatuses.size());
+                assertTrue(snapshotStatuses.get(0).getState().completed());
+            }
+        });
+
+        GetSnapshotsResponse snapshotsStatusResponse = client().admin().cluster().prepareGetSnapshots("test-repo").setSnapshots("test-snap").get();
+        SnapshotInfo snapshotInfo = snapshotsStatusResponse.getSnapshots().get(0);
+        assertEquals(SnapshotState.SUCCESS, snapshotInfo.state());
+        assertEquals(snapshotInfo.totalShards(), snapshotInfo.successfulShards());
+        assertEquals(0, snapshotInfo.failedShards());
+    }
+
+
     private boolean snapshotIsDone(String repository, String snapshot) {
         try {
             SnapshotsStatusResponse snapshotsStatusResponse = client().admin().cluster().prepareSnapshotStatus(repository).setSnapshots(snapshot).get();