Skip to content

Commit

Permalink
Fixes snapshot status on failed snapshots
Browse files Browse the repository at this point in the history
If a snapshot is taken on multiple indices, and some of them are "good"
indices that don't contain any corruption or failures, and some of them
are "bad" indices that contain missing shards or corrupted shards, and
if the snapshot request is set to partial=false (meaning don't take a
snapshot if there are any failures), then the good indices will not be
snapshotted either.  Previously, when getting the status of such a
snapshot, a 500 error would be thrown, because the snap-*.dat blob for
the shards in the good index could not be found.

This commit fixes the problem by reporting shards of good indices as
failed due to a failed snapshot, instead of throwing the
NoSuchFileException.

Closes elastic#23716
  • Loading branch information
Ali Beyad committed Mar 30, 2017
1 parent a4b37bf commit 1a48dc3
Show file tree
Hide file tree
Showing 2 changed files with 111 additions and 4 deletions.
Expand Up @@ -60,6 +60,7 @@
import org.elasticsearch.common.unit.TimeValue;
import org.elasticsearch.index.Index;
import org.elasticsearch.index.shard.ShardId;
import org.elasticsearch.index.snapshots.IndexShardRestoreFailedException;
import org.elasticsearch.index.snapshots.IndexShardSnapshotStatus;
import org.elasticsearch.repositories.IndexId;
import org.elasticsearch.repositories.RepositoriesService;
Expand All @@ -70,6 +71,7 @@
import org.elasticsearch.threadpool.ThreadPool;

import java.io.IOException;
import java.nio.file.NoSuchFileException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand Down Expand Up @@ -550,7 +552,8 @@ public List<SnapshotsInProgress.Entry> currentSnapshots(final String repository,
/**
* Returns status of shards currently finished snapshots
* <p>
* This method is executed on master node and it's complimentary to the {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it
* This method is executed on master node and it's complimentary to the
* {@link SnapshotShardsService#currentSnapshotShards(Snapshot)} because it
* returns similar information but for already finished snapshots.
* </p>
*
Expand All @@ -564,6 +567,8 @@ public Map<ShardId, IndexShardSnapshotStatus> snapshotShards(final String reposi
Repository repository = repositoriesService.repository(repositoryName);
RepositoryData repositoryData = repository.getRepositoryData();
MetaData metaData = repository.getSnapshotMetaData(snapshotInfo, repositoryData.resolveIndices(snapshotInfo.indices()));
Map<ShardId, IndexId> nonFailedShards = new HashMap<>();
boolean hasFailedShards = false;
for (String index : snapshotInfo.indices()) {
IndexId indexId = repositoryData.resolveIndexId(index);
IndexMetaData indexMetaData = metaData.indices().get(index);
Expand All @@ -577,14 +582,42 @@ public Map<ShardId, IndexShardSnapshotStatus> snapshotShards(final String reposi
shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE);
shardSnapshotStatus.failure(shardFailure.reason());
shardStatus.put(shardId, shardSnapshotStatus);
hasFailedShards = true;
} else {
IndexShardSnapshotStatus shardSnapshotStatus =
repository.getShardSnapshotStatus(snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId);
shardStatus.put(shardId, shardSnapshotStatus);
nonFailedShards.put(shardId, indexId);
}
}
}
}
for (Map.Entry<ShardId, IndexId> shard : nonFailedShards.entrySet()) {
final ShardId shardId = shard.getKey();
final IndexId indexId = shard.getValue();
IndexShardSnapshotStatus shardSnapshotStatus;
try {
shardSnapshotStatus = repository.getShardSnapshotStatus(
snapshotInfo.snapshotId(), snapshotInfo.version(), indexId, shardId);
} catch (IndexShardRestoreFailedException e) {
if (e.getCause() != null
&& e.getCause() instanceof NoSuchFileException
&& hasFailedShards) {
// the snap-*.dat file could not be found for the index, even
// though there was no shard failure for the index, this can
// happen when a snapshot is *not* set to partial (won't take
// partial snapshots) and has indices that contain missing
// or corrupted shards, in which case the snapshot will not proceed
// for any of the indices, even the ones that have no shard failures
shardSnapshotStatus = new IndexShardSnapshotStatus();
shardSnapshotStatus.updateStage(IndexShardSnapshotStatus.Stage.FAILURE);
String msg = "snapshot not taken because partial snapshots are not " +
"configured for this snapshot and another shard caused the " +
"snapshot to fail";
shardSnapshotStatus.failure(msg);
} else {
throw e;
}
}
shardStatus.put(shardId, shardSnapshotStatus);
}
return unmodifiableMap(shardStatus);
}

Expand Down
Expand Up @@ -54,6 +54,7 @@
import org.elasticsearch.cluster.metadata.MappingMetaData;
import org.elasticsearch.cluster.metadata.MetaDataIndexStateService;
import org.elasticsearch.cluster.routing.IndexRoutingTable;
import org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider;
import org.elasticsearch.cluster.service.ClusterService;
import org.elasticsearch.common.Priority;
import org.elasticsearch.common.Strings;
Expand Down Expand Up @@ -2737,4 +2738,77 @@ public void testSnapshotSucceedsAfterSnapshotFailure() throws Exception {
assertEquals(SnapshotState.SUCCESS, getSnapshotsResponse.getSnapshots().get(0).state());
}

public void testSnapshotStatusOnFailedIndex() throws Exception {
logger.info("--> creating repository");
final Path repoPath = randomRepoPath();
final Client client = client();
assertAcked(client.admin().cluster()
.preparePutRepository("test-repo")
.setType("fs")
.setVerify(false)
.setSettings(Settings.builder().put("location", repoPath)));

logger.info("--> creating good index");
assertAcked(prepareCreate("test-idx-good")
.setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put(SETTING_NUMBER_OF_REPLICAS, 0)));
ensureGreen();
final int numDocs = randomIntBetween(1, 5);
for (int i = 0; i < numDocs; i++) {
index("test-idx-good", "doc", Integer.toString(i), "foo", "bar" + i);
}
refresh();

logger.info("--> creating bad index");
assertAcked(prepareCreate("test-idx-bad")
.setWaitForActiveShards(ActiveShardCount.NONE)
.setSettings(Settings.builder()
.put(SETTING_NUMBER_OF_SHARDS, 1)
.put(SETTING_NUMBER_OF_REPLICAS, 0)
// set shard allocation to none so the primary cannot be
// allocated - simulates a "bad" index that fails to snapshot
.put(EnableAllocationDecider.INDEX_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(),
"none")));

logger.info("--> snapshot bad index and get status");
client.admin().cluster()
.prepareCreateSnapshot("test-repo", "test-snap1")
.setWaitForCompletion(true)
.setIndices("test-idx-bad")
.get();
SnapshotsStatusResponse snapshotsStatusResponse = client.admin().cluster()
.prepareSnapshotStatus("test-repo")
.setSnapshots("test-snap1")
.get();
assertEquals(1, snapshotsStatusResponse.getSnapshots().size());
assertEquals(State.FAILED, snapshotsStatusResponse.getSnapshots().get(0).getState());

logger.info("--> snapshot both good and bad index and get status");
client.admin().cluster()
.prepareCreateSnapshot("test-repo", "test-snap2")
.setWaitForCompletion(true)
.setIndices("test-idx-good", "test-idx-bad")
.get();
snapshotsStatusResponse = client.admin().cluster()
.prepareSnapshotStatus("test-repo")
.setSnapshots("test-snap2")
.get();
assertEquals(1, snapshotsStatusResponse.getSnapshots().size());
// verify a FAILED status is returned instead of a 500 status code
// see https://github.com/elastic/elasticsearch/issues/23716
SnapshotStatus snapshotStatus = snapshotsStatusResponse.getSnapshots().get(0);
assertEquals(State.FAILED, snapshotStatus.getState());
for (SnapshotIndexShardStatus shardStatus : snapshotStatus.getShards()) {
assertEquals(SnapshotIndexShardStage.FAILURE, shardStatus.getStage());
if (shardStatus.getIndex().equals("test-idx-good")) {
String msg = "snapshot not taken because partial snapshots are not " +
"configured for this snapshot and another shard caused the " +
"snapshot to fail";
assertEquals(msg, shardStatus.getFailure());
} else {
assertEquals("primary shard is not allocated", shardStatus.getFailure());
}
}
}
}

0 comments on commit 1a48dc3

Please sign in to comment.