Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BitFlip fault injection workload #11264

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
10 changes: 6 additions & 4 deletions fdbclient/include/fdbclient/CommitTransaction.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,8 @@
// The versioned message has wire format : -1, version, messages
static const int32_t VERSION_HEADER = -1;

extern Severity getBitFlipSeverityType();

static const char* typeString[] = { "SetValue",
"ClearRange",
"AddValue",
Expand Down Expand Up @@ -200,7 +202,7 @@ struct MutationRef {
// This operation must be after removing the acs index if exists
void offloadChecksum() {
if (this->checksum.present()) {
TraceEvent(SevError, "MutationRefUnexpectedError")
TraceEvent(getBitFlipSeverityType(), "MutationRefUnexpectedError")
.setMaxFieldLength(-1)
.setMaxEventLength(-1)
.detail("Reason", "Internal checksum has been set when offloading checksum")
Expand Down Expand Up @@ -306,7 +308,7 @@ struct MutationRef {
// Calculate crc based on type and param1 and param2 and compare the crc with this->checksum
bool validateChecksum() const {
if (this->corrupted) {
TraceEvent(SevError, "MutationRefUnexpectedError")
TraceEvent(getBitFlipSeverityType(), "MutationRefUnexpectedError")
.setMaxFieldLength(-1)
.setMaxEventLength(-1)
.detail("Reason", "Mutation has been marked as corrupted")
Expand All @@ -320,7 +322,7 @@ struct MutationRef {
crc = crc32c_append(crc, this->param1.begin(), this->param1.size());
crc = crc32c_append(crc, this->param2.begin(), this->param2.size());
if (crc != static_cast<uint32_t>(this->checksum.get())) {
TraceEvent(SevError, "MutationRefUnexpectedError")
TraceEvent(getBitFlipSeverityType(), "MutationRefUnexpectedError")
.setMaxFieldLength(-1)
.setMaxEventLength(-1)
.detail("Reason", "Mutation checksum mismatch")
Expand Down Expand Up @@ -394,7 +396,7 @@ struct MutationRef {
param1 = param2.substr(0, param2.size() - 1);
}
if (!validateChecksum()) {
TraceEvent(SevError, "MutationRefCorruptionDetected")
TraceEvent(getBitFlipSeverityType(), "MutationRefCorruptionDetected")
.setMaxFieldLength(-1)
.setMaxEventLength(-1)
.detail("Mutation", this->toString());
Expand Down
16 changes: 16 additions & 0 deletions fdbrpc/include/fdbrpc/simulator.h
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@ struct MachineInfo;

constexpr double DISABLE_CONNECTION_FAILURE_FOREVER = 1e6;

// Flip a random bit in the data for error injection ONLY in simulation.
extern void flip_bit(StringRef data, const char* file, int line);

extern Severity getBitFlipSeverityType();

#define INJECT_BIT_FLIP(data) flip_bit(data, __FILE__, __LINE__)

class ISimulator : public INetwork {

public:
Expand Down Expand Up @@ -434,6 +441,15 @@ class ISimulator : public INetwork {
ISimulator();
virtual ~ISimulator();

bool allowBitFlipInjection = false;
std::map<std::string, int> bitFlipInjections;
void enableBitFlipInjection() { allowBitFlipInjection = true; }
bool isBitFlipInjectionEnabled() { return allowBitFlipInjection && !speedUpSimulation; }
void disableBitFlipInjection() { allowBitFlipInjection = false; }
void addBitFlipInjectionStats(const char* file, int line);
bool isBitFlipInjected(const char* file, int line);
bool isBitFlipInjected() { return !bitFlipInjections.empty(); }

protected:
Mutex mutex;

Expand Down
38 changes: 38 additions & 0 deletions fdbrpc/sim2.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <utility>

#include "flow/MkCert.h"
#include "flow/Trace.h"
#include "fmt/format.h"
#include "fdbrpc/simulator.h"
#include "flow/Arena.h"
Expand Down Expand Up @@ -142,6 +143,43 @@ bool simulator_should_inject_blob_fault(const char* context, const char* file, i
return false;
}

void flip_bit(StringRef data, const char* file, int line) {
if (data.size() == 0)
return;
ASSERT(g_network->isSimulated() && g_simulator->isBitFlipInjectionEnabled());
if (g_simulator->isBitFlipInjected(file, line))
return;

int index = deterministicRandom()->randomInt(0, data.size());
int bit = deterministicRandom()->randomInt(0, 8);
uint8_t* p = const_cast<uint8_t*>(data.begin()) + index;
uint8_t original = *p;
*p ^= 1 << bit;
TraceEvent(SevWarn, "BitFlipped")
.detail("File", file)
.detail("Line", line)
.detail("BufferOffset", index)
.detail("FlippedBit", bit)
.detail("OriginalByte", original)
.detail("NewByte", *p);
g_simulator->addBitFlipInjectionStats(file, line);
// ASSERT(false);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Open this assert when running

}

Severity getBitFlipSeverityType() {
return g_simulator && g_simulator->isBitFlipInjected() ? SevWarn : SevError;
}

void ISimulator::addBitFlipInjectionStats(const char* file, int line) {
std::string fileLine = format("%s:%d", file, line);
bitFlipInjections[fileLine]++;
}

bool ISimulator::isBitFlipInjected(const char* file, int line) {
std::string fileLine = format("%s:%d", file, line);
return g_simulator->bitFlipInjections.count(fileLine) > 0;
}

void ISimulator::disableFor(const std::string& desc, double time) {
disabledMap[desc] = time;
}
Expand Down
4 changes: 4 additions & 0 deletions fdbserver/TLogServer.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2110,6 +2110,10 @@ Future<Void> tLogPeekMessages(PromiseType replyPromise,
reply.end = endVersion;
reply.onlySpilled = onlySpilled;

if (g_network->isSimulated() && g_simulator->isBitFlipInjectionEnabled() && BUGGIFY_WITH_PROB(0.001)) {
INJECT_BIT_FLIP(reply.messages);
}

DebugLogTraceEvent("TLogPeekMessages4", self->dbgid)
.detail("LogId", logData->logId)
.detail("Tag", reqTag.toString())
Expand Down
4 changes: 3 additions & 1 deletion fdbserver/storageserver.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include "fdbclient/BlobCipher.h"
#include "fdbclient/BlobGranuleCommon.h"
#include "fdbrpc/TenantInfo.h"
#include "fdbrpc/simulator.h"
#include "flow/ApiVersion.h"
#include "flow/network.h"
#include "fmt/format.h"
Expand Down Expand Up @@ -11259,7 +11260,8 @@ ACTOR Future<Void> update(StorageServer* data, bool* pReceivedUpdate) {
.setMaxFieldLength(-1)
.setMaxEventLength(-1)
.detail("Mutation", msg);
ASSERT(false);
// ASSERT(false);
throw please_reboot();
}
}
// TraceEvent(SevDebug, "SSReadingLog", data->thisServerID).detail("Mutation", msg);
Expand Down
90 changes: 90 additions & 0 deletions fdbserver/workloads/BitFlip.actor.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* BitFlip.actor.cpp
*
* This source file is part of the FoundationDB open source project
*
* Copyright 2013-2024 Apple Inc. and the FoundationDB project authors
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "fdbclient/Knobs.h"
#include "fdbrpc/simulator.h"
#include "fdbserver/TesterInterface.actor.h"
#include "fdbserver/QuietDatabase.h"
#include "fdbserver/ServerDBInfo.h"
#include "fdbserver/workloads/workloads.actor.h"
#include "flow/DeterministicRandom.h"

#include "flow/actorcompiler.h" // This must be the last #include.

// A simulation workload that flips random memory bit of the data in the system.
struct BitFlipWorkload : FailureInjectionWorkload {
static constexpr auto NAME = "BitFlip";
bool enabled;
bool success = true;

// How long to run the workload before starting
double initialDelay = 0.0;

// How long the workload should be run; if <= 0 then it will run until the workload's check function is called
double duration = 10.0;

BitFlipWorkload(WorkloadContext const& wcx, NoOptions) : FailureInjectionWorkload(wcx) {
enabled = !clientId && g_network->isSimulated() && CLIENT_KNOBS->ENABLE_ACCUMULATIVE_CHECKSUM &&
CLIENT_KNOBS->ENABLE_MUTATION_CHECKSUM;
}

BitFlipWorkload(WorkloadContext const& wcx) : FailureInjectionWorkload(wcx) {
// only do this on the "first" client in simulation
enabled = !clientId && g_network->isSimulated() && CLIENT_KNOBS->ENABLE_ACCUMULATIVE_CHECKSUM &&
CLIENT_KNOBS->ENABLE_MUTATION_CHECKSUM;
initialDelay = getOption(options, "initialDelay"_sr, 0.0);
duration = getOption(options, "testDuration"_sr, 20.0);
}

bool shouldInject(DeterministicRandom& random,
const WorkloadRequest& work,
const unsigned alreadyAdded) const override {
return alreadyAdded < 1 && work.useDatabase && 0.1 / (1 + alreadyAdded) > random.random01();
}
Future<Void> setup(Database const& cx) override { return Void(); }

Future<Void> start(Database const& cx) override { return _start(cx, this); }

ACTOR Future<Void> _start(Database cx, BitFlipWorkload* self) {
if (!self->enabled) {
return Void();
}

wait(delay(self->initialDelay));
TraceEvent("BitFlipOn").log();
g_simulator->enableBitFlipInjection();

// If a duration was given, let the duration elapse and then shut the profiler off
if (self->duration > 0) {
wait(delay(self->duration));
}
g_simulator->disableBitFlipInjection();
TraceEvent("BitFlipOff").log();

return Void();
}

Future<bool> check(Database const& cx) override { return success; }

void getMetrics(std::vector<PerfMetric>& m) override {}
};

WorkloadFactory<BitFlipWorkload> BitFlipWorkloadFactory;
FailureInjectorFactory<BitFlipWorkload> BitFlipFailureInjectorFactory;
4 changes: 0 additions & 4 deletions fdbserver/workloads/Rollback.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,6 @@
#include "fdbserver/ServerDBInfo.h"
#include "flow/actorcompiler.h" // This must be the last #include.

// Choose a random proxy and a random tLog, represented as unclogTlog.
// The workload first clogs network link between the chosen proxy and all tLogs but the unclogTlog;
// While the network is still clogged, the workload kills the proxy and clogs the unclogged tlog's interface.
// Note: The clogged network link's latency will become "clogDuration".
struct RollbackWorkload : FailureInjectionWorkload {
static constexpr auto NAME = "Rollback";

Expand Down