Skip to content

Commit

Permalink
Don't remove team when total team count is within threshold
Browse files Browse the repository at this point in the history
  • Loading branch information
yao-xiao-github committed Apr 11, 2024
1 parent f97430b commit b8945ba
Show file tree
Hide file tree
Showing 3 changed files with 13 additions and 6 deletions.
1 change: 1 addition & 0 deletions fdbclient/ServerKnobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( TR_FLAG_DISABLE_SERVER_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_SERVER_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
init( TR_REMOVE_SERVER_TEAM_DELAY, 60.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_DELAY = deterministicRandom()->random01() * 60.0;
init( TR_REMOVE_SERVER_TEAM_EXTRA_DELAY, 5.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_EXTRA_DELAY = deterministicRandom()->random01() * 10.0;
init( TR_REDUNDANT_TEAM_PERCENTAGE_THRESHOLD, .01 ); if (randomize && BUGGIFY) TR_REDUNDANT_TEAM_PERCENTAGE_THRESHOLD = deterministicRandom()->random01() * 0.1;

init( DD_REMOVE_STORE_ENGINE_DELAY, 60.0 ); if( randomize && BUGGIFY ) DD_REMOVE_STORE_ENGINE_DELAY = deterministicRandom()->random01() * 60.0;

Expand Down
2 changes: 2 additions & 0 deletions fdbclient/include/fdbclient/ServerKnobs.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ class ServerKnobs : public KnobsImpl<ServerKnobs> {
double TR_REMOVE_SERVER_TEAM_DELAY; // wait for the specified time before try to remove next server team
double TR_REMOVE_SERVER_TEAM_EXTRA_DELAY; // serverTeamRemover waits for the delay and check DD healthyness again to
// ensure it runs after machineTeamRemover
double TR_REDUNDANT_TEAM_PERCENTAGE_THRESHOLD; // serverTeamRemover will only remove teams if existing team number
// is p% more than the desired team number.

// Remove wrong storage engines
double DD_REMOVE_STORE_ENGINE_DELAY; // wait for the specified time before remove the next batch
Expand Down
16 changes: 10 additions & 6 deletions fdbserver/DDTeamCollection.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -732,8 +732,6 @@ class DDTeamCollectionImpl {
.detail("Primary", self->primary)
.detail("StorageTeamSize", self->configuration.storageTeamSize);

// If there are too few machines to even build teams or there are too few represented datacenters, can't build
// any team.
if (uniqueMachines >= self->configuration.storageTeamSize) {
desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * serverCount;
int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * serverCount;
Expand Down Expand Up @@ -832,6 +830,9 @@ class DDTeamCollectionImpl {
.trackLatest(self->teamCollectionInfoEventHolder->trackingKey);
}
} else {
// If there are too few machines to even build teams or there are too few represented datacenters, can't
// build
// any team.
self->lastBuildTeamsFailed = true;
TraceEvent(SevWarnAlways, "BuildTeamsNotEnoughUniqueMachines", self->distributorId)
.detail("Primary", self->primary)
Expand Down Expand Up @@ -2031,7 +2032,8 @@ class DDTeamCollectionImpl {
// Pick the server team whose members are on the most number of server teams, and mark it undesired
std::pair<Reference<TCTeamInfo>, int> foundSTInfo = self->getServerTeamWithMostProcessTeams();

if (totalSTCount > desiredServerTeams && foundSTInfo.first.isValid()) {
if (totalSTCount > desiredServerTeams * (1 + SERVER_KNOBS->TR_REDUNDANT_TEAM_PERCENTAGE_THRESHOLD) &&
foundSTInfo.first.isValid()) {
ASSERT(foundSTInfo.first.isValid());
Reference<TCTeamInfo> st = foundSTInfo.first;
int maxNumProcessTeams = foundSTInfo.second;
Expand All @@ -2053,8 +2055,9 @@ class DDTeamCollectionImpl {
.detail("ServerTeamToRemove", st->getServerIDsStr())
.detail("ServerTeamID", st->getTeamID())
.detail("NumProcessTeamsOnTheServerTeam", maxNumProcessTeams)
.detail("CurrentServerTeams", self->teams.size())
.detail("DesiredServerTeams", desiredServerTeams);
.detail("CurrentServerTeams", totalSTCount)
.detail("DesiredServerTeams", desiredServerTeams)
.detail("Primary", self->primary);

numServerTeamRemoved++;
} else {
Expand All @@ -2063,7 +2066,8 @@ class DDTeamCollectionImpl {
TraceEvent("ServerTeamRemoverDone", self->distributorId)
.detail("CurrentServerTeams", self->teams.size())
.detail("DesiredServerTeams", desiredServerTeams)
.detail("NumServerTeamRemoved", numServerTeamRemoved);
.detail("NumServerTeamRemoved", numServerTeamRemoved)
.detail("Primary", self->primary);
self->traceTeamCollectionInfo();
numServerTeamRemoved = 0; // Reset the counter to avoid keep printing the message
}
Expand Down

0 comments on commit b8945ba

Please sign in to comment.