Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-7.3] Don't remove teams when total team count is within threshold #11294

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
1 change: 1 addition & 0 deletions fdbclient/ServerKnobs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -366,6 +366,7 @@ void ServerKnobs::initialize(Randomize randomize, ClientKnobs* clientKnobs, IsSi
init( TR_FLAG_DISABLE_SERVER_TEAM_REMOVER, false ); if( randomize && BUGGIFY ) TR_FLAG_DISABLE_SERVER_TEAM_REMOVER = deterministicRandom()->random01() < 0.1 ? true : false; // false by default. disable the consistency check when it's true
init( TR_REMOVE_SERVER_TEAM_DELAY, 60.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_DELAY = deterministicRandom()->random01() * 60.0;
init( TR_REMOVE_SERVER_TEAM_EXTRA_DELAY, 5.0 ); if( randomize && BUGGIFY ) TR_REMOVE_SERVER_TEAM_EXTRA_DELAY = deterministicRandom()->random01() * 10.0;
init( TR_REDUNDANT_TEAM_PERCENTAGE_THRESHOLD, .01 ); if (randomize && BUGGIFY) TR_REDUNDANT_TEAM_PERCENTAGE_THRESHOLD = deterministicRandom()->random01() * 0.1;

init( DD_REMOVE_STORE_ENGINE_DELAY, 60.0 ); if( randomize && BUGGIFY ) DD_REMOVE_STORE_ENGINE_DELAY = deterministicRandom()->random01() * 60.0;

Expand Down
2 changes: 2 additions & 0 deletions fdbclient/include/fdbclient/ServerKnobs.h
Original file line number Diff line number Diff line change
Expand Up @@ -345,6 +345,8 @@ class ServerKnobs : public KnobsImpl<ServerKnobs> {
double TR_REMOVE_SERVER_TEAM_DELAY; // wait for the specified time before try to remove next server team
double TR_REMOVE_SERVER_TEAM_EXTRA_DELAY; // serverTeamRemover waits for the delay and check DD healthyness again to
// ensure it runs after machineTeamRemover
double TR_REDUNDANT_TEAM_PERCENTAGE_THRESHOLD; // serverTeamRemover will only remove teams if existing team number
// is p% more than the desired team number.

// Remove wrong storage engines
double DD_REMOVE_STORE_ENGINE_DELAY; // wait for the specified time before remove the next batch
Expand Down
15 changes: 9 additions & 6 deletions fdbserver/DDTeamCollection.actor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -732,8 +732,6 @@ class DDTeamCollectionImpl {
.detail("Primary", self->primary)
.detail("StorageTeamSize", self->configuration.storageTeamSize);

// If there are too few machines to even build teams or there are too few represented datacenters, can't build
// any team.
if (uniqueMachines >= self->configuration.storageTeamSize) {
desiredTeams = SERVER_KNOBS->DESIRED_TEAMS_PER_SERVER * serverCount;
int maxTeams = SERVER_KNOBS->MAX_TEAMS_PER_SERVER * serverCount;
Expand Down Expand Up @@ -832,6 +830,8 @@ class DDTeamCollectionImpl {
.trackLatest(self->teamCollectionInfoEventHolder->trackingKey);
}
} else {
// If there are too few machines to even build teams or there are too few represented datacenters, can't
// build any team.
self->lastBuildTeamsFailed = true;
TraceEvent(SevWarnAlways, "BuildTeamsNotEnoughUniqueMachines", self->distributorId)
.detail("Primary", self->primary)
Expand Down Expand Up @@ -2031,7 +2031,8 @@ class DDTeamCollectionImpl {
// Pick the server team whose members are on the most number of server teams, and mark it undesired
std::pair<Reference<TCTeamInfo>, int> foundSTInfo = self->getServerTeamWithMostProcessTeams();

if (totalSTCount > desiredServerTeams && foundSTInfo.first.isValid()) {
if (totalSTCount > desiredServerTeams * (1 + SERVER_KNOBS->TR_REDUNDANT_TEAM_PERCENTAGE_THRESHOLD) &&
foundSTInfo.first.isValid()) {
ASSERT(foundSTInfo.first.isValid());
Reference<TCTeamInfo> st = foundSTInfo.first;
int maxNumProcessTeams = foundSTInfo.second;
Expand All @@ -2053,8 +2054,9 @@ class DDTeamCollectionImpl {
.detail("ServerTeamToRemove", st->getServerIDsStr())
.detail("ServerTeamID", st->getTeamID())
.detail("NumProcessTeamsOnTheServerTeam", maxNumProcessTeams)
.detail("CurrentServerTeams", self->teams.size())
.detail("DesiredServerTeams", desiredServerTeams);
.detail("CurrentServerTeams", totalSTCount)
.detail("DesiredServerTeams", desiredServerTeams)
.detail("Primary", self->primary);

numServerTeamRemoved++;
} else {
Expand All @@ -2063,7 +2065,8 @@ class DDTeamCollectionImpl {
TraceEvent("ServerTeamRemoverDone", self->distributorId)
.detail("CurrentServerTeams", self->teams.size())
.detail("DesiredServerTeams", desiredServerTeams)
.detail("NumServerTeamRemoved", numServerTeamRemoved);
.detail("NumServerTeamRemoved", numServerTeamRemoved)
.detail("Primary", self->primary);
self->traceTeamCollectionInfo();
numServerTeamRemoved = 0; // Reset the counter to avoid keep printing the message
}
Expand Down