Skip to content

Commit efe444d

Browse files
committed
[MERGE #5377 @dilijev] Added -timeoutRetries to rl (and some drive-by cleanup)
Merge pull request #5377 from dilijev:rl-timeout-retry This is an effort to reduce false-positive timeout signals due to busy test machines when running nightlies. Due to some formatting changes, recommend reviewing with hide whitespace diffs to reduce the noise.
2 parents 4826e8e + 1e18c50 commit efe444d

File tree

7 files changed

+442
-232
lines changed

7 files changed

+442
-232
lines changed

bin/rl/rl.cpp

Lines changed: 85 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -273,6 +273,7 @@ const char * const TestInfoKindName[] =
273273
"env",
274274
"command",
275275
"timeout",
276+
"timeoutRetries",
276277
"sourcepath",
277278
"eol-normalization",
278279
"custom-config-file",
@@ -315,34 +316,34 @@ char *CL, *_CL_;
315316
const char *JCBinary = "jshost.exe";
316317

317318
BOOL FStatus = TRUE;
318-
char *StatusPrefix;
319-
const char *StatusFormat;
319+
char *StatusPrefix = nullptr;
320+
const char *StatusFormat = nullptr;
320321

321-
BOOL FVerbose;
322-
BOOL FQuiet;
323-
BOOL FNoWarn;
324-
BOOL FTest;
322+
BOOL FVerbose = FALSE;
323+
BOOL FQuiet = FALSE;
324+
BOOL FNoWarn = FALSE;
325+
BOOL FTest = FALSE;
325326
BOOL FStopOnError = FALSE;
326327
BOOL GStopDueToError = FALSE;
327-
BOOL FLow;
328-
BOOL FNoDelete;
329-
BOOL FCopyOnFail;
328+
BOOL FLow = FALSE;
329+
BOOL FNoDelete = FALSE;
330+
BOOL FCopyOnFail = FALSE;
330331
BOOL FSummary = TRUE;
331-
BOOL FMoveDiffs;
332-
BOOL FNoMoveDiffsSwitch;
333-
BOOL FRelativeLogPath;
332+
BOOL FMoveDiffs = FALSE;
333+
BOOL FNoMoveDiffsSwitch = FALSE;
334+
BOOL FRelativeLogPath = FALSE;
334335
BOOL FNoDirName = TRUE;
335-
BOOL FBaseline;
336+
BOOL FBaseline = FALSE;
336337
BOOL FRebase = FALSE;
337-
BOOL FDiff;
338-
BOOL FBaseDiff;
339-
BOOL FSyncEnumDirs;
338+
BOOL FDiff = FALSE;
339+
BOOL FBaseDiff = FALSE;
340+
BOOL FSyncEnumDirs = FALSE;
340341
BOOL FNogpfnt = TRUE;
341-
BOOL FAppend;
342+
BOOL FAppend = FALSE;
342343
BOOL FAppendTestNameToExtraCCFlags = FALSE;
343344

344345
#ifndef NODEBUG
345-
BOOL FDebug;
346+
BOOL FDebug = FALSE;
346347
#endif
347348

348349
// Output synchronization options
@@ -364,7 +365,6 @@ RLMODE Mode = DEFAULT_RLMODE;
364365
char *DCFGfile = NULL;
365366
char const *CFGfile = NULL;
366367
char const *CMDfile = NULL;
367-
int CFGline;
368368

369369
#define MAX_ALLOWED_THREADS 10 // must be <= MAXIMUM_WAIT_OBJECTS (64)
370370
unsigned NumberOfThreads = 0;
@@ -376,15 +376,18 @@ BOOL FNoProgramOutput = FALSE;
376376
BOOL FOnlyAssertOutput = FALSE;
377377
BOOL FExcludeDirs = FALSE;
378378
BOOL FGenLst = FALSE;
379-
char *ResumeDir, *MatchDir;
379+
char *ResumeDir = nullptr;
380+
char *MatchDir = nullptr;
380381

381382
TIME_OPTION Timing = TIME_DIR | TIME_TEST; // Default to report times at test and directory level
382383

383-
static const char *ProgramName;
384-
static const char *LogName;
385-
static const char *FullLogName;
386-
static const char *ResultsLogName;
387-
static const char *TestTimeout; // Stores timeout in seconds for all tests
384+
static const char *ProgramName = nullptr;
385+
static const char *LogName = nullptr;
386+
static const char *FullLogName = nullptr;
387+
static const char *ResultsLogName = nullptr;
388+
static const char *TestTimeout = nullptr; // Stores timeout in seconds for all tests
389+
static const char *TestTimeoutRetries = nullptr; // Number of timeout retries for all tests
390+
#define MAX_ALLOWED_TIMEOUT_RETRIES 100 // Arbitrary max to avoid accidentally specifying too many retries
388391

389392
// NOTE: this might be unused now
390393
static char TempPath[MAX_PATH] = ""; // Path for temporary files
@@ -2474,6 +2477,7 @@ WriteEnvLst
24742477
NULL,
24752478
NULL,
24762479
NULL,
2480+
NULL,
24772481
NULL
24782482
};
24792483

@@ -2994,6 +2998,11 @@ ParseArg(
29942998
break;
29952999
}
29963000

3001+
if (!_stricmp(&arg[1], "timeoutRetries")) {
3002+
TestTimeoutRetries = ComplainIfNoArg(arg, s);
3003+
break;
3004+
}
3005+
29973006
#ifndef NODEBUG
29983007
if (!_stricmp(&arg[1], "debug")) {
29993008
FDebug = FVerbose = TRUE;
@@ -3540,6 +3549,32 @@ IsTimeoutStringValid(const char *strTimeout)
35403549
return TRUE;
35413550
}
35423551

3552+
BOOL
3553+
IsTimeoutRetriesStringValid(const char *strTimeoutRetries)
3554+
{
3555+
char *end;
3556+
_set_errno(0);
3557+
3558+
uint32 numRetries = strtoul(strTimeoutRetries, &end, 10);
3559+
3560+
if (errno != 0 || *end != 0)
3561+
{
3562+
return FALSE;
3563+
}
3564+
3565+
// We will not be doing any math with this value, so no need to check for overflow.
3566+
// However, large values will possibly result in an unacceptably long retry loop,
3567+
// (especially with the default timeout being multiple minutes long),
3568+
// so limit the number of retries to some arbitrary max.
3569+
3570+
if (numRetries > MAX_ALLOWED_TIMEOUT_RETRIES)
3571+
{
3572+
return FALSE;
3573+
}
3574+
3575+
return TRUE;
3576+
}
3577+
35433578
uint32 GetTimeoutValue(const char *strTimeout)
35443579
{
35453580
if (strTimeout == nullptr)
@@ -3606,13 +3641,26 @@ GetTestInfoFromNode
36063641
if (i == TIK_TIMEOUT)
36073642
{
36083643
// Validate the timeout string now to fail early so we don't run any tests when there is an error.
3609-
if (!IsTimeoutStringValid(testInfo->data[i])) {
3644+
if (!IsTimeoutStringValid(testInfo->data[i]))
3645+
{
36103646
CFG_ERROR_EX(fileName, node->LineNumber,
36113647
"Invalid timeout specified. Cannot parse or too large.\n", nullptr);
36123648
childNode->Dump();
36133649
return FALSE;
36143650
}
36153651
}
3652+
3653+
if (i == TIK_TIMEOUT_RETRIES)
3654+
{
3655+
// Validate the timeoutRetries string now to fail early so we don't run any tests when there is an error.
3656+
if (!IsTimeoutRetriesStringValid(testInfo->data[i]))
3657+
{
3658+
CFG_ERROR_EX(fileName, node->LineNumber,
3659+
"Invalid number of timeout retries specified. Value must be numeric and <= %d.\n", MAX_ALLOWED_TIMEOUT_RETRIES);
3660+
childNode->Dump();
3661+
return FALSE;
3662+
}
3663+
}
36163664
}
36173665
}
36183666

@@ -3626,6 +3674,17 @@ GetTestInfoFromNode
36263674
testInfo->data[i] = TestTimeout;
36273675
}
36283676
}
3677+
3678+
if (i == TIK_TIMEOUT_RETRIES && TestTimeoutRetries != nullptr)
3679+
{
3680+
// Overriding the timeoutRetries value with the command line value (if the command line value is larger)
3681+
uint32 xmlTimeoutRetriesValue = GetTimeoutValue(testInfo->data[i]);
3682+
uint32 testTimeoutRetriesValue = GetTimeoutValue(TestTimeoutRetries);
3683+
if (xmlTimeoutRetriesValue < testTimeoutRetriesValue)
3684+
{
3685+
testInfo->data[i] = TestTimeoutRetries;
3686+
}
3687+
}
36293688
}
36303689

36313690
return TRUE;

bin/rl/rl.h

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,7 @@ extern RLMODE Mode;
167167
#define DEFAULT_EXEC_TESTS_FLAGS ";"
168168

169169
#define DEFAULT_TEST_TIMEOUT 60000
170+
#define DEFAULT_TEST_TIMEOUT_RETRIES 0
170171

171172
#define DIR_LOCKFILE "regrlock.txt"
172173

@@ -203,6 +204,7 @@ enum TestInfoKind
203204
TIK_ENV,
204205
TIK_COMMAND,
205206
TIK_TIMEOUT,
207+
TIK_TIMEOUT_RETRIES,
206208
TIK_SOURCE_PATH,
207209
TIK_EOL_NORMALIZATION,
208210
TIK_CUSTOM_CONFIG_FILE,
@@ -998,7 +1000,12 @@ extern int ExecTest(CDirectory* pDir, Test * pTest, TestVariant * pTestVariant);
9981000

9991001
// rlmp.cpp
10001002

1001-
extern int ExecuteCommand(const char* path, const char* CommandLine, DWORD millisecTimeout = INFINITE, void* localEnvVars = NULL);
1003+
extern int ExecuteCommand(
1004+
const char* path,
1005+
const char* CommandLine,
1006+
DWORD millisecTimeout = INFINITE,
1007+
uint32 timeoutRetries = 0,
1008+
void* envFlags = NULL);
10021009

10031010
extern int DoOneExternalTest(
10041011
CDirectory* pDir,
@@ -1012,7 +1019,9 @@ extern int DoOneExternalTest(
10121019
BOOL fCleanBefore,
10131020
BOOL fCleanAfter,
10141021
BOOL fSuppressNoGPF,
1015-
void *localEnvVars = NULL
1022+
DWORD millisecTimeout = INFINITE,
1023+
uint32 timeoutRetries = 0,
1024+
void *envFlags = NULL
10161025
);
10171026

10181027
extern void

bin/rl/rlmp.cpp

Lines changed: 53 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -557,48 +557,74 @@ int
557557
ExecuteCommand(
558558
const char* path,
559559
const char* CommandLine,
560-
DWORD millisecTimeout,
561-
void* envFlags)
560+
DWORD millisecTimeout /* = INFINITE */,
561+
uint32 timeoutRetries /* = 0 */,
562+
void* envFlags /* = NULL */)
562563
{
563-
int rc;
564+
int rc = 0;
564565
FILE* childOutput = NULL;
565566
char putEnvStr[BUFFER_SIZE];
566567
char ExecuteProgramCmdLine[BUFFER_SIZE];
567568
UINT prevmode;
568569

569570
prevmode = SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOGPFAULTERRORBOX);
570571

571-
// Always flush output buffers before executing a command. Note: this
572-
// shouldn't really be necessary because all output should be buffered
573-
// by the COutputBuffer class on a per-thread basis.
574-
fflush(stdout);
575-
fflush(stderr);
576-
577572
strcpy_s(ExecuteProgramCmdLine, "cmd.exe /c "); // for .cmd/.bat scripts
578573
strcat_s(ExecuteProgramCmdLine, CommandLine);
579574

580-
EnterCriticalSection(&csCurrentDirectory);
575+
for (uint32 numTimeouts = 0; numTimeouts <= timeoutRetries; numTimeouts++)
576+
{
577+
if (numTimeouts != 0)
578+
{
579+
LogOut("Timeout retry (attempt #%u)...\n", numTimeouts);
580+
}
581581

582-
// If we're doing executable tests, set the TARGET_VM environment variable.
583-
// We must do this inside a critical section since the environment is
584-
// not thread specific.
582+
// Always flush output buffers before executing a command. Note: this
583+
// shouldn't really be necessary because all output should be buffered
584+
// by the COutputBuffer class on a per-thread basis.
585+
fflush(stdout);
586+
fflush(stderr);
585587

586-
if ((Mode == RM_EXE) && TargetVM) {
587-
sprintf_s(putEnvStr, "TARGET_VM=%s", TargetVM);
588-
_putenv(putEnvStr);
589-
}
588+
EnterCriticalSection(&csCurrentDirectory);
590589

591-
rc = _chdir(path);
592-
if (rc == 0) {
593-
childOutput = PipeSpawn(ExecuteProgramCmdLine, envFlags);
594-
}
595-
LeaveCriticalSection(&csCurrentDirectory);
590+
// If we're doing executable tests, set the TARGET_VM environment variable.
591+
// We must do this inside a critical section since the environment is
592+
// not thread specific.
593+
if ((Mode == RM_EXE) && TargetVM)
594+
{
595+
sprintf_s(putEnvStr, "TARGET_VM=%s", TargetVM);
596+
_putenv(putEnvStr);
597+
}
596598

597-
if (rc != 0) {
598-
LogError("Could not change directory to '%s' - errno == %d\n", path, errno);
599-
} else if (childOutput != NULL) {
600-
rc = FilterThread(childOutput, millisecTimeout);
601-
rc = PipeSpawnClose(childOutput, rc == WAIT_TIMEOUT);
599+
// Change directory and then spawn child process inside a critical section
600+
// because the environment (including CWD) is not thread-specific
601+
// and the child process depends on CWD.
602+
rc = _chdir(path);
603+
if (rc == 0)
604+
{
605+
childOutput = PipeSpawn(ExecuteProgramCmdLine, envFlags);
606+
}
607+
608+
LeaveCriticalSection(&csCurrentDirectory);
609+
610+
if (rc != 0)
611+
{
612+
LogError("Could not change directory to '%s' - errno == %d\n", path, errno);
613+
}
614+
else if (childOutput != NULL)
615+
{
616+
rc = FilterThread(childOutput, millisecTimeout);
617+
bool timedOut = (rc == WAIT_TIMEOUT);
618+
rc = PipeSpawnClose(childOutput, timedOut);
619+
if (timedOut)
620+
{
621+
continue; // retry timed-out command
622+
}
623+
else
624+
{
625+
break; // did not timeout, do not retry
626+
}
627+
}
602628
}
603629

604630
SetErrorMode(prevmode);

0 commit comments

Comments
 (0)