Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
87918: nightlies,logictestccl: probabilistically run backup+restore during logic tests r=stevendanna a=adityamaru

This change introduces a nightly script that run the logictests
in logictestccl to run with `COCKROACH_LOGIC_TEST_BACKUP_RESTORE_PROBABILITY`
set to a non-zero value. This environment variable controls the probability of us
running a backup+restore between lines of a logic test. Egs: 1.0 means that
every line of the logic test will be run after a cluster backup and restore.

The motivation for this change is to increase the coverage of backup+restore
testing by piggybacking on our large logictest corpus. To begin with we only
run tests in logictestccl with config local, but this will be gradually expanded
as we work through the failure modes.

Informs: cockroachdb#77130

Release note: None

Co-authored-by: adityamaru <adityamaru@gmail.com>
  • Loading branch information
craig[bot] and adityamaru committed Sep 22, 2022
2 parents a18db70 + 211981b commit 87bd307
Show file tree
Hide file tree
Showing 8 changed files with 149 additions and 20 deletions.
13 changes: 13 additions & 0 deletions build/teamcity/cockroach/nightlies/sqllogic_backup_restore.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/usr/bin/env bash

set -euo pipefail

dir="$(dirname $(dirname $(dirname $(dirname "${0}"))))"

source "$dir/teamcity-support.sh" # For $root
source "$dir/teamcity-bazel-support.sh" # For run_bazel

tc_start_block "Run SQL Logic Test Backup Restore"
BAZEL_SUPPORT_EXTRA_DOCKER_ARGS="-e BUILD_VCS_NUMBER -e GITHUB_API_TOKEN -e GITHUB_REPO -e GOOGLE_EPHEMERAL_CREDENTIALS -e TC_BUILDTYPE_ID -e TC_BUILD_BRANCH -e TC_BUILD_ID -e TC_SERVER_URL" \
run_bazel build/teamcity/cockroach/nightlies/sqllogic_backup_restore_impl.sh
tc_end_block "Run SQL Logic Test Backup Restore"
50 changes: 50 additions & 0 deletions build/teamcity/cockroach/nightlies/sqllogic_backup_restore_impl.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
#!/usr/bin/env bash

set -xeuo pipefail

dir="$(dirname $(dirname $(dirname $(dirname "${0}"))))"
source "$dir/teamcity-bazel-support.sh" # For process_test_json
source "$dir/teamcity-support.sh"

bazel build //pkg/cmd/bazci //pkg/cmd/github-post //pkg/cmd/testfilter --config=ci
BAZEL_BIN=$(bazel info bazel-bin --config=ci)
google_credentials="$GOOGLE_EPHEMERAL_CREDENTIALS"

log_into_gcloud
export GOOGLE_APPLICATION_CREDENTIALS="$PWD/.google-credentials.json"

ARTIFACTS_DIR=/artifacts
GO_TEST_JSON_OUTPUT_FILE=$ARTIFACTS_DIR/test.json.txt
exit_status=0

configs=(
local
multiregion-9node-3region-3azs
5node
multiregion-9node-3region-3azs-no-los
multiregion-9node-3region-3azs-tenant
multiregion-9node-3region-3azs-vec-off
multiregion-15node-5region-3azs
3node-tenant
3node-tenant-multiregion
)

for config in "${configs[@]}"; do
$BAZEL_BIN/pkg/cmd/bazci/bazci_/bazci test -- --config=ci \
//pkg/ccl/logictestccl/tests/$config/... \
--test_arg=-show-sql \
--test_env=COCKROACH_LOGIC_TEST_BACKUP_RESTORE_PROBABILITY=0.5 \
--test_env=GO_TEST_WRAP_TESTV=1 \
--test_env=GO_TEST_WRAP=1 \
--test_env=GO_TEST_JSON_OUTPUT_FILE=$GO_TEST_JSON_OUTPUT_FILE.$config \
--test_timeout=28800 \
--test_env=GOOGLE_APPLICATION_CREDENTIALS="$GOOGLE_APPLICATION_CREDENTIALS" \
|| exit_status=$?

process_test_json \
$BAZEL_BIN/pkg/cmd/testfilter/testfilter_/testfilter \
$BAZEL_BIN/pkg/cmd/github-post/github-post_/github-post \
$ARTIFACTS_DIR \
$GO_TEST_JSON_OUTPUT_FILE.$config \
$exit_status
done
1 change: 1 addition & 0 deletions pkg/ccl/logictestccl/testdata/logic_test/as_of
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# LogicTest: local
# BackupRestoreProbability: 0.0

statement ok
CREATE TABLE t (
Expand Down
2 changes: 2 additions & 0 deletions pkg/ccl/logictestccl/testdata/logic_test/crdb_internal
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ CREATE table t2 (a STRING PRIMARY KEY) PARTITION BY LIST (a) (

# Since there are no zone configurations on any of these partitions, tables,
# or databases, these partitions inherit directly from the default config.
skipif backup-restore
query IITTITTTII
SELECT * FROM crdb_internal.partitions ORDER BY table_id, index_id, name
----
Expand Down Expand Up @@ -64,6 +65,7 @@ CREATE TABLE t4 (a INT PRIMARY KEY, b INT); CREATE INDEX myt4index ON t4 (b); AL

user testuser

skipif backup-restore
query IT
SELECT zone_id, target FROM crdb_internal.zones ORDER BY 1
----
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,7 @@ ALTER INDEX t_implicit_idx@t_a_idx PARTITION BY LIST (a) (
PARTITION "one" VALUES IN (1)
)

skipif backup-restore
query TTTTTT colnames
SELECT
indexrelid, indrelid, indkey, indclass, indoption, indcollation
Expand Down
5 changes: 5 additions & 0 deletions pkg/ccl/logictestccl/testdata/logic_test/schema_change_in_txn
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
# Backing up and restoring a descriptor will increment the version of the
# descriptor before restoring it so we cannot achieve the expected behaviour in
# this test.
# BackupRestoreProbability: 0.0

# Regression test for a situation involving creating a table in a transaction
# and altering the index when referenced by name.
subtest index_resolution_does_not_lead_to_new_version
Expand Down
49 changes: 30 additions & 19 deletions pkg/sql/logictest/logic.go
Original file line number Diff line number Diff line change
Expand Up @@ -550,6 +550,9 @@ var (
// writes not holding appropriate latches for range key stats update.
useMVCCRangeTombstonesForPointDeletes = util.ConstantWithMetamorphicTestBool(
"logictest-use-mvcc-range-tombstones-for-point-deletes", false)

// BackupRestoreProbability is the environment variable for `3node-backup` config.
backupRestoreProbability = envutil.EnvOrDefaultFloat64("COCKROACH_LOGIC_TEST_BACKUP_RESTORE_PROBABILITY", 0.0)
)

const queryRewritePlaceholderPrefix = "__async_query_rewrite_placeholder"
Expand Down Expand Up @@ -619,6 +622,10 @@ type pendingQuery struct {
func (ls *logicStatement) readSQL(
t *logicTest, s *logictestbase.LineScanner, allowSeparator bool,
) (separator bool, _ error) {
if err := t.maybeBackupRestore(t.rng, t.cfg); err != nil {
return false, err
}

var buf bytes.Buffer
hasVars := false
for s.Scan() {
Expand Down Expand Up @@ -1666,6 +1673,7 @@ func (t *logicTest) shutdownCluster() {

// resetCluster cleans up the current cluster, and creates a fresh one.
func (t *logicTest) resetCluster() {
t.traceStop()
t.shutdownCluster()
if t.serverArgs == nil {
// We expect the server args to be persisted to the test during test
Expand Down Expand Up @@ -1981,9 +1989,6 @@ type subtestDetails struct {
}

func (t *logicTest) processTestFile(path string, config logictestbase.TestClusterConfig) error {
rng, seed := randutil.NewPseudoRand()
t.outf("rng seed: %d\n", seed)

subtests, err := fetchSubtests(path)
if err != nil {
return err
Expand All @@ -2001,7 +2006,7 @@ func (t *logicTest) processTestFile(path string, config logictestbase.TestCluste
// If subtest has no name, then it is not a subtest, so just run the lines
// in the overall test. Note that this can only happen in the first subtest.
if len(subtest.name) == 0 {
if err := t.processSubtest(subtest, path, config, rng); err != nil {
if err := t.processSubtest(subtest, path, config); err != nil {
return err
}
} else {
Expand All @@ -2011,7 +2016,7 @@ func (t *logicTest) processTestFile(path string, config logictestbase.TestCluste
defer func() {
t.subtestT = nil
}()
if err := t.processSubtest(subtest, path, config, rng); err != nil {
if err := t.processSubtest(subtest, path, config); err != nil {
t.Error(err)
}
})
Expand Down Expand Up @@ -2042,9 +2047,10 @@ func (t *logicTest) hasOpenTxns(ctx context.Context) bool {
existingTxnPriority := "NORMAL"
err := user.QueryRow("SHOW TRANSACTION PRIORITY").Scan(&existingTxnPriority)
if err != nil {
// Ignore an error if we are unable to see transaction priority.
// If we are unable to see transaction priority assume we're in the middle
// of an explicit txn.
log.Warningf(ctx, "failed to check txn priority with %v", err)
continue
return true
}
if _, err := user.Exec("SET TRANSACTION PRIORITY NORMAL;"); !testutils.IsError(err, "there is no transaction in progress") {
// Reset the txn priority to what it was before we checked for open txns.
Expand All @@ -2065,11 +2071,6 @@ func (t *logicTest) hasOpenTxns(ctx context.Context) bool {
func (t *logicTest) maybeBackupRestore(
rng *rand.Rand, config logictestbase.TestClusterConfig,
) error {
if config.BackupRestoreProbability != 0 && !config.IsCCLConfig {
return errors.Newf("logic test config %s specifies a backup restore probability but is not CCL",
config.Name)
}

// We decide if we want to take a backup here based on a probability
// specified in the logic test config.
if rng.Float64() > config.BackupRestoreProbability {
Expand All @@ -2091,6 +2092,8 @@ func (t *logicTest) maybeBackupRestore(
t.setUser(oldUser, 0 /* nodeIdxOverride */)
}()

log.Info(context.Background(), "Running cluster backup and restore")

// To restore the same state in for the logic test, we need to restore the
// data and the session state. The session state includes things like session
// variables that are set for every session that is open.
Expand Down Expand Up @@ -2139,13 +2142,13 @@ func (t *logicTest) maybeBackupRestore(
userToSessionVars[user] = userSessionVars
}

backupLocation := fmt.Sprintf("nodelocal://1/logic-test-backup-%s",
backupLocation := fmt.Sprintf("gs://cockroachdb-backup-testing/logic-test-backup-restore-nightly/%s?AUTH=implicit",
strconv.FormatInt(timeutil.Now().UnixNano(), 10))

// Perform the backup and restore as root.
t.setUser(username.RootUser, 0 /* nodeIdxOverride */)

if _, err := t.db.Exec(fmt.Sprintf("BACKUP TO '%s'", backupLocation)); err != nil {
if _, err := t.db.Exec(fmt.Sprintf("BACKUP INTO '%s'", backupLocation)); err != nil {
return errors.Wrap(err, "backing up cluster")
}

Expand All @@ -2155,7 +2158,7 @@ func (t *logicTest) maybeBackupRestore(

// Run the restore as root.
t.setUser(username.RootUser, 0 /* nodeIdxOverride */)
if _, err := t.db.Exec(fmt.Sprintf("RESTORE FROM '%s'", backupLocation)); err != nil {
if _, err := t.db.Exec(fmt.Sprintf("RESTORE FROM LATEST IN '%s'", backupLocation)); err != nil {
return errors.Wrap(err, "restoring cluster")
}

Expand Down Expand Up @@ -2252,7 +2255,7 @@ func (t *logicTest) purgeZoneConfig() {
}

func (t *logicTest) processSubtest(
subtest subtestDetails, path string, config logictestbase.TestClusterConfig, rng *rand.Rand,
subtest subtestDetails, path string, config logictestbase.TestClusterConfig,
) error {
defer t.traceStop()

Expand Down Expand Up @@ -2283,9 +2286,6 @@ func (t *logicTest) processSubtest(
path, s.Line+subtest.lineLineIndexIntoFile,
)
}
if err := t.maybeBackupRestore(rng, config); err != nil {
return err
}
switch cmd {
case "repeat":
// A line "repeat X" makes the test repeat the following statement or query X times.
Expand Down Expand Up @@ -2878,6 +2878,10 @@ func (t *logicTest) processSubtest(
}
s.SetSkip(fmt.Sprintf("unsupported configuration %s (%s)", configName, issue))
}
case "backup-restore":
if config.BackupRestoreProbability > 0.0 {
s.SetSkip("backup-restore interferes with this check")
}
continue
default:
return errors.Errorf("unimplemented test statement: %s", s.Text())
Expand Down Expand Up @@ -3938,9 +3942,16 @@ func RunLogicTest(
panic(errors.Wrapf(err, "could not set batch size for test"))
}
}
hasOverride, overriddenBackupRestoreProbability := logictestbase.ReadBackupRestoreProbabilityOverride(t, path)
config.BackupRestoreProbability = backupRestoreProbability
if hasOverride {
config.BackupRestoreProbability = overriddenBackupRestoreProbability
}

lt.setup(
config, serverArgsCopy, readClusterOptions(t, path), readKnobOptions(t, path), readTenantClusterSettingOverrideArgs(t, path),
)

lt.runFile(path, config)

progress.total += lt.progress
Expand Down
48 changes: 47 additions & 1 deletion pkg/sql/logictest/logictestbase/logictestbase.go
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ type TestClusterConfig struct {
// localities is set if nodes should be set to a particular locality.
// Nodes are 1-indexed.
Localities map[int]roachpb.Locality
// backupRestoreProbability will periodically backup the cluster and restore
// BackupRestoreProbability will periodically backup the cluster and restore
// it's state to a new cluster at random points during a logic test.
BackupRestoreProbability float64
// disableDeclarativeSchemaChanger will disable the declarative schema changer
Expand Down Expand Up @@ -576,6 +576,52 @@ func (l stdlogger) Logf(format string, args ...interface{}) {
fmt.Printf(format, args...)
}

// ReadBackupRestoreProbabilityOverride reads any LogicTest directive at the
// beginning of a test file. A line that starts with "#
// BackupRestoreProbability:" specifies the probability with which we should run
// a cluster backup + restore between lines of the test file.
//
// Example:
//
// # BackupRestoreProbability: 0.8
//
// If the file doesn't contain a directive, the value of the environment
// variable COCKROACH_LOGIC_TEST_BACKUP_RESTORE_PROBABILITY is used.
func ReadBackupRestoreProbabilityOverride(
t logger, path string,
) (hasOverride bool, probability float64) {
file, err := os.Open(path)
if err != nil {
t.Fatalf("failed open file %s", path)
}
defer file.Close()

s := NewLineScanner(file)
for s.Scan() {
fields := strings.Fields(s.Text())
if len(fields) == 0 {
continue
}
cmd := fields[0]
if !strings.HasPrefix(cmd, "#") {
// Stop at the first line that's not a comment (or empty).
break
}
if len(fields) > 1 && cmd == "#" && fields[1] == "BackupRestoreProbability:" {
if len(fields) == 2 {
t.Fatalf("%s: empty LogicTest directive", path)
}
probability, err := strconv.ParseFloat(fields[2], 64)
if err != nil {
t.Fatalf("failed to parse backup+restore probability: %+v", err)
}
return true, probability
}
}

return false, 0
}

// ReadTestFileConfigs reads any LogicTest directive at the beginning of a
// test file. A line that starts with "# LogicTest:" specifies a list of
// configuration names. The test file is run against each of those
Expand Down

0 comments on commit 87bd307

Please sign in to comment.