From 1b996634516cb0b29074dd120c187f322f12729b Mon Sep 17 00:00:00 2001 From: Ben Blattberg Date: Fri, 4 Nov 2022 10:24:31 -0500 Subject: [PATCH 1/3] Log errors when the PostgreSQL data directory is wrong The postgres-startup container now reports when it finds the installed PostgreSQL binaries do not match the specified PostgreSQL version. Some storage providers do not mount the PostgreSQL data volume with correct ownership or permissions. The postgres-startup container now prints those attributes of parent directories when it cannot create or modify a needed file or directory. Issue: [sc-11804] Issue: CrunchyData/postgres-operator#2870 Co-authored-by: @cbandy --- internal/postgres/config.go | 33 ++++++++-- internal/postgres/config_test.go | 99 ++++++++++++++++++++++++++--- internal/postgres/reconcile_test.go | 17 +++-- 3 files changed, 131 insertions(+), 18 deletions(-) diff --git a/internal/postgres/config.go b/internal/postgres/config.go index 9032c0deaa..e54215cb78 100644 --- a/internal/postgres/config.go +++ b/internal/postgres/config.go @@ -26,6 +26,18 @@ import ( ) const ( + // bashHalt is a Bash function that prints its arguments to stderr then + // exits with a non-zero status. It uses the exit status of the prior + // command if that was not zero. + bashHalt = `halt() { local rc=$?; >&2 echo "$@"; exit "${rc/#0/1}"; }` + + // bashPermissions is a Bash function that prints the permissions of a file + // or directory and all its parent directories, except the root directory. + bashPermissions = `permissions() {` + + ` while [[ -n "$1" ]]; do set "${1%/*}" "$@"; done; shift;` + + ` stat -Lc '%A %4u %4g %n' "$@";` + + ` }` + // bashSafeLink is a Bash function that moves an existing file or directory // and replaces it with a symbolic link. bashSafeLink = ` @@ -184,6 +196,12 @@ func startupCommand( script := strings.Join([]string{ `declare -r expected_major_version="$1" pgwal_directory="$2" pgbrLog_directory="$3"`, + // Function to print the permissions of a file or directory and its parents. + bashPermissions, + + // Function to print a message to stderr then exit non-zero. + bashHalt, + // Function to log values in a basic structured format. `results() { printf '::postgres-operator: %s::%s\n' "$@"; }`, @@ -198,14 +216,16 @@ func startupCommand( // match the cluster spec. `results 'postgres path' "$(command -v postgres)"`, `results 'postgres version' "${postgres_version:=$(postgres --version)}"`, - `[[ "${postgres_version}" == *") ${expected_major_version}."* ]]`, + `[[ "${postgres_version}" =~ ") ${expected_major_version}"($|[^0-9]) ]] ||`, + `halt Expected PostgreSQL version "${expected_major_version}"`, // Abort when the configured data directory is not $PGDATA. // - https://www.postgresql.org/docs/current/runtime-config-file-locations.html `results 'config directory' "${PGDATA:?}"`, `postgres_data_directory=$([ -d "${PGDATA}" ] && postgres -C data_directory || echo "${PGDATA}")`, `results 'data directory' "${postgres_data_directory}"`, - `[ "${postgres_data_directory}" = "${PGDATA}" ]`, + `[[ "${postgres_data_directory}" == "${PGDATA}" ]] ||`, + `halt Expected matching config and data directories`, // Determine if the data directory has been prepared for bootstrapping the cluster `bootstrap_dir="${postgres_data_directory}_bootstrap"`, @@ -218,11 +238,13 @@ func startupCommand( // - https://www.postgresql.org/docs/current/creating-cluster.html // - https://git.postgresql.org/gitweb/?p=postgresql.git;f=src/backend/utils/init/miscinit.c;hb=REL_13_0#l319 // - https://issue.k8s.io/93802#issuecomment-717646167 - `install --directory --mode=0700 "${postgres_data_directory}"`, + `install --directory --mode=0700 "${postgres_data_directory}" ||`, + `halt "$(permissions "${postgres_data_directory}" ||:)"`, // Create the pgBackRest log directory. `results 'pgBackRest log directory' "${pgbrLog_directory}"`, - `install --directory --mode=0775 "${pgbrLog_directory}"`, + `install --directory --mode=0775 "${pgbrLog_directory}" ||`, + `halt "$(permissions "${pgbrLog_directory}" ||:)"`, // Copy replication client certificate files // from the /pgconf/tls/replication directory to the /tmp/replication directory in order @@ -243,7 +265,8 @@ func startupCommand( // Abort when the data directory is not empty and its version does not // match the cluster spec. `results 'data version' "${postgres_data_version:=$(< "${postgres_data_directory}/PG_VERSION")}"`, - `[ "${postgres_data_version}" = "${expected_major_version}" ]`, + `[[ "${postgres_data_version}" == "${expected_major_version}" ]] ||`, + `halt Expected PostgreSQL data version "${expected_major_version}"`, // Safely move the WAL directory onto the intended volume. PostgreSQL // always writes WAL files in the "pg_wal" directory inside the data diff --git a/internal/postgres/config_test.go b/internal/postgres/config_test.go index bea53ec522..db2b5fd158 100644 --- a/internal/postgres/config_test.go +++ b/internal/postgres/config_test.go @@ -16,6 +16,8 @@ package postgres import ( + "bytes" + "errors" "os" "os/exec" "path/filepath" @@ -26,6 +28,7 @@ import ( corev1 "k8s.io/api/core/v1" "sigs.k8s.io/yaml" + "github.com/crunchydata/postgres-operator/internal/testing/cmp" "github.com/crunchydata/postgres-operator/internal/testing/require" "github.com/crunchydata/postgres-operator/pkg/apis/postgres-operator.crunchydata.com/v1beta1" ) @@ -57,6 +60,85 @@ func TestWALDirectory(t *testing.T) { assert.Equal(t, WALDirectory(cluster, instance), "/pgwal/pg13_wal") } +func TestBashHalt(t *testing.T) { + t.Run("NoPipeline", func(t *testing.T) { + cmd := exec.Command("bash") + cmd.Args = append(cmd.Args, "-c", "--", bashHalt+`; halt ab cd e`) + + var exit *exec.ExitError + stdout, err := cmd.Output() + assert.Assert(t, errors.As(err, &exit)) + assert.Equal(t, string(stdout), "", "expected no stdout") + assert.Equal(t, string(exit.Stderr), "ab cd e\n") + assert.Equal(t, exit.ExitCode(), 1) + }) + + t.Run("PipelineZeroStatus", func(t *testing.T) { + cmd := exec.Command("bash") + cmd.Args = append(cmd.Args, "-c", "--", bashHalt+`; true && halt message`) + + var exit *exec.ExitError + stdout, err := cmd.Output() + assert.Assert(t, errors.As(err, &exit)) + assert.Equal(t, string(stdout), "", "expected no stdout") + assert.Equal(t, string(exit.Stderr), "message\n") + assert.Equal(t, exit.ExitCode(), 1) + }) + + t.Run("PipelineNonZeroStatus", func(t *testing.T) { + cmd := exec.Command("bash") + cmd.Args = append(cmd.Args, "-c", "--", bashHalt+`; (exit 99) || halt $'multi\nline'`) + + var exit *exec.ExitError + stdout, err := cmd.Output() + assert.Assert(t, errors.As(err, &exit)) + assert.Equal(t, string(stdout), "", "expected no stdout") + assert.Equal(t, string(exit.Stderr), "multi\nline\n") + assert.Equal(t, exit.ExitCode(), 99) + }) + + t.Run("Subshell", func(t *testing.T) { + cmd := exec.Command("bash") + cmd.Args = append(cmd.Args, "-c", "--", bashHalt+`; (halt 'err') || echo 'after'`) + + stderr := new(bytes.Buffer) + cmd.Stderr = stderr + + stdout, err := cmd.Output() + assert.NilError(t, err) + assert.Equal(t, string(stdout), "after\n") + assert.Equal(t, stderr.String(), "err\n") + assert.Equal(t, cmd.ProcessState.ExitCode(), 0) + }) +} + +func TestBashPermissions(t *testing.T) { + // macOS `stat` takes different arguments than BusyBox and GNU coreutils. + if output, err := exec.Command("stat", "--help").CombinedOutput(); err != nil { + t.Skip(`requires "stat" executable`) + } else if !strings.Contains(string(output), "%A") { + t.Skip(`requires "stat" with access format sequence`) + } + + dir := t.TempDir() + assert.NilError(t, os.Mkdir(filepath.Join(dir, "sub"), 0o751)) + assert.NilError(t, os.Chmod(filepath.Join(dir, "sub"), 0o751)) + assert.NilError(t, os.WriteFile(filepath.Join(dir, "sub", "fn"), nil, 0o624)) // #nosec G306 OK permissions for a temp dir in a test + assert.NilError(t, os.Chmod(filepath.Join(dir, "sub", "fn"), 0o624)) + + cmd := exec.Command("bash") + cmd.Args = append(cmd.Args, "-c", "--", + bashPermissions+`; permissions "$@"`, "-", + filepath.Join(dir, "sub", "fn")) + + stdout, err := cmd.Output() + assert.NilError(t, err) + assert.Assert(t, cmp.Regexp(``+ + `drwxr-x--x \d+ \d+ [^ ]+/sub\n`+ + `-rw--w-r-- \d+ \d+ [^ ]+/sub/fn\n`+ + `$`, string(stdout))) +} + func TestBashSafeLink(t *testing.T) { // macOS lacks `realpath` which is part of GNU coreutils. if _, err := exec.LookPath("realpath"); err != nil { @@ -310,13 +392,6 @@ func TestBashSafeLink(t *testing.T) { }) } -func TestBashSafeLinkPrettyYAML(t *testing.T) { - b, err := yaml.Marshal(bashSafeLink) - assert.NilError(t, err) - assert.Assert(t, strings.HasPrefix(string(b), `|`), - "expected literal block scalar, got:\n%s", b) -} - func TestStartupCommand(t *testing.T) { shellcheck := require.ShellCheck(t) @@ -329,14 +404,22 @@ func TestStartupCommand(t *testing.T) { // Expect a bash command with an inline script. assert.DeepEqual(t, command[:3], []string{"bash", "-ceu", "--"}) assert.Assert(t, len(command) > 3) + script := command[3] // Write out that inline script. dir := t.TempDir() file := filepath.Join(dir, "script.bash") - assert.NilError(t, os.WriteFile(file, []byte(command[3]), 0o600)) + assert.NilError(t, os.WriteFile(file, []byte(script), 0o600)) // Expect shellcheck to be happy. cmd := exec.Command(shellcheck, "--enable=all", file) output, err := cmd.CombinedOutput() assert.NilError(t, err, "%q\n%s", cmd.Args, output) + + t.Run("PrettyYAML", func(t *testing.T) { + b, err := yaml.Marshal(script) + assert.NilError(t, err) + assert.Assert(t, strings.HasPrefix(string(b), `|`), + "expected literal block scalar, got:\n%s", b) + }) } diff --git a/internal/postgres/reconcile_test.go b/internal/postgres/reconcile_test.go index 56b11d251b..3cd548dec9 100644 --- a/internal/postgres/reconcile_test.go +++ b/internal/postgres/reconcile_test.go @@ -202,6 +202,8 @@ initContainers: - -- - |- declare -r expected_major_version="$1" pgwal_directory="$2" pgbrLog_directory="$3" + permissions() { while [[ -n "$1" ]]; do set "${1%/*}" "$@"; done; shift; stat -Lc '%A %4u %4g %n' "$@"; } + halt() { local rc=$?; >&2 echo "$@"; exit "${rc/#0/1}"; } results() { printf '::postgres-operator: %s::%s\n' "$@"; } safelink() ( local desired="$1" name="$2" current @@ -214,21 +216,26 @@ initContainers: results 'uid' "$(id -u)" 'gid' "$(id -G)" results 'postgres path' "$(command -v postgres)" results 'postgres version' "${postgres_version:=$(postgres --version)}" - [[ "${postgres_version}" == *") ${expected_major_version}."* ]] + [[ "${postgres_version}" =~ ") ${expected_major_version}"($|[^0-9]) ]] || + halt Expected PostgreSQL version "${expected_major_version}" results 'config directory' "${PGDATA:?}" postgres_data_directory=$([ -d "${PGDATA}" ] && postgres -C data_directory || echo "${PGDATA}") results 'data directory' "${postgres_data_directory}" - [ "${postgres_data_directory}" = "${PGDATA}" ] + [[ "${postgres_data_directory}" == "${PGDATA}" ]] || + halt Expected matching config and data directories bootstrap_dir="${postgres_data_directory}_bootstrap" [ -d "${bootstrap_dir}" ] && results 'bootstrap directory' "${bootstrap_dir}" [ -d "${bootstrap_dir}" ] && postgres_data_directory="${bootstrap_dir}" - install --directory --mode=0700 "${postgres_data_directory}" + install --directory --mode=0700 "${postgres_data_directory}" || + halt "$(permissions "${postgres_data_directory}" ||:)" results 'pgBackRest log directory' "${pgbrLog_directory}" - install --directory --mode=0775 "${pgbrLog_directory}" + install --directory --mode=0775 "${pgbrLog_directory}" || + halt "$(permissions "${pgbrLog_directory}" ||:)" install -D --mode=0600 -t "/tmp/replication" "/pgconf/tls/replication"/{tls.crt,tls.key,ca.crt} [ -f "${postgres_data_directory}/PG_VERSION" ] || exit 0 results 'data version' "${postgres_data_version:=$(< "${postgres_data_directory}/PG_VERSION")}" - [ "${postgres_data_version}" = "${expected_major_version}" ] + [[ "${postgres_data_version}" == "${expected_major_version}" ]] || + halt Expected PostgreSQL data version "${expected_major_version}" safelink "${pgwal_directory}" "${postgres_data_directory}/pg_wal" results 'wal directory' "$(realpath "${postgres_data_directory}/pg_wal")" rm -f "${postgres_data_directory}/recovery.signal" From c718876c458edd546cab1e31db1a484c5c6a52cb Mon Sep 17 00:00:00 2001 From: Ben Blattberg Date: Fri, 4 Nov 2022 10:27:53 -0500 Subject: [PATCH 2/3] Change owner of the PostgreSQL directory at startup PostgreSQL won't to start unless it owns the data directory. Kubernetes sets the group according to fsGroup but not the owner. The postgres-startup container now recreates the data directory to give it a new owner when permissions are sufficient to do so. It now raises an error when the owner is incorrect and cannot be changed. Issue: [sc-15909] See: https://docs.k8s.io/tasks/configure-pod-container/security-context/ Co-authored-by: @cbandy --- internal/postgres/config.go | 36 +++++++++++++-- internal/postgres/config_test.go | 71 ++++++++++++++++++++++++++++- internal/postgres/reconcile_test.go | 10 +++- 3 files changed, 110 insertions(+), 7 deletions(-) diff --git a/internal/postgres/config.go b/internal/postgres/config.go index e54215cb78..e9cfcae362 100644 --- a/internal/postgres/config.go +++ b/internal/postgres/config.go @@ -38,6 +38,15 @@ const ( ` stat -Lc '%A %4u %4g %n' "$@";` + ` }` + // bashRecreateDirectory is a Bash function that moves the contents of an + // existing directory into a newly created directory of the same name. + bashRecreateDirectory = ` +recreate() ( + local tmp; tmp=$(mktemp -d -p "${1%/*}"); GLOBIGNORE='.:..'; set -x + chmod "$2" "${tmp}"; mv "$1"/* "${tmp}"; rmdir "$1"; mv "${tmp}" "$1" +) +` + // bashSafeLink is a Bash function that moves an existing file or directory // and replaces it with a symbolic link. bashSafeLink = ` @@ -205,7 +214,10 @@ func startupCommand( // Function to log values in a basic structured format. `results() { printf '::postgres-operator: %s::%s\n' "$@"; }`, - // Function to change a directory symlink while keeping the directory content. + // Function to change the owner of an existing directory. + strings.TrimSpace(bashRecreateDirectory), + + // Function to change a directory symlink while keeping the directory contents. strings.TrimSpace(bashSafeLink), // Log the effective user ID and all the group IDs. @@ -234,11 +246,27 @@ func startupCommand( // PostgreSQL requires its directory to be writable by only itself. // Pod "securityContext.fsGroup" sets g+w on directories for *some* - // storage providers. + // storage providers. Ensure the current user owns the directory, and + // remove group permissions. // - https://www.postgresql.org/docs/current/creating-cluster.html - // - https://git.postgresql.org/gitweb/?p=postgresql.git;f=src/backend/utils/init/miscinit.c;hb=REL_13_0#l319 + // - https://git.postgresql.org/gitweb/?p=postgresql.git;f=src/backend/postmaster/postmaster.c;hb=REL_10_0#l1522 + // - https://git.postgresql.org/gitweb/?p=postgresql.git;f=src/backend/utils/init/miscinit.c;hb=REL_14_0#l349 // - https://issue.k8s.io/93802#issuecomment-717646167 - `install --directory --mode=0700 "${postgres_data_directory}" ||`, + // + // When the directory does not exist, create it with the correct permissions. + // When the directory has the correct owner, set the correct permissions. + `if [[ ! -e "${postgres_data_directory}" || -O "${postgres_data_directory}" ]]; then`, + `install --directory --mode=0700 "${postgres_data_directory}"`, + // + // The directory exists but its owner is wrong. When it is writable, + // the set-group-ID bit indicates that "fsGroup" probably ran on its + // contents making them safe to use. In this case, we can make a new + // directory (owned by this user) and refill it. + `elif [[ -w "${postgres_data_directory}" && -g "${postgres_data_directory}" ]]; then`, + `recreate "${postgres_data_directory}" '0700'`, + // + // The directory exists, its owner is wrong, and it is not writable. + `else (halt Permissions!); fi ||`, `halt "$(permissions "${postgres_data_directory}" ||:)"`, // Create the pgBackRest log directory. diff --git a/internal/postgres/config_test.go b/internal/postgres/config_test.go index db2b5fd158..4f1f1c857e 100644 --- a/internal/postgres/config_test.go +++ b/internal/postgres/config_test.go @@ -18,6 +18,7 @@ package postgres import ( "bytes" "errors" + "fmt" "os" "os/exec" "path/filepath" @@ -134,11 +135,77 @@ func TestBashPermissions(t *testing.T) { stdout, err := cmd.Output() assert.NilError(t, err) assert.Assert(t, cmp.Regexp(``+ - `drwxr-x--x \d+ \d+ [^ ]+/sub\n`+ - `-rw--w-r-- \d+ \d+ [^ ]+/sub/fn\n`+ + `drwxr-x--x\s+\d+\s+\d+\s+[^ ]+/sub\n`+ + `-rw--w-r--\s+\d+\s+\d+\s+[^ ]+/sub/fn\n`+ `$`, string(stdout))) } +func TestBashRecreateDirectory(t *testing.T) { + // macOS `stat` takes different arguments than BusyBox and GNU coreutils. + if output, err := exec.Command("stat", "--help").CombinedOutput(); err != nil { + t.Skip(`requires "stat" executable`) + } else if !strings.Contains(string(output), "%a") { + t.Skip(`requires "stat" with access format sequence`) + } + + dir := t.TempDir() + assert.NilError(t, os.Mkdir(filepath.Join(dir, "d"), 0o755)) + assert.NilError(t, os.WriteFile(filepath.Join(dir, "d", ".hidden"), nil, 0o644)) // #nosec G306 OK permissions for a temp dir in a test + assert.NilError(t, os.WriteFile(filepath.Join(dir, "d", "file"), nil, 0o644)) // #nosec G306 OK permissions for a temp dir in a test + + stat := func(args ...string) string { + cmd := exec.Command("stat", "-c", "%i %#a %N") + cmd.Args = append(cmd.Args, args...) + out, err := cmd.CombinedOutput() + + t.Helper() + assert.NilError(t, err, string(out)) + return string(out) + } + + var before, after struct{ d, f, dInode, dPerms string } + + before.d = stat(filepath.Join(dir, "d")) + before.f = stat( + filepath.Join(dir, "d", ".hidden"), + filepath.Join(dir, "d", "file"), + ) + + cmd := exec.Command("bash") + cmd.Args = append(cmd.Args, "-ceu", "--", + bashRecreateDirectory+` recreate "$@"`, "-", + filepath.Join(dir, "d"), "0740") + + output, err := cmd.CombinedOutput() + assert.NilError(t, err, string(output)) + assert.Assert(t, cmp.Regexp(`^`+ + `[+] chmod 0740 [^ ]+/tmp.[^ /]+\n`+ + `[+] mv [^ ]+/d/.hidden [^ ]+/d/file [^ ]+/tmp.[^ /]+\n`+ + `[+] rmdir [^ ]+/d\n`+ + `[+] mv [^ ]+/tmp.[^ /]+ [^ ]+/d\n`+ + `$`, string(output))) + + after.d = stat(filepath.Join(dir, "d")) + after.f = stat( + filepath.Join(dir, "d", ".hidden"), + filepath.Join(dir, "d", "file"), + ) + + _, err = fmt.Sscan(before.d, &before.dInode, &before.dPerms) + assert.NilError(t, err) + _, err = fmt.Sscan(after.d, &after.dInode, &after.dPerms) + assert.NilError(t, err) + + // New directory is new. + assert.Assert(t, after.dInode != before.dInode) + + // New directory has the requested permissions. + assert.Equal(t, after.dPerms, "0740") + + // Files are in the new directory and unchanged. + assert.DeepEqual(t, after.f, before.f) +} + func TestBashSafeLink(t *testing.T) { // macOS lacks `realpath` which is part of GNU coreutils. if _, err := exec.LookPath("realpath"); err != nil { diff --git a/internal/postgres/reconcile_test.go b/internal/postgres/reconcile_test.go index 3cd548dec9..2cdb1794eb 100644 --- a/internal/postgres/reconcile_test.go +++ b/internal/postgres/reconcile_test.go @@ -205,6 +205,10 @@ initContainers: permissions() { while [[ -n "$1" ]]; do set "${1%/*}" "$@"; done; shift; stat -Lc '%A %4u %4g %n' "$@"; } halt() { local rc=$?; >&2 echo "$@"; exit "${rc/#0/1}"; } results() { printf '::postgres-operator: %s::%s\n' "$@"; } + recreate() ( + local tmp; tmp=$(mktemp -d -p "${1%/*}"); GLOBIGNORE='.:..'; set -x + chmod "$2" "${tmp}"; mv "$1"/* "${tmp}"; rmdir "$1"; mv "${tmp}" "$1" + ) safelink() ( local desired="$1" name="$2" current current=$(realpath "${name}") @@ -226,7 +230,11 @@ initContainers: bootstrap_dir="${postgres_data_directory}_bootstrap" [ -d "${bootstrap_dir}" ] && results 'bootstrap directory' "${bootstrap_dir}" [ -d "${bootstrap_dir}" ] && postgres_data_directory="${bootstrap_dir}" - install --directory --mode=0700 "${postgres_data_directory}" || + if [[ ! -e "${postgres_data_directory}" || -O "${postgres_data_directory}" ]]; then + install --directory --mode=0700 "${postgres_data_directory}" + elif [[ -w "${postgres_data_directory}" && -g "${postgres_data_directory}" ]]; then + recreate "${postgres_data_directory}" '0700' + else (halt Permissions!); fi || halt "$(permissions "${postgres_data_directory}" ||:)" results 'pgBackRest log directory' "${pgbrLog_directory}" install --directory --mode=0775 "${pgbrLog_directory}" || From 534765545009312db7ff61334c69a2880a3ad7e1 Mon Sep 17 00:00:00 2001 From: Ben Blattberg Date: Fri, 4 Nov 2022 10:42:55 -0500 Subject: [PATCH 3/3] Add KUTTL test for migration from third-party PGSQL Issue: [sc-15909] --- .gitignore | 3 +- Makefile | 9 +- internal/postgres/config.go | 7 + internal/postgres/reconcile_test.go | 2 + .../01--non-crunchy-cluster.yaml | 193 ++++++++++++++++++ .../e2e-other/cluster-migrate/01-assert.yaml | 8 + .../cluster-migrate/02--create-data.yaml | 30 +++ .../e2e-other/cluster-migrate/02-assert.yaml | 7 + .../cluster-migrate/03--alter-pv.yaml | 23 +++ .../e2e-other/cluster-migrate/04--delete.yaml | 15 ++ .../e2e-other/cluster-migrate/04-errors.yaml | 4 + .../cluster-migrate/05--cluster.yaml | 30 +++ .../e2e-other/cluster-migrate/06-assert.yaml | 21 ++ .../cluster-migrate/07--set-collation.yaml | 23 +++ .../cluster-migrate/08--alter-pv.yaml | 16 ++ .../cluster-migrate/09--check-data.yaml | 23 +++ .../kuttl/e2e-other/cluster-migrate/README.md | 45 ++++ 17 files changed, 456 insertions(+), 3 deletions(-) create mode 100644 testing/kuttl/e2e-other/cluster-migrate/01--non-crunchy-cluster.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/01-assert.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/02--create-data.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/02-assert.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/03--alter-pv.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/04--delete.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/04-errors.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/05--cluster.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/06-assert.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/07--set-collation.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/08--alter-pv.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/09--check-data.yaml create mode 100644 testing/kuttl/e2e-other/cluster-migrate/README.md diff --git a/.gitignore b/.gitignore index 7a95ef977a..64ec9b3ace 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,4 @@ .DS_Store /vendor/ tools -/testing/kuttl/e2e-generated/ -/testing/kuttl/e2e-generated-other/ +/testing/kuttl/e2e-generated*/ diff --git a/Makefile b/Makefile index 5a8e46d579..5ff0050f9c 100644 --- a/Makefile +++ b/Makefile @@ -226,7 +226,14 @@ generate-kuttl: [ ! -d testing/kuttl/e2e-generated ] || rm -r testing/kuttl/e2e-generated [ ! -d testing/kuttl/e2e-generated-other ] || rm -r testing/kuttl/e2e-generated-other bash -ceu ' \ - render() { envsubst '"'"'$$KUTTL_PG_VERSION $$KUTTL_POSTGIS_VERSION $$KUTTL_PSQL_IMAGE'"'"'; }; \ + case $(KUTTL_PG_VERSION) in \ + 15 ) export KUTTL_BITNAMI_IMAGE_TAG=15.0.0-debian-11-r4 ;; \ + 14 ) export KUTTL_BITNAMI_IMAGE_TAG=14.5.0-debian-11-r37 ;; \ + 13 ) export KUTTL_BITNAMI_IMAGE_TAG=13.8.0-debian-11-r39 ;; \ + 12 ) export KUTTL_BITNAMI_IMAGE_TAG=12.12.0-debian-11-r40 ;; \ + 11 ) export KUTTL_BITNAMI_IMAGE_TAG=11.17.0-debian-11-r39 ;; \ + esac; \ + render() { envsubst '"'"'$$KUTTL_PG_VERSION $$KUTTL_POSTGIS_VERSION $$KUTTL_PSQL_IMAGE $$KUTTL_BITNAMI_IMAGE_TAG'"'"'; }; \ while [ $$# -gt 0 ]; do \ source="$${1}" target="$${1/e2e/e2e-generated}"; \ mkdir -p "$${target%/*}"; render < "$${source}" > "$${target}"; \ diff --git a/internal/postgres/config.go b/internal/postgres/config.go index e9cfcae362..487047287e 100644 --- a/internal/postgres/config.go +++ b/internal/postgres/config.go @@ -296,6 +296,13 @@ func startupCommand( `[[ "${postgres_data_version}" == "${expected_major_version}" ]] ||`, `halt Expected PostgreSQL data version "${expected_major_version}"`, + // For a restore from datasource: + // Patroni will complain if there's no `postgresql.conf` file + // and PGDATA may be missing that file if this is a restored database + // where the conf file was kept elsewhere. + `[[ ! -f "${postgres_data_directory}/postgresql.conf" ]] &&`, + `touch "${postgres_data_directory}/postgresql.conf"`, + // Safely move the WAL directory onto the intended volume. PostgreSQL // always writes WAL files in the "pg_wal" directory inside the data // directory. The recommended way to relocate it is with a symbolic diff --git a/internal/postgres/reconcile_test.go b/internal/postgres/reconcile_test.go index 2cdb1794eb..72c90c6ac8 100644 --- a/internal/postgres/reconcile_test.go +++ b/internal/postgres/reconcile_test.go @@ -244,6 +244,8 @@ initContainers: results 'data version' "${postgres_data_version:=$(< "${postgres_data_directory}/PG_VERSION")}" [[ "${postgres_data_version}" == "${expected_major_version}" ]] || halt Expected PostgreSQL data version "${expected_major_version}" + [[ ! -f "${postgres_data_directory}/postgresql.conf" ]] && + touch "${postgres_data_directory}/postgresql.conf" safelink "${pgwal_directory}" "${postgres_data_directory}/pg_wal" results 'wal directory' "$(realpath "${postgres_data_directory}/pg_wal")" rm -f "${postgres_data_directory}/recovery.signal" diff --git a/testing/kuttl/e2e-other/cluster-migrate/01--non-crunchy-cluster.yaml b/testing/kuttl/e2e-other/cluster-migrate/01--non-crunchy-cluster.yaml new file mode 100644 index 0000000000..1ccceb7098 --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/01--non-crunchy-cluster.yaml @@ -0,0 +1,193 @@ +apiVersion: v1 +kind: Secret +metadata: + name: non-crunchy-cluster + labels: + postgres-operator-test: kuttl + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster +type: Opaque +stringData: + postgres-password: "SR6kNAFXvX" +--- +apiVersion: v1 +kind: Service +metadata: + name: non-crunchy-cluster-hl + labels: + postgres-operator-test: kuttl + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster + app.kubernetes.io/component: primary + service.alpha.kubernetes.io/tolerate-unready-endpoints: "true" +spec: + type: ClusterIP + clusterIP: None + publishNotReadyAddresses: true + ports: + - name: tcp-postgresql + port: 5432 + targetPort: tcp-postgresql + selector: + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster + app.kubernetes.io/component: primary +--- +apiVersion: v1 +kind: Service +metadata: + name: non-crunchy-cluster + labels: + postgres-operator-test: kuttl + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster + app.kubernetes.io/component: primary +spec: + type: ClusterIP + sessionAffinity: None + ports: + - name: tcp-postgresql + port: 5432 + targetPort: tcp-postgresql + nodePort: null + selector: + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster + app.kubernetes.io/component: primary +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: non-crunchy-cluster + labels: + postgres-operator-test: kuttl + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster + app.kubernetes.io/component: primary +spec: + replicas: 1 + serviceName: non-crunchy-cluster-hl + updateStrategy: + rollingUpdate: {} + type: RollingUpdate + selector: + matchLabels: + postgres-operator-test: kuttl + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster + app.kubernetes.io/component: primary + template: + metadata: + name: non-crunchy-cluster + labels: + postgres-operator-test: kuttl + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster + app.kubernetes.io/component: primary + spec: + serviceAccountName: default + affinity: + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchLabels: + postgres-operator-test: kuttl + app.kubernetes.io/name: postgresql + app.kubernetes.io/instance: non-crunchy-cluster + app.kubernetes.io/component: primary + namespaces: + - "default" + topologyKey: kubernetes.io/hostname + weight: 1 + securityContext: + fsGroup: 1001 + hostNetwork: false + hostIPC: false + containers: + - name: postgresql + image: docker.io/bitnami/postgresql:${KUTTL_BITNAMI_IMAGE_TAG} + imagePullPolicy: "IfNotPresent" + securityContext: + runAsUser: 1001 + env: + - name: BITNAMI_DEBUG + value: "false" + - name: POSTGRESQL_PORT_NUMBER + value: "5432" + - name: POSTGRESQL_VOLUME_DIR + value: "/bitnami/postgresql" + - name: PGDATA + value: "/bitnami/postgresql/data" + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: non-crunchy-cluster + key: postgres-password + - name: POSTGRESQL_ENABLE_LDAP + value: "no" + - name: POSTGRESQL_ENABLE_TLS + value: "no" + - name: POSTGRESQL_LOG_HOSTNAME + value: "false" + - name: POSTGRESQL_LOG_CONNECTIONS + value: "false" + - name: POSTGRESQL_LOG_DISCONNECTIONS + value: "false" + - name: POSTGRESQL_PGAUDIT_LOG_CATALOG + value: "off" + - name: POSTGRESQL_CLIENT_MIN_MESSAGES + value: "error" + - name: POSTGRESQL_SHARED_PRELOAD_LIBRARIES + value: "pgaudit" + ports: + - name: tcp-postgresql + containerPort: 5432 + livenessProbe: + failureThreshold: 6 + initialDelaySeconds: 30 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + exec: + command: + - /bin/sh + - -c + - exec pg_isready -U "postgres" -h localhost -p 5432 + readinessProbe: + failureThreshold: 6 + initialDelaySeconds: 5 + periodSeconds: 10 + successThreshold: 1 + timeoutSeconds: 5 + exec: + command: + - /bin/sh + - -c + - -e + - | + exec pg_isready -U "postgres" -h localhost -p 5432 + [ -f /opt/bitnami/postgresql/tmp/.initialized ] || [ -f /bitnami/postgresql/.initialized ] + resources: + limits: {} + requests: + cpu: 250m + memory: 256Mi + volumeMounts: + - name: dshm + mountPath: /dev/shm + - name: data + mountPath: /bitnami/postgresql + volumes: + - name: dshm + emptyDir: + medium: Memory + volumeClaimTemplates: + - metadata: + name: data + spec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: "1Gi" diff --git a/testing/kuttl/e2e-other/cluster-migrate/01-assert.yaml b/testing/kuttl/e2e-other/cluster-migrate/01-assert.yaml new file mode 100644 index 0000000000..c45fe79261 --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/01-assert.yaml @@ -0,0 +1,8 @@ +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: non-crunchy-cluster +status: + readyReplicas: 1 + replicas: 1 + updatedReplicas: 1 diff --git a/testing/kuttl/e2e-other/cluster-migrate/02--create-data.yaml b/testing/kuttl/e2e-other/cluster-migrate/02--create-data.yaml new file mode 100644 index 0000000000..a9b7ebf152 --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/02--create-data.yaml @@ -0,0 +1,30 @@ +--- +# Create some data that will be preserved after migration. +apiVersion: batch/v1 +kind: Job +metadata: + name: original-data + labels: { postgres-operator-test: kuttl } +spec: + backoffLimit: 3 + template: + metadata: + labels: { postgres-operator-test: kuttl } + spec: + restartPolicy: Never + containers: + - name: psql + image: ${KUTTL_PSQL_IMAGE} + env: + - { name: PGHOST, value: "non-crunchy-cluster" } + # Do not wait indefinitely. + - { name: PGCONNECT_TIMEOUT, value: '5' } + - { name: PGPASSWORD, valueFrom: { secretKeyRef: { name: non-crunchy-cluster, key: postgres-password } } } + command: + - psql + - --username=postgres + - --dbname=postgres + - --set=ON_ERROR_STOP=1 + - --command + - | + CREATE TABLE IF NOT EXISTS important (data) AS VALUES ('treasure'); diff --git a/testing/kuttl/e2e-other/cluster-migrate/02-assert.yaml b/testing/kuttl/e2e-other/cluster-migrate/02-assert.yaml new file mode 100644 index 0000000000..5115ba97c9 --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/02-assert.yaml @@ -0,0 +1,7 @@ +--- +apiVersion: batch/v1 +kind: Job +metadata: + name: original-data +status: + succeeded: 1 diff --git a/testing/kuttl/e2e-other/cluster-migrate/03--alter-pv.yaml b/testing/kuttl/e2e-other/cluster-migrate/03--alter-pv.yaml new file mode 100644 index 0000000000..64fa700297 --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/03--alter-pv.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: | + set -e + VOLUME_NAME=$( + kubectl get pvc --namespace "${NAMESPACE}" \ + --output=jsonpath={.items..spec.volumeName} + ) + + ORIGINAL_POLICY=$( + kubectl get pv "${VOLUME_NAME}" \ + --output=jsonpath={.spec.persistentVolumeReclaimPolicy} + ) + + kubectl create configmap persistent-volume-reclaim-policy --namespace "${NAMESPACE}" \ + --from-literal=ORIGINAL_POLICY="${ORIGINAL_POLICY}" \ + --from-literal=VOLUME_NAME="${VOLUME_NAME}" + + kubectl patch pv "${VOLUME_NAME}" -p '{"spec":{"persistentVolumeReclaimPolicy":"Retain"}}' + + kubectl label pv "${VOLUME_NAME}" postgres-operator-test=kuttl app.kubernetes.io/name=postgresql app.kubernetes.io/instance=non-crunchy-cluster test-namespace="${NAMESPACE}" diff --git a/testing/kuttl/e2e-other/cluster-migrate/04--delete.yaml b/testing/kuttl/e2e-other/cluster-migrate/04--delete.yaml new file mode 100644 index 0000000000..ed38b23d9f --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/04--delete.yaml @@ -0,0 +1,15 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +delete: +- apiVersion: apps/v1 + kind: StatefulSet + name: non-crunchy-cluster +- apiVersion: v1 + kind: Service + name: non-crunchy-cluster +- apiVersion: v1 + kind: Service + name: non-crunchy-cluster-hl +- apiVersion: v1 + kind: Secret + name: non-crunchy-cluster diff --git a/testing/kuttl/e2e-other/cluster-migrate/04-errors.yaml b/testing/kuttl/e2e-other/cluster-migrate/04-errors.yaml new file mode 100644 index 0000000000..1767e8040f --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/04-errors.yaml @@ -0,0 +1,4 @@ +apiVersion: v1 +kind: Pod +metadata: + name: non-crunchy-cluster-0 diff --git a/testing/kuttl/e2e-other/cluster-migrate/05--cluster.yaml b/testing/kuttl/e2e-other/cluster-migrate/05--cluster.yaml new file mode 100644 index 0000000000..a81666ed01 --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/05--cluster.yaml @@ -0,0 +1,30 @@ +apiVersion: postgres-operator.crunchydata.com/v1beta1 +kind: PostgresCluster +metadata: + name: cluster-migrate +spec: + dataSource: + volumes: + pgDataVolume: + pvcName: data-non-crunchy-cluster-0 + directory: data + postgresVersion: ${KUTTL_PG_VERSION} + instances: + - name: instance1 + dataVolumeClaimSpec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: 1Gi + backups: + pgbackrest: + repos: + - name: repo1 + volume: + volumeClaimSpec: + accessModes: + - "ReadWriteOnce" + resources: + requests: + storage: 1Gi diff --git a/testing/kuttl/e2e-other/cluster-migrate/06-assert.yaml b/testing/kuttl/e2e-other/cluster-migrate/06-assert.yaml new file mode 100644 index 0000000000..1a25966abb --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/06-assert.yaml @@ -0,0 +1,21 @@ +apiVersion: postgres-operator.crunchydata.com/v1beta1 +kind: PostgresCluster +metadata: + name: cluster-migrate +status: + instances: + - name: instance1 + readyReplicas: 1 + replicas: 1 + updatedReplicas: 1 +--- +apiVersion: v1 +kind: Pod +metadata: + labels: + postgres-operator.crunchydata.com/cluster: cluster-migrate + postgres-operator.crunchydata.com/data: postgres + postgres-operator.crunchydata.com/instance-set: instance1 + postgres-operator.crunchydata.com/role: master +status: + phase: Running diff --git a/testing/kuttl/e2e-other/cluster-migrate/07--set-collation.yaml b/testing/kuttl/e2e-other/cluster-migrate/07--set-collation.yaml new file mode 100644 index 0000000000..00eb741f80 --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/07--set-collation.yaml @@ -0,0 +1,23 @@ +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: | + set -e + if [[ ${KUTTL_PG_VERSION} -ge 15 ]]; then + PRIMARY= + while [[ -z "${PRIMARY}" ]]; do + PRIMARY=$( + kubectl get pod --namespace "${NAMESPACE}" \ + --output name --selector ' + postgres-operator.crunchydata.com/cluster=cluster-migrate, + postgres-operator.crunchydata.com/role=master' + ) + done + + # Ignore warnings about collation changes. This is DANGEROUS on real data! + # Only do this automatic step in test conditions; with real data, this may cause + # more problems as you may need to reindex. + kubectl exec --namespace "${NAMESPACE}" "${PRIMARY}" -c database \ + -- psql -qAt --command \ + 'ALTER DATABASE postgres REFRESH COLLATION VERSION; ALTER DATABASE template1 REFRESH COLLATION VERSION;' + fi diff --git a/testing/kuttl/e2e-other/cluster-migrate/08--alter-pv.yaml b/testing/kuttl/e2e-other/cluster-migrate/08--alter-pv.yaml new file mode 100644 index 0000000000..c5edfb4c99 --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/08--alter-pv.yaml @@ -0,0 +1,16 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: | + set -e + SAVED_DATA=$( + kubectl get configmap persistent-volume-reclaim-policy --namespace "${NAMESPACE}" \ + --output=jsonpath="{.data..['ORIGINAL_POLICY','VOLUME_NAME']}" + ) + + IFS=' ' + read ORIGINAL_POLICY VOLUME_NAME <<< "${SAVED_DATA}" + + kubectl patch pv "${VOLUME_NAME}" -p '{"spec":{"persistentVolumeReclaimPolicy":"'${ORIGINAL_POLICY}'"}}' + diff --git a/testing/kuttl/e2e-other/cluster-migrate/09--check-data.yaml b/testing/kuttl/e2e-other/cluster-migrate/09--check-data.yaml new file mode 100644 index 0000000000..6a46bd8e9a --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/09--check-data.yaml @@ -0,0 +1,23 @@ +--- +apiVersion: kuttl.dev/v1beta1 +kind: TestStep +commands: + - script: | + set -e + PRIMARY=$( + kubectl get pod --namespace "${NAMESPACE}" \ + --output name --selector ' + postgres-operator.crunchydata.com/cluster=cluster-migrate, + postgres-operator.crunchydata.com/role=master' + ) + + TREASURE=$( + kubectl exec "${PRIMARY}" --namespace "${NAMESPACE}" \ + --container database \ + -- psql -U postgres -qt -c "select data from important" + ) + + if [[ "${TREASURE}" != " treasure" ]]; then + echo "Migration from 3rd-party PG pod failed, result from query: ${TREASURE}" + exit 1 + fi diff --git a/testing/kuttl/e2e-other/cluster-migrate/README.md b/testing/kuttl/e2e-other/cluster-migrate/README.md new file mode 100644 index 0000000000..b2becc9ffb --- /dev/null +++ b/testing/kuttl/e2e-other/cluster-migrate/README.md @@ -0,0 +1,45 @@ +## Cluster Migrate + +This test was developed to check that users could bypass some known problems when +migrating from a non-Crunchy PostgreSQL image to a Crunchy PostgreSQL image: + +1) it changes the ownership of the data directory (which depends on fsGroup +behavior to change group ownership which is not available in all providers); +2) it makes sure a postgresql.conf file is available, as required by Patroni. + +Important note on *environment*: +As noted above, this work relies on fsGroup, so this test will not work in the current +form in all environments. For instance, this creates a PG cluster with fsGroup set, +which will result in an error in OpenShift. + +Important note on *PV permissions*: +This test involves changing permissions on PersistentVolumes, which may not be available +in all environments to all users (since this is a cluster-wide permission). + +Important note on migrating between different builds of *Postgres 15*: +PG 15 introduced new behavior around database collation versions, which result in errors like: + +``` +WARNING: database \"postgres\" has a collation version mismatch +DETAIL: The database was created using collation version 2.31, but the operating system provides version 2.28 +``` + +This error occured in `reconcilePostgresDatabases` and prevented PGO from finishing the reconcile +loop. For _testing purposes_, this problem is worked around in steps 06 and 07, which wait for +the PG pod to be ready and then send a command to `REFRESH COLLATION VERSION` on the `postgres` +and `template1` databases (which were the only databases where this error was observed during +testing). + +This solution is fine for testing purposes, but is not a solution that should be done in production +as an automatic step. User intervention and supervision is recommended in that case. + +### Steps + +* 01: Create a non-Crunchy PostgreSQL cluster and wait for it to be ready +* 02: Create data on that cluster +* 03: Alter the Reclaim policy of the PV so that it will survive deletion of the cluster +* 04: Delete the original cluster, leaving the PV +* 05: Create a PGO-managed `postgrescluster` with the remaing PV as the datasource +* 06-07: Wait for the PG pod to be ready and alter the collation (PG 15 only, see above) +* 08: Alter the PV to the original Reclaim policy +* 09: Check that the data successfully migrated