From dbc715be20df2b692d2648d2ce3281733b98bd43 Mon Sep 17 00:00:00 2001 From: mrdrivingduck Date: Mon, 31 Jul 2023 16:31:26 +0800 Subject: [PATCH] feat: support flashback table Patch provided by: mrhemingway --- .../polar_monitor_flashback_log.c | 4 +- polardb_build.sh | 3 +- src/backend/access/heap/hio.c | 8 + src/backend/access/logindex/polar_logindex.c | 19 + .../access/logindex/polar_logindex_redo.c | 2 +- src/backend/access/logindex/polar_ringbuf.c | 2 +- src/backend/access/transam/clog.c | 40 + src/backend/access/transam/parallel.c | 6 + src/backend/access/transam/slru.c | 115 +- src/backend/access/transam/xlog.c | 119 +- src/backend/commands/cluster.c | 19 +- src/backend/commands/tablecmds.c | 6 +- src/backend/parser/analyze.c | 25 + src/backend/parser/gram.y | 26 +- src/backend/parser/parse_agg.c | 7 + src/backend/parser/parse_expr.c | 4 + src/backend/parser/parse_func.c | 3 + src/backend/polar_flashback/Makefile | 7 +- .../polar_fast_recovery_area.c | 336 +++++ src/backend/polar_flashback/polar_flashback.c | 229 ++++ .../polar_flashback/polar_flashback_clog.c | 275 +++++ .../polar_flashback/polar_flashback_log.c | 830 +++++++++++-- .../polar_flashback_log_decoder.c | 379 ------ .../polar_flashback_log_file.c | 167 +-- .../polar_flashback_log_index.c | 74 +- .../polar_flashback_log_index_queue.c | 24 +- .../polar_flashback_log_insert.c | 326 ----- .../polar_flashback_log_list.c | 140 +-- .../polar_flashback/polar_flashback_log_mem.c | 64 +- .../polar_flashback_log_reader.c | 172 ++- .../polar_flashback_log_repair_page.c | 161 --- .../polar_flashback_log_worker.c | 43 +- .../polar_flashback/polar_flashback_point.c | 633 +++++++++- .../polar_flashback_rel_filenode.c | 213 ++++ .../polar_flashback_snapshot.c | 853 +++++++++++++ .../polar_flashback/polar_flashback_table.c | 1084 +++++++++++++++++ src/backend/postmaster/pgstat.c | 18 + src/backend/replication/walreceiver.c | 15 +- src/backend/storage/buffer/bufmgr.c | 1 - src/backend/storage/buffer/polar_bufmgr.c | 1 - src/backend/storage/buffer/polar_copybuf.c | 2 + src/backend/storage/file/polar_fd.c | 8 + src/backend/storage/ipc/ipci.c | 6 +- src/backend/storage/ipc/procarray.c | 31 +- src/backend/storage/page/bufpage.c | 2 +- src/backend/tcop/utility.c | 21 + src/backend/utils/cache/relcache.c | 7 +- src/backend/utils/misc/guc.c | 54 +- src/bin/polar_tools/Makefile | 5 +- .../polar_tools/flashback_log_control_dump.c | 6 +- src/bin/polar_tools/flashback_log_file_dump.c | 174 +-- .../polar_tools/flashback_point_file_dump.c | 268 ++++ src/bin/polar_tools/flashback_snapshot_dump.c | 248 ++++ src/bin/polar_tools/fra_control_dump.c | 109 ++ src/bin/polar_tools/polar_tools.c | 9 + src/bin/polar_tools/polar_tools.h | 5 + src/fe_utils/Makefile | 2 +- src/fe_utils/timestamp.c | 62 + src/include/access/clog.h | 3 + src/include/access/polar_logindex.h | 2 + src/include/access/polar_logindex_internal.h | 1 + src/include/access/polar_ringbuf.h | 4 +- src/include/access/slru.h | 2 + src/include/catalog/pg_control.h | 1 - src/include/commands/cluster.h | 6 +- src/include/fe_utils/timestamp.h | 20 + src/include/nodes/nodes.h | 6 +- src/include/nodes/parsenodes.h | 14 + src/include/parser/kwlist.h | 2 + src/include/parser/parse_node.h | 3 +- src/include/pgstat.h | 6 + .../polar_fast_recovery_area.h | 74 ++ src/include/polar_flashback/polar_flashback.h | 26 + .../polar_flashback/polar_flashback_clog.h | 46 + .../polar_flashback/polar_flashback_log.h | 112 +- .../polar_flashback_log_decoder.h | 35 - .../polar_flashback_log_file.h | 31 +- .../polar_flashback_log_index.h | 1 + .../polar_flashback_log_index_queue.h | 1 + .../polar_flashback_log_insert.h | 44 - .../polar_flashback_log_internal.h | 27 +- .../polar_flashback_log_list.h | 14 +- .../polar_flashback/polar_flashback_log_mem.h | 12 +- .../polar_flashback_log_reader.h | 14 + .../polar_flashback_log_record.h | 16 +- .../polar_flashback_log_repair_page.h | 32 - .../polar_flashback_log_worker.h | 7 +- .../polar_flashback/polar_flashback_point.h | 121 +- .../polar_flashback_rel_filenode.h | 42 + .../polar_flashback_snapshot.h | 103 ++ .../polar_flashback/polar_flashback_table.h | 58 + src/include/storage/lwlock.h | 2 + src/include/storage/polar_fd.h | 2 + src/include/storage/procarray.h | 5 +- src/include/storage/relfilenode.h | 8 + src/include/utils/px_unsync_guc_name.h | 4 +- src/test/Makefile | 6 - src/test/modules/Makefile | 3 +- .../test_flashback_log/test_flashback_log.c | 82 +- .../test_flashback_log.conf | 3 +- .../modules/test_flashback_table/.gitignore | 4 + .../modules/test_flashback_table/Makefile | 52 + .../expected/flashback_table_isolation.out | 361 ++++++ .../expected/test_flashback_table.out | 381 ++++++ .../specs/flashback_table_isolation.spec | 99 ++ .../sql/test_flashback_table.sql | 204 ++++ .../test_flashback_table--1.0.sql | 65 + .../test_flashback_table.c | 1036 ++++++++++++++++ .../test_flashback_table.conf | 28 + .../test_flashback_table.control | 4 + src/test/perl/PolarRegression.pm | 134 +- .../t/009_flashback_random_table.pl | 62 + src/test/polar_flog_repair_partial/.gitignore | 2 - src/test/polar_flog_repair_partial/Makefile | 25 - src/test/polar_flog_repair_partial/README | 25 - src/test/polar_pl/Makefile | 1 + src/test/polar_pl/README | 7 +- .../t/017_flog_solve_torn_page.pl} | 0 .../t/018_flog_in_online_promote.pl} | 0 src/tools/pgindent/typedefs.list | 1 + 120 files changed, 9182 insertions(+), 1782 deletions(-) create mode 100644 src/backend/polar_flashback/polar_fast_recovery_area.c create mode 100644 src/backend/polar_flashback/polar_flashback.c create mode 100644 src/backend/polar_flashback/polar_flashback_clog.c delete mode 100644 src/backend/polar_flashback/polar_flashback_log_decoder.c delete mode 100644 src/backend/polar_flashback/polar_flashback_log_insert.c delete mode 100644 src/backend/polar_flashback/polar_flashback_log_repair_page.c create mode 100644 src/backend/polar_flashback/polar_flashback_rel_filenode.c create mode 100644 src/backend/polar_flashback/polar_flashback_snapshot.c create mode 100644 src/backend/polar_flashback/polar_flashback_table.c create mode 100644 src/bin/polar_tools/flashback_point_file_dump.c create mode 100644 src/bin/polar_tools/flashback_snapshot_dump.c create mode 100644 src/bin/polar_tools/fra_control_dump.c create mode 100644 src/fe_utils/timestamp.c create mode 100644 src/include/fe_utils/timestamp.h create mode 100644 src/include/polar_flashback/polar_fast_recovery_area.h create mode 100644 src/include/polar_flashback/polar_flashback.h create mode 100644 src/include/polar_flashback/polar_flashback_clog.h delete mode 100644 src/include/polar_flashback/polar_flashback_log_decoder.h delete mode 100644 src/include/polar_flashback/polar_flashback_log_insert.h delete mode 100644 src/include/polar_flashback/polar_flashback_log_repair_page.h create mode 100644 src/include/polar_flashback/polar_flashback_rel_filenode.h create mode 100644 src/include/polar_flashback/polar_flashback_snapshot.h create mode 100644 src/include/polar_flashback/polar_flashback_table.h create mode 100644 src/test/modules/test_flashback_table/.gitignore create mode 100644 src/test/modules/test_flashback_table/Makefile create mode 100644 src/test/modules/test_flashback_table/expected/flashback_table_isolation.out create mode 100644 src/test/modules/test_flashback_table/expected/test_flashback_table.out create mode 100644 src/test/modules/test_flashback_table/specs/flashback_table_isolation.spec create mode 100644 src/test/modules/test_flashback_table/sql/test_flashback_table.sql create mode 100644 src/test/modules/test_flashback_table/test_flashback_table--1.0.sql create mode 100644 src/test/modules/test_flashback_table/test_flashback_table.c create mode 100644 src/test/modules/test_flashback_table/test_flashback_table.conf create mode 100644 src/test/modules/test_flashback_table/test_flashback_table.control create mode 100644 src/test/polar_consistency/t/009_flashback_random_table.pl delete mode 100644 src/test/polar_flog_repair_partial/.gitignore delete mode 100644 src/test/polar_flog_repair_partial/Makefile delete mode 100644 src/test/polar_flog_repair_partial/README rename src/test/{polar_flog_repair_partial/t/001_flog_solve_torn_page.pl => polar_pl/t/017_flog_solve_torn_page.pl} (100%) rename src/test/{polar_flog_repair_partial/t/002_flog_in_online_promote.pl => polar_pl/t/018_flog_in_online_promote.pl} (100%) diff --git a/external/polar_monitor/polar_monitor_flashback_log.c b/external/polar_monitor/polar_monitor_flashback_log.c index 9d088c34805..2a1f554938f 100644 --- a/external/polar_monitor/polar_monitor_flashback_log.c +++ b/external/polar_monitor/polar_monitor_flashback_log.c @@ -132,8 +132,8 @@ polar_stat_flashback_log_buf(PG_FUNCTION_ARGS) ptr = polar_get_curr_flog_ptr(flog_instance->buf_ctl, &prev_ptr); initalized_upto = polar_get_flog_buf_initalized_upto(flog_instance->buf_ctl); - polar_flog_get_keep_wal_lsn(flog_instance->buf_ctl, &keep_wal_lsn); - is_ready = polar_is_flog_buf_ready(flog_instance->buf_ctl); + keep_wal_lsn = flog_instance->buf_ctl->redo_lsn; + is_ready = POLAR_IS_FLOG_BUF_READY(flog_instance->buf_ctl); values[i++] = LSNGetDatum(ptr); values[i++] = LSNGetDatum(prev_ptr); diff --git a/polardb_build.sh b/polardb_build.sh index 8344b407c72..295ebe5a726 100755 --- a/polardb_build.sh +++ b/polardb_build.sh @@ -598,7 +598,8 @@ then if [[ $enable_flashback_log == "on" ]]; then - echo "polar_enable_flashback_log = on" >> $pg_bld_master_dir/postgresql.conf + echo "polar_enable_flashback_log = on + polar_enable_fast_recovery_area = on" >> $pg_bld_master_dir/postgresql.conf fi # echo "max_wal_size = 16GB diff --git a/src/backend/access/heap/hio.c b/src/backend/access/heap/hio.c index 8b531c7f619..700ba062efa 100644 --- a/src/backend/access/heap/hio.c +++ b/src/backend/access/heap/hio.c @@ -25,6 +25,7 @@ #include "storage/smgr.h" /* POLAR */ +#include "polar_flashback/polar_flashback.h" #include "utils/guc.h" static Buffer polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkInsertState bistate); @@ -674,6 +675,9 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn int index = 0; char *bulk_buf_block = NULL; BufferAccessStrategy strategy = NULL; + bool need_flog; + + need_flog = polar_enable_fra(fra_instance); if (bistate != NULL) { @@ -768,6 +772,10 @@ polar_relation_add_extra_blocks_and_return_last_buffer(Relation relation, BulkIn PageInit(page, BufferGetPageSize(buffer), 0); + /* Insert the flashback log record for relation bulk extend */ + if (need_flog) + polar_flog_rel_bulk_extend(flog_instance, buffer); + /* * We mark all the new buffers dirty, but do nothing to write them * out; they'll probably get used soon, and even if they are not, a diff --git a/src/backend/access/logindex/polar_logindex.c b/src/backend/access/logindex/polar_logindex.c index e8f268c5edd..d7cabce51b3 100644 --- a/src/backend/access/logindex/polar_logindex.c +++ b/src/backend/access/logindex/polar_logindex.c @@ -2360,3 +2360,22 @@ polar_logindex_update_promoted_info(logindex_snapshot_t logindex_snapshot, XLogR info->old_rw_max_inserted_lsn = last_replayed_lsn; info->old_rw_max_tid = logindex_snapshot->max_idx_table_id; } + +XLogRecPtr +polar_get_logindex_max_parsed_lsn(logindex_snapshot_t logindex_snapshot) +{ + XLogRecPtr max_parsed_lsn = InvalidXLogRecPtr; + + SpinLockAcquire(LOG_INDEX_SNAPSHOT_LOCK); + max_parsed_lsn = logindex_snapshot->max_parsed_lsn; + SpinLockRelease(LOG_INDEX_SNAPSHOT_LOCK); + return max_parsed_lsn; +} + +void +polar_set_logindex_max_parsed_lsn(logindex_snapshot_t logindex_snapshot, XLogRecPtr lsn) +{ + SpinLockAcquire(LOG_INDEX_SNAPSHOT_LOCK); + logindex_snapshot->max_parsed_lsn = lsn; + SpinLockRelease(LOG_INDEX_SNAPSHOT_LOCK); +} diff --git a/src/backend/access/logindex/polar_logindex_redo.c b/src/backend/access/logindex/polar_logindex_redo.c index a303d6c8b8f..0a451aaa8c3 100644 --- a/src/backend/access/logindex/polar_logindex_redo.c +++ b/src/backend/access/logindex/polar_logindex_redo.c @@ -2610,7 +2610,7 @@ polar_logindex_find_first_fpi(polar_logindex_redo_ctl_t instance, XLogRecPtr sta } else elog(WARNING, "The first WAL record of " POLAR_LOG_BUFFER_TAG_FORMAT "" - "from %lx to %lx is not a full page image", POLAR_LOG_BUFFER_TAG(tag), + " from %lx to %lx is not a full page image", POLAR_LOG_BUFFER_TAG(tag), start_lsn, end_lsn); } diff --git a/src/backend/access/logindex/polar_ringbuf.c b/src/backend/access/logindex/polar_ringbuf.c index 214ce4d1dc4..1b19500b370 100644 --- a/src/backend/access/logindex/polar_ringbuf.c +++ b/src/backend/access/logindex/polar_ringbuf.c @@ -441,7 +441,7 @@ polar_ringbuf_free_up(polar_ringbuf_t rbuf, size_t len, polar_interrupt_callback CHECK_FOR_INTERRUPTS(); if (callback != NULL) - callback(); + callback(rbuf); pg_usleep(10); continue; diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 6c8d2e4afbe..cd00c6bd65d 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -1176,3 +1176,43 @@ polar_remove_clog_local_cache_file(void) { polar_slru_remove_local_cache_file(ClogCtl); } + +/* POLAR */ + +/* + * POLAR: Get the minimal segment no in clog + */ +int +polar_get_clog_min_seg_no(void) +{ + int min_seg_no = INT_MAX; + + SlruScanDirectory(ClogCtl, polar_slru_find_min_seg, &min_seg_no); + return min_seg_no; +} + +bool +polar_xid_in_clog_dir(TransactionId xid, const char *clog_dir) +{ + int pageno = TransactionIdToPage(xid); + SlruCtlData ctl; + + StrNCpy(ctl.Dir, clog_dir, sizeof(ctl.Dir)); + return polar_slru_page_physical_exists(&ctl, pageno); +} + +XidStatus +polar_get_xid_status(TransactionId xid, const char *clog_dir) +{ + int pageno = TransactionIdToPage(xid); + int byteno = TransactionIdToByte(xid); + int bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT; + char *byteptr; + XidStatus status; + PGAlignedBlock clog_page; + + polar_physical_read_fra_slru(clog_dir, pageno, clog_page.data); + byteptr = clog_page.data + byteno; + status = (*byteptr >> bshift) & CLOG_XACT_BITMASK; + return status; +} diff --git a/src/backend/access/transam/parallel.c b/src/backend/access/transam/parallel.c index 35820633e8a..b0cbaaaaeb6 100644 --- a/src/backend/access/transam/parallel.c +++ b/src/backend/access/transam/parallel.c @@ -45,6 +45,9 @@ /* POLAR px */ #include "access/px_btbuild.h" +/* POLAR: flashback table */ +#include "polar_flashback/polar_flashback_table.h" + /* * We don't want to waste a lot of memory on an error queue which, most of * the time, will process only a handful of small messages. However, it is @@ -143,6 +146,9 @@ static const struct { "polar_px_bt_build_main", polar_px_bt_build_main }, + { + "polar_flashback_pages_woker_main", polar_flashback_pages_woker_main + } /* POLAR End*/ }; diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 1e8074f7c8b..c674d4a6b17 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -63,6 +63,7 @@ /* POLAR end */ /* POLAR */ +#include "polar_flashback/polar_fast_recovery_area.h" #include "utils/guc.h" #include "storage/polar_fd.h" @@ -155,6 +156,7 @@ static void polar_slru_file_name_by_name(SlruCtl ctl, char *path, char *filename static void polar_slru_file_dir(SlruCtl ctl, char *path); static bool polar_slru_local_cache_read_page(SlruCtl ctl, int pageno, int slotno); static bool polar_slru_local_cache_write_page(SlruCtl ctl, int pageno, int slotno); +static bool polar_slru_scan_dir_internal(SlruCtl ctl, SlruScanCallback callback, void *data, const char *path); #define SlruFileName(a,b,c) polar_slru_file_name_by_seg(a,b,c) @@ -1666,6 +1668,8 @@ restart:; * * NB: This does not touch the SLRU buffers themselves, callers have to ensure * they either can't yet contain anything, or have already been cleaned out. + * + * POLAR: Add rename action for flashback table/database. */ static void SlruInternalDeleteSegment(SlruCtl ctl, char *filename) @@ -1689,7 +1693,11 @@ SlruInternalDeleteSegment(SlruCtl ctl, char *filename) polar_slru_file_name_by_name(ctl, path, filename); ereport(LOG, (errmsg("removing file \"%s\"", path))); - polar_unlink(path); + + if (polar_slru_seg_need_mv(fra_instance, ctl)) + polar_mv_slru_seg_to_fra(fra_instance, filename, path); + else + polar_unlink(path); } } @@ -1831,20 +1839,43 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) { bool retval = false; SlruShared shared = ctl->shared; - DIR *cldir; - struct dirent *clde; - int segno; - int segpage; char path[MAXPGPATH]; + /* + * POLAR: We add a local cache for slru files, so scan it first. + * Scan the shared storage while the return value is false. + * + * NB: If you want to scan the local cache and shared storage all files, + * the return value must be false. + */ if (shared->polar_cache != NULL) - cldir = polar_allocate_dir(shared->polar_cache->dir_name); - else + { + snprintf(path, MAXPGPATH, "%s", shared->polar_cache->dir_name); + retval = polar_slru_scan_dir_internal(ctl, callback, data, path); + } + + if (!retval) { polar_slru_file_dir(ctl, path); - cldir = polar_allocate_dir(path); + retval = polar_slru_scan_dir_internal(ctl, callback, data, path); } + return retval; +} + +/* + * POLAR: Slru scan directory internal function like SlruScanDirectory old version. + */ +static bool +polar_slru_scan_dir_internal(SlruCtl ctl, SlruScanCallback callback, void *data, const char *path) +{ + bool retval = false; + DIR *cldir; + struct dirent *clde; + int segno; + int segpage; + + cldir = polar_allocate_dir(path); while ((clde = ReadDir(cldir, path)) != NULL) { size_t len; @@ -1858,7 +1889,7 @@ SlruScanDirectory(SlruCtl ctl, SlruScanCallback callback, void *data) segpage = segno * SLRU_PAGES_PER_SEGMENT; elog(DEBUG2, "SlruScanDirectory invoking callback on %s/%s", - ctl->Dir, clde->d_name); + path, clde->d_name); retval = callback(ctl, clde->d_name, segpage, data); if (retval) break; @@ -2169,3 +2200,69 @@ polar_slru_remove_local_cache_file(SlruCtl ctl) if (shared && shared->polar_cache) polar_local_cache_move_trash(shared->polar_cache->dir_name); } + +/* + * POLAR: SlruScanDirectory callback. + * This callback get minimal of all segments. + */ +bool +polar_slru_find_min_seg(SlruCtl ctl, char *filename, int segpage, void *data) +{ + int seg_no; + int *min_seg_no = (int *) data; + + seg_no = (int) strtol(filename, NULL, 16); + *min_seg_no = Min(seg_no, *min_seg_no); + return false; /* keep going */ +} + +/* + * POLAR: physical read fast recovery slru file. + * + * Like SlruPhysicalReadPage but we will report error when we can't find the file + * while SlruPhysicalReadPage just return a empty page. + */ +void +polar_physical_read_fra_slru(const char *slru_dir, int page_no, char *page) +{ + SlruCtlData ctl; + int seg_no = page_no / SLRU_PAGES_PER_SEGMENT; + int rpageno = page_no % SLRU_PAGES_PER_SEGMENT; + int offset = rpageno * BLCKSZ; + char path[MAXPGPATH]; + int fd; + + StrNCpy(ctl.Dir, slru_dir, sizeof(ctl.Dir)); + SlruFileName(&ctl, path, seg_no); + + fd = polar_open_transient_file(path, O_RDWR | PG_BINARY); + + if (fd < 0) + /*no cover line*/ + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + + pgstat_report_wait_start(WAIT_EVENT_SLRU_READ); + + if (polar_pread(fd, page, BLCKSZ, offset) != BLCKSZ) + { + /*no cover begin*/ + pgstat_report_wait_end(); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read from file \"%s\": %m", path))); + /*no cover end*/ + } + + pgstat_report_wait_end(); + + if (CloseTransientFile(fd)) + { + /*no cover begin*/ + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not close fast recovery area slru file %s: %m", path))); + /*no cover end*/ + } +} diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 13e8e9fb84a..1d5b3124e87 100755 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -91,7 +91,7 @@ #include "commands/tablespace.h" #include "polar_dma/polar_dma.h" #include "polar_datamax/polar_datamax.h" -#include "polar_flashback/polar_flashback_point.h" +#include "polar_flashback/polar_flashback.h" #include "portability/instr_time.h" #include "replication/polar_cluster_info.h" #include "storage/polar_fd.h" @@ -7852,8 +7852,8 @@ StartupXLOG(void) pg_atomic_write_u32(&polar_shmem_csn_mvcc_var_cache->polar_oldest_active_xid, checkPoint.nextXid); /* POLAR end */ - /* Startup all about flashback log */ - polar_startup_flog(&checkPoint, flog_instance); + /* Startup all about flashback */ + polar_startup_flashback(&checkPoint); /* * Initialize replication slots, before there's a chance to remove @@ -9065,7 +9065,7 @@ StartupXLOG(void) */ if (!ArchiveRecoveryRequested && polar_lazy_end_of_recovery_checkpoint_enabled() && - !fast_promoted && !bgwriterLaunched) + !fast_promoted && !bgwriterLaunched && !flog_instance) polar_lazy_end_of_recovery_checkpoint = true; /* @@ -9380,7 +9380,7 @@ StartupXLOG(void) if (enable_logindex_online_promote) { /* Start up the flashback log before reset background replayed lsn */ - polar_startup_flog(&ControlFile->checkPointCopy, flog_instance); + polar_startup_flashback(&ControlFile->checkPointCopy); /* POLAR: Wake up background process to start marking buffer dirty from replayed lsn */ polar_reset_bg_replayed_lsn(polar_logindex_redo_instance); } @@ -10248,6 +10248,7 @@ CreateCheckPoint(int flags) polar_flog_rec_ptr flog_ptr_ckp_start = POLAR_INVALID_FLOG_REC_PTR; bool is_flashback_point = false; bool is_online_promote = false; + flashback_snapshot_header_t fbpoint_snapshot = NULL; /* * POLAR: Don't do checkpoint during online promote when background process @@ -10311,6 +10312,20 @@ CreateCheckPoint(int flags) polar_is_flashback_point(flog_instance, GetXLogInsertRecPtr(), InvalidXLogRecPtr, &flags, false); + /* + * POLAR: get current flashback log ptr of the flashback point begining. + * + * NB: We don't need a precise value, but a little earlier value. + * When we search a right flashback log in logindex, sometime we will get + * two right flashback log. + */ + if (is_flashback_point) + { + flog_ptr_ckp_start = polar_get_flog_write_result(flog_instance->buf_ctl); + /* Get the current snapshot for flashback table */ + fbpoint_snapshot = polar_get_flashback_snapshot_data(fra_instance, GetXLogInsertRecPtr()); + } + /* * Use a critical section to force system panic if we have trouble. */ @@ -10357,16 +10372,6 @@ CreateCheckPoint(int flags) if (polar_csn_enable) oldest_active_xid = pg_atomic_read_u32(&polar_shmem_csn_mvcc_var_cache->polar_oldest_active_xid); - /* - * POLAR: get current flashback log ptr of the flashback point begining. - * - * NB: We don't need a precise value, but a little earlier value. - * When we search a right flashback log in logindex, sometime we will get - * two right flashback log. - */ - if (is_flashback_point) - flog_ptr_ckp_start = polar_get_flog_write_result(flog_instance->buf_ctl); - /* * Get location of last important record before acquiring insert locks (as * GetLastImportantRecPtr() also locks WAL locks). @@ -10687,7 +10692,7 @@ CreateCheckPoint(int flags) * 2. remove the old flashback data. */ if (is_flashback_point) - polar_flog_do_fbpoint(flog_instance, flog_ptr_ckp_start, shutdown); + polar_do_flashback_point(flog_ptr_ckp_start, fbpoint_snapshot, shutdown); /* POLAR: record checkpoint ptr when checkpoint finished */ polar_buffer_pool_ctl_set_last_checkpoint_lsn(ControlFile->checkPoint); @@ -10863,6 +10868,12 @@ CheckPointGuts(XLogRecPtr checkPointRedo, int flags) CheckPointReplicationOrigin(); /* We deliberately delay 2PC checkpointing as long as possible */ CheckPointTwoPhase(checkPointRedo); + + /* + * POLAR: Flashback log do checkpoint. + * Now just set flog_instance->buf_ctl->redo_lsn to checkPointRedo. + */ + POLAR_CHECK_POINT_FLOG(flog_instance, checkPointRedo); } /* @@ -10951,20 +10962,9 @@ CreateRestartPoint(int flags) bg_replayed_lsn = polar_bg_redo_get_replayed_lsn(polar_logindex_redo_instance); /* POLAR: Is it a flashback point? */ is_flashback_point = polar_is_flashback_point(flog_instance, - polar_get_replay_end_rec_ptr(&replayTLI), bg_replayed_lsn, + polar_get_replay_end_rec_ptr(NULL), bg_replayed_lsn, &flags, true); - flashback_point_time = (pg_time_t) time(NULL); - /* - * POLAR: get current flashback log ptr of the flashback point begining. - * - * NB: We don't need a precise value, but a little earlier value. - * When we search a right flashback log in logindex, sometime we will get - * two right flashback log. - */ - if (is_flashback_point) - flog_ptr_ckp_start = polar_get_flog_write_result(flog_instance->buf_ctl); - /* * Acquire CheckpointLock to ensure only one restartpoint or checkpoint * happens at a time. @@ -10991,19 +10991,6 @@ CreateRestartPoint(int flags) SpinLockRelease(&XLogCtl->info_lck); } - /* - * POLAR: Set the flashback point lsn to replayEndRecPtr. - * - * NB: Must hold the XLogCtl->info_lck to protect XLogCtl->replayEndRecPtr - * not change. - */ - if (is_flashback_point) - { - SpinLockAcquire(&XLogCtl->info_lck); - polar_set_fbpoint_wal_info(flog_instance->buf_ctl, XLogCtl->replayEndRecPtr, flashback_point_time, bg_replayed_lsn, true); - SpinLockRelease(&XLogCtl->info_lck); - } - /* * Check that we're still in recovery mode. It's ok if we exit recovery * mode after this check, the restart point is valid anyway. @@ -11034,6 +11021,17 @@ CreateRestartPoint(int flags) } /* POLAR end */ + flashback_point_time = (pg_time_t) time(NULL); + /* + * POLAR: get current flashback log ptr of the flashback point begining. + * + * NB: We don't need a precise value, but a little earlier value. + * When we search a right flashback log in logindex, sometime we will get + * two right flashback log. + */ + if (is_flashback_point) + flog_ptr_ckp_start = polar_get_flog_write_result(flog_instance->buf_ctl); + /* * If the last checkpoint record we've replayed is already our last * restartpoint, we can't perform a new restart point. We still update @@ -11068,14 +11066,29 @@ CreateRestartPoint(int flags) /* * When hot standby is shutdown in this case, do the shutdown checkpoint * things about flashback log. + * + * NB: Now the fast recovery area is disable for standby, so we don't + * need the next xid and snapshot. */ if (is_flashback_point) - polar_flog_do_fbpoint(flog_instance, flog_ptr_ckp_start, true); + { + polar_set_fbpoint_wal_info(flog_instance->buf_ctl, polar_get_replay_end_rec_ptr(NULL), flashback_point_time, bg_replayed_lsn, true); + polar_do_flashback_point(flog_ptr_ckp_start, NULL, true); + } } LWLockRelease(CheckpointLock); return false; } + /* + * POLAR: Set the flashback point lsn to replayEndRecPtr. + * + * NB: Must hold the XLogCtl->info_lck to protect XLogCtl->replayEndRecPtr + * not change. + */ + if (is_flashback_point) + polar_set_fbpoint_wal_info(flog_instance->buf_ctl, polar_get_replay_end_rec_ptr(NULL), flashback_point_time, bg_replayed_lsn, true); + /* * Update the shared RedoRecPtr so that the startup process can calculate * the number of segments replayed since last restartpoint, and request a @@ -11170,9 +11183,12 @@ CreateRestartPoint(int flags) * POLAR: Do something after the flashback point is done: * 1. Flush the flashback data. * 2. remove the old flashback data. + * + * NB: Now the fast recovery area is disable for standby, so we don't + * need the next xid and snapshot. */ if (is_flashback_point) - polar_flog_do_fbpoint(flog_instance, flog_ptr_ckp_start, is_shutdown); + polar_do_flashback_point(flog_ptr_ckp_start, NULL, is_shutdown); /* POLAR: record checkpoint ptr when checkpoint finished */ polar_buffer_pool_ctl_set_last_checkpoint_lsn(ControlFile->checkPoint); @@ -15334,10 +15350,7 @@ polar_calc_min_used_lsn(bool is_contain_replication_slot) min_lsn = Min(min_lsn, logindex_start_lsn); /* Keep the wal for flashback */ - if (polar_is_flog_enabled(flog_instance)) - polar_flog_get_keep_wal_lsn(flog_instance->buf_ctl, &min_lsn); - - return min_lsn; + return polar_get_flashback_keep_wal(min_lsn); } void @@ -17643,24 +17656,28 @@ polar_fill_segment_file_zero(int fd, char *tmppath, int segment_size, int nbytes = 0; instr_time polar_init_start; instr_time polar_init_end; + int each_size = POALR_FILL_ZERO_EACH_SIZE; + + if (each_size > segment_size) + each_size = segment_size; if (IsUnderPostmaster) INSTR_TIME_SET_CURRENT(polar_init_start); MemSet(data, 0, sizeof(data)); - for (nbytes = 0; nbytes < segment_size; nbytes += POALR_FILL_ZERO_EACH_SIZE) + for (nbytes = 0; nbytes < segment_size; nbytes += each_size) { int rc = 0; errno = 0; pgstat_report_wait_start(init_write_event_info); if (POLAR_ENABLE_PWRITE()) - rc = (int)polar_pwrite(fd, data, POALR_FILL_ZERO_EACH_SIZE, nbytes); + rc = (int)polar_pwrite(fd, data, each_size, nbytes); else /*no cover line*/ - rc = (int)polar_write(fd, data, POALR_FILL_ZERO_EACH_SIZE); + rc = (int)polar_write(fd, data, each_size); - if (rc != POALR_FILL_ZERO_EACH_SIZE) + if (rc != each_size) { /*no cover begin*/ int save_errno = errno; diff --git a/src/backend/commands/cluster.c b/src/backend/commands/cluster.c index 0a425e10186..c12ae8e4bc2 100644 --- a/src/backend/commands/cluster.c +++ b/src/backend/commands/cluster.c @@ -54,6 +54,9 @@ #include "utils/tqual.h" #include "utils/tuplesort.h" +/* POLAR: start */ +#include "polar_flashback/polar_flashback_rel_filenode.h" +/* POLAR: end */ /* * This struct is used to pass around the information on tables to be @@ -619,8 +622,8 @@ rebuild_relation(Relation OldHeap, Oid indexOid, bool verbose) * data, then call finish_heap_swap to complete the operation. */ Oid -make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence, - LOCKMODE lockmode) +polar_make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence, + LOCKMODE lockmode, const char * NewHeapNameGiven) { TupleDesc OldHeapDesc; char NewHeapName[NAMEDATALEN]; @@ -670,7 +673,10 @@ make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence, * mapped. This simplifies swap_relation_files, and is absolutely * necessary for rebuilding pg_class, for reasons explained there. */ - snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap); + if (NewHeapNameGiven) + strncpy(NewHeapName, NewHeapNameGiven, sizeof(NewHeapName)); + else + snprintf(NewHeapName, sizeof(NewHeapName), "pg_temp_%u", OIDOldHeap); OIDNewHeap = heap_create_with_catalog(NewHeapName, namespaceid, @@ -1189,6 +1195,7 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, relfilenode2; Oid swaptemp; char swptmpchr; + bool change_persistence; /* We need writable copies of both pg_class tuples. */ relRelation = heap_open(RelationRelationId, RowExclusiveLock); @@ -1206,6 +1213,9 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, relfilenode1 = relform1->relfilenode; relfilenode2 = relform2->relfilenode; + /* When change the relation persistence, we need to flog the relation file node change */ + change_persistence = (relform1->relpersistence != relform2->relpersistence); + if (OidIsValid(relfilenode1) && OidIsValid(relfilenode2)) { /* @@ -1348,6 +1358,9 @@ swap_relation_files(Oid r1, Oid r2, bool target_is_pg_class, CacheInvalidateRelcacheByTuple(reltup2); } + /* POLAR: Log the relation file node update after swapping the two heaps */ + polar_flog_filenode_update(flog_instance, fra_instance, r1, relfilenode2, InvalidOid, change_persistence, false); + /* * Post alter hook for modified relations. The change to r2 is always * internal, but r1 depends on the invocation context. diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c index 32e59b784dd..8c55843da75 100644 --- a/src/backend/commands/tablecmds.c +++ b/src/backend/commands/tablecmds.c @@ -102,6 +102,7 @@ #include "utils/typcache.h" /* POLAR */ +#include "polar_flashback/polar_flashback_rel_filenode.h" #include "utils/guc.h" #include "storage/procarray.h" /* POLAR end */ @@ -11843,6 +11844,9 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode) rd_rel->relfilenode = newrelfilenode; CatalogTupleUpdate(pg_class, &tuple->t_self, tuple); + /* Log the relation file node update after pg_class changed */ + polar_flog_filenode_update(flog_instance, fra_instance, rel->rd_id, newrelfilenode, newTableSpace, false, true); + InvokeObjectPostAlterHook(RelationRelationId, RelationGetRelid(rel), 0); heap_freetuple(tuple); @@ -16218,4 +16222,4 @@ polar_px_btbuild_update_pg_class(Relation heap, Relation index) opt->arg = (Node *)makeString("finish"); options = lappend(options, opt); ATExecSetRelOptions(index, options, AT_SetRelOptions, ShareUpdateExclusiveLock); -} \ No newline at end of file +} diff --git a/src/backend/parser/analyze.c b/src/backend/parser/analyze.c index 8d4cadc9b31..bda8efe1e1e 100644 --- a/src/backend/parser/analyze.c +++ b/src/backend/parser/analyze.c @@ -86,6 +86,9 @@ static void transformLockingClause(ParseState *pstate, Query *qry, static bool test_raw_expression_coverage(Node *node, void *context); #endif +/* POLAR: transform flashback table statement */ +static Query *transformPolarFlashbackTableStmt(ParseState *pstate, + PolarFlashbackTableStmt *stmt); /* * parse_analyze @@ -341,6 +344,11 @@ transformStmt(ParseState *pstate, Node *parseTree) (CallStmt *) parseTree); break; + case T_PolarFlashbackTableStmt: + result = transformPolarFlashbackTableStmt(pstate, + (PolarFlashbackTableStmt *) parseTree); + break; + default: /* @@ -2994,3 +3002,20 @@ test_raw_expression_coverage(Node *node, void *context) } #endif /* RAW_EXPRESSION_COVERAGE_TEST */ + +/* + * POLAR: transform a PolarFlashbackTableStmt + * + * We need to do parse analysis on timestamp expression. + */ +static Query * +transformPolarFlashbackTableStmt(ParseState *pstate, PolarFlashbackTableStmt *stmt) +{ + Query *result; + Node* node = transformExpr(pstate, stmt->target_timestamp, EXPR_KIND_FLASHBACK_TABLE); + stmt->time_expr = coerce_to_specific_type(pstate, node, TIMESTAMPTZOID, "FLASHBACK TABLE"); + result = makeNode(Query); + result->commandType = CMD_UTILITY; + result->utilityStmt = (Node *) stmt; + return result; +} diff --git a/src/backend/parser/gram.y b/src/backend/parser/gram.y index 044d339e8c4..bfe11e9e716 100644 --- a/src/backend/parser/gram.y +++ b/src/backend/parser/gram.y @@ -288,6 +288,8 @@ static bool polar_is_ignore_user_defined_tablespace(char *tablespace_name); CreateMatViewStmt RefreshMatViewStmt CreateAmStmt CreatePublicationStmt AlterPublicationStmt CreateSubscriptionStmt AlterSubscriptionStmt DropSubscriptionStmt + /* POLAR: flashback table stmt */ + PolarFlashbackTableStmt %type select_no_parens select_with_parens select_clause simple_select values_clause @@ -644,7 +646,7 @@ static bool polar_is_ignore_user_defined_tablespace(char *tablespace_name); EXCLUDE EXCLUDING EXCLUSIVE EXECUTE EXISTS EXPLAIN EXTENSION EXTERNAL EXTRACT - FALSE_P FAMILY FETCH FILTER FIRST_P FLASHBACK FLOAT_P FOLLOWER FOLLOWING FOR + FALSE_P FAMILY FETCH FILTER FIRST_P FLOAT_P FOLLOWER FOLLOWING FOR FORCE FOREIGN FORWARD FREEZE FROM FULL FUNCTION FUNCTIONS GENERATED GLOBAL GRANT GRANTED GREATEST GROUP_P GROUPING GROUPS @@ -710,6 +712,10 @@ static bool polar_is_ignore_user_defined_tablespace(char *tablespace_name); YEAR_P YES_P ZONE + + /* POLAR: FLASHBACK */ + FLASHBACK + /* POLAR: End */ /* * The grammar thinks these are keywords, but they are not in the kwlist.h @@ -955,6 +961,7 @@ stmt : | VariableSetStmt | VariableShowStmt | ViewStmt + | PolarFlashbackTableStmt | /*EMPTY*/ { $$ = NULL; } ; @@ -15328,6 +15335,23 @@ ColLabel: IDENT { $$ = $1; } ; +/***************************************************************************** + * + * POLAR: + * FLASHBACK TABLE relation_expr + * TO TIMESTAMP a_expr + * + *****************************************************************************/ +PolarFlashbackTableStmt: + FLASHBACK TABLE relation_expr TO TIMESTAMP a_expr + { + PolarFlashbackTableStmt *n = makeNode(PolarFlashbackTableStmt); + n->relation = $3; + n->target_timestamp = $6; + $$ = (Node *)n; + } + ; + /* * Keyword category lists. Generally, every keyword present in * the Postgres grammar should appear in exactly one of these lists. diff --git a/src/backend/parser/parse_agg.c b/src/backend/parser/parse_agg.c index 61727e1d71a..b3b21db1e1d 100644 --- a/src/backend/parser/parse_agg.c +++ b/src/backend/parser/parse_agg.c @@ -523,6 +523,10 @@ check_agglevels_and_constraints(ParseState *pstate, Node *expr) break; + case EXPR_KIND_FLASHBACK_TABLE: + errkind = true; + break; + /* * There is intentionally no default: case here, so that the * compiler will warn if we add a new ParseExprKind without @@ -902,6 +906,9 @@ transformWindowFuncCall(ParseState *pstate, WindowFunc *wfunc, case EXPR_KIND_CALL_ARGUMENT: err = _("window functions are not allowed in CALL arguments"); break; + case EXPR_KIND_FLASHBACK_TABLE: + errkind = true; + break; /* * There is intentionally no default: case here, so that the diff --git a/src/backend/parser/parse_expr.c b/src/backend/parser/parse_expr.c index cd2338d659b..092c60c848b 100644 --- a/src/backend/parser/parse_expr.c +++ b/src/backend/parser/parse_expr.c @@ -1818,6 +1818,7 @@ transformSubLink(ParseState *pstate, SubLink *sublink) case EXPR_KIND_RETURNING: case EXPR_KIND_VALUES: case EXPR_KIND_VALUES_SINGLE: + case EXPR_KIND_FLASHBACK_TABLE: /* okay */ break; case EXPR_KIND_CHECK_CONSTRAINT: @@ -3475,6 +3476,9 @@ ParseExprKindName(ParseExprKind exprKind) return "PARTITION BY"; case EXPR_KIND_CALL_ARGUMENT: return "CALL"; + /* POLAR: add expr for flashback table */ + case EXPR_KIND_FLASHBACK_TABLE: + return "FLASHBACK TABLE"; /* * There is intentionally no default: case here, so that the diff --git a/src/backend/parser/parse_func.c b/src/backend/parser/parse_func.c index 07b437cd5ec..bf206e4f2a7 100644 --- a/src/backend/parser/parse_func.c +++ b/src/backend/parser/parse_func.c @@ -2400,6 +2400,9 @@ check_srf_call_placement(ParseState *pstate, Node *last_srf, int location) case EXPR_KIND_CALL_ARGUMENT: err = _("set-returning functions are not allowed in CALL arguments"); break; + case EXPR_KIND_FLASHBACK_TABLE: + errkind = true; + break; /* * There is intentionally no default: case here, so that the diff --git a/src/backend/polar_flashback/Makefile b/src/backend/polar_flashback/Makefile index 1774eedc403..d28dcdfdd49 100644 --- a/src/backend/polar_flashback/Makefile +++ b/src/backend/polar_flashback/Makefile @@ -12,8 +12,9 @@ subdir = src/backend/polar_flashback top_builddir = ../../.. include $(top_builddir)/src/Makefile.global -OBJS = polar_flashback_log.o polar_flashback_log_file.o polar_flashback_log_index.o polar_flashback_log_mem.o \ - polar_flashback_log_reader.o polar_flashback_log_decoder.o polar_flashback_log_insert.o polar_flashback_log_worker.o \ - polar_flashback_point.o polar_flashback_log_list.o polar_flashback_log_index_queue.o polar_flashback_log_repair_page.o polar_flashback_drop.o +OBJS = polar_flashback_log.o polar_flashback_log_file.o polar_flashback_log_index.o polar_flashback_log_mem.o \ + polar_flashback_log_reader.o polar_flashback_log_worker.o polar_flashback_log_list.o polar_flashback_log_index_queue.o \ + polar_flashback_point.o polar_flashback_table.o polar_flashback.o polar_fast_recovery_area.o polar_flashback_clog.o \ + polar_flashback_snapshot.o polar_flashback_rel_filenode.o polar_flashback_drop.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/polar_flashback/polar_fast_recovery_area.c b/src/backend/polar_flashback/polar_fast_recovery_area.c new file mode 100644 index 00000000000..b28c85f9314 --- /dev/null +++ b/src/backend/polar_flashback/polar_fast_recovery_area.c @@ -0,0 +1,336 @@ +/*------------------------------------------------------------------------- + * + * polar_fast_recovery_area.c + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, Alibaba Group Holding limited + * + * Fast recovery area store data for flashback table. + * Now there are three sub directories: + * 1. pg_xact which contains each truncated clog. + * 2. fbpoint which contains the flashback point files (fbpoint records and snapshot data). + * + * IDENTIFICATION + * src/backend/polar_flashback/polar_fast_recovery_area.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback.h" +#include "polar_flashback/polar_flashback_snapshot.h" + +fra_ctl_t fra_instance = NULL; +bool polar_enable_fast_recovery_area; +int polar_fast_recovery_area_rotation; + +/* + * POLAR: Read fast recovery area control file data + */ +static bool +read_fra_ctl_file(fra_ctl_t ctl, fra_ctl_file_data_t *ctl_file_data) +{ + char ctl_file_path[MAXPGPATH]; + int fd; + pg_crc32c crc; + + polar_make_file_path_level3(ctl_file_path, ctl->dir, FRA_CTL_FILE_NAME); + + /* The control file may be non-exist */ + if (!polar_file_exists(ctl_file_path)) + { + elog(WARNING, "Can't find %s", ctl_file_path); + return false; + } + + fd = polar_open_transient_file(ctl_file_path, O_RDONLY | PG_BINARY); + + if (fd < 0) + /*no cover line*/ + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", ctl_file_path))); + + pgstat_report_wait_start(WAIT_EVENT_FRA_CTL_FILE_READ); + + if (polar_read(fd, ctl_file_data, sizeof(fra_ctl_file_data_t)) != sizeof(fra_ctl_file_data_t)) + /*no cover line*/ + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not read from file \"%s\": %m", ctl_file_path))); + + pgstat_report_wait_end(); + + if (CloseTransientFile(fd)) + /*no cover line*/ + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", ctl_file_path))); + + /* Verify CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) ctl_file_data, offsetof(fra_ctl_file_data_t, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, ctl_file_data->crc)) + /*no cover line*/ + ereport(FATAL, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("calculated CRC checksum does not match value stored in file \"%s\"", + ctl_file_path))); + + /* Check the version */ + if (ctl_file_data->version_no != FRA_CTL_FILE_VERSION) + /*no cover line*/ + ereport(FATAL, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("Expected version %d does not match value stored in file %d", + FRA_CTL_FILE_VERSION, ctl_file_data->version_no))); + + return true; +} + +/* + * POLAR: write fast recovery area control file by checkpoint. + * Caller must hold the file lock. + */ +static void +write_fra_ctl_file(fra_ctl_t ctl) +{ + fra_ctl_file_data_t ctl_file; + char ctl_file_path[MAXPGPATH]; + + /* Update the data */ + ctl_file.version_no = FRA_CTL_FILE_VERSION; + ctl_file.next_fbpoint_rec_no = ctl->next_fbpoint_rec_no; + ctl_file.min_keep_lsn = ctl->min_keep_lsn; + ctl_file.min_clog_seg_no = ctl->clog_ctl->min_clog_seg_no; + ctl_file.next_clog_subdir_no = pg_atomic_read_u32(&(ctl->clog_ctl->next_clog_subdir_no)); + ctl_file.snapshot_end_pos.seg_no = ctl->snapshot_end_pos.seg_no; + ctl_file.snapshot_end_pos.offset = ctl->snapshot_end_pos.offset; + + /* Compute the CRC */ + INIT_CRC32C(ctl_file.crc); + COMP_CRC32C(ctl_file.crc, &ctl_file, + offsetof(fra_ctl_file_data_t, crc)); + FIN_CRC32C(ctl_file.crc); + + /* Write the control file */ + polar_make_file_path_level3(ctl_file_path, ctl->dir, FRA_CTL_FILE_NAME); + polar_write_ctl_file_atomic(ctl_file_path, &ctl_file, sizeof(fra_ctl_file_data_t), + WAIT_EVENT_FRA_CTL_FILE_WRITE, WAIT_EVENT_FRA_CTL_FILE_SYNC); + + elog(DEBUG2, "The fast recovery control info now is: " + "the next flashback point record number: %X/%X, " + "the min flashback clog segment number in the first sub directory: %04X" + "the next flashback log sub directory number: %08X", + (uint32)(ctl_file.next_fbpoint_rec_no >> 32), (uint32)(ctl_file.next_fbpoint_rec_no), + ctl_file.min_clog_seg_no, ctl_file.next_clog_subdir_no); +} + +static void +validate_fra_subdir(const char *fra_dir, const char *sub_dir) +{ + char path[MAXPGPATH]; + + FRA_GET_SUBDIR_PATH(fra_dir, sub_dir, path); + polar_validate_dir(path); +} + +inline bool +polar_enable_fra(fra_ctl_t ctl) +{ + return ctl && !polar_in_replica_mode() && !polar_in_standby_mode(); +} + +Size +polar_fra_shmem_size(void) +{ + Size size = 0; + + Assert(polar_enable_flashback_log && polar_enable_fast_recovery_area); + /* Fast recovery area control data */ + size = sizeof(fra_ctl_data_t); + /* Add flashback point shared memory size */ + size += polar_flashback_point_shmem_size(); + /* Add flashback clog shared memory size */ + size += FLASHBACK_CLOG_SHMEM_SIZE; + return size; +} + +void +fra_shmem_init_data(fra_ctl_t ctl, const char *name) +{ + char fra_ctl_file_lock[FL_OBJ_MAX_NAME_LEN]; + + FLOG_GET_OBJ_NAME(fra_ctl_file_lock, name, FRA_CTL_FILE_LOCK_NAME_SUFFIX); + + MemSet(ctl, 0, sizeof(fra_ctl_data_t)); + ctl->clog_ctl = polar_flashback_clog_shmem_init(name); + ctl->point_ctl = polar_flashback_point_shmem_init(name); + StrNCpy(ctl->dir, name, FL_INS_MAX_NAME_LEN); + LWLockRegisterTranche(LWTRANCHE_POLAR_FRA_CTL_FILE, fra_ctl_file_lock); + LWLockInitialize(&ctl->ctl_file_lock, LWTRANCHE_POLAR_FRA_CTL_FILE); +} + +fra_ctl_t +fra_shmem_init_internal(const char *name) +{ + fra_ctl_t ctl; + bool found; + char ctl_name[FL_OBJ_MAX_NAME_LEN]; + + FLOG_GET_OBJ_NAME(ctl_name, name, FRA_CTL_NAME_SUFFIX); + + ctl = (fra_ctl_t) ShmemInitStruct(ctl_name, sizeof(fra_ctl_data_t), &found); + + if (!IsUnderPostmaster) + { + Assert(!found); + fra_shmem_init_data(ctl, name); + } + else + Assert(found); + + return ctl; +} + +void +polar_startup_fra(fra_ctl_t ctl) +{ + char path[MAXPGPATH]; + fra_ctl_file_data_t ctl_file_data; + int min_clog_seg_no = FLASHBACK_CLOG_INVALID_SEG; + uint32 next_clog_subdir_no = 0; + + /* validate direcotry */ + polar_make_file_path_level2(path, ctl->dir); + polar_validate_dir(path); + + /* validate sub direcotry */ + validate_fra_subdir(ctl->dir, FLASHBACK_CLOG_DIR); + validate_fra_subdir(ctl->dir, FBPOINT_DIR); + + /* Read control file to init */ + if (read_fra_ctl_file(ctl, &ctl_file_data)) + { + min_clog_seg_no = ctl_file_data.min_clog_seg_no; + next_clog_subdir_no = ctl_file_data.next_clog_subdir_no; + ctl->next_fbpoint_rec_no = ctl_file_data.next_fbpoint_rec_no; + ctl->min_keep_lsn = ctl_file_data.min_keep_lsn; + ctl->snapshot_end_pos.seg_no = ctl_file_data.snapshot_end_pos.seg_no; + ctl->snapshot_end_pos.offset = ctl_file_data.snapshot_end_pos.offset; + } + + if (XLogRecPtrIsInvalid(ctl->min_keep_lsn)) + ctl->min_keep_lsn = GetRedoRecPtr(); + + if (FBPOINT_POS_IS_INVALID(ctl->snapshot_end_pos)) + ctl->snapshot_end_pos.offset = FBPOINT_SEG_SIZE; + + polar_startup_flashback_clog(ctl->clog_ctl, min_clog_seg_no, next_clog_subdir_no); + polar_startup_flashback_point(ctl->point_ctl, ctl->dir, ctl->next_fbpoint_rec_no); +} + +bool +polar_slru_seg_need_mv(fra_ctl_t fra_ctl, SlruCtl slru_ctl) +{ + if (strcmp(slru_ctl->Dir, FLASHBACK_CLOG_DIR) == 0 && polar_enable_fra(fra_ctl)) + return true; + else + return false; +} + +/* + * POLAR: Move SLRU segment to fra, now just clog. + */ +void +polar_mv_slru_seg_to_fra(fra_ctl_t ctl, const char *fname, const char *old_path) +{ + bool need_update_ctl = false; + + polar_mv_clog_seg_to_fra(ctl->clog_ctl, ctl->dir, fname, old_path, &need_update_ctl); + + /* Update the control file when it is necessary */ + if (need_update_ctl) + { + LWLockAcquire(&ctl->ctl_file_lock, LW_EXCLUSIVE); + write_fra_ctl_file(ctl); + LWLockRelease(&ctl->ctl_file_lock); + } +} + +/* + * POLAR: Remove the old fast recovery data + * + * NB: The fbpoint_seg_no is keep semgent no + 1. + * When fbpoint_seg_no is zero, there is no flashback point segment to removed. + * + */ +static void +polar_remove_fra_data(fra_ctl_t ctl, fbpoint_rec_data_t *fbpoint_rec, uint32 fbpoint_seg_no) +{ + if (fbpoint_seg_no > 0) + { + /* The snapshot data and flashback point record may be not in the same segment, we keep the minimal one */ + fbpoint_seg_no = Min(fbpoint_rec->snapshot_pos.seg_no, fbpoint_seg_no); + polar_truncate_fbpoint_files(ctl->dir, fbpoint_seg_no); + } + + /* Truncate the flashback clog subdir */ + polar_truncate_flashback_clog_subdir(ctl->dir, fbpoint_rec->next_clog_subdir_no); +} + +/* + * Fast recovery area do flashback point. + * + * NB: Now the fast recovery area is disable for standby. + */ +void +polar_fra_do_fbpoint(fra_ctl_t ctl, fbpoint_wal_info_data_t *wal_info, + polar_flog_rec_ptr *keep_ptr, flashback_snapshot_header_t snapshot) +{ + fbpoint_rec_data_t rec_data; + uint32 fbpoint_seg_no = 0; + fbpoint_pos_t snapshot_pos; + fbpoint_pos_t snapshot_end_pos; + XLogRecPtr min_keep_lsn; + + if (!polar_enable_fra(ctl)) + return; + + snapshot_end_pos = ctl->snapshot_end_pos; + Assert(snapshot); + /* Backup the snapshot to fast recovery area */ + snapshot_pos = polar_backup_snapshot_to_fra(snapshot, &snapshot_end_pos, ctl->dir); + + /* The size of fbpoint_rec_data_t can not be larger than FBPOINT_REC_SIZE */ + StaticAssertStmt(sizeof(fbpoint_rec_data_t) < FBPOINT_REC_SIZE, + "fbpoint_rec_data_t is larger than 64"); + /* Flush the flashback point record */ + rec_data.flog_ptr = *keep_ptr; + rec_data.redo_lsn = wal_info->fbpoint_lsn; + rec_data.time = wal_info->fbpoint_time; + rec_data.next_clog_subdir_no = FLSHBAK_GET_SNAPSHOT_DATA(snapshot)->next_clog_subdir_no; + rec_data.snapshot_pos = snapshot_pos; + polar_flush_fbpoint_rec(ctl->point_ctl, ctl->dir, &rec_data); + + /* Get the keep flashback point record */ + fbpoint_seg_no = polar_get_keep_fbpoint(ctl->point_ctl, ctl->dir, &rec_data, keep_ptr, &min_keep_lsn); + + /* Write the control file */ + LWLockAcquire(&ctl->ctl_file_lock, LW_EXCLUSIVE); + /* Update the var about flashback point */ + ctl->next_fbpoint_rec_no = ctl->point_ctl->next_fbpoint_rec_no; + ctl->snapshot_end_pos = snapshot_end_pos; + ctl->min_keep_lsn = min_keep_lsn; + write_fra_ctl_file(ctl); + LWLockRelease(&ctl->ctl_file_lock); + + /* Remove the old data */ + polar_remove_fra_data(ctl, &rec_data, fbpoint_seg_no); +} diff --git a/src/backend/polar_flashback/polar_flashback.c b/src/backend/polar_flashback/polar_flashback.c new file mode 100644 index 00000000000..271cb1f3c1b --- /dev/null +++ b/src/backend/polar_flashback/polar_flashback.c @@ -0,0 +1,229 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback.c + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, Alibaba Group Holding limited + * + * IDENTIFICATION + * src/backend/polar_flashback/polar_flashback.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "polar_flashback/polar_flashback.h" +#include "polar_flashback/polar_flashback_log.h" + +/* + * POLAR: Write a small (less than 512B) control info file atomically. + * + * When the control file doesn't exist, create and write it are not atomic. + * So we create a tmp one, write it and rename it to make the operation atomic. + * However, something the tmp one will be left, we think it is harmless and + * will use it again. + * + * NB: We think the control file is very important, so error report with PANIC. + */ +void +polar_write_ctl_file_atomic(const char *path, void *data, size_t size, + uint32 write_event_info, uint32 fsync_event_info) +{ +#define TMP_FILE_SUFFIX "tmp" + int fd; + int file_flags = O_RDWR | PG_BINARY; + bool need_rename = false; + char tmp_path[MAXPGPATH]; + + fd = BasicOpenFile(path, file_flags, true); + + /* Create a tmp file */ + if (fd < 0) + { + if (errno == ENOENT) + { + /* The ctl file doesn't exist, so create a tmp one and write */ + need_rename = true; + file_flags |= (O_CREAT | O_EXCL); + /* Write to tmp file to avoid create a empty file and crash */ + snprintf(tmp_path, MAXPGPATH, "%s%s", path, TMP_FILE_SUFFIX); + polar_unlink(tmp_path); + + fd = BasicOpenFile(tmp_path, file_flags, true); + + if (fd < 0) + { + /*no cover begin*/ + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", tmp_path))); + /*no cover end*/ + } + } + else + /*no cover line*/ + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", path))); + } + + /* Write it */ + pgstat_report_wait_start(write_event_info); + + /* Log error */ + if (polar_write(fd, data, size) != size) + { + /*no cover begin*/ + int save_errno = errno; + + /* + * If we fail to write the file, delete it to release disk + * space. + */ + if (need_rename) + polar_unlink(tmp_path); + + /* if write didn't set errno, assume problem is no disk space */ + errno = save_errno ? save_errno : ENOSPC; + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not write to file \"%s\": %m", (need_rename ? tmp_path : path)))); + /*no cover end*/ + } + + pgstat_report_wait_end(); + + /* Fsync it */ + pgstat_report_wait_start(fsync_event_info); + + if (polar_fsync(fd) != 0) + { + /*no cover line*/ + pgstat_report_wait_end(); + ereport(PANIC, + (errcode_for_file_access(), + (errmsg("could not sync file \"%s\": %m", (need_rename ? tmp_path : path))))); + } + + pgstat_report_wait_end(); + + if (polar_close(fd)) + /*no cover line*/ + ereport(PANIC, + (errcode_for_file_access(), + errmsg("could not close file \"%s\": %m", (need_rename ? tmp_path : path)))); + + /* Rename the tmp one to real one */ + if (unlikely(need_rename)) + polar_durable_rename(tmp_path, path, PANIC); +} + +/* + * POLAR: Get all flashback relative memory. + */ +Size +polar_flashback_shmem_size(void) +{ + Size size = 0; + + /* Check the guc */ + if (polar_enable_flashback_log) + { + /*no cover begin*/ + if (polar_flashback_logindex_mem_size == 0) + elog(FATAL, "Cannot enable flashback log when \"polar_flashback_logindex_mem_size\" is zero."); + + if (polar_logindex_mem_size == 0) + elog(FATAL, "Cannot enable flashback log when \"polar_logindex_mem_size\" is zero."); + + /*no cover end*/ + + size = POLAR_FLOG_SHMEM_SIZE(); + + if (polar_enable_fast_recovery_area) + return add_size(size, polar_fra_shmem_size()); + } + + return size; +} + +/* + * POLAR: Initialization of shared memory for flashback log internal function + */ +void +polar_flashback_shmem_init(void) +{ + if (polar_is_flog_mem_enabled()) + { + POLAR_FLOG_SHMEM_INIT(); + + if (polar_enable_fast_recovery_area) + FRA_SHMEM_INIT(); + } +} + +/* + * POLAR: startup all about flashback. + */ +void +polar_startup_flashback(CheckPoint *checkpoint) +{ + /* Just start up in non-replica node */ + if (!polar_in_replica_mode()) + { + if (flog_instance == NULL) + { + polar_remove_all_flog_data(flog_instance); + Assert(fra_instance == NULL); + FRA_REMOVE_ALL_DATA(); + return; + } + else + polar_startup_flog(checkpoint, flog_instance); + + if (fra_instance == NULL) + FRA_REMOVE_ALL_DATA(); + else + polar_startup_fra(fra_instance); + } +} + +/* + * POLAR: Do something after the checkpoint is done. + * + * 1. Flush the fast recovery data and remove the old data. + * 2. Flush the flashback log data and remove the old data. + */ +void +polar_do_flashback_point(polar_flog_rec_ptr ckp_start, flashback_snapshot_header_t snapshot, bool shutdown) +{ + polar_flog_rec_ptr keep_ptr = ckp_start; + flog_buf_ctl_t buf_ctl = flog_instance->buf_ctl; + + polar_fra_do_fbpoint(fra_instance, &(buf_ctl->wal_info), &keep_ptr, snapshot); + polar_flog_do_fbpoint(flog_instance, ckp_start, keep_ptr, shutdown); +} + +XLogRecPtr +polar_get_flashback_keep_wal(XLogRecPtr keep) +{ + XLogRecPtr flashback_keep; + + if (!polar_is_flog_enabled(flog_instance)) + return keep; + + flashback_keep = flog_instance->buf_ctl->redo_lsn; + + if (polar_enable_fra(fra_instance)) + flashback_keep = Min(flashback_keep, fra_instance->min_keep_lsn); + + /* Get the minimal of keep and flashback keep */ + if (XLogRecPtrIsInvalid(flashback_keep)) + return keep; + else if (XLogRecPtrIsInvalid(keep)) + return flashback_keep; + else + return Min(keep, flashback_keep); +} diff --git a/src/backend/polar_flashback/polar_flashback_clog.c b/src/backend/polar_flashback/polar_flashback_clog.c new file mode 100644 index 00000000000..a9dac87264f --- /dev/null +++ b/src/backend/polar_flashback/polar_flashback_clog.c @@ -0,0 +1,275 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback_clog.c + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, Alibaba Group Holding limited + * + * Move these clog files to flashback log dir instead of truncate. + * These clog files will be segmented into some sub directory named by + * create time. + * + * IDENTIFICATION + * src/backend/polar_flashback/polar_flashback_clog.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/clog.h" +#include "miscadmin.h" +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback_clog.h" +#include "polar_flashback/polar_flashback_log_internal.h" +#include "storage/fd.h" +#include "storage/polar_fd.h" +#include "storage/shmem.h" + +#define GET_CLOG_SUBDIR_FULL_PATH(dir_path, subdir_no, path) \ + snprintf(path, MAXPGPATH, "%s/%08X", dir_path, subdir_no) + +static void +get_clog_file_full_path(const char *clog_dir_path, uint32 clog_subdir_no, const char *fname, char *path) +{ + char clog_subdir_path[MAXPGPATH]; + + GET_CLOG_SUBDIR_FULL_PATH(clog_dir_path, clog_subdir_no, clog_subdir_path); + polar_make_file_path_level3(path, clog_subdir_path, fname); +} + +/* POLAR: Validate the flashback clog subdir */ +static void +validate_flashback_clog_subdir(const char *clog_dir_path, uint32 clog_subdir_no) +{ + char path[MAXPGPATH]; + + GET_CLOG_SUBDIR_FULL_PATH(clog_dir_path, clog_subdir_no, path); + polar_validate_dir(path); +} + +static bool +clog_file_in_subdir(const char *clog_subdir_path, const char *fname) +{ + struct stat st; + char path[MAXPGPATH]; + + snprintf(path, MAXPGPATH, "%s/%s", clog_subdir_path, fname); + + if ((polar_stat(path, &st) == 0) && S_ISREG(st.st_mode)) + return true; + else + return false; +} + +static bool +is_clog_subdir_right(flashback_clog_ctl_t clog_ctl, const char *fname, uint32 clog_subdir_no, + const char *clog_dir_path) +{ + int seg_no; + char clog_subdir_path[MAXPGPATH]; + + seg_no = (int) strtol(fname, NULL, 16); + GET_CLOG_SUBDIR_FULL_PATH(clog_dir_path, clog_subdir_no, clog_subdir_path); + + /* The segment file is in subdir already, so we need to create a new dir */ + if (clog_file_in_subdir(clog_subdir_path, fname)) + return false; + else if (clog_subdir_no == FLASHBACK_CLOG_MIN_SUBDIR && seg_no < clog_ctl->min_clog_seg_no) + return false; + + return true; +} + +static uint32 +get_right_clog_subdir_by_fname(flashback_clog_ctl_t clog_ctl, const char *clog_dir_path, + const char *fname, bool *need_update_ctl) +{ + bool need_new_subdir = false; + uint32 next_clog_subdir_no; + uint32 clog_subdir_no = FLASHBACK_CLOG_MIN_SUBDIR; + + next_clog_subdir_no = pg_atomic_read_u32(&clog_ctl->next_clog_subdir_no); + + /* There are no clog subdir. */ + if (next_clog_subdir_no == FLASHBACK_CLOG_MIN_NEXT_SUBDIR) + need_new_subdir = !is_clog_subdir_right(clog_ctl, fname, clog_subdir_no, clog_dir_path); + else if (FLASHBACK_CLOG_IS_EMPTY(next_clog_subdir_no)) + need_new_subdir = true; + else + { + Assert(!FLASHBACK_CLOG_IS_EMPTY(next_clog_subdir_no)); + clog_subdir_no = next_clog_subdir_no - 2; + + /* First check prev sub directory, next to check lastest sub directory */ + if (!is_clog_subdir_right(clog_ctl, fname, clog_subdir_no, clog_dir_path)) + { + clog_subdir_no++; + need_new_subdir = !is_clog_subdir_right(clog_ctl, fname, clog_subdir_no, clog_dir_path); + } + } + + if (need_new_subdir) + { + if (!FLASHBACK_CLOG_IS_EMPTY(next_clog_subdir_no)) + clog_subdir_no++; + + validate_flashback_clog_subdir(clog_dir_path, clog_subdir_no); + + if (pg_atomic_compare_exchange_u32(&clog_ctl->next_clog_subdir_no, &next_clog_subdir_no, + next_clog_subdir_no + 1)) + *need_update_ctl = true; + } + + return clog_subdir_no; +} + +void +polar_flashback_clog_shmem_init_data(flashback_clog_ctl_t ctl) +{ + MemSet(ctl, 0, sizeof(flashback_clog_ctl_data_t)); + pg_atomic_init_u32(&ctl->next_clog_subdir_no, FLASHBACK_CLOG_MIN_SUBDIR); + ctl->min_clog_seg_no = FLASHBACK_CLOG_INVALID_SEG; +} + +flashback_clog_ctl_t +polar_flashback_clog_shmem_init(const char *name) +{ + flashback_clog_ctl_t clog_ctl; + bool found; + char ctl_name[FL_OBJ_MAX_NAME_LEN]; + + FLOG_GET_OBJ_NAME(ctl_name, name, FLASHBACK_CLOG_CTL_NAME_SUFFIX); + + clog_ctl = (flashback_clog_ctl_t) ShmemInitStruct(ctl_name, FLASHBACK_CLOG_SHMEM_SIZE, &found); + + if (!IsUnderPostmaster) + { + Assert(!found); + polar_flashback_clog_shmem_init_data(clog_ctl); + } + else + Assert(found); + + return clog_ctl; +} + +void +polar_startup_flashback_clog(flashback_clog_ctl_t clog_ctl, int min_clog_seg_no, + uint32 next_clog_subdir_no) +{ + /* Update the minimal clog segment no after polar_enable_fast_recovery_area is on. */ + if (min_clog_seg_no == FLASHBACK_CLOG_INVALID_SEG) + clog_ctl->min_clog_seg_no = polar_get_clog_min_seg_no(); + else + clog_ctl->min_clog_seg_no = min_clog_seg_no; + + pg_atomic_write_u32(&clog_ctl->next_clog_subdir_no, next_clog_subdir_no); +} + +void +polar_mv_clog_seg_to_fra(flashback_clog_ctl_t clog_ctl, const char *fra_dir, + const char *fname, const char *old_path, bool *need_update_ctl) +{ + char new_path[MAXPGPATH]; + char clog_dir_path[MAXPGPATH]; + uint32 clog_subdir_no; + + /* Find right clog sub directory */ + FRA_GET_SUBDIR_PATH(fra_dir, FLASHBACK_CLOG_DIR, clog_dir_path); + clog_subdir_no = get_right_clog_subdir_by_fname(clog_ctl, clog_dir_path, fname, need_update_ctl); + /* Move the clog file to fast recovery area */ + get_clog_file_full_path(clog_dir_path, clog_subdir_no, fname, new_path); + polar_durable_rename(old_path, new_path, ERROR); + + elog(LOG, "The clog file %s is renamed to %s", old_path, new_path); +} + +void +polar_truncate_flashback_clog_subdir(const char *fra_dir, uint32 next_clog_subdir_no) +{ + DIR *cldir; + struct dirent *clde; + char clog_dir_path[MAXPGPATH]; + + /* Empty or just one sub directory, return */ + if (next_clog_subdir_no < FLASHBACK_CLOG_MIN_NEXT_SUBDIR) + return; + + next_clog_subdir_no--; + FRA_GET_SUBDIR_PATH(fra_dir, FLASHBACK_CLOG_DIR, clog_dir_path); + cldir = polar_allocate_dir(clog_dir_path); + + while ((clde = ReadDir(cldir, clog_dir_path)) != NULL) + { + size_t len; + uint32 subdir_no; + + len = strlen(clde->d_name); + + if ((len == FLASHBACK_CLOG_SUBDIR_NAME_LEN) && + strspn(clde->d_name, "0123456789ABCDEF") == len) + { + subdir_no = (uint32) strtoul(clde->d_name, NULL, 16); + + if (subdir_no < next_clog_subdir_no) + { + char clog_subdir_path[MAXPGPATH]; + + GET_CLOG_SUBDIR_FULL_PATH(clog_dir_path, subdir_no, clog_subdir_path); + rmtree(clog_subdir_path, true); + } + } + } + + FreeDir(cldir); +} + +/* + * POLAR: Get the status of the xid. + * + * If it is in fast recovery area, find the right clog sub directory and read the page from disk. + * Otherwise, get the status form slru shared buffer. + */ +XidStatus +polar_flashback_get_xid_status(TransactionId xid, TransactionId max_xid, uint32 next_clog_subdir_no, + const char *fra_dir) +{ + bool must_found = false; + XLogRecPtr lsn; + + if (!FLASHBACK_CLOG_IS_EMPTY(next_clog_subdir_no)) + { + char clog_dir_path[MAXPGPATH]; + char clog_subdir_path[MAXPGPATH]; + + /* Get the real clog subdir no */ + next_clog_subdir_no--; + + /* The transaction in the last clog sub directory */ + if (xid >= max_xid) + { + if (next_clog_subdir_no == FLASHBACK_CLOG_MIN_SUBDIR) + elog(ERROR, "There is no clog subdir but xid %u is larger than or equal to max_xid %u", xid, max_xid); + + /* Must in the last clog sub directory */ + next_clog_subdir_no--; + must_found = true; + } + + /* Find clog sub directory */ + FRA_GET_SUBDIR_PATH(fra_dir, FLASHBACK_CLOG_DIR, clog_dir_path); + GET_CLOG_SUBDIR_FULL_PATH(clog_dir_path, next_clog_subdir_no, clog_subdir_path); + + if (polar_xid_in_clog_dir(xid, clog_subdir_path)) + return polar_get_xid_status(xid, clog_subdir_path); + /*no cover begin*/ + else if (unlikely(must_found)) + elog(ERROR, "We can't find the xid %u in %s", xid, clog_subdir_path); + /*no cover end*/ + } + + /* It is not in fast recovery area */ + return TransactionIdGetStatus(xid, &lsn); +} diff --git a/src/backend/polar_flashback/polar_flashback_log.c b/src/backend/polar_flashback/polar_flashback_log.c index ecc33775ffc..350a0d2aa69 100644 --- a/src/backend/polar_flashback/polar_flashback_log.c +++ b/src/backend/polar_flashback/polar_flashback_log.c @@ -16,20 +16,28 @@ #include "access/xlog.h" #include "access/xlogdefs.h" +#include "common/pg_lzcompress.h" #include "miscadmin.h" #include "polar_flashback/polar_flashback_log.h" #include "polar_flashback/polar_flashback_log_file.h" #include "polar_flashback/polar_flashback_log_index.h" -#include "polar_flashback/polar_flashback_log_insert.h" #include "polar_flashback/polar_flashback_log_worker.h" #include "polar_flashback/polar_flashback_point.h" +#include "polar_flashback/polar_flashback_rel_filenode.h" +#include "postmaster/startup.h" #include "storage/buf_internals.h" #include "storage/bufpage.h" #include "storage/bufmgr.h" +#include "storage/checksum.h" #include "utils/guc.h" +/* Buffer size required to store a compressed version of origin page */ +#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) + /* GUCs */ bool polar_enable_flashback_log; +bool polar_has_partial_write; +bool polar_flashback_log_debug; /* For the logindex */ int polar_flashback_logindex_mem_size; int polar_flashback_logindex_bloom_blocks; @@ -40,18 +48,12 @@ int polar_flashback_log_insert_locks; flog_ctl_t flog_instance = NULL; -static bool -flog_data_need_remove_all(void) -{ - return !polar_enable_flashback_log && !polar_in_replica_mode(); -} - /* * POLAR: Remove all the flashback relative files contain log files and * logindex files. And keep the flashback logindex dir. */ -static void -flog_data_remove_all(flog_ctl_t instance) +void +polar_remove_all_flog_data(flog_ctl_t instance) { logindex_snapshot_t snapshot = NULL; flog_buf_ctl_t buf_ctl = NULL; @@ -66,7 +68,7 @@ flog_data_remove_all(flog_ctl_t instance) polar_flog_remove_all(buf_ctl); } -static Size +static inline Size flog_ctl_size(void) { return MAXALIGN(sizeof(flog_ctl_data_t)); @@ -77,18 +79,17 @@ flog_ctl_size(void) * * NB: Only startup process will call the function, so it is lock free. */ -static void -polar_set_flog_state(flog_ctl_t instance, flashback_state state) +static inline void +polar_set_flog_state(flog_ctl_t instance, uint32 state) { - instance->state = state; - pg_write_barrier(); + pg_atomic_write_u32(&instance->state, state); } -void +inline void polar_flog_ctl_init_data(flog_ctl_t ctl) { MemSet(ctl, 0, sizeof(flog_ctl_data_t)); - ctl->state = FLOG_INIT; + pg_atomic_init_u32(&ctl->state, FLOG_INIT); } static flog_ctl_t @@ -165,7 +166,7 @@ polar_flush_flog_data(flog_buf_ctl_t ctl, polar_flog_rec_ptr ckp_start, bool shu * while the buffer isn't always flushed in shutdown restartpoint of standby. */ polar_flog_flush(ctl, ckp_end); - polar_set_flog_buf_state(ctl, FLOG_BUF_SHUTDOWNED); + ctl->buf_state = FLOG_BUF_SHUTDOWNED; } else ckp_end = polar_get_flog_write_result(ctl); @@ -199,7 +200,7 @@ clean_buf_flog_state(BufferDesc *buf_hdr, uint32 flog_state) polar_unlock_redo_state(buf_hdr, state); } -bool +inline bool polar_is_flog_mem_enabled(void) { return polar_enable_flashback_log && !polar_is_datamax() && !IsBootstrapProcessingMode(); @@ -208,17 +209,16 @@ polar_is_flog_mem_enabled(void) /* * POLAR: Is the flashback log enabled? */ -bool +inline bool polar_is_flog_enabled(flog_ctl_t instance) { return instance && !polar_in_replica_mode(); } -bool +inline bool polar_is_flog_ready(flog_ctl_t instance) { - pg_read_barrier(); - return instance->state == FLOG_READY; + return pg_atomic_read_u32(&instance->state) == FLOG_READY; } /* @@ -229,11 +229,10 @@ polar_is_flog_ready(flog_ctl_t instance) * 1. read the control file and fill the flashback log buf_ctl info. * 2. call polar_logindex_snapshot_init to do logindex snapshot init. */ -bool +inline bool polar_has_flog_startup(flog_ctl_t instance) { - pg_read_barrier(); - return instance->state > FLOG_INIT; + return pg_atomic_read_u32(&instance->state) > FLOG_INIT; } /* @@ -253,33 +252,6 @@ polar_flog_shmem_size_internal(int insert_locks_num, int log_buffers, int logind return size; } -/* - * POLAR: Get the flashback log relative memory. - */ -Size -polar_flog_shmem_size(void) -{ - Size size = 0; - - if (polar_enable_flashback_log) - { - /*no cover begin*/ - if (polar_flashback_logindex_mem_size == 0) - elog(FATAL, "Cannot enable flashback log when \"polar_flashback_logindex_mem_size\" is zero."); - - if (polar_logindex_mem_size == 0) - elog(FATAL, "Cannot enable flashback log when \"polar_logindex_mem_size\" is zero."); - - /*no cover end*/ - } - else - return size; - - return polar_flog_shmem_size_internal(polar_flashback_log_insert_locks, - polar_flashback_log_buffers, polar_flashback_logindex_mem_size, - polar_flashback_logindex_bloom_blocks, polar_flashback_logindex_queue_buffers); -} - /* * POLAR: Initialization of shared memory for flashback log internal function */ @@ -298,16 +270,6 @@ polar_flog_shmem_init_internal(const char *name, int insert_locks_num, return ctl; } -void -polar_flog_shmem_init(void) -{ - if (polar_is_flog_mem_enabled()) - flog_instance = polar_flog_shmem_init_internal(POLAR_FL_DEFAULT_DIR, - polar_flashback_log_insert_locks, polar_flashback_log_buffers, - polar_flashback_logindex_mem_size, polar_flashback_logindex_bloom_blocks, - polar_flashback_logindex_queue_buffers); -} - /* * POLAR: Do something after the flashback point is done. * @@ -315,12 +277,13 @@ polar_flog_shmem_init(void) * 2. remove the old flashback log. */ void -polar_flog_do_fbpoint(flog_ctl_t instance, polar_flog_rec_ptr ckp_start, bool shutdown) +polar_flog_do_fbpoint(flog_ctl_t instance, polar_flog_rec_ptr ckp_start, + polar_flog_rec_ptr keep_ptr, bool shutdown) { flog_buf_ctl_t buf_ctl = instance->buf_ctl; polar_flush_flog_data(buf_ctl, ckp_start, shutdown); - polar_remove_flog_data(instance, ckp_start); + polar_remove_flog_data(instance, keep_ptr); } /* @@ -336,7 +299,7 @@ polar_is_buf_flog_enabled(flog_ctl_t instance, Buffer buf) buf_hdr = GetBufferDescriptor(buf - 1); return polar_is_flog_enabled(flog_instance) && - !polar_check_buf_flog_state(buf_hdr, POLAR_BUF_FLOG_DISABLE); + !POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_FLOG_DISABLE); } /* @@ -356,7 +319,7 @@ polar_is_flog_needed(flog_ctl_t flog_ins, polar_logindex_redo_ctl_t redo_ins, { Assert(flog_ins); - if (!is_need_flog(forkno) || !is_permanent) + if (!POLAR_IS_NEED_FLOG(forkno) || !is_permanent) return false; if (redo_lsn != InvalidXLogRecPtr) @@ -421,30 +384,30 @@ polar_check_fpi_origin_page(RelFileNode rnode, ForkNumber forkno, BlockNumber bl if (!PageIsNew(page_tmp.data) && !polar_page_is_just_inited(page_tmp.data)) /*no cover line*/ - elog(PANIC, "The page [%u, %u, %u], %d, %u has a full page image wal record, " - "but its origin page is not a empty page", rnode.spcNode, rnode.dbNode, rnode.relNode, - forkno, block); + elog(PANIC, "The page " POLAR_LOG_BUFFER_TAG_FORMAT " has a full page image wal record, " + "but its origin page is not a empty page", rnode.spcNode, rnode.dbNode, + rnode.relNode, forkno, block); } } /* * POLAR: Flush the flashback log record of the buffer. * - * is_invalidate: Is the buffer invalidate? + * invalidate: Invalidate the buffer? * * When we drop the relation or database, the buffer is invalidate and its flashback log is * unnecessary. */ void -polar_flush_buf_flog_rec(BufferDesc *buf_hdr, flog_ctl_t instance, bool is_invalidate) +polar_flush_buf_flog_rec(BufferDesc *buf_hdr, flog_ctl_t instance, bool invalidate) { if (!polar_is_flog_enabled(instance)) return; - polar_insert_buf_flog_rec_sync(instance->list_ctl, instance->buf_ctl, - instance->queue_ctl, buf_hdr, is_invalidate); + if (IS_BUF_IN_FLOG_LIST(buf_hdr)) + polar_process_buf_flog_list(instance, buf_hdr, false, invalidate); - polar_flush_buf_flog(instance->list_ctl, instance->buf_ctl, buf_hdr, is_invalidate); + polar_flush_buf_flog(instance->list_ctl, instance->buf_ctl, buf_hdr, invalidate); } /* @@ -453,51 +416,36 @@ polar_flush_buf_flog_rec(BufferDesc *buf_hdr, flog_ctl_t instance, bool is_inval * state: the database state. * instance: the flashback log instance. * - * NB: It just fill some control info. The flashback log - * buffer recovery and logindex recovery will be - * done by the flashback log background writer. + * NB: It will switch to a new file when the flashback log is in crash recovery. */ void polar_startup_flog(CheckPoint *checkpoint, flog_ctl_t instance) { - if (!IsUnderPostmaster) - return; - - /* Remove all when flashback log is unenable */ - if (flog_data_need_remove_all()) - { - flog_data_remove_all(instance); - return; - } - /* Validate directory and startup the flashback log and flashback logindex in the rw and standby */ - if (polar_is_flog_enabled(instance)) - { - logindex_snapshot_t snapshot = instance->logindex_snapshot; - flog_buf_ctl_t buf_ctl = instance->buf_ctl; + logindex_snapshot_t snapshot = instance->logindex_snapshot; + flog_buf_ctl_t buf_ctl = instance->buf_ctl; - Assert(instance); - polar_validate_flog_dir(buf_ctl); - polar_validate_flog_index_dir(snapshot); + Assert(polar_is_flog_enabled(instance)); + polar_validate_flog_dir(buf_ctl); + polar_validate_flog_index_dir(snapshot); - /* Remove all the temporary flashback log files */ - polar_remove_tmp_flog_file(buf_ctl->dir); + /* Remove all the temporary flashback log files */ + polar_remove_tmp_flog_file(buf_ctl->dir); - polar_startup_flog_buf(buf_ctl, checkpoint); - polar_startup_flog_index(snapshot, - VALID_FLOG_PTR(polar_get_fbpoint_start_ptr(buf_ctl))); + polar_startup_flog_buf(buf_ctl, checkpoint); + polar_startup_flog_index(snapshot, + VALID_FLOG_PTR(polar_get_fbpoint_start_ptr(buf_ctl))); - if (buf_ctl->buf_state == FLOG_BUF_READY) - polar_set_flog_state(instance, FLOG_READY); - else - { - /* - * Recover the flashback log buffer, so we can insert flashback log - * in the startup process without wait. - */ - polar_recover_flog_buf(instance); - polar_set_flog_state(instance, FLOG_STARTUP); - } + if (buf_ctl->buf_state == FLOG_BUF_READY) + polar_set_flog_state(instance, FLOG_READY); + else + { + /* + * Recover the flashback log buffer, so we can insert flashback log + * in the startup process without wait. + */ + polar_recover_flog_buf(instance); + polar_set_flog_state(instance, FLOG_STARTUP); } } @@ -521,7 +469,6 @@ polar_recover_flog_buf(flog_ctl_t instance) polar_flog_rec_ptr prev_pos; polar_flog_rec_ptr block_end_ptr; uint64 seg_no = 0; - flog_buf_state state; LWLockAcquire(&buf_ctl->ctl_file_lock, LW_SHARED); ckp_end_ptr = buf_ctl->fbpoint_info.flog_end_ptr; @@ -534,15 +481,14 @@ polar_recover_flog_buf(flog_ctl_t instance) /* If there is no flashback log record, just return */ if (seg_no == POLAR_INVALID_FLOG_SEGNO) { - polar_set_flog_buf_state(buf_ctl, FLOG_BUF_READY); + buf_ctl->buf_state = FLOG_BUF_READY; polar_set_flog_state(instance, FLOG_READY); elog(LOG, "There is no flashback log data, so just skip the recovery of" " the flashback log and logindex"); return; } - state = polar_get_flog_buf_state(buf_ctl); - polar_log_flog_buf_state(state); + polar_log_flog_buf_state(buf_ctl->buf_state); /* * Note the previous pointer is not correct. Its expected value @@ -550,9 +496,9 @@ polar_recover_flog_buf(flog_ctl_t instance) * than ckp_end_ptr. It is nothing serious, we just process this case * in the reader function. */ - if (state == FLOG_BUF_SHUTDOWN_RECOVERY) + if (buf_ctl->buf_state == FLOG_BUF_SHUTDOWN_RECOVERY) ptr = ckp_end_ptr; - else if (state == FLOG_BUF_CRASH_RECOVERY) + else if (buf_ctl->buf_state == FLOG_BUF_CRASH_RECOVERY) { ptr = (seg_no + 1) * POLAR_FLOG_SEG_SIZE; prev_ptr = POLAR_INVALID_FLOG_REC_PTR; @@ -587,7 +533,7 @@ polar_recover_flog_buf(flog_ctl_t instance) else /*no cover line*/ ereport(PANIC, (errcode(ERRCODE_INTERNAL_ERROR), - errmsg("The invalid flashback log buffer recovery state: %u", state))); + errmsg("The invalid flashback log buffer recovery state: %u", buf_ctl->buf_state))); /* Write control file, update the flashback state area */ LWLockAcquire(&buf_ctl->ctl_file_lock, LW_EXCLUSIVE); @@ -640,9 +586,8 @@ polar_recover_flog_buf(flog_ctl_t instance) insert->curr_pos = pos; insert->prev_pos = prev_pos; SpinLockRelease(&insert->insertpos_lck); - polar_set_flog_buf_state(buf_ctl, FLOG_BUF_READY); - - polar_set_flog_min_recover_lsn(buf_ctl, ptr); + buf_ctl->buf_state = FLOG_BUF_READY; + buf_ctl->min_recover_lsn = ptr; elog(LOG, "The flashback log shared buffer is ready now, the current point(position)" " is %X/%X(%X/%X), previous point(position) is %X/%X(%X/%X), initalized upto point is" @@ -685,7 +630,7 @@ polar_set_buf_flog_lost_checked(flog_ctl_t flog_ins, * flushed to disk already. */ bool -polar_may_buf_lost_flog(flog_ctl_t flog_ins, polar_logindex_redo_ctl_t redo_instance, +polar_may_buf_lost_flog(flog_ctl_t flog_ins, polar_logindex_redo_ctl_t redo_ins, BufferDesc *buf_desc) { static XLogRecPtr max_page_lsn_in_disk = InvalidXLogRecPtr; @@ -696,11 +641,11 @@ polar_may_buf_lost_flog(flog_ctl_t flog_ins, polar_logindex_redo_ctl_t redo_inst return false; /* Only used by online promote */ - if (polar_get_bg_redo_state(redo_instance) != POLAR_BG_ONLINE_PROMOTE) + if (polar_get_bg_redo_state(redo_ins) != POLAR_BG_ONLINE_PROMOTE) return false; /* The buffer have been checked already */ - if (polar_check_buf_flog_state(buf_desc, POLAR_BUF_FLOG_LOST_CHECKED)) + if (POLAR_CHECK_BUF_FLOG_STATE(buf_desc, POLAR_BUF_FLOG_LOST_CHECKED)) return false; page_lsn = BufferGetLSN(buf_desc); @@ -736,13 +681,12 @@ polar_make_true_no_flog(flog_ctl_t instance, BufferDesc *buf) if (!polar_is_flog_enabled(instance)) return; - if (is_buf_in_flog_list(buf)) + if (IS_BUF_IN_FLOG_LIST(buf)) { /*no cover line*/ - elog(PANIC, "The buffer %d [%u, %u, %u], %u, %u will be evicted " - "but there is flashback log record of it not flushed", - buf->buf_id, buf->tag.rnode.spcNode, buf->tag.rnode.dbNode, - buf->tag.rnode.relNode, buf->tag.forkNum, buf->tag.blockNum); + elog(PANIC, "The buffer %d " POLAR_LOG_BUFFER_TAG_FORMAT " will be evicted " + "but there is flashback log record of it not flushed", buf->buf_id, + POLAR_LOG_BUFFER_TAG(&(buf->tag))); } } @@ -753,3 +697,633 @@ polar_recover_flog(flog_ctl_t instance) polar_recover_flog_index(instance->logindex_snapshot, instance->queue_ctl, instance->buf_ctl); polar_set_flog_state(instance, FLOG_READY); } + +void +polar_get_buffer_tag_in_flog_rec(flog_record *rec, BufferTag *tag) +{ + switch (rec->xl_rmid) + { + case ORIGIN_PAGE_ID: + INIT_BUFFERTAG(*tag, FL_GET_ORIGIN_PAGE_REC_DATA(rec)->tag.rnode, + FL_GET_ORIGIN_PAGE_REC_DATA(rec)->tag.forkNum, + FL_GET_ORIGIN_PAGE_REC_DATA(rec)->tag.blockNum); + break; + + case ORIGIN_PAGE_FULL: + INIT_BUFFERTAG(*tag, FL_GET_FILENODE_REC_DATA(rec)->new_filenode, FILENODE_FORK, 0); + break; + + default: + /*no cover line*/ + elog(ERROR, "Unknown flashback log record rmid %d", rec->xl_rmid); + } +} + +/* POLAR: Just crash will cause partial write */ +static inline bool +may_be_partial_write(void) +{ + return AmStartupProcess() && !reachedConsistency; +} + +/* + * POLAR: Write the repaired buffer. + * + * NB: The buffer must be invaild, so write it without + * any lock is safe. + */ +static void +write_repaired_buf(BufferDesc *buf) +{ + SMgrRelation reln; + Block buf_block; + char *buf_write; + + Assert(pg_atomic_read_u32(&buf->polar_redo_state) & POLAR_BUF_FLOG_DISABLE); + + buf_block = BufHdrGetBlock(buf); + buf_write = PageEncryptCopy((Page) buf_block, buf->tag.forkNum, + buf->tag.blockNum); + buf_write = PageSetChecksumCopy((Page) buf_write, buf->tag.blockNum); + reln = smgropen(buf->tag.rnode, InvalidBackendId); + smgrwrite(reln, + buf->tag.forkNum, + buf->tag.blockNum, + buf_write, + false); +} + +/* + * POLAR: Get origin page to solve partial write problem. + * + * instance: The flashback log instance. + * buf: The target buffer. + * tag: The buffer tag. + */ +static bool +get_origin_page_for_partial_write(flog_ctl_t instance, Buffer *buf, BufferTag *tag) +{ + flshbak_buf_context_t context; + bool found = false; + flog_reader_state * reader; + + /* Allocate a flashback log reader */ + FLOG_ALLOC_PAGE_READER(reader, instance->buf_ctl, ERROR); + + INIT_FLSHBAK_BUF_CONTEXT(context, polar_get_fbpoint_start_ptr(instance->buf_ctl), + polar_get_flog_write_result(instance->buf_ctl), + polar_get_curr_fbpoint_lsn(instance->buf_ctl), GetRedoRecPtr(), + instance->logindex_snapshot, reader, tag, *buf, ERROR, true); + + found = polar_flashback_buffer(&context); + + polar_flog_reader_free(reader); + return found; +} + +/* + * POLAR: The flashback log can repair the PERMANENT buffer + * when it meet a partial write. + */ +bool +polar_can_flog_repair(flog_ctl_t instance, BufferDesc *buf_hdr, bool has_redo_action) +{ + uint32 buf_state; + + if (!polar_is_flog_enabled(instance)) + return false; + + if (may_be_partial_write() || has_redo_action) + { + buf_state = pg_atomic_read_u32(&buf_hdr->state); + return buf_state & BM_PERMANENT; + } + + return false; +} + +/* + * To repair the partial write problem. + * Partial write problem will occur in three scenarios: + * 1. RW crash recovery. + * 2. Standby crash recovery. + * 3. RO to RW online promote. + */ +void +polar_repair_partial_write(flog_ctl_t instance, BufferDesc *bufHdr) +{ + BufferTag *tag = &bufHdr->tag; + Buffer buf; + + Assert((pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0); + buf = bufHdr->buf_id + 1; + + /* Wait for the flashback logindex ready */ + while (!polar_is_flog_ready(instance)) + { + /* Handle interrupt signals of startup process to avoid hang */ + if (AmStartupProcess()) + HandleStartupProcInterrupts(); + else + CHECK_FOR_INTERRUPTS(); + + pg_usleep(1000L); + } + + if (!get_origin_page_for_partial_write(instance, &buf, tag)) + { + /*no cover line*/ + elog(ERROR, "Can't find a valid origin page for " POLAR_LOG_BUFFER_TAG_FORMAT " from flashback log", + POLAR_LOG_BUFFER_TAG(tag)); + } + else + { + /* Flush the buffer to protect the next first modify after checkpoint. */ + write_repaired_buf(bufHdr); + + elog(LOG, "The page " POLAR_LOG_BUFFER_TAG_FORMAT " has been repaired by flashback log", + POLAR_LOG_BUFFER_TAG(tag)); + } +} + +static void +log_flog_rec(flog_record *record, polar_flog_rec_ptr ptr) +{ +#define MAX_EXTRA_INFO_SIZE 512 + + char extra_info[MAX_EXTRA_INFO_SIZE]; + fl_origin_page_rec_data *origin_page_rec; + fl_filenode_rec_data_t *filenode_rec; + + switch (record->xl_rmid) + { + case ORIGIN_PAGE_ID: + origin_page_rec = FL_GET_ORIGIN_PAGE_REC_DATA(record); + snprintf(extra_info, MAX_EXTRA_INFO_SIZE, "It is a origin page record, " + "the origin page is %s page. The redo lsn of the origin page is %X/%X, " + "the page tag is " POLAR_LOG_BUFFER_TAG_FORMAT, + (record->xl_info == ORIGIN_PAGE_EMPTY ? "empty" : "not empty"), + (uint32)(origin_page_rec->redo_lsn >> 32), (uint32)(origin_page_rec->redo_lsn), + POLAR_LOG_BUFFER_TAG(&(origin_page_rec->tag))); + break; + case REL_FILENODE_ID: + filenode_rec = FL_GET_FILENODE_REC_DATA(record); + snprintf(extra_info, MAX_EXTRA_INFO_SIZE, "It is a relation file node change record, " + "the origin relation file node for [%u, %u, %u] is [%u, %u, %u] before %s", + filenode_rec->new_filenode.spcNode, filenode_rec->new_filenode.dbNode, + filenode_rec->new_filenode.relNode, + filenode_rec->old_filenode.spcNode, filenode_rec->old_filenode.dbNode, + filenode_rec->old_filenode.relNode, timestamptz_to_str(filenode_rec->time)); + break; + default: + /*no cover begin*/ + elog(ERROR, "The type of the record %X/%08X is wrong\n", + (uint32)(ptr >> 32), (uint32)ptr); + break; + /*no cover end*/ + } + + elog(LOG, "Insert a flashback log record at %X/%X: total length is %u, " + "the previous pointer is %X/%X. %s", + (uint32)(ptr >> 32), (uint32)ptr, record->xl_tot_len, + (uint32)(record->xl_prev >> 32), (uint32)(record->xl_prev), extra_info); +} + +static uint32 +get_origin_page_rec_len(uint32 xl_tot_len, flog_insert_context *insert_context, + fl_rec_img_header *b_img, fl_rec_img_comp_header *cb_img, char *data) +{ + Page page; + uint32 result = xl_tot_len; + + page = (Page) insert_context->data; + result += FL_ORIGIN_PAGE_REC_INFO_SIZE; + + /* Process the unempty origin page record */ + if (page && !PageIsNew(page) && !polar_page_is_just_inited(page)) + { + BlockNumber block_num; + bool need_checksum_again; + uint16 data_len = BLCKSZ; + bool from_origin_buf = false; + uint16 lower; + uint16 upper; + + need_checksum_again = from_origin_buf = insert_context->info & FROM_ORIGIN_BUF; + block_num = insert_context->buf_tag->blockNum; + + /* Assume we can omit data between pd_lower and pd_upper */ + lower = ((PageHeader) page)->pd_lower; + upper = ((PageHeader) page)->pd_upper; + + if (lower >= SizeOfPageHeaderData && + upper > lower && + upper <= BLCKSZ) + { + b_img->hole_offset = lower; + b_img->bimg_info |= IMAGE_HAS_HOLE; + cb_img->hole_length = upper - lower; + /* + * Check the checksum before compute it again, so we will + * not change the rightness of checksum. + * + * When it is from origin buffer, the checksum may be wrong, + * so we don't check the pages from origin page buffer. + * + * We do nothing while the checksum is wrong here, but + * the decoder will verify the page. + */ + if ((!from_origin_buf) && DataChecksumsEnabled()) + need_checksum_again = + (pg_checksum_page((char *) page, block_num) == ((PageHeader) page)->pd_checksum); + } + else + { + /* No "hole" to compress out */ + b_img->hole_offset = 0; + cb_img->hole_length = 0; + } + + /* Clean the hole */ + MemSet((char *)page + b_img->hole_offset, 0, cb_img->hole_length); + + /* Compute checksum again */ + if (need_checksum_again && DataChecksumsEnabled()) + ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, block_num); + + /* Try to compress flashback log */ + if (polar_compress_block_in_log(page, b_img->hole_offset, cb_img->hole_length, + data, &data_len, FL_REC_IMG_COMP_HEADER_SIZE)) + { + b_img->bimg_info |= IMAGE_IS_COMPRESSED; + b_img->length = data_len; + + if (cb_img->hole_length != 0) + result += FL_REC_IMG_COMP_HEADER_SIZE; + } + else + b_img->length = BLCKSZ - cb_img->hole_length; + + result += FL_REC_IMG_HEADER_SIZE + b_img->length; + } + + /* The empty origin page record just a fl_origin_page_rec_data */ + return result; +} + +static flog_record * +assemble_origin_page_rec(flog_insert_context *insert_context, uint32 xl_tot_len) +{ + fl_rec_img_header b_img = {0, 0, 0}; + fl_rec_img_comp_header cb_img = {0}; + char data[PGLZ_MAX_BLCKSZ]; + fl_origin_page_rec_data rec_data; + flog_record *rec; + char *scratch; + + Assert(insert_context->rmgr == ORIGIN_PAGE_ID); + xl_tot_len = get_origin_page_rec_len(xl_tot_len, insert_context, &b_img, &cb_img, data); + + /* Construct the flashback log record */ + rec = polar_palloc_in_crit(xl_tot_len); + rec->xl_tot_len = xl_tot_len; + rec->xl_info = insert_context->info; + + /* Copy the record data for the origin page. */ + rec_data.redo_lsn = insert_context->redo_lsn; + INIT_BUFFERTAG(rec_data.tag, insert_context->buf_tag->rnode, + insert_context->buf_tag->forkNum, insert_context->buf_tag->blockNum); + scratch = (char *)rec + FLOG_REC_HEADER_SIZE; + memcpy(scratch, &rec_data, FL_ORIGIN_PAGE_REC_INFO_SIZE); + scratch += FL_ORIGIN_PAGE_REC_INFO_SIZE; + + /* An empty origin page record */ + if (b_img.length == 0) + { + Assert(xl_tot_len == (FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE)); + rec->xl_info = ORIGIN_PAGE_EMPTY; + } + else + { + Assert(xl_tot_len >= FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE + + FL_REC_IMG_HEADER_SIZE + b_img.length); + + rec->xl_info = ORIGIN_PAGE_FULL; + memcpy(scratch, &b_img, FL_REC_IMG_HEADER_SIZE); + scratch += FL_REC_IMG_HEADER_SIZE; + + /* Process compressed one */ + if (b_img.bimg_info & IMAGE_IS_COMPRESSED) + { + if (cb_img.hole_length != 0) + { + memcpy(scratch, &cb_img, FL_REC_IMG_COMP_HEADER_SIZE); + scratch += FL_REC_IMG_COMP_HEADER_SIZE; + Assert(xl_tot_len == FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE + + FL_REC_IMG_HEADER_SIZE + FL_REC_IMG_COMP_HEADER_SIZE + b_img.length); + } + + memcpy(scratch, data, b_img.length); + } + else + { + Assert(xl_tot_len == FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE + + FL_REC_IMG_HEADER_SIZE + b_img.length); + + if (cb_img.hole_length != 0) + { + Assert(b_img.length < BLCKSZ); + Assert(b_img.hole_offset >= SizeOfPageHeaderData); + + memcpy(scratch, (char *) (insert_context->data), b_img.hole_offset); + scratch += b_img.hole_offset; + memcpy(scratch, (char *) (insert_context->data) + b_img.hole_offset + cb_img.hole_length, + b_img.length - b_img.hole_offset); + } + else + { + Assert(b_img.length == BLCKSZ); + memcpy(scratch, (char *) (insert_context->data), b_img.length); + } + } + } + + return rec; +} + +/* + * Assemble a flashback log record from the buffers into an + * polar_flashback_log_record, ready for insertion with + * polar_flashback_log_insert_record(). + * + * tag is the buffer tag of buffer which is a origin page. + * redo_ptr is the last checkpoint XLOG record pointer. + * + * Return the flashback log record which palloc in this function. + * The caller can switch the memory context and pfree. + * + * The record will be contain the (compressed) block data. + * The record header fields are filled in, except for the CRC field and + * previous pointer. + * + */ +static flog_record * +flog_rec_assemble(flog_insert_context *insert_context) +{ + flog_record *rec; + uint32 xl_tot_len = FLOG_REC_HEADER_SIZE; + + if (insert_context->rmgr == ORIGIN_PAGE_ID) + rec = assemble_origin_page_rec(insert_context, xl_tot_len); + else if (insert_context->rmgr == REL_FILENODE_ID) + rec = polar_assemble_filenode_rec(insert_context, xl_tot_len); + else + /*no cover line*/ + elog(PANIC, "Unknown flashback log record rmid %d", insert_context->rmgr); + + /* Set the common part */ + rec->xl_prev = POLAR_INVALID_FLOG_REC_PTR; + rec->xl_rmid = insert_context->rmgr; + rec->xl_xid = 0; + + return rec; +} + +/* + * POLAR: Insert a flashback log record to flashback log shared buffer. + * + * insert_context: Everything about the insertion. + */ +polar_flog_rec_ptr +polar_flog_insert_into_buffer(flog_ctl_t instance, flog_insert_context *insert_context) +{ + polar_flog_rec_ptr start_ptr = POLAR_INVALID_FLOG_REC_PTR; + polar_flog_rec_ptr end_ptr = POLAR_INVALID_FLOG_REC_PTR; + flog_record *rec = NULL; + + /* Assemble the flashback log record without the previous pointer and the CRC field */ + rec = flog_rec_assemble(insert_context); + /* + * Fill the previous pointer and the CRC field and insert the flashback log record to flashback log + * shared buffers. + */ + end_ptr = polar_flog_rec_insert(instance->buf_ctl, instance->queue_ctl, rec, &start_ptr); + + if (unlikely(polar_flashback_log_debug)) + log_flog_rec(rec, start_ptr); + + pfree(rec); + + return end_ptr; +} + +polar_flog_rec_ptr +polar_insert_buf_flog_rec(flog_ctl_t instance, BufferTag *tag, XLogRecPtr redo_lsn, + XLogRecPtr fbpoint_lsn, uint8 info, Page origin_page, bool from_origin_buf) +{ + flog_insert_context insert_context; + + /* + * It is candidate, check it in here. + */ + if ((info & FLOG_LIST_SLOT_CANDIDATE) && PageGetLSN(origin_page) > fbpoint_lsn) + return POLAR_INVALID_FLOG_REC_PTR; + + Assert(PageGetLSN(origin_page) <= fbpoint_lsn); + + /* Construct the insert context */ + insert_context.buf_tag = tag; + insert_context.data = origin_page; + insert_context.redo_lsn = redo_lsn; + insert_context.rmgr = ORIGIN_PAGE_ID; + /* This will be update in the flashback log record */ + insert_context.info = ORIGIN_PAGE_FULL; + + if (from_origin_buf) + insert_context.info |= FROM_ORIGIN_BUF; + + return polar_flog_insert_into_buffer(instance, &insert_context); +} + +static polar_flog_rec_ptr +polar_insert_rel_extend_flog_rec(flog_ctl_t instance, Buffer buffer) +{ + flog_insert_context insert_context; + XLogRecPtr redo_lsn; + + /* The buffer is just extended, so the redo_lsn of the record is just the wal lsn in the disk */ + redo_lsn = GetFlushRecPtr(); + + /* Construct the insert context */ + insert_context.buf_tag = &(GetBufferDescriptor(buffer - 1)->tag); + insert_context.data = NULL; + insert_context.redo_lsn = redo_lsn; + insert_context.rmgr = ORIGIN_PAGE_ID; + /* This will be update in the flashback log record */ + insert_context.info = ORIGIN_PAGE_EMPTY; + + return polar_flog_insert_into_buffer(instance, &insert_context); +} + +void +polar_flog_rel_bulk_extend(flog_ctl_t instance, Buffer buffer) +{ + polar_flog_rec_ptr ptr; + + Assert(polar_is_flog_enabled(instance)); + + ptr = polar_insert_rel_extend_flog_rec(instance, buffer); + /* Flush it right now to avoid to miss it */ + polar_flog_flush(instance->buf_ctl, ptr); +} + +/* + * POLAR: Get the origin page from flashback log. + * Return true when we get a right origin page. + * + * context: contain the start point and end point of flashback log to search and the target buffer tag. + * Page: The target page. + * replay_start_lsn: The replay start WAL lsn. + * + * NB: Please make true the context->end_ptr larger than context->start_ptr. + * + */ +bool +polar_get_origin_page(flshbak_buf_context_t *context, Page page, XLogRecPtr *replay_start_lsn) +{ + + log_index_page_iter_t originpage_iter; + log_index_lsn_t *lsn_info = NULL; + polar_flog_rec_ptr ptr; + bool found = false; + + Assert(context->start_ptr < context->end_ptr); + + originpage_iter = + polar_logindex_create_page_iterator(context->logindex_snapshot, context->tag, context->start_ptr, context->end_ptr - 1, false); + + if (polar_logindex_page_iterator_state(originpage_iter) != ITERATE_STATE_FINISHED) + { + /*no cover begin*/ + polar_logindex_release_page_iterator(originpage_iter); + elog(ERROR, "Failed to iterate data for flashback log of " POLAR_LOG_BUFFER_TAG_FORMAT + ", which start pointer =%X/%X and end pointer =%X/%X", + POLAR_LOG_BUFFER_TAG(context->tag), + (uint32)((context->start_ptr) >> 32), (uint32)(context->start_ptr), + (uint32)((context->end_ptr - 1) >> 32), (uint32)(context->end_ptr - 1)); + return false; + /*no cover end*/ + } + + if ((lsn_info = polar_logindex_page_iterator_next(originpage_iter)) != NULL) + { + ptr = (polar_flog_rec_ptr) lsn_info->lsn; + Assert(BUFFERTAGS_EQUAL(*(lsn_info->tag), *(context->tag))); + found = polar_decode_origin_page_rec(context->reader, ptr, page, replay_start_lsn, context->tag); + } + + polar_logindex_release_page_iterator(originpage_iter); + + if (!found) + { + /*no cover line*/ + elog(LOG, "Can't find a valid origin page for page " POLAR_LOG_BUFFER_TAG_FORMAT + " with flashback log start location %X/%X and end location %X/%X", + POLAR_LOG_BUFFER_TAG(context->tag), + (uint32)((context->start_ptr) >> 32), (uint32)(context->start_ptr), + (uint32)((context->end_ptr - 1) >> 32), (uint32)(context->end_ptr - 1)); + } + else if (unlikely(polar_flashback_log_debug)) + { + *replay_start_lsn = Max(*replay_start_lsn, PageGetLSN(page)); + elog(LOG, "We find a valid origin page for page " POLAR_LOG_BUFFER_TAG_FORMAT + " with flashback log start location %X/%X and end location %X/%X, its " + "WAL replay start lsn is %X/%X", POLAR_LOG_BUFFER_TAG(context->tag), + (uint32)((context->start_ptr) >> 32), (uint32)(context->start_ptr), + (uint32)((context->end_ptr - 1) >> 32), (uint32)(context->end_ptr - 1), + (uint32)(*replay_start_lsn >> 32), (uint32)(*replay_start_lsn)); + } + + return found; +} + +/* + * POLAR: Flashback the buffer. + * + * context: everything we need. You can see the detail in its definition. + * + * NB: Please make sure context->end_ptr >= context->start_ptr. + */ +bool +polar_flashback_buffer(flshbak_buf_context_t *context) +{ + Page page; + XLogRecPtr lsn = InvalidXLogRecPtr; + BufferDesc *buf_desc; + + if (unlikely(context->start_ptr == context->end_ptr)) + { + elog(WARNING, "The page " POLAR_LOG_BUFFER_TAG_FORMAT " has no origin page between " + "flashback log %X/%X and %X/%X", POLAR_LOG_BUFFER_TAG(context->tag), + (uint32) (context->start_ptr >> 32), (uint32) (context->start_ptr), + (uint32) (context->end_ptr >> 32), (uint32) context->end_ptr); + return false; + } + else if (unlikely(context->start_ptr > context->end_ptr)) + { + /*no cover begin*/ + elog(ERROR, "The range to flashback page " POLAR_LOG_BUFFER_TAG_FORMAT " is wrong, " + "the flashback log start pointer %X/%X, the flashback log end pointer %X/%X", + POLAR_LOG_BUFFER_TAG(context->tag), + (uint32) (context->start_ptr >> 32), (uint32) (context->start_ptr), + (uint32) (context->end_ptr >> 32), (uint32) (context->end_ptr)); + /*no cover end*/ + } + + /* Disable the flashback log for the buffer in the flashback. */ + buf_desc = GetBufferDescriptor(context->buf - 1); + set_buf_flog_state(buf_desc, POLAR_BUF_FLOG_DISABLE); + + page = BufferGetPage(context->buf); + + if (!polar_get_origin_page(context, page, &lsn)) + { + /* If not found, check its first modify is a XLOG_FPI_MULTI/XLOG_FPI/XLOG_FPI_FOR_HINT record? */ + lsn = polar_logindex_find_first_fpi(polar_logindex_redo_instance, + context->start_lsn, context->end_lsn, context->tag, &(context->buf), context->apply_fpi); + + if (!XLogRecPtrIsInvalid(lsn)) + { + elog(LOG, "The first modify of " POLAR_LOG_BUFFER_TAG_FORMAT " after %X/%X " + "is a new full page image, its origin page is a empty page or the image", + POLAR_LOG_BUFFER_TAG(context->tag), + (uint32) (context->start_lsn >> 32), (uint32) (context->start_lsn)); + + if (context->apply_fpi) + lsn = PageGetLSN(page); + else + MemSet(page, 0, BLCKSZ); + } + else + { + elog(context->elevel, "Can't find a valid origin page for " POLAR_LOG_BUFFER_TAG_FORMAT " from flashback log", + POLAR_LOG_BUFFER_TAG(context->tag)); + return false; + } + } + + Assert(!XLogRecPtrIsInvalid(lsn)); + + if (unlikely(polar_flashback_log_debug)) + { + elog(LOG, "The origin page " POLAR_LOG_BUFFER_TAG_FORMAT " need to replay from %X/%X to %X/%X", + POLAR_LOG_BUFFER_TAG(context->tag), + (uint32) (lsn >> 32), (uint32) lsn, + (uint32) (context->end_lsn >> 32), (uint32) (context->end_lsn)); + } + + /* The lsn can be larger than or equal to context->end_lsn, it means no WAL record to replay */ + polar_logindex_apply_page(polar_logindex_redo_instance, lsn, context->end_lsn, context->tag, &(context->buf)); + return true; +} + diff --git a/src/backend/polar_flashback/polar_flashback_log_decoder.c b/src/backend/polar_flashback/polar_flashback_log_decoder.c deleted file mode 100644 index f496d5dfa01..00000000000 --- a/src/backend/polar_flashback/polar_flashback_log_decoder.c +++ /dev/null @@ -1,379 +0,0 @@ -/*------------------------------------------------------------------------- - * - * polar_flashback_log_decoder.c - * Implementation of flashback log decoder - * - * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * Portions Copyright (c) 2021, Alibaba Group Holding limited - * - * IDENTIFICATION - * src/backend/polar_flashback/polar_flashback_log_decoder.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "access/polar_logindex_redo.h" -#include "access/xlog.h" -#include "common/pg_lzcompress.h" -#include "polar_flashback/polar_flashback_log.h" -#include "polar_flashback/polar_flashback_log_decoder.h" -#include "polar_flashback/polar_flashback_log_reader.h" -#include "polar_flashback/polar_flashback_log_record.h" -#include "storage/bufpage.h" -#include "storage/checksum.h" - -static bool -fl_decode_unempty_origin_page(flog_record *rec, - Page page, polar_flog_rec_ptr ptr) -{ - fl_origin_page_rec_data *rec_data; - fl_rec_img_header *img; - fl_rec_img_comp_header *c_img; - char *origin_page; - int record_data_len; - PGAlignedBlock tmp; - uint16 hole_length = 0; - - rec_data = FL_GET_ORIGIN_PAGE_REC_DATA(rec); - img = FL_GET_ORIGIN_PAGE_IMG_HEADER(rec); - origin_page = (char *)img + FL_REC_IMG_HEADER_SIZE; - record_data_len = img->length; - - if (img->bimg_info & IMAGE_IS_COMPRESSED) - { - if (img->bimg_info & IMAGE_HAS_HOLE) - { - c_img = (fl_rec_img_comp_header *) origin_page; - origin_page += FL_REC_IMG_COMP_HEADER_SIZE; - hole_length = c_img->hole_length; - } - - if (pglz_decompress(origin_page, record_data_len, tmp.data, - BLCKSZ - hole_length) < 0) - { - /*no cover line*/ - elog(ERROR, "Invalid compressed origin page ([%u, %u, %u]), %u, %u, " - "from flashback log at %X/%X", - rec_data->tag.rnode.spcNode, - rec_data->tag.rnode.dbNode, - rec_data->tag.rnode.relNode, - rec_data->tag.forkNum, - rec_data->tag.blockNum, - (uint32)(ptr >> 32), - (uint32) ptr); - } - - origin_page = tmp.data; - } - else if (img->bimg_info & IMAGE_HAS_HOLE) - hole_length = BLCKSZ - img->length; - - /* generate page, taking into account hole if necessary */ - if (hole_length == 0) - memcpy((char *)page, origin_page, BLCKSZ); - else - { - memcpy((char *)page, origin_page, img->hole_offset); - /* must zero-fill the hole */ - MemSet((char *)page + img->hole_offset, 0, hole_length); - memcpy((char *)page + (img->hole_offset + hole_length), - origin_page + img->hole_offset, - BLCKSZ - (img->hole_offset + hole_length)); - } - - /* Checksum again */ - if (!PageIsVerified(page, rec_data->tag.forkNum, rec_data->tag.blockNum, NULL)) - /*no cover line*/ - elog(ERROR, "The checksum of origin page ([%u, %u, %u]), %u, %u, " - "from flashback log at %X/%X is wrong", - rec_data->tag.rnode.spcNode, - rec_data->tag.rnode.dbNode, - rec_data->tag.rnode.relNode, - rec_data->tag.forkNum, - rec_data->tag.blockNum, - (uint32)(ptr >> 32), (uint32) ptr); - - return true; -} - -/* - * POLAR: Decode the origin page flashback log record. - * Check the checkpoint lsn and crc field. - */ -static bool -decode_origin_page_rec(polar_flog_rec_ptr ptr, Page page, XLogRecPtr *checkpoint_lsn, - BufferTag *tag, flog_buf_ctl_t buf_ctl) -{ - uint8 info; - bool is_valid = false; - flog_record *rec; - flog_reader_state *reader; - char *errormsg; - - reader = polar_flog_reader_allocate(POLAR_FLOG_SEG_SIZE, - &polar_flog_page_read, NULL, buf_ctl); - - if (reader == NULL) - /*no cover line*/ - ereport(ERROR, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("Can not allocate the flashback log reader memory"))); - - /* Read the flashback log record until the flashback log is invalid */ - rec = polar_read_flog_record(reader, ptr, &errormsg); - - if (rec != NULL && rec->xl_rmid == ORIGIN_PAGE_ID) - { - BufferTag tag_in_rec; - - tag_in_rec = FL_GET_ORIGIN_PAGE_REC_DATA(rec)->tag; - - if (!BUFFERTAGS_EQUAL(tag_in_rec, *tag)) - /*no cover line*/ - elog(ERROR, "The buffer tag flashback log record at %X/%X is " - "([%u, %u, %u]), %u, %u not ([%u, %u, %u]), %u, %u", - (uint32)(ptr >> 32), (uint32) ptr, - tag_in_rec.rnode.spcNode, tag_in_rec.rnode.dbNode, - tag_in_rec.rnode.relNode, tag_in_rec.forkNum, - tag_in_rec.blockNum, tag->rnode.spcNode, tag->rnode.dbNode, - tag->rnode.relNode, tag->forkNum, tag->blockNum); - - info = rec->xl_info; - - switch (info & ORIGIN_PAGE_TYPE_MASK) - { - case ORIGIN_PAGE_EMPTY: - is_valid = true; - PageInit(page, BLCKSZ, 0); - break; - - case ORIGIN_PAGE_FULL: - is_valid = fl_decode_unempty_origin_page(rec, page, ptr); - break; - - default: - /*no cover line*/ - elog(ERROR, "Parse flashback log rec: unknown op code %u", info); - } - } - else if (rec == NULL) - /*no cover line*/ - elog(ERROR, "The flashback log record at %X/%X is invaid with error: %s", - (uint32)(ptr >> 32), - (uint32) ptr, errormsg); - else - /*no cover line*/ - elog(ERROR, "The flashback log record at %X/%X not a origin page record, its rmid is " - "%d", - (uint32)(ptr >> 32), - (uint32) ptr, rec->xl_rmid); - - if (is_valid) - *checkpoint_lsn = FL_GET_ORIGIN_PAGE_REC_DATA(rec)->redo_lsn; - - polar_flog_reader_free(reader); - return is_valid; -} - -/* - * POLAR: Get the origin page in the time of checkpoint_lsn to page. - * - * tag: The bufferTag of the target page. - * Page: The target page. - * start_ptr: The start pointer of the flashback logindex to search. - * end_ptr: The end pointer of the flashback logindex to search. - * replay_start_lsn: The replay start WAL lsn. - * - * Return true when we get a right origin page. - * - * When is_partial_write is true, we will get the origin page whose checkpoint_lsn - * is equal to the checkpoint_lsn. It means that the origin page is the origin version - * before the partial write. So it can be used to repair the partial written page. - * - * When is_partial_write is false, we will get the origin page whose checkpoint_lsn - * is larger than the checkpoint_lsn. It means that the origin page is the origin version - * before the checkpoint_lsn. So it can be used to get a certain version at a certain time. - * - * The two scenarios above are not the same. Partial write is a known issue, we can repair it. - * And now we don't like to repair issue unknown which may cause something unexpected. - * - * Please make true the end_ptr larger than start_ptr. - * - */ -bool -polar_get_origin_page(flog_ctl_t instance, BufferTag *tag, Page page, polar_flog_rec_ptr start_ptr, - polar_flog_rec_ptr end_ptr, XLogRecPtr *replay_start_lsn) -{ - - log_index_page_iter_t originpage_iter; - log_index_lsn_t *lsn_info = NULL; - polar_flog_rec_ptr ptr; - bool found = false; - logindex_snapshot_t snapshot = instance->logindex_snapshot; - - Assert(start_ptr < end_ptr); - - originpage_iter = - polar_logindex_create_page_iterator(snapshot, tag, start_ptr, end_ptr - 1, false); - - if (polar_logindex_page_iterator_state(originpage_iter) != ITERATE_STATE_FINISHED) - { - /*no cover begin*/ - elog(ERROR, "Failed to iterate data for ([%u, %u, %u]), %u, %u flashback log, " - "which start pointer =%X/%X and end pointer =%X/%X", - tag->rnode.spcNode, - tag->rnode.dbNode, - tag->rnode.relNode, - tag->forkNum, - tag->blockNum, - (uint32)((start_ptr) >> 32), (uint32)start_ptr, - (uint32)((end_ptr - 1) >> 32), (uint32)(end_ptr - 1)); - return false; - /*no cover end*/ - } - - if ((lsn_info = polar_logindex_page_iterator_next(originpage_iter)) != NULL) - { - ptr = (polar_flog_rec_ptr) lsn_info->lsn; - Assert(BUFFERTAGS_EQUAL(*(lsn_info->tag), *tag)); - found = decode_origin_page_rec(ptr, page, replay_start_lsn, tag, instance->buf_ctl); - } - - polar_logindex_release_page_iterator(originpage_iter); - - if (!found) - { - /*no cover line*/ - elog(LOG, "Can't find a valid origin page for page ([%u, %u, %u]), %u, %u " - "with flashback log start location %X/%X and end location %X/%X", - tag->rnode.spcNode, - tag->rnode.dbNode, - tag->rnode.relNode, - tag->forkNum, - tag->blockNum, - (uint32)((start_ptr) >> 32), (uint32)start_ptr, - (uint32)((end_ptr - 1) >> 32), (uint32)(end_ptr - 1)); - } - else if (unlikely(polar_flashback_log_debug)) - { - *replay_start_lsn = Max(*replay_start_lsn, PageGetLSN(page)); - elog(LOG, "We find a valid origin page for page ([%u, %u, %u]), %u, %u " - "with flashback log start location %X/%X and end location %X/%X, its " - "WAL replay start lsn is %X/%X", - tag->rnode.spcNode, - tag->rnode.dbNode, - tag->rnode.relNode, - tag->forkNum, - tag->blockNum, - (uint32)((start_ptr) >> 32), (uint32)start_ptr, - (uint32)((end_ptr - 1) >> 32), (uint32)(end_ptr - 1), - (uint32)(*replay_start_lsn >> 32), (uint32)(*replay_start_lsn)); - } - - return found; -} - -/* - * POLAR: Flashback the buffer. - * instance: flashback log instance. - * buf: The buffer to flashback. - * tag: The origin page buffer tag. - * start_ptr: The flashback log start point to search origin page. - * end_ptr: The flashback log end point to search origin page. - * start_lsn: The WAL start lsn to replay. - * end_lsn: The WAL end lsn to replay. - * elevel: Log level. - * apply_fpi: apply full page image or not. - * - * NB: Please make sure end_ptr >= start_ptr. - */ -bool -polar_flashback_buffer(flog_ctl_t instance, Buffer *buf, BufferTag *tag, - polar_flog_rec_ptr start_ptr, polar_flog_rec_ptr end_ptr, - XLogRecPtr start_lsn, XLogRecPtr end_lsn, int elevel, - bool apply_fpi) -{ - Page page; - XLogRecPtr lsn = InvalidXLogRecPtr; - BufferDesc *buf_desc; - - if (start_ptr == end_ptr) - { - elog(WARNING, "The page ([%u, %u, %u]), %u, %u has no origin page between flashback log " - "%X/%X and %X/%X", - tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->forkNum, tag->blockNum, - (uint32)(start_ptr >> 32), (uint32) start_ptr, - (uint32)(end_ptr >> 32), (uint32) end_ptr); - return false; - } - else if (start_ptr > end_ptr) - { - /*no cover begin*/ - elog(ERROR, "The range to flashback page ([%u, %u, %u]), %u, %u is wrong, " - "the flashback log start pointer %X/%X, the flashback log end pointer %X/%X", - tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, tag->forkNum, tag->blockNum, - (uint32)(start_ptr >> 32), (uint32) start_ptr, - (uint32)(end_ptr >> 32), (uint32) end_ptr); - /*no cover end*/ - } - - /* Disable the flashback log for the buffer in the flashback. */ - buf_desc = GetBufferDescriptor(*buf - 1); - set_buf_flog_state(buf_desc, POLAR_BUF_FLOG_DISABLE); - - page = BufferGetPage(*buf); - - if (!polar_get_origin_page(instance, tag, page, start_ptr, end_ptr, &lsn)) - { - /* If not found, check its first modify is a XLOG_FPI_MULTI/XLOG_FPI/XLOG_FPI_FOR_HINT record? */ - lsn = polar_logindex_find_first_fpi(polar_logindex_redo_instance, - start_lsn, end_lsn, tag, buf, apply_fpi); - - if (!XLogRecPtrIsInvalid(lsn)) - { - elog(LOG, "The first modify of ([%u, %u, %u]), %u, %u after %X/%X " - "is a new full page image, its origin page is a empty page or the image", - tag->rnode.spcNode, - tag->rnode.dbNode, - tag->rnode.relNode, - tag->forkNum, - tag->blockNum, - (uint32)(start_lsn >> 32), (uint32) start_lsn); - - if (apply_fpi) - lsn = PageGetLSN(page); - else - MemSet(page, 0, BLCKSZ); - } - else - { - elog(elevel, "Can't find a valid origin page for ([%u, %u, %u]), %u, %u from flashback log", - tag->rnode.spcNode, - tag->rnode.dbNode, - tag->rnode.relNode, - tag->forkNum, - tag->blockNum); - return false; - } - } - - Assert(!XLogRecPtrIsInvalid(lsn)); - - if (unlikely(polar_flashback_log_debug)) - { - elog(LOG, "The origin page ([%u, %u, %u]), %u, %u need to replay from %X/%X to %X/%X", - tag->rnode.spcNode, - tag->rnode.dbNode, - tag->rnode.relNode, - tag->forkNum, - tag->blockNum, - (uint32) (lsn >> 32), (uint32) lsn, - (uint32) (end_lsn >> 32), (uint32) end_lsn); - } - - /* The lsn can be larger than or equal to end_lsn, it means no WAL record to replay */ - polar_logindex_apply_page(polar_logindex_redo_instance, lsn, end_lsn, tag, buf); - return true; -} diff --git a/src/backend/polar_flashback/polar_flashback_log_file.c b/src/backend/polar_flashback/polar_flashback_log_file.c index fa2a96d82b5..84072eef4b2 100644 --- a/src/backend/polar_flashback/polar_flashback_log_file.c +++ b/src/backend/polar_flashback/polar_flashback_log_file.c @@ -20,6 +20,7 @@ #include "access/xlog.h" #include "miscadmin.h" #include "pgstat.h" +#include "polar_flashback/polar_flashback.h" #include "polar_flashback/polar_flashback_log_file.h" #include "port.h" #include "storage/fd.h" @@ -28,33 +29,22 @@ #define FLOG_TMP_FNAME "flogtmp." -#define flog_dir_full_path(ctl, path) \ +#define FLOG_GET_DIR_FULL_PATH(ctl, path) \ polar_make_file_path_level2(path, polar_get_flog_dir(ctl)) -#define flog_file_full_path(ctl, path, file_name) \ +#define FLOG_GET_FILE_FULL_PATH(ctl, path, file_name) \ polar_make_file_path_level3(path, polar_get_flog_dir(ctl), file_name) /* GUCs */ int polar_flashback_log_keep_segments; -static bool -is_file_exist(const char *path) -{ - struct stat st; - - if ((polar_stat(path, &st) == 0) && S_ISREG(st.st_mode)) - return true; - else - return false; -} - /* Get flashback log file path. */ static void flog_file_path_from_seg(char *path, const char *dir, uint64 seg_no, int segsz_bytes) { char seg_file_name[FLOG_MAX_FNAME_LEN]; - get_flog_fname(seg_file_name, seg_no, segsz_bytes, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(seg_file_name, seg_no, segsz_bytes, FLOG_DEFAULT_TIMELINE); polar_make_file_path_level3(path, dir, seg_file_name); } @@ -124,26 +114,25 @@ static void remove_old_flog_file(flog_buf_ctl_t ctl, const char *segname, polar_flog_rec_ptr endptr) { char path[MAXPGPATH]; - struct stat statbuf; uint64 endlogSegNo; uint64 recycleSegNo; /* * Initialize info about where to try to recycle to. */ - endlogSegNo = flog_ptr_to_seg(endptr, POLAR_FLOG_SEG_SIZE); + endlogSegNo = FLOG_PTR_TO_SEG(endptr, POLAR_FLOG_SEG_SIZE); /* POLAR: Just reuse polar_flashback_log_keep_segments segments */ recycleSegNo = endlogSegNo + polar_flashback_log_keep_segments; - flog_file_full_path(ctl, path, segname); + FLOG_GET_FILE_FULL_PATH(ctl, path, segname); /* * Before deleting the file, see if it can be recycled as a future log * segment. Only recycle normal files. */ if (endlogSegNo <= recycleSegNo && - polar_lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && + polar_file_exists(path) && install_flog_seg(ctl, &endlogSegNo, path, true, recycleSegNo)) { @@ -194,14 +183,14 @@ truncate_flog_files(flog_buf_ctl_t ctl, uint64 segno, polar_flog_rec_ptr endptr) char polar_path[MAXPGPATH]; Assert(!polar_in_replica_mode()); - flog_dir_full_path(ctl, polar_path); + FLOG_GET_DIR_FULL_PATH(ctl, polar_path); /* * Construct a filename of the last segment to be kept. The timeline ID * doesn't matter, we ignore that in the comparison. (During recovery, * ThisTimeLineID isn't set, so we can't use that.) */ - get_flog_fname(lastoff, segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(lastoff, segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); elog(LOG, "attempting to remove flashback log segments older than log file %s", lastoff); @@ -211,7 +200,7 @@ truncate_flog_files(flog_buf_ctl_t ctl, uint64 segno, polar_flog_rec_ptr endptr) while ((xlde = ReadDir(xldir, polar_path)) != NULL) { /* Ignore files that are not flashback log segments */ - if (!is_flashback_log_file(xlde->d_name)) + if (!FLOG_IS_LOG_FILE(xlde->d_name)) continue; if (strcmp(xlde->d_name, lastoff) <= 0) @@ -233,7 +222,7 @@ polar_truncate_flog_before(flog_buf_ctl_t ctl, polar_flog_rec_ptr ptr) /* Get the current flashback log write result (end of flashback log in disk) */ recptr = polar_get_flog_write_result(ctl); - seg_no = flog_ptr_to_seg(ptr, POLAR_FLOG_SEG_SIZE); + seg_no = FLOG_PTR_TO_SEG(ptr, POLAR_FLOG_SEG_SIZE); /* There is only one file, just return */ if (seg_no == 0) @@ -280,14 +269,14 @@ polar_flog_read(char *buf, int segsize, polar_flog_rec_ptr startptr, startoff = FLOG_SEGMENT_OFFSET(recptr, segsize); /* Do we need to switch to a different xlog segment? */ - if (open_file < 0 || !ptr_in_flog_seg(recptr, open_segno, segsize)) + if (open_file < 0 || !FLOG_PTR_IN_SEG(recptr, open_segno, segsize)) { char path[MAXPGPATH]; if (open_file >= 0) polar_close(open_file); - open_segno = flog_ptr_to_seg(recptr, segsize); + open_segno = FLOG_PTR_TO_SEG(recptr, segsize); flog_file_path_from_seg(path, dir, open_segno, segsize); @@ -376,7 +365,7 @@ polar_flog_pos2ptr(uint64 bytepos) seg_offset += fullpages * POLAR_FLOG_BLCKSZ + bytesleft + FLOG_SHORT_PHD_SIZE; } - flog_seg_offset_to_ptr(fullsegs, seg_offset, POLAR_FLOG_SEG_SIZE, result); + FLOG_SEG_OFFSET_TO_PTR(fullsegs, seg_offset, POLAR_FLOG_SEG_SIZE, result); return result; } @@ -421,7 +410,7 @@ polar_flog_pos2endptr(uint64 bytepos) seg_offset += fullpages * POLAR_FLOG_BLCKSZ + bytesleft + FLOG_SHORT_PHD_SIZE; } - flog_seg_offset_to_ptr(fullsegs, seg_offset, POLAR_FLOG_SEG_SIZE, result); + FLOG_SEG_OFFSET_TO_PTR(fullsegs, seg_offset, POLAR_FLOG_SEG_SIZE, result); return result; } @@ -436,7 +425,7 @@ polar_flog_ptr2pos(polar_flog_rec_ptr ptr) uint32 offset; uint64 result; - fullsegs = flog_ptr_to_seg(ptr, POLAR_FLOG_SEG_SIZE); + fullsegs = FLOG_PTR_TO_SEG(ptr, POLAR_FLOG_SEG_SIZE); fullpages = (FLOG_SEGMENT_OFFSET(ptr, POLAR_FLOG_SEG_SIZE)) / POLAR_FLOG_BLCKSZ; offset = ptr % POLAR_FLOG_BLCKSZ; @@ -479,45 +468,40 @@ polar_read_flog_ctl_file(flog_buf_ctl_t ctl, flog_ctl_file_data_t *ctl_file_data { char ctl_file_path[MAXPGPATH]; int fd; - int read_len; pg_crc32c crc; - int rc; - flog_file_full_path(ctl, ctl_file_path, FLOG_CTL_FILE); + FLOG_GET_FILE_FULL_PATH(ctl, ctl_file_path, FLOG_CTL_FILE); /* The control file may be non-exist */ - if (!is_file_exist(ctl_file_path)) + if (!polar_file_exists(ctl_file_path)) { elog(WARNING, "Can't find %s", ctl_file_path); return false; } - fd = BasicOpenFile(ctl_file_path, O_RDWR | PG_BINARY, true); + fd = polar_open_transient_file(ctl_file_path, O_RDONLY | PG_BINARY); if (fd < 0) /*no cover line*/ ereport(FATAL, (errcode_for_file_access(), - (errmsg("could not open file \"%s\": %m", ctl_file_path)))); + errmsg("could not open file \"%s\": %m", ctl_file_path))); - /* Read data */ - pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_LOG_CTL_FILE_READ); - read_len = polar_read(fd, ctl_file_data, sizeof(flog_ctl_file_data_t)); - pgstat_report_wait_end(); + pgstat_report_wait_start(WAIT_EVENT_FRA_CTL_FILE_READ); - if (read_len != sizeof(flog_ctl_file_data_t)) + if (polar_read(fd, ctl_file_data, sizeof(flog_ctl_file_data_t)) != sizeof(flog_ctl_file_data_t)) /*no cover line*/ ereport(FATAL, (errcode_for_file_access(), - (errmsg("could not read from file \"%s\": %m", ctl_file_path)))); + errmsg("could not read from file \"%s\": %m", ctl_file_path))); - rc = polar_close(fd); + pgstat_report_wait_end(); - if (rc) + if (CloseTransientFile(fd)) /*no cover line*/ ereport(FATAL, (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", ctl_file_path))); + errmsg("could not close file \"%s\": %m", ctl_file_path))); /* Verify CRC */ INIT_CRC32C(crc); @@ -543,7 +527,6 @@ polar_write_flog_ctl_file(flog_buf_ctl_t ctl) { flog_ctl_file_data_t flashback_log_ctl_file; char ctl_file_path[MAXPGPATH]; - int fd; flashback_log_ctl_file.version_no = FLOG_CTL_FILE_VERSION; @@ -559,46 +542,10 @@ polar_write_flog_ctl_file(flog_buf_ctl_t ctl) offsetof(flog_ctl_file_data_t, crc)); FIN_CRC32C(flashback_log_ctl_file.crc); - flog_file_full_path(ctl, ctl_file_path, FLOG_CTL_FILE); - fd = BasicOpenFile(ctl_file_path, O_RDWR | O_CREAT | PG_BINARY, true); - - if (fd < 0) - { - /*no cover line*/ - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - ctl_file_path))); - return; - } - - pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_LOG_CTL_FILE_WRITE); - - if (polar_write(fd, &flashback_log_ctl_file, sizeof(flog_ctl_file_data_t)) != sizeof(flog_ctl_file_data_t)) - /*no cover line*/ - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not write file \"%s\": %m", - ctl_file_path))); - - pgstat_report_wait_end(); - - pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_LOG_CTL_FILE_SYNC); + FLOG_GET_FILE_FULL_PATH(ctl, ctl_file_path, FLOG_CTL_FILE); - if (polar_fsync(fd) != 0) - /*no cover line*/ - ereport(PANIC, - (errcode_for_file_access(), - (errmsg("could not sync file \"%s\": %m", - ctl_file_path)))); - - pgstat_report_wait_end(); - - if (polar_close(fd)) - /*no cover line*/ - ereport(PANIC, - (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", ctl_file_path))); + polar_write_ctl_file_atomic(ctl_file_path, &flashback_log_ctl_file, sizeof(flog_ctl_file_data_t), + WAIT_EVENT_FLASHBACK_LOG_CTL_FILE_WRITE, WAIT_EVENT_FLASHBACK_LOG_CTL_FILE_SYNC); if (unlikely(polar_flashback_log_debug)) { @@ -678,7 +625,7 @@ polar_validate_flog_dir(flog_buf_ctl_t ctl) { char path[MAXPGPATH]; - flog_dir_full_path(ctl, path); + FLOG_GET_DIR_FULL_PATH(ctl, path); polar_validate_dir(path); } @@ -762,7 +709,7 @@ polar_flog_file_init(flog_buf_ctl_t ctl, uint64 logsegno, bool *use_existent) elog(DEBUG2, "creating and filling new flashback log file"); snprintf(tmpfile_name, FLOG_MAX_FNAME_LEN, FLOG_TMP_FNAME "%d", (int) getpid()); - flog_file_full_path(ctl, tmppath, tmpfile_name); + FLOG_GET_FILE_FULL_PATH(ctl, tmppath, tmpfile_name); polar_unlink(tmppath); fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, true); @@ -846,7 +793,7 @@ polar_prealloc_flog_files(flog_buf_ctl_t ctl, int num) bool use_existent; int count = 0; - _log_seg_no = flog_ptr_to_seg(request_ptr, POLAR_FLOG_SEG_SIZE); + _log_seg_no = FLOG_PTR_TO_SEG(request_ptr, POLAR_FLOG_SEG_SIZE); while (count < num) { @@ -860,15 +807,15 @@ polar_prealloc_flog_files(flog_buf_ctl_t ctl, int num) /* POLAR: Check the flashback log of the segment no exists */ bool -polar_is_flog_file_exist(const char *dir, polar_flog_rec_ptr ptr, int elevel) +polar_flog_file_exists(const char *dir, polar_flog_rec_ptr ptr, int elevel) { char path[MAXPGPATH]; uint64 segno; - segno = flog_ptr_to_seg(ptr, POLAR_FLOG_SEG_SIZE); + segno = FLOG_PTR_TO_SEG(ptr, POLAR_FLOG_SEG_SIZE); flog_file_path_from_seg(path, dir, segno, POLAR_FLOG_SEG_SIZE); - if (is_file_exist(path)) + if (polar_file_exists(path)) return true; /*no cover begin*/ @@ -896,7 +843,7 @@ polar_flog_remove_all(flog_buf_ctl_t ctl) { char path[MAXPGPATH]; - flog_dir_full_path(ctl, path); + FLOG_GET_DIR_FULL_PATH(ctl, path); polar_flog_clean_dir_internal(path); } @@ -924,20 +871,25 @@ polar_write_flog_history_file(const char *dir, TimeLineID tli, polar_flog_rec_pt tmp_fd = polar_open_transient_file(tmp_path, O_RDWR | O_CREAT | O_TRUNC); if (tmp_fd < 0) + /*no cover line*/ ereport(ERROR, (errcode_for_file_access(), errmsg("could not create file \"%s\": %m", tmp_path))); - if (is_file_exist(path)) + if (polar_file_exists(path)) { - fd = BasicOpenFile(path, O_RDONLY, true); + fd = polar_open_transient_file(path, O_RDONLY); if (fd < 0) - /*no cover line*/ + { + /*no cover begin*/ + CloseTransientFile(tmp_fd); ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open file \"%s\": %m", - path))); + errmsg("could not open flashback log history file \"%s\": %m", + path))); + /*no cover end*/ + } for (;;) { @@ -980,7 +932,7 @@ polar_write_flog_history_file(const char *dir, TimeLineID tli, polar_flog_rec_pt pgstat_report_wait_end(); } - polar_close(fd); + CloseTransientFile(fd); } snprintf(ptr_info, sizeof(ptr_info), @@ -1000,11 +952,21 @@ polar_write_flog_history_file(const char *dir, TimeLineID tli, polar_flog_rec_pt pgstat_report_wait_end(); - if (polar_close(tmp_fd)) + pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_LOG_HISTORY_FILE_SYNC); + + if (polar_fsync(tmp_fd) != 0) + /*no cover line*/ + ereport(data_sync_elevel(ERROR), + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", tmp_path))); + + pgstat_report_wait_end(); + + if (CloseTransientFile(tmp_fd)) /*no cover line*/ ereport(ERROR, (errcode_for_file_access(), - errmsg("could not close file \"%s\": %m", path))); + errmsg("could not close file \"%s\": %m", tmp_path))); /* * As the rename is atomic operation, if any problem occurs after this @@ -1030,14 +992,15 @@ polar_read_flog_history_file(const char *dir) polar_make_file_path_level3(path, dir, FLOG_HISTORY_FILE); - if (!is_file_exist(path)) + if (!polar_file_exists(path)) return result; - history_file_fd = BasicOpenFile(path, O_RDWR, true); + history_file_fd = polar_open_transient_file(path, O_RDONLY); if (history_file_fd < 0) - /*no cover line*/ - ereport(ERROR, (errcode_for_file_access(), + /*no cover line */ + ereport(FATAL, + (errcode_for_file_access(), errmsg("could not open file \"%s\": %m", path))); pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_LOG_HISTORY_FILE_READ); @@ -1104,6 +1067,8 @@ polar_read_flog_history_file(const char *dir) pgstat_report_wait_end(); + CloseTransientFile(history_file_fd); + return result; } diff --git a/src/backend/polar_flashback/polar_flashback_log_index.c b/src/backend/polar_flashback/polar_flashback_log_index.c index a01e92a55bf..a0db160532b 100644 --- a/src/backend/polar_flashback/polar_flashback_log_index.c +++ b/src/backend/polar_flashback/polar_flashback_log_index.c @@ -18,7 +18,6 @@ #include "access/xlog.h" #include "miscadmin.h" #include "polar_flashback/polar_flashback_log.h" -#include "polar_flashback/polar_flashback_log_decoder.h" #include "polar_flashback/polar_flashback_log_file.h" #include "polar_flashback/polar_flashback_log_index.h" #include "polar_flashback/polar_flashback_log_mem.h" @@ -27,7 +26,7 @@ #include "postmaster/startup.h" #include "storage/lwlock.h" -#define flog_index_dir_full_path(path, snapshot) \ +#define FLOG_GET_INDEX_DIR_FULL_PATH(path, snapshot) \ polar_make_file_path_level2(path, get_flog_index_dir(snapshot)) /* @@ -59,11 +58,9 @@ polar_flog_index_add_lsn(logindex_snapshot_t snapshot, BufferTag *tag, polar_flo if (unlikely(polar_flashback_log_debug)) { - elog(LOG, "Add flashback logindex: tag is '[%u, %u, %u], %d, %u', " - "lsn is %X/%X, previous lsn is %X/%X", - tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, - tag->forkNum, tag->blockNum, - (uint32)(ptr >> 32), (uint32)(ptr), (uint32)(prev >> 32), (uint32)(prev)); + elog(LOG, "Add flashback logindex: tag is " POLAR_LOG_BUFFER_TAG_FORMAT " , lsn is %X/%X, " + "previous lsn is %X/%X", POLAR_LOG_BUFFER_TAG(tag), (uint32)(ptr >> 32), + (uint32)(ptr), (uint32)(prev >> 32), (uint32)(prev)); } } @@ -82,26 +79,20 @@ insert_flog_index_from_file(logindex_snapshot_t snapshot, flog_reader_state *sta if (record != NULL) { - /* Now just insert the origin page record */ - if (record->xl_rmid == ORIGIN_PAGE_ID) - { - fl_origin_page_rec_data *rec_data; - - rec_data = FL_GET_ORIGIN_PAGE_REC_DATA(record); + BufferTag tag; - if (unlikely(polar_flashback_log_debug)) - elog(LOG, "We found the flashback log record at %X/%X from file, " - "total length is %u, the tag is '[%u, %u, %u], %d, %u'", - (uint32)(*ptr_expected >> 32), (uint32)(*ptr_expected), record->xl_tot_len, - rec_data->tag.rnode.spcNode, rec_data->tag.rnode.dbNode, rec_data->tag.rnode.relNode, - rec_data->tag.forkNum, rec_data->tag.blockNum); + polar_get_buffer_tag_in_flog_rec(record, &tag); - /* The add lsn can ignore the first insert which is inserted already */ - polar_flog_index_add_lsn(snapshot, &(rec_data->tag), POLAR_INVALID_FLOG_REC_PTR, state->read_rec_ptr); - } + /* The add lsn can ignore the first insert which is inserted already */ + polar_flog_index_add_lsn(snapshot, &tag, POLAR_INVALID_FLOG_REC_PTR, state->read_rec_ptr); *ptr_expected = convert_to_first_valid_ptr(state->end_rec_ptr); - return true; + + if (unlikely(polar_flashback_log_debug)) + elog(LOG, "We found the flashback log record at %X/%X from file, " + "total length is %u, the tag is " POLAR_LOG_BUFFER_TAG_FORMAT, + (uint32)(*ptr_expected >> 32), (uint32)(*ptr_expected), record->xl_tot_len, + POLAR_LOG_BUFFER_TAG(&tag)); } /* * Ignore the switch point invalid flashback log and skip to next ptr. @@ -110,10 +101,7 @@ insert_flog_index_from_file(logindex_snapshot_t snapshot, flog_reader_state *sta * insert to logidnex. */ else if (state->in_switch_region) - { *ptr_expected = state->end_rec_ptr; - return true; - } /* Ignore the error when read the record which is over the write result. */ else if (strncmp(errormsg, REC_UNFLUSHED_ERROR_MSG, strlen(REC_UNFLUSHED_ERROR_MSG)) == 0) return false; @@ -122,7 +110,9 @@ insert_flog_index_from_file(logindex_snapshot_t snapshot, flog_reader_state *sta elog(PANIC, "Failed to read record %X/%08X from flashback log file with error: %s", (uint32)(*ptr_expected >> 32), (uint32)*ptr_expected, errormsg); - return false; + /* The max end lsn is the next one */ + polar_set_logindex_max_parsed_lsn(snapshot, *ptr_expected); + return true; } static bool @@ -135,10 +125,17 @@ insert_flog_index_from_queue(logindex_snapshot_t snapshot, polar_ringbuf_ref_t * if (polar_flog_read_info_from_queue(ref, *ptr_expected, &tag, &log_len, max_ptr)) { + polar_flog_rec_ptr next_ptr; + + next_ptr = polar_get_next_flog_ptr(*ptr_expected, log_len); polar_flog_index_add_lsn(snapshot, &tag, POLAR_INVALID_FLOG_REC_PTR, *ptr_expected); - *ptr_expected = polar_get_next_flog_ptr(*ptr_expected, log_len); + polar_set_logindex_max_parsed_lsn(snapshot, next_ptr); + *ptr_expected = next_ptr; return true; } + /* The last record is contrecord, set max end lsn to max_ptr */ + else if (log_len) + polar_set_logindex_max_parsed_lsn(snapshot, max_ptr); return false; } @@ -187,7 +184,7 @@ polar_validate_flog_index_dir(logindex_snapshot_t snapshot) { char path[MAXPGPATH]; - flog_index_dir_full_path(path, snapshot); + FLOG_GET_INDEX_DIR_FULL_PATH(path, snapshot); polar_validate_dir(path); } @@ -197,7 +194,7 @@ polar_flog_index_remove_all(logindex_snapshot_t snapshot) { char path[MAXPGPATH]; - flog_index_dir_full_path(path, snapshot); + FLOG_GET_INDEX_DIR_FULL_PATH(path, snapshot); polar_flog_clean_dir_internal(path); } @@ -300,15 +297,7 @@ polar_flog_index_insert(logindex_snapshot_t snapshot, flog_index_queue_ctl_t que } if (NEED_READER(source) && state == NULL) - { - state = polar_flog_reader_allocate(POLAR_FLOG_SEG_SIZE, &polar_flog_page_read, NULL, buf_ctl); - - if (state == NULL) - /*no cover line*/ - ereport(PANIC, - (errcode(ERRCODE_OUT_OF_MEMORY), - errmsg("Can not allocate the flashback log reader memory"))); - } + FLOG_ALLOC_PAGE_READER(state, buf_ctl, PANIC); /* Pop the flashback log lsn_info from logindex queue or read it from log files */ while (insert_ptr < max_ptr && inserted) @@ -393,12 +382,9 @@ polar_get_flog_index_meta_max_ptr(logindex_snapshot_t snapshot) void polar_recover_flog_index(logindex_snapshot_t snapshot, flog_index_queue_ctl_t queue_ctl, flog_buf_ctl_t buf_ctl) { - polar_flog_rec_ptr min_recover_lsn; - Assert(snapshot); - min_recover_lsn = polar_get_flog_min_recover_lsn(buf_ctl); - polar_flog_index_insert(snapshot, queue_ctl, buf_ctl, min_recover_lsn, LOG_FILE); + polar_flog_index_insert(snapshot, queue_ctl, buf_ctl, buf_ctl->min_recover_lsn, LOG_FILE); elog(LOG, "Recover the flashback logindex to %X/%X", - (uint32) (min_recover_lsn >> 32), (uint32) min_recover_lsn); + (uint32) (buf_ctl->min_recover_lsn >> 32), (uint32) buf_ctl->min_recover_lsn); } diff --git a/src/backend/polar_flashback/polar_flashback_log_index_queue.c b/src/backend/polar_flashback/polar_flashback_log_index_queue.c index 845892db867..3d35ebace84 100644 --- a/src/backend/polar_flashback/polar_flashback_log_index_queue.c +++ b/src/backend/polar_flashback/polar_flashback_log_index_queue.c @@ -81,10 +81,9 @@ polar_flog_index_queue_ref_pop(polar_ringbuf_ref_t *ref, flog_index_queue_lsn_in polar_ringbuf_update_ref(ref); if (unlikely(polar_flashback_log_debug)) - elog(LOG, "%s ptr: %x/%x, total length: %u, the tag: '[%u, %u, %u], %d, %u'", __func__, + elog(LOG, "%s ptr: %x/%x, total length: %u, the tag: " POLAR_LOG_BUFFER_TAG_FORMAT, __func__, (uint32)(lsn_info->ptr >> 32), (uint32)lsn_info->ptr, lsn_info->log_len, - lsn_info->tag.rnode.spcNode, lsn_info->tag.rnode.dbNode, lsn_info->tag.rnode.relNode, - lsn_info->tag.forkNum, lsn_info->tag.blockNum); + POLAR_LOG_BUFFER_TAG(&(lsn_info->tag))); return true; } @@ -96,10 +95,9 @@ polar_flog_index_queue_ref_pop(polar_ringbuf_ref_t *ref, flog_index_queue_lsn_in * NB: It is a callback, so we don't use the polar_ringbuf_t as a parameter. */ static void -flog_index_queue_keep_data(void) +flog_index_queue_keep_data(polar_ringbuf_t queue) { polar_ringbuf_ref_t ref = { .slot = -1 }; - polar_ringbuf_t queue = flog_instance->queue_ctl->queue; if (!polar_ringbuf_new_ref(queue, true, &ref, "flashback_logindex_queue_data_keep")) @@ -227,19 +225,21 @@ polar_flog_index_queue_push(flog_index_queue_ctl_t ctl, size_t rbuf_pos, flog_re int offset = 0; ssize_t copy_size; polar_ringbuf_t ringbuf = ctl->queue; + BufferTag tag; if (rbuf_pos >= ringbuf->size) /*no cover line*/ ereport(PANIC, (errmsg("rbuf_pos=%ld is incorrect for flashback logindex queue, " "queue_size=%ld", rbuf_pos, ringbuf->size))); + polar_get_buffer_tag_in_flog_rec(record, &tag); offset += polar_ringbuf_pkt_write(ringbuf, rbuf_pos, offset, (uint8 *)&start_lsn, sizeof(start_lsn)); offset += polar_ringbuf_pkt_write(ringbuf, rbuf_pos, offset, (uint8 *)&log_len, sizeof(log_len)); copy_size = polar_ringbuf_pkt_write(ringbuf, rbuf_pos, offset, - (uint8 *)(&(FL_GET_ORIGIN_PAGE_REC_DATA(record)->tag)), copy_len); + (uint8 *)(&tag), copy_len); if (copy_size != copy_len) /*no cover line*/ @@ -277,6 +277,8 @@ polar_flog_read_info_from_queue(polar_ringbuf_ref_t *ref, polar_flog_rec_ptr ptr Assert(!FLOG_REC_PTR_IS_INVAILD(ptr_expected)); + lsn_info.log_len = 0; + do { memset(&lsn_info, 0, sizeof(flog_index_queue_lsn_info)); @@ -284,7 +286,11 @@ polar_flog_read_info_from_queue(polar_ringbuf_ref_t *ref, polar_flog_rec_ptr ptr if (polar_flog_index_queue_ref_pop(ref, &lsn_info, max_ptr)) ptr = lsn_info.ptr; else + { + /* The log_len may be zero */ + *log_len = lsn_info.log_len; return false; + } if (ptr > ptr_expected) { @@ -313,10 +319,8 @@ polar_flog_read_info_from_queue(polar_ringbuf_ref_t *ref, polar_flog_rec_ptr ptr if (unlikely(polar_flashback_log_debug)) elog(LOG, "We found the flashback log record at %X/%X from logindex queue, " - "total length is %u, the tag is '[%u, %u, %u], %d, %u'", - (uint32)(ptr >> 32), (uint32)ptr, *log_len, - tag->rnode.spcNode, tag->rnode.dbNode, tag->rnode.relNode, - tag->forkNum, tag->blockNum); + "total length is %u, the tag is " POLAR_LOG_BUFFER_TAG_FORMAT, + (uint32)(ptr >> 32), (uint32)ptr, *log_len, POLAR_LOG_BUFFER_TAG(tag)); return true; } diff --git a/src/backend/polar_flashback/polar_flashback_log_insert.c b/src/backend/polar_flashback/polar_flashback_log_insert.c deleted file mode 100644 index 34ad1cb5048..00000000000 --- a/src/backend/polar_flashback/polar_flashback_log_insert.c +++ /dev/null @@ -1,326 +0,0 @@ -/*------------------------------------------------------------------------- - * - * polar_flashback_log_insert.c - * - * -<<<<<<< HEAD - * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * Portions Copyright (c) 2021, Alibaba Group Holding limited -======= - * Copyright (c) 2020, Alibaba Group Holding Limited - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. ->>>>>>> logindex_flashback_datamax_opensource - * - * IDENTIFICATION - * src/backend/polar_flashback/polar_flashback_log_insert.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "access/xlog.h" -#include "access/xloginsert.h" -#include "common/pg_lzcompress.h" -#include "miscadmin.h" -#include "polar_flashback/polar_flashback_log_insert.h" -#include "polar_flashback/polar_flashback_log_list.h" -#include "polar_flashback/polar_flashback_point.h" -#include "storage/buf_internals.h" -#include "storage/bufpage.h" -#include "storage/checksum.h" -#include "utils/guc.h" -#include "utils/memutils.h" - -/* Buffer size required to store a compressed version of backup block image */ -#define PGLZ_MAX_BLCKSZ PGLZ_MAX_OUTPUT(BLCKSZ) - -/* GUCs */ -bool polar_flashback_log_debug; - -static void -log_flog_rec(flog_record *record, polar_flog_rec_ptr ptr) -{ -#define MAX_EXTRA_INFO_SIZE 512 - - char extra_info[MAX_EXTRA_INFO_SIZE]; - fl_origin_page_rec_data *info; - - switch (record->xl_rmid) - { - case ORIGIN_PAGE_ID: - info = FL_GET_ORIGIN_PAGE_REC_DATA(record); - snprintf(extra_info, MAX_EXTRA_INFO_SIZE, "It is a origin page record, " - "the origin page is %s page. The redo lsn of the origin page is %X/%X, " - "the page tag is [%u, %u, %u], %d, %u", - (record->xl_info == ORIGIN_PAGE_EMPTY ? "empty" : "not empty"), - (uint32)(info->redo_lsn >> 32), (uint32)(info->redo_lsn), - info->tag.rnode.spcNode, info->tag.rnode.dbNode, info->tag.rnode.relNode, - info->tag.forkNum, info->tag.blockNum); - break; - - default: - /*no cover begin*/ - elog(ERROR, "The type of the record %X/%08X is wrong\n", - (uint32)(ptr >> 32), (uint32)ptr); - break; - /*no cover end*/ - } - - elog(LOG, "Insert a flashback log record at %X/%X: total length is %u, " - "the previous pointer is %X/%X. %s", - (uint32)(ptr >> 32), (uint32)ptr, record->xl_tot_len, - (uint32)(record->xl_prev >> 32), (uint32)(record->xl_prev), extra_info); -} - -/* - * Assemble a flashback log record from the buffers into an - * polar_flashback_log_record, ready for insertion with - * polar_flashback_log_insert_record(). - * - * tag is the buffer tag of buffer which is a origin page. - * redo_ptr is the last checkpoint XLOG record pointer. - * - * Return the flashback log record which palloc in this function. - * The caller can switch the memory context and pfree. - * - * The record will be contain the (compressed) block data. - * The record header fields are filled in, except for the CRC field and - * previous pointer. - * - */ -static flog_record * -flog_rec_assemble(flog_insert_context insert_context) -{ - bool include_origin_page = false; - Page page = NULL; - fl_rec_img_header b_img = {0, 0, 0}; - fl_rec_img_comp_header cb_img = {0}; - flog_record *rec; - char data[PGLZ_MAX_BLCKSZ]; - uint16 data_len = BLCKSZ; - uint32 xl_tot_len = FLOG_REC_HEADER_SIZE; - bool is_empty_page = true; - bool is_compressed = false; - bool from_origin_buf = false; - - if (insert_context.rmgr == ORIGIN_PAGE_ID) - include_origin_page = true; - - if (include_origin_page) - { - BufferTag *tag; - bool need_checksum_again; - - need_checksum_again = from_origin_buf = insert_context.info & FROM_ORIGIN_BUF; - tag = insert_context.buf_tag; - page = insert_context.origin_page; - xl_tot_len += FL_ORIGIN_PAGE_REC_INFO_SIZE; - - if (!PageIsNew(page) && !polar_page_is_just_inited(page)) - { - uint16 lower; - uint16 upper; - - /* Assume we can omit data between pd_lower and pd_upper */ - lower = ((PageHeader) page)->pd_lower; - upper = ((PageHeader) page)->pd_upper; - - if (lower >= SizeOfPageHeaderData && - upper > lower && - upper <= BLCKSZ) - { - b_img.hole_offset = lower; - b_img.bimg_info |= IMAGE_HAS_HOLE; - cb_img.hole_length = upper - lower; - /* - * Check the checksum before compute it again, so we will - * not change the rightness of checksum. - * - * When it is from origin buffer, the checksum may be wrong, - * so we don't check the pages from origin page buffer. - * - * We do nothing while the checksum is wrong here, but - * the decoder will verify the page. - */ - if (!from_origin_buf) - need_checksum_again = - (pg_checksum_page((char *) page, tag->blockNum) == ((PageHeader) page)->pd_checksum); - } - else - { - /* No "hole" to compress out */ - b_img.hole_offset = 0; - cb_img.hole_length = 0; - } - - /* Compute checksum again */ - if (need_checksum_again && DataChecksumsEnabled()) - { - MemSet((char *)page + b_img.hole_offset, 0, cb_img.hole_length); - ((PageHeader) page)->pd_checksum = pg_checksum_page((char *) page, tag->blockNum); - } - - /* Try to compress flashback log */ - is_compressed = polar_compress_block_in_log(page, b_img.hole_offset, cb_img.hole_length, - data, &data_len, FL_REC_IMG_COMP_HEADER_SIZE); - - if (is_compressed) - { - b_img.bimg_info |= IMAGE_IS_COMPRESSED; - b_img.length = data_len; - - if (cb_img.hole_length != 0) - xl_tot_len += FL_REC_IMG_COMP_HEADER_SIZE; - } - else - b_img.length = BLCKSZ - cb_img.hole_length; - - xl_tot_len += FL_REC_IMG_HEADER_SIZE + b_img.length; - is_empty_page = false; - } - } - - /* Construct the flashback log record */ - rec = polar_palloc_in_crit(xl_tot_len); - rec->xl_tot_len = xl_tot_len; - rec->xl_prev = POLAR_INVALID_FLOG_REC_PTR; - rec->xl_rmid = insert_context.rmgr; - rec->xl_info = insert_context.info; - rec->xl_xid = 0; - - if (include_origin_page) - { - fl_origin_page_rec_data rec_data; - char *scratch = (char *)rec + FLOG_REC_HEADER_SIZE; - - /* Copy the record data for the origin page. */ - rec_data.redo_lsn = insert_context.redo_lsn; - INIT_BUFFERTAG(rec_data.tag, insert_context.buf_tag->rnode, - insert_context.buf_tag->forkNum, insert_context.buf_tag->blockNum); - memcpy(scratch, &rec_data, FL_ORIGIN_PAGE_REC_INFO_SIZE); - scratch += FL_ORIGIN_PAGE_REC_INFO_SIZE; - - if (is_empty_page) - { - Assert(xl_tot_len == (FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE)); - rec->xl_info = ORIGIN_PAGE_EMPTY; - } - else - { - Assert(xl_tot_len >= FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE + - FL_REC_IMG_HEADER_SIZE + b_img.length); - - rec->xl_info = ORIGIN_PAGE_FULL; - memcpy(scratch, &b_img, FL_REC_IMG_HEADER_SIZE); - scratch += FL_REC_IMG_HEADER_SIZE; - - if (is_compressed) - { - if (cb_img.hole_length != 0) - { - memcpy(scratch, &cb_img, FL_REC_IMG_COMP_HEADER_SIZE); - scratch += FL_REC_IMG_COMP_HEADER_SIZE; - Assert(xl_tot_len == FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE + - FL_REC_IMG_HEADER_SIZE + FL_REC_IMG_COMP_HEADER_SIZE + b_img.length); - } - - memcpy(scratch, data, b_img.length); - } - else - { - Assert(xl_tot_len == FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE + - FL_REC_IMG_HEADER_SIZE + b_img.length); - - if (cb_img.hole_length != 0) - { - Assert(b_img.length < BLCKSZ); - Assert(b_img.hole_offset >= SizeOfPageHeaderData); - - memcpy(scratch, (char *)page, b_img.hole_offset); - scratch += b_img.hole_offset; - memcpy(scratch, (char *)page + b_img.hole_offset + cb_img.hole_length, - b_img.length - b_img.hole_offset); - } - else - { - Assert(b_img.length == BLCKSZ); - memcpy(scratch, (char *)page, b_img.length); - } - } - } - - /* Record the page come from origin buffer */ - if (from_origin_buf) - rec->xl_info |= FROM_ORIGIN_BUF; - } - - return rec; -} - -/* - * POLAR: Insert a flashback log record to flashback log shared buffer. - * - * insert_context: Everything about the insertion. - */ -polar_flog_rec_ptr -polar_flog_insert_into_buffer(flog_buf_ctl_t buf_ctl, flog_index_queue_ctl_t queue_ctl, - flog_insert_context insert_context) -{ - polar_flog_rec_ptr start_ptr = POLAR_INVALID_FLOG_REC_PTR; - polar_flog_rec_ptr end_ptr = POLAR_INVALID_FLOG_REC_PTR; - flog_record *rec = NULL; - - /* Assemble the flashback log record without the previous pointer and the CRC field */ - rec = flog_rec_assemble(insert_context); - /* - * Fill the previous pointer and the CRC field and insert the flashback log record to flashback log - * shared buffers. - */ - end_ptr = polar_flog_rec_insert(buf_ctl, queue_ctl, rec, &start_ptr); - - if (unlikely(polar_flashback_log_debug)) - log_flog_rec(rec, start_ptr); - - pfree(rec); - - return end_ptr; -} - -polar_flog_rec_ptr -polar_insert_buf_flog_rec(flog_buf_ctl_t buf_ctl, flog_index_queue_ctl_t queue_ctl, BufferTag *tag, - XLogRecPtr redo_lsn, XLogRecPtr fbpoint_lsn, uint8 info, Page origin_page, bool from_origin_buf) -{ - flog_insert_context insert_context; - - /* - * It is candidate, check it in here. - */ - if ((info & FLOG_LIST_SLOT_CANDIDATE) && PageGetLSN(origin_page) > fbpoint_lsn) - return POLAR_INVALID_FLOG_REC_PTR; - - Assert(PageGetLSN(origin_page) <= fbpoint_lsn); - - /* Construct the insert context */ - insert_context.buf_tag = tag; - insert_context.origin_page = (Page)origin_page; - insert_context.redo_lsn = redo_lsn; - insert_context.rmgr = ORIGIN_PAGE_ID; - /* This will be update in the flashback log record */ - insert_context.info = ORIGIN_PAGE_FULL; - - if (from_origin_buf) - insert_context.info |= FROM_ORIGIN_BUF; - - return polar_flog_insert_into_buffer(buf_ctl, queue_ctl, insert_context); -} diff --git a/src/backend/polar_flashback/polar_flashback_log_list.c b/src/backend/polar_flashback/polar_flashback_log_list.c index 2fd06d01077..52180f2e48a 100644 --- a/src/backend/polar_flashback/polar_flashback_log_list.c +++ b/src/backend/polar_flashback/polar_flashback_log_list.c @@ -12,16 +12,15 @@ * *------------------------------------------------------------------------- */ - #include "postgres.h" +#include "access/polar_log.h" #include "access/polar_logindex_redo.h" #include "access/xlog.h" #include "access/xlog_internal.h" #include "miscadmin.h" #include "pgstat.h" -#include "polar_flashback/polar_flashback_log_insert.h" -#include "polar_flashback/polar_flashback_log_list.h" +#include "polar_flashback/polar_flashback_log.h" #include "polar_flashback/polar_flashback_point.h" #include "port/atomics.h" #include "postmaster/startup.h" @@ -39,14 +38,14 @@ int polar_flashback_log_sync_buf_timeout; void polar_clean_origin_buf_bit(flog_list_ctl_t ctl, int buf_id, int8 origin_buf_index) { - int array_id = get_origin_buf_array_id(origin_buf_index); + int array_id = GET_ORIGIN_BUF_ARRAY_ID(origin_buf_index); + int8 bit_index = GET_ORIGIN_BUF_INDEX(origin_buf_index); /* Clean the buffer and release */ MemSet(ctl->origin_buf + origin_buf_index * BLCKSZ, 0, BLCKSZ); CLEAR_BUFFERTAG(ctl->buf_tag[origin_buf_index]); - Assert(((get_origin_buf_bit(ctl, origin_buf_index)) & 1) == 1); - pg_atomic_fetch_and_u32(&ctl->origin_buf_bitmap[array_id], ~((uint32) 1 << origin_buf_index)); - Assert(((get_origin_buf_bit(ctl, origin_buf_index)) & 1) == 0); + Assert(((GET_ORIGIN_BUF_BIT(ctl, origin_buf_index)) & 1) == 1); + pg_atomic_fetch_and_u32(&ctl->origin_buf_bitmap[array_id], ~((uint32) 1 << bit_index)); Assert(ctl->flashback_list[buf_id].origin_buf_index != INVAILD_ORIGIN_BUF_INDEX); ctl->flashback_list[buf_id].origin_buf_index = INVAILD_ORIGIN_BUF_INDEX; @@ -67,7 +66,7 @@ add_buf_to_list(int buf_id, uint8 info, flog_list_ctl_t list_ctl, flog_buf_ctl_t Assert(buf_id < NBuffers && buf_id >= 0); Assert(list_ctl->flashback_list[buf_id].info == FLOG_LIST_SLOT_EMPTY); - lsn = polar_get_prior_fbpoint_lsn(buf_ctl); + lsn = buf_ctl->redo_lsn; Assert(!XLogRecPtrIsInvalid(lsn)); list_ctl->flashback_list[buf_id].redo_lsn = lsn; list_ctl->flashback_list[buf_id].fbpoint_lsn = polar_get_local_fbpoint_lsn(buf_ctl, InvalidXLogRecPtr, InvalidXLogRecPtr); @@ -103,13 +102,8 @@ add_buf_to_list(int buf_id, uint8 info, flog_list_ctl_t list_ctl, flog_buf_ctl_t { BufferDesc *buf_hdr = GetBufferDescriptor(buf_id); - elog(LOG, "Insert the page [%u, %u, %u], %u, %u in buffer %d into flashback log async list, " - "its prev is %d, its redo lsn is %ld", - buf_hdr->tag.rnode.spcNode, - buf_hdr->tag.rnode.dbNode, - buf_hdr->tag.rnode.relNode, - buf_hdr->tag.forkNum, - buf_hdr->tag.blockNum, + elog(LOG, "Insert the page " POLAR_LOG_BUFFER_TAG_FORMAT " in buffer %d into flashback log async list, " + "its prev is %d, its redo lsn is %ld", POLAR_LOG_BUFFER_TAG(&(buf_hdr->tag)), buf_id, prev, lsn); } } @@ -147,11 +141,10 @@ remove_buf_from_list(int buf_id, flog_list_ctl_t ctl) SpinLockRelease(&ctl->info_lck); + /* Read the prev_buf and next buf first and set */ + pg_read_barrier(); ctl->flashback_list[buf_id].prev_buf = NOT_IN_FLOG_LIST; ctl->flashback_list[buf_id].next_buf = NOT_IN_FLOG_LIST; - - /* Make sure the insert of the slot becomes visible to others. */ - pg_write_barrier(); pg_atomic_fetch_add_u64(&ctl->remove_total_num, 1); if (unlikely(polar_flashback_log_debug)) @@ -237,7 +230,7 @@ polar_wait_buf_flog_rec_insert(BufferDesc *buf_hdr) GetCurrentTimestamp(), polar_flashback_log_sync_buf_timeout)) return false; } - while (is_buf_in_flog_list(buf_hdr)); + while (IS_BUF_IN_FLOG_LIST(buf_hdr)); return true; } @@ -278,12 +271,8 @@ read_origin_page_from_file(BufferTag *tag, char *origin_page) { if (unlikely(polar_flashback_log_debug)) { - elog(LOG, "The origin page of [%u, %u, %u], %u, %u is a empty page", - rnode.spcNode, - rnode.dbNode, - rnode.relNode, - fork_num, - blkno); + elog(LOG, "The origin page of " POLAR_LOG_BUFFER_TAG_FORMAT " is a empty page", + POLAR_LOG_BUFFER_TAG(tag)); } return; @@ -296,12 +285,8 @@ read_origin_page_from_file(BufferTag *tag, char *origin_page) * record. */ if (unlikely(polar_flashback_log_debug)) - elog(LOG, "Read the origin page [%u, %u, %u], %u, %u from file", - rnode.spcNode, - rnode.dbNode, - rnode.relNode, - fork_num, - blkno); + elog(LOG, "Read the origin page " POLAR_LOG_BUFFER_TAG_FORMAT " from file", + POLAR_LOG_BUFFER_TAG(tag)); } /* @@ -313,14 +298,12 @@ read_origin_page_from_file(BufferTag *tag, char *origin_page) * The background worker will insert the flashback log record of async list * head buffer, but the buffer can be removed by backend in very small cases. * And in very small cases, the normal backend check the buffer is - * in the list by is_buf_in_flog_list, but it may remove by the background. + * in the list by IS_BUF_IN_FLOG_LIST, but it may remove by the background. * * The background worker will report the error when the result is false. */ -static bool -insert_buf_flog_rec_from_list(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, - flog_index_queue_ctl_t queue_ctl, BufferDesc *buf_hdr, - bool is_background, bool is_validate) +bool +polar_process_buf_flog_list(flog_ctl_t instance, BufferDesc *buf_hdr, bool is_background, bool invalidate) { XLogRecPtr redo_lsn; XLogRecPtr fbpoint_lsn; @@ -331,6 +314,7 @@ insert_buf_flog_rec_from_list(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, bool result = false; int8 origin_buf_index; PGAlignedBlock origin_page; + flog_list_ctl_t list_ctl = instance->list_ctl; buf_id = buf_hdr->buf_id; /* Wait the buffer inserting flashback log finished */ @@ -348,8 +332,8 @@ insert_buf_flog_rec_from_list(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, /* Remove the buffer to let others go */ remove_buf_from_list(buf_id, list_ctl); - /* When the buffer is validate, just clean everything */ - if (!is_validate) + /* When invalidate the buffer, just clean everything */ + if (!invalidate) { bool from_origin_buf = false; @@ -364,7 +348,7 @@ insert_buf_flog_rec_from_list(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, read_origin_page_from_file(&buf_hdr->tag, origin_page.data); /* Insert the flashback log for the buffer */ - ptr = polar_insert_buf_flog_rec(buf_ctl, queue_ctl, &buf_hdr->tag, redo_lsn, + ptr = polar_insert_buf_flog_rec(instance, &buf_hdr->tag, redo_lsn, fbpoint_lsn, info, origin_page.data, from_origin_buf); list_ctl->flashback_list[buf_id].flashback_ptr = ptr; } @@ -380,34 +364,24 @@ insert_buf_flog_rec_from_list(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, if (unlikely(polar_flashback_log_debug)) elog(LOG, "Insert flashback log record from async list: " - "page [%u, %u, %u], %u, %u buffer(%d) by %s.", - buf_hdr->tag.rnode.spcNode, - buf_hdr->tag.rnode.dbNode, - buf_hdr->tag.rnode.relNode, - buf_hdr->tag.forkNum, - buf_hdr->tag.blockNum, buf_id, is_background ? "background worker" : "backend"); + "page " POLAR_LOG_BUFFER_TAG_FORMAT " buffer(%d) by %s.", + POLAR_LOG_BUFFER_TAG(&(buf_hdr->tag)), buf_id, + is_background ? "background worker" : "backend"); result = true; } else if (inserting_already) { if (unlikely(polar_flashback_log_debug)) - elog(LOG, "Wait to insert the page [%u, %u, %u], %u, %u flashback log record " - "by %s.", - buf_hdr->tag.rnode.spcNode, - buf_hdr->tag.rnode.dbNode, - buf_hdr->tag.rnode.relNode, - buf_hdr->tag.forkNum, - buf_hdr->tag.blockNum, is_background ? "backend" : "background worker"); + elog(LOG, "Wait to insert the page " POLAR_LOG_BUFFER_TAG_FORMAT " flashback log record " + "by %s.", POLAR_LOG_BUFFER_TAG(&(buf_hdr->tag)), + is_background ? "backend" : "background worker"); /* The background worker just return */ if (!is_background && !polar_wait_buf_flog_rec_insert(buf_hdr)) elog(ERROR, "Cancel the process due to wait the flashback log of the page " "([%u, %u, %u]), %u, %u inserting timeout. Please enlarge the " - "guc polar_flashback_log_sync_buf_timeout", - buf_hdr->tag.rnode.spcNode, buf_hdr->tag.rnode.dbNode, - buf_hdr->tag.rnode.relNode, buf_hdr->tag.forkNum, - buf_hdr->tag.blockNum); + "guc polar_flashback_log_sync_buf_timeout", POLAR_LOG_BUFFER_TAG(&(buf_hdr->tag))); result = true; } @@ -484,7 +458,7 @@ polar_push_buf_to_flog_list(flog_list_ctl_t ctl, flog_buf_ctl_t buf_ctl, Buffer * We are here means that the replay of the buffer has been done, * clean the POLAR_BUF_FLOG_DISABLE state and go on. */ - if (polar_check_buf_flog_state(buf_hdr, POLAR_BUF_FLOG_DISABLE)) + if (POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_FLOG_DISABLE)) clean_buf_flog_state(buf_hdr, POLAR_BUF_FLOG_DISABLE); info = FLOG_LIST_SLOT_READY; @@ -497,31 +471,23 @@ polar_push_buf_to_flog_list(flog_list_ctl_t ctl, flog_buf_ctl_t buf_ctl, Buffer add_buf_to_list(buf_id, info, ctl, buf_ctl); } else if (unlikely(polar_flashback_log_debug)) - elog(LOG, "The page [%u, %u, %u], %u, %u buffer %d " - "is in flashback log list already.", - buf_hdr->tag.rnode.spcNode, - buf_hdr->tag.rnode.dbNode, - buf_hdr->tag.rnode.relNode, - buf_hdr->tag.forkNum, - buf_hdr->tag.blockNum, - buf_id); + elog(LOG, "The page " POLAR_LOG_BUFFER_TAG_FORMAT " buffer %d " + "is in flashback log list already.", POLAR_LOG_BUFFER_TAG(&(buf_hdr->tag)), buf_id); } /* * Insert a flashback log record from list head. * - * NB: Just one process to insert record from one flashback ring. - * Maybe the flashback ring can be many. + * NB: Just one process to insert record from list. */ void -polar_insert_flog_rec_from_list_bg(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, - flog_index_queue_ctl_t queue_ctl) +polar_process_flog_list_bg(flog_ctl_t instance) { BufferDesc *buf_hdr; int buf; /* Get the head with spin lock */ - buf = get_flog_list_head_with_lock(list_ctl); + buf = get_flog_list_head_with_lock(instance->list_ctl); Assert(buf < NBuffers); if (buf == NOT_IN_FLOG_LIST) @@ -535,32 +501,16 @@ polar_insert_flog_rec_from_list_bg(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ * If it is failed, we check whether someone else has inserted it. * If not, just report a PANIC. */ - if (!insert_buf_flog_rec_from_list(list_ctl, buf_ctl, queue_ctl, - buf_hdr, true, false) && buf == get_flog_list_head_with_lock(list_ctl)) + if (!polar_process_buf_flog_list(instance, buf_hdr, true, false) && + buf == get_flog_list_head_with_lock(instance->list_ctl)) { /* Check someone remove it or a memory PANIC */ - elog(PANIC, "The page [%u, %u, %u], %u, %u buffer(%d) is list head but not in list", - buf_hdr->tag.rnode.spcNode, - buf_hdr->tag.rnode.dbNode, - buf_hdr->tag.rnode.relNode, - buf_hdr->tag.forkNum, - buf_hdr->tag.blockNum, buf); + elog(PANIC, "The page " POLAR_LOG_BUFFER_TAG_FORMAT " buffer(%d) is list head but not in list", + POLAR_LOG_BUFFER_TAG(&(buf_hdr->tag)), buf); } /*no cover end*/ } -/* - * POLAR: Insert the flashback log record of the buffer by myself or - * wait the background to insert it. - */ -void -polar_insert_buf_flog_rec_sync(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, - flog_index_queue_ctl_t queue_ctl, BufferDesc *buf_hdr, bool is_validate) -{ - if (is_buf_in_flog_list(buf_hdr)) - insert_buf_flog_rec_from_list(list_ctl, buf_ctl, queue_ctl, buf_hdr, false, is_validate); -} - Size polar_flog_async_list_shmem_size(void) { @@ -705,7 +655,7 @@ polar_add_origin_buf(flog_list_ctl_t ctl, BufferDesc *buf_desc) /* * In some cases, like the buffer not flushed, keep the old one. */ - if (is_buf_in_flog_list(buf_desc)) + if (IS_BUF_IN_FLOG_LIST(buf_desc)) /*no cover line*/ return INVAILD_ORIGIN_BUF_INDEX; @@ -715,12 +665,8 @@ polar_add_origin_buf(flog_list_ctl_t ctl, BufferDesc *buf_desc) ctl->flashback_list[buf_desc->buf_id].origin_buf_index = buf_index; if (unlikely(polar_flashback_log_debug)) - { - elog(LOG, "Add page [%u, %u, %u], %u, %u buffer(%d) in the origin buffer %d", - buf_desc->tag.rnode.spcNode, buf_desc->tag.rnode.dbNode, - buf_desc->tag.rnode.relNode, buf_desc->tag.forkNum, - buf_desc->tag.blockNum, buf_desc->buf_id, buf_index); - } + elog(LOG, "Add page " POLAR_LOG_BUFFER_TAG_FORMAT " buffer(%d) in the origin buffer %d", + POLAR_LOG_BUFFER_TAG(&(buf_desc->tag)), buf_desc->buf_id, buf_index); } return buf_index; diff --git a/src/backend/polar_flashback/polar_flashback_log_mem.c b/src/backend/polar_flashback/polar_flashback_log_mem.c index a783578769d..b0e628718f9 100644 --- a/src/backend/polar_flashback/polar_flashback_log_mem.c +++ b/src/backend/polar_flashback/polar_flashback_log_mem.c @@ -47,7 +47,7 @@ static LWLock *flog_write_lock = NULL; /* Macro to advance to next buffer index. */ #define FLOG_NEXT_BUF_IDX(idx, ctl) \ - (((idx) == ctl->cache_blck) ? 0 : ((idx) + 1)) + (((idx) == (ctl)->cache_blck) ? 0 : ((idx) + 1)) #define INSERT_FREESPACE(endptr) \ (((endptr) % POLAR_FLOG_BLCKSZ == 0) ? 0 : (POLAR_FLOG_BLCKSZ - (endptr) % POLAR_FLOG_BLCKSZ)) @@ -328,7 +328,7 @@ flog_file_close(void) { char name[FLOG_MAX_FNAME_LEN]; /*no cover begin*/ - get_flog_fname(name, open_log_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(name, open_log_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); ereport(PANIC, (errcode_for_file_access(), errmsg("could not close log file %s: %m", name))); @@ -425,7 +425,7 @@ flog_write(flog_buf_ctl_t ctl, polar_flog_rec_ptr write_request, bool flexible) write_result = EndPtr; is_partial_page = write_request < write_result; - if (!ptr_prev_in_flog_seg(write_result, open_log_segno, + if (!FLOG_PTR_PREV_IN_SEG(write_result, open_log_segno, POLAR_FLOG_SEG_SIZE)) { /* @@ -437,7 +437,7 @@ flog_write(flog_buf_ctl_t ctl, polar_flog_rec_ptr write_request, bool flexible) if (open_log_file >= 0) flog_file_close(); - open_log_segno = flog_ptr_prev_to_seg(write_result, + open_log_segno = FLOG_PTR_PREV_TO_SEG(write_result, POLAR_FLOG_SEG_SIZE); /* create/use new log file */ @@ -453,7 +453,7 @@ flog_write(flog_buf_ctl_t ctl, polar_flog_rec_ptr write_request, bool flexible) /* Make sure we have the current logfile open */ if (open_log_file < 0) { - open_log_segno = flog_ptr_prev_to_seg(write_result, + open_log_segno = FLOG_PTR_PREV_TO_SEG(write_result, POLAR_FLOG_SEG_SIZE); /* create/use new log file */ use_existent = true; @@ -517,7 +517,7 @@ flog_write(flog_buf_ctl_t ctl, polar_flog_rec_ptr write_request, bool flexible) if (errno == EINTR) continue; - get_flog_fname(name, open_log_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(name, open_log_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); ereport(PANIC, (errcode_for_file_access(), errmsg("could not write to flashback log file %s " @@ -552,7 +552,7 @@ flog_write(flog_buf_ctl_t ctl, polar_flog_rec_ptr write_request, bool flexible) /*no cover begin*/ int save_errno = errno; char name[FLOG_MAX_FNAME_LEN]; - get_flog_fname(name, open_log_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(name, open_log_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); polar_close(open_log_file); errno = save_errno; ereport(PANIC, @@ -1219,36 +1219,11 @@ polar_startup_flog_buf(flog_buf_ctl_t ctl, CheckPoint *checkpoint) polar_set_fbpoint_wal_info(ctl, checkpoint->redo, checkpoint->time, InvalidXLogRecPtr, false); ctl->fbpoint_info.wal_info = ctl->wal_info; /* There no flashback, so just set buffer ready */ - polar_set_flog_buf_state(ctl, FLOG_BUF_READY); + ctl->buf_state = FLOG_BUF_READY; } -} - -flog_buf_state -polar_get_flog_buf_state(flog_buf_ctl_t ctl) -{ - pg_read_barrier(); - return ctl->buf_state; -} - -void -polar_set_flog_buf_state(flog_buf_ctl_t ctl, flog_buf_state buf_state) -{ - ctl->buf_state = buf_state; - pg_write_barrier(); -} - -polar_flog_rec_ptr -polar_get_flog_min_recover_lsn(flog_buf_ctl_t ctl) -{ - pg_read_barrier(); - return ctl->min_recover_lsn; -} -void -polar_set_flog_min_recover_lsn(flog_buf_ctl_t ctl, polar_flog_rec_ptr ptr) -{ - ctl->min_recover_lsn = ptr; - pg_write_barrier(); + /* Set flashback log redo lsn to checkpoint redo lsn */ + ctl->redo_lsn = checkpoint->redo; } /* POLAR: Track and log flashback log service state. */ @@ -1313,7 +1288,7 @@ polar_flog_rec_insert(flog_buf_ctl_t buf_ctl, flog_index_queue_ctl_t queue_ctl, pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_LOG_BUF_READY); /* Check the flashback log shared buffer ready */ - if (!polar_is_flog_buf_ready(buf_ctl)) + if (!POLAR_IS_FLOG_BUF_READY(buf_ctl)) /*no cover line*/ elog(PANIC, "The flashback log buffer must be ready before insert flashback log record."); @@ -1527,7 +1502,7 @@ polar_flog_flush_bg(flog_buf_ctl_t ctl) { if (open_log_file >= 0) { - if (!ptr_prev_in_flog_seg(write_result, open_log_segno, + if (!FLOG_PTR_PREV_IN_SEG(write_result, open_log_segno, POLAR_FLOG_SEG_SIZE)) flog_file_close(); } @@ -1593,21 +1568,6 @@ polar_flog_flush(flog_buf_ctl_t ctl, polar_flog_rec_ptr end_ptr) } } -void -polar_flog_get_keep_wal_lsn(flog_buf_ctl_t ctl, XLogRecPtr *keep) -{ - XLogRecPtr fl_keep_wal_lsn; - - fl_keep_wal_lsn = ctl->keep_wal_lsn; - - if (fl_keep_wal_lsn == InvalidXLogRecPtr) - return; - else if (*keep != InvalidXLogRecPtr) - *keep = Min(*keep, fl_keep_wal_lsn); - else - *keep = fl_keep_wal_lsn; -} - char * polar_get_flog_dir(flog_buf_ctl_t ctl) { diff --git a/src/backend/polar_flashback/polar_flashback_log_reader.c b/src/backend/polar_flashback/polar_flashback_log_reader.c index 56f526c9100..ef4efc7c2cf 100644 --- a/src/backend/polar_flashback/polar_flashback_log_reader.c +++ b/src/backend/polar_flashback/polar_flashback_log_reader.c @@ -15,6 +15,7 @@ #include "postgres.h" #include "access/xlog.h" +#include "common/pg_lzcompress.h" #include "miscadmin.h" #include "pgstat.h" #include "polar_flashback/polar_flashback_log_file.h" @@ -26,12 +27,13 @@ static int read_file = -1; static uint64 read_segno = 0; static uint32 read_off = 0; static uint32 read_len = 0; +const char *flog_record_types[FLOG_REC_TYPES + 1] = FLOG_RECORD_TYPES; /* size of the buffer allocated for error message. */ #define MAX_ERRORMSG_LEN 1000 /* load the flashback log switch ptrs */ -#define load_switch_ptrs(dir, ptrs) \ - (ptrs != NIL? ptrs : polar_read_flog_history_file(dir)) +#define FLOG_LOAD_SWITCH_PTRS(dir, ptrs) \ + ((ptrs) != NIL? (ptrs) : polar_read_flog_history_file(dir)) static void report_invalid_flog_record(flog_reader_state *state, const char *fmt, ...) pg_attribute_printf(2, 3); @@ -117,7 +119,7 @@ is_flog_ptr_switch(polar_flog_rec_ptr ptr, polar_flog_rec_ptr prev_ptr, flog_rea return false; /* Check strictly */ - reader->switch_ptrs = load_switch_ptrs(reader->flog_buf_ctl->dir, reader->switch_ptrs); + reader->switch_ptrs = FLOG_LOAD_SWITCH_PTRS(reader->flog_buf_ctl->dir, reader->switch_ptrs); foreach (cell, reader->switch_ptrs) { @@ -213,17 +215,17 @@ flog_page_header_validate(flog_reader_state *state, Assert((recptr % POLAR_FLOG_BLCKSZ) == 0); - segno = flog_ptr_to_seg(recptr, state->segment_size); + segno = FLOG_PTR_TO_SEG(recptr, state->segment_size); offset = FLOG_SEGMENT_OFFSET(recptr, state->segment_size); - flog_seg_offset_to_ptr(segno, offset, state->segment_size, recaddr); + FLOG_SEG_OFFSET_TO_PTR(segno, offset, state->segment_size, recaddr); if (hdr->xlp_magic != FLOG_PAGE_MAGIC) { /*no cover begin*/ char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "invalid magic number %04X in flashback log segment %s, offset %u", @@ -239,7 +241,7 @@ flog_page_header_validate(flog_reader_state *state, /*no cover begin*/ char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "invalid version %04X in flashback log segment %s, offset %u", hdr->xlp_version, @@ -254,7 +256,7 @@ flog_page_header_validate(flog_reader_state *state, /*no cover begin*/ char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "invalid info bits %04X in flashback log segment %s, offset %u", @@ -316,7 +318,7 @@ flog_page_header_validate(flog_reader_state *state, /*no cover begin*/ char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); /* hmm, first page of file doesn't have a long header? */ report_invalid_flog_record(state, @@ -338,7 +340,7 @@ flog_page_header_validate(flog_reader_state *state, /*no cover begin*/ char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "unexpected pageaddr %X/%X in flashback log segment %s, offset %u", @@ -374,7 +376,7 @@ read_flog_page_internal(flog_reader_state *state, flog_page_header hdr; Assert((page_ptr % POLAR_FLOG_BLCKSZ) == 0); - target_segno = flog_ptr_to_seg(page_ptr, state->segment_size); + target_segno = FLOG_PTR_TO_SEG(page_ptr, state->segment_size); target_pageoff = FLOG_SEGMENT_OFFSET(page_ptr, state->segment_size); /* check whether we have all the requested data already */ @@ -981,7 +983,7 @@ polar_flog_page_read(flog_reader_state *state, read_len = read_upto - target_page_ptr; } - target_seg_no = flog_ptr_to_seg(target_page_ptr, POLAR_FLOG_SEG_SIZE); + target_seg_no = FLOG_PTR_TO_SEG(target_page_ptr, POLAR_FLOG_SEG_SIZE); target_page_off = FLOG_SEGMENT_OFFSET(target_page_ptr, POLAR_FLOG_SEG_SIZE); /* @@ -989,22 +991,22 @@ polar_flog_page_read(flog_reader_state *state, * is not in the currently open one. */ if (read_file >= 0 && - !ptr_in_flog_seg(target_page_ptr, read_segno, POLAR_FLOG_SEG_SIZE)) + !FLOG_PTR_IN_SEG(target_page_ptr, read_segno, POLAR_FLOG_SEG_SIZE)) { polar_close(read_file); read_file = -1; } - read_segno = flog_ptr_to_seg(target_page_ptr, POLAR_FLOG_SEG_SIZE); + read_segno = FLOG_PTR_TO_SEG(target_page_ptr, POLAR_FLOG_SEG_SIZE); if (read_file < 0) { - if (polar_is_flog_file_exist(state->flog_buf_ctl->dir, target_page_ptr, WARNING)) + if (polar_flog_file_exists(state->flog_buf_ctl->dir, target_page_ptr, WARNING)) read_file = polar_flog_file_open(read_segno, state->flog_buf_ctl->dir); else { /*no cover begin*/ - get_flog_fname(file_name, read_segno, + FLOG_GET_FNAME(file_name, read_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "Can't find the flashback log segno file %s", file_name); @@ -1033,7 +1035,7 @@ polar_flog_page_read(flog_reader_state *state, /*no cover begin*/ pgstat_report_wait_end(); - get_flog_fname(fname, read_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, read_segno, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); errno = save_errno; ereport(WARNING, (errcode_for_file_access(), @@ -1094,7 +1096,7 @@ polar_is_flog_rec_ignore(polar_flog_rec_ptr *ptr, uint32 log_len, flog_reader_st else end_ptr = polar_get_next_flog_ptr(*ptr, log_len); - reader->switch_ptrs = load_switch_ptrs(reader->flog_buf_ctl->dir, reader->switch_ptrs); + reader->switch_ptrs = FLOG_LOAD_SWITCH_PTRS(reader->flog_buf_ctl->dir, reader->switch_ptrs); foreach (cell, reader->switch_ptrs) { flog_history_entry *tle = (flog_history_entry *) lfirst(cell); @@ -1116,3 +1118,137 @@ polar_is_flog_rec_ignore(polar_flog_rec_ptr *ptr, uint32 log_len, flog_reader_st return false; } + +static bool +decode_origin_page(flog_record *rec, Page page, polar_flog_rec_ptr ptr) +{ + fl_origin_page_rec_data *rec_data; + fl_rec_img_header *img; + fl_rec_img_comp_header *c_img; + char *origin_page; + int record_data_len; + PGAlignedBlock tmp; + uint16 hole_length = 0; + + rec_data = FL_GET_ORIGIN_PAGE_REC_DATA(rec); + img = FL_GET_ORIGIN_PAGE_IMG_HEADER(rec); + origin_page = (char *)img + FL_REC_IMG_HEADER_SIZE; + record_data_len = img->length; + + if (img->bimg_info & IMAGE_IS_COMPRESSED) + { + if (img->bimg_info & IMAGE_HAS_HOLE) + { + c_img = (fl_rec_img_comp_header *) origin_page; + origin_page += FL_REC_IMG_COMP_HEADER_SIZE; + hole_length = c_img->hole_length; + } + + if (pglz_decompress(origin_page, record_data_len, tmp.data, + BLCKSZ - hole_length) < 0)/* POLAR Ganos: external detoast slice */ + { + /*no cover line*/ + elog(ERROR, "Invalid compressed origin page " POLAR_LOG_BUFFER_TAG_FORMAT + ", from flashback log at %X/%X", POLAR_LOG_BUFFER_TAG(&(rec_data->tag)), + (uint32)(ptr >> 32), (uint32) ptr); + } + + origin_page = tmp.data; + } + else if (img->bimg_info & IMAGE_HAS_HOLE) + hole_length = BLCKSZ - img->length; + + /* generate page, taking into account hole if necessary */ + if (hole_length == 0) + memcpy((char *)page, origin_page, BLCKSZ); + else + { + memcpy((char *)page, origin_page, img->hole_offset); + /* must zero-fill the hole */ + MemSet((char *)page + img->hole_offset, 0, hole_length); + memcpy((char *)page + (img->hole_offset + hole_length), + origin_page + img->hole_offset, + BLCKSZ - (img->hole_offset + hole_length)); + } + + /* Checksum again */ + if (!PageIsVerified(page, rec_data->tag.forkNum, rec_data->tag.blockNum, NULL)) + /*no cover line*/ + elog(ERROR, "The checksum of origin page " POLAR_LOG_BUFFER_TAG_FORMAT + ", from flashback log at %X/%X is wrong", POLAR_LOG_BUFFER_TAG(&(rec_data->tag)), + (uint32)(ptr >> 32), (uint32) ptr); + + return true; +} + +/* + * POLAR: Decode the flashback log record + */ +flog_record * +polar_decode_flog_rec_common(flog_reader_state *reader, polar_flog_rec_ptr ptr, RmgrId rm_id) +{ + flog_record *rec; + char *errormsg = NULL; + + Assert(reader); + /* Read the flashback log record until the flashback log is invalid */ + rec = polar_read_flog_record(reader, ptr, &errormsg); + + if (rec == NULL) + /*no cover line*/ + elog(ERROR, "The flashback log record at %X/%X is invaid with error: %s", + (uint32)(ptr >> 32), (uint32) ptr, errormsg); + else if (rec->xl_rmid != rm_id) + /*no cover line*/ + elog(ERROR, "The flashback log record at %X/%X expected is %s, but its rmid is %d now", + (uint32)(ptr >> 32), (uint32) ptr, flog_record_types[rm_id], rec->xl_rmid); + + return rec; +} + +/* + * POLAR: Decode the origin page flashback log record. + * Check the checkpoint lsn and crc field. + */ +bool +polar_decode_origin_page_rec(flog_reader_state *reader, polar_flog_rec_ptr ptr, Page page, + XLogRecPtr *redo_lsn, BufferTag *tag) +{ + uint8 info; + bool is_valid = false; + flog_record *rec; + + rec = polar_decode_flog_rec_common(reader, ptr, ORIGIN_PAGE_ID); + + Assert(rec->xl_rmid == ORIGIN_PAGE_ID); + + if (!BUFFERTAGS_EQUAL(FL_GET_ORIGIN_PAGE_REC_DATA(rec)->tag, *tag)) + /*no cover line*/ + elog(ERROR, "The buffer tag flashback log record at %X/%X is " POLAR_LOG_BUFFER_TAG_FORMAT + "not " POLAR_LOG_BUFFER_TAG_FORMAT, + (uint32)(ptr >> 32), (uint32) ptr, + POLAR_LOG_BUFFER_TAG(&(FL_GET_ORIGIN_PAGE_REC_DATA(rec)->tag)), POLAR_LOG_BUFFER_TAG(tag)); + + info = rec->xl_info; + + switch (info & ORIGIN_PAGE_TYPE_MASK) + { + case ORIGIN_PAGE_EMPTY: + is_valid = true; + PageInit(page, BLCKSZ, 0); + break; + + case ORIGIN_PAGE_FULL: + is_valid = decode_origin_page(rec, page, ptr); + break; + + default: + /*no cover line*/ + elog(ERROR, "Parse flashback log origin page rec: unknown information %u", info); + } + + if (is_valid) + *redo_lsn = FL_GET_ORIGIN_PAGE_REC_DATA(rec)->redo_lsn; + + return is_valid; +} diff --git a/src/backend/polar_flashback/polar_flashback_log_repair_page.c b/src/backend/polar_flashback/polar_flashback_log_repair_page.c deleted file mode 100644 index cfd333cc23b..00000000000 --- a/src/backend/polar_flashback/polar_flashback_log_repair_page.c +++ /dev/null @@ -1,161 +0,0 @@ -/*------------------------------------------------------------------------- - * - * polar_flashback_log_repair_page.c - * - * - * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group - * Portions Copyright (c) 1994, Regents of the University of California - * Portions Copyright (c) 2021, Alibaba Group Holding limited - * - * IDENTIFICATION - * src/backend/polar_flashback/polar_flashback_log_repair_page.c - * - *------------------------------------------------------------------------- - */ -#include "postgres.h" - -#include "access/polar_logindex_redo.h" -#include "miscadmin.h" -#include "polar_flashback/polar_flashback_log.h" -#include "polar_flashback/polar_flashback_log_decoder.h" -#include "polar_flashback/polar_flashback_log_index.h" -#include "polar_flashback/polar_flashback_log_repair_page.h" -#include "polar_flashback/polar_flashback_point.h" -#include "postmaster/startup.h" - -bool polar_has_partial_write; -/* POLAR: Just crash will cause partial write */ -static bool -may_be_partial_write(void) -{ - return AmStartupProcess() && !reachedConsistency; -} - -/* - * POLAR: Write the repaired buffer. - * - * NB: The buffer must be invaild, so write it without - * any lock is safe. - */ -static void -write_repaired_buf(BufferDesc *buf) -{ - SMgrRelation reln; - Block buf_block; - char *buf_write; - - Assert(pg_atomic_read_u32(&buf->polar_redo_state) & POLAR_BUF_FLOG_DISABLE); - - buf_block = BufHdrGetBlock(buf); - buf_write = PageEncryptCopy((Page) buf_block, buf->tag.forkNum, - buf->tag.blockNum); - buf_write = PageSetChecksumCopy((Page) buf_write, buf->tag.blockNum); - reln = smgropen(buf->tag.rnode, InvalidBackendId); - smgrwrite(reln, - buf->tag.forkNum, - buf->tag.blockNum, - buf_write, - false); -} - -/* - * POLAR: Get origin page to solve partial write problem. - * - * tag: The buffer tag. - * page: The origin page. - * replay_start_lsn: The replay start LSN. - */ -static bool -get_origin_page_for_partial_write(flog_ctl_t instance, Buffer *buf, BufferTag *tag) -{ - polar_flog_rec_ptr start_ptr; - polar_flog_rec_ptr end_ptr; - XLogRecPtr start_lsn; - XLogRecPtr end_lsn; - flog_buf_ctl_t buf_ctl = instance->buf_ctl; - - start_ptr = polar_get_fbpoint_start_ptr(buf_ctl); - end_ptr = polar_get_flog_write_result(buf_ctl); - Assert(end_ptr >= start_ptr); - - /* There is no flashback log */ - if (FLOG_REC_PTR_IS_INVAILD(end_ptr)) - return false; - - start_lsn = polar_get_curr_fbpoint_lsn(buf_ctl); - end_lsn = GetRedoRecPtr(); - - return polar_flashback_buffer(instance, buf, tag, start_ptr, end_ptr, start_lsn, end_lsn, ERROR, true); -} - -/* - * POLAR: The flashback log can repair the PERMANENT buffer - * when it meet a partial write. - */ -bool -polar_can_flog_repair(flog_ctl_t instance, BufferDesc *buf_hdr, bool has_redo_action) -{ - uint32 buf_state; - - if (!polar_is_flog_enabled(instance)) - return false; - - if (may_be_partial_write() || has_redo_action) - { - buf_state = pg_atomic_read_u32(&buf_hdr->state); - return buf_state & BM_PERMANENT; - } - - return false; -} - -/* - * To repair the partial write problem. - * Partial write problem will occur in three scenarios: - * 1. RW crash recovery. - * 2. Standby crash recovery. - * 3. RO to RW online promote. - */ -void -polar_repair_partial_write(flog_ctl_t instance, BufferDesc *bufHdr) -{ - BufferTag *tag = &bufHdr->tag; - Buffer buf; - - Assert((pg_atomic_read_u32(&bufHdr->state) & BM_VALID) == 0); - buf = bufHdr->buf_id + 1; - - /* Wait for the flashback logindex ready */ - while (!polar_is_flog_ready(instance)) - { - /* Handle interrupt signals of startup process to avoid hang */ - if (AmStartupProcess()) - HandleStartupProcInterrupts(); - else - CHECK_FOR_INTERRUPTS(); - - pg_usleep(1000L); - } - - if (!get_origin_page_for_partial_write(instance, &buf, tag)) - { - elog(ERROR, "Can't find a valid origin page for ([%u, %u, %u]), %u, %u from flashback log", - tag->rnode.spcNode, - tag->rnode.dbNode, - tag->rnode.relNode, - tag->forkNum, - tag->blockNum); - } - else - { - /* Flush the buffer to protect the next first modify after checkpoint. */ - write_repaired_buf(bufHdr); - - elog(LOG, "The page ([%u, %u, %u]), %u, %u has been repaired by flashback log", - tag->rnode.spcNode, - tag->rnode.dbNode, - tag->rnode.relNode, - tag->forkNum, - tag->blockNum); - } -} diff --git a/src/backend/polar_flashback/polar_flashback_log_worker.c b/src/backend/polar_flashback/polar_flashback_log_worker.c index 795d7e30c91..d249b7922eb 100644 --- a/src/backend/polar_flashback/polar_flashback_log_worker.c +++ b/src/backend/polar_flashback/polar_flashback_log_worker.c @@ -34,6 +34,12 @@ #include "utils/polar_coredump.h" #include "utils/resowner.h" +#define WAKEUP_FLOG_BG_WORKER(latch) \ + do { \ + if (latch) \ + SetLatch(latch); \ + } while (0) + /* GUCs */ int polar_flashback_log_bgwrite_delay; int polar_flashback_log_insert_list_delay; @@ -71,6 +77,14 @@ bg_shutdown_handler(SIGNAL_ARGS) errno = save_errno; } +static void +set_flog_bgwriter_latch(flog_ctl_t instance) +{ + Assert(MyLatch); + + instance->bgwriter_latch = MyLatch; +} + void polar_flog_bgwriter_main(void) { @@ -134,6 +148,8 @@ polar_flog_bgwriter_main(void) ALLOCSET_DEFAULT_SIZES); MemoryContextSwitchTo(bgworker_context); + set_flog_bgwriter_latch(flog_instance); + /* * If an exception is encountered, processing resumes here. * @@ -248,6 +264,9 @@ polar_flog_bgwriter_main(void) if (!polar_is_flog_ready(flog_instance)) polar_recover_flog(flog_instance); + /* insert the logindex first */ + polar_flog_index_insert(snapshot, queue_ctl, buf_ctl, polar_get_flog_write_result(buf_ctl), ANY); + ptr_expected = polar_flog_flush_bg(buf_ctl); if (!FLOG_REC_PTR_IS_INVAILD(ptr_expected)) @@ -275,9 +294,7 @@ polar_flog_bginserter_main(void) { sigjmp_buf local_sigjmp_buf; MemoryContext bgworker_context; - flog_buf_ctl_t buf_ctl = flog_instance->buf_ctl; flog_list_ctl_t list_ctl = flog_instance->list_ctl; - flog_index_queue_ctl_t queue_ctl = flog_instance->queue_ctl; /* * Properly accept or ignore signals the postmaster might send us. @@ -445,7 +462,7 @@ polar_flog_bginserter_main(void) insert_num = 0; do { - polar_insert_flog_rec_from_list_bg(list_ctl, buf_ctl, queue_ctl); + polar_process_flog_list_bg(flog_instance); polar_flog_get_async_list_info(list_ctl, &head, &tail); insert_num++; } @@ -465,3 +482,23 @@ polar_flog_bginserter_main(void) exit(1); } } + +bool +polar_is_flog_index_inserted(flog_ctl_t instance, void *data) +{ + polar_flog_rec_ptr *ptr = (polar_flog_rec_ptr *) data; + + return polar_get_logindex_max_parsed_lsn(instance->logindex_snapshot) >= *ptr; +} + +void +polar_wait_flog_bgworker(flog_ctl_t instance, flog_bg_worker_done is_done, void *extra_data) +{ + WAKEUP_FLOG_BG_WORKER(instance->bgwriter_latch); + + while (!is_done(instance, extra_data)) + { + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000); + } +} diff --git a/src/backend/polar_flashback/polar_flashback_point.c b/src/backend/polar_flashback/polar_flashback_point.c index 56f2756a784..cf72a8e2a21 100644 --- a/src/backend/polar_flashback/polar_flashback_point.c +++ b/src/backend/polar_flashback/polar_flashback_point.c @@ -16,6 +16,8 @@ #include "access/xlog.h" #include "access/xlog_internal.h" +#include "miscadmin.h" +#include "polar_flashback/polar_fast_recovery_area.h" #include "polar_flashback/polar_flashback_log_file.h" #include "polar_flashback/polar_flashback_log_index.h" #include "polar_flashback/polar_flashback_point.h" @@ -48,7 +50,7 @@ polar_set_fbpoint_wal_info(flog_buf_ctl_t buf_ctl, XLogRecPtr fbpoint_lsn, if ((fbpoint_lsn > prior_fbpoint_lsn) && (!is_restart_point || (bg_replayed_lsn >= prior_fbpoint_lsn))) { - buf_ctl->keep_wal_lsn = buf_ctl->wal_info.prior_fbpoint_lsn = prior_fbpoint_lsn; + buf_ctl->wal_info.prior_fbpoint_lsn = prior_fbpoint_lsn; buf_ctl->wal_info.fbpoint_lsn = fbpoint_lsn; buf_ctl->wal_info.fbpoint_time = fbpoint_time; } @@ -105,6 +107,402 @@ polar_get_local_fbpoint_lsn(flog_buf_ctl_t buf_ctl, XLogRecPtr page_lsn, XLogRec return local_fbpoint_lsn; } +void +polar_get_fbpoint_file_path(uint32 seg_no, const char *fra_dir, char *path) +{ + char relative_path[MAXPGPATH]; + + snprintf(relative_path, MAXPGPATH, "%s/%s/%08X", fra_dir, FBPOINT_DIR, seg_no); + + polar_make_file_path_level2(path, relative_path); +} + +static int +fbpoint_file_init(const char *fra_dir, uint32 seg_no, fbpoint_io_error_t *io_error) +{ + char path[MAXPGPATH]; + int fd; + + polar_get_fbpoint_file_path(seg_no, fra_dir, path); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY, true); + + if (fd < 0) + { + if (errno == ENOENT) + { + fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, true); + + if (fd < 0) + { + io_error->errcause = FBPOINT_FILE_CREATE_FAILED; + io_error->save_errno = errno; + } + else + { + polar_fill_segment_file_zero(fd, path, FBPOINT_SEG_SIZE, + WAIT_EVENT_FLASHBACK_POINT_FILE_WRITE, WAIT_EVENT_FLASHBACK_POINT_FILE_SYNC, + "checkpoint info"); + elog(DEBUG2, "done creating and filling new flashback point file %s", path); + } + } + else + { + io_error->errcause = FBPOINT_FILE_OPEN_FAILED; + io_error->save_errno = errno; + } + } + + return fd; +} + +static pg_crc32c +fbpoint_page_comp_crc(fbpoint_page_header_t *header) +{ + pg_crc32c crc; + + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) header + FBPOINT_PAGE_HEADER_SIZE, FBPOINT_PAGE_SIZE - FBPOINT_PAGE_HEADER_SIZE); + COMP_CRC32C(crc, (char *) header, offsetof(fbpoint_page_header_t, crc)); + FIN_CRC32C(crc); + + return crc; +} + +static void +insert_fbpoint_rec(fpoint_ctl_t ctl, fbpoint_rec_data_t *rec_data) +{ + fbpoint_rec_data_t *start_pos; + uint64 fbpoint_rec_no; + + fbpoint_rec_no = ctl->next_fbpoint_rec_no; + + /* Update the flashback point record, hold the exclusive lock to protect the memory */ + LWLockAcquire(&ctl->fbpoint_rec_buf_lock, LW_EXCLUSIVE); + + /* A new segment, so reset the buffer */ + if (fbpoint_rec_no % FBPOINT_REC_PER_SEG == 0) + MemSet(ctl->fbpoint_rec_buf, 0, FBPOINT_REC_END_POS); + + start_pos = (fbpoint_rec_data_t *) FBPOINT_GET_OFFSET_BY_REC_NO(ctl->fbpoint_rec_buf, fbpoint_rec_no); + memcpy(start_pos, rec_data, sizeof(fbpoint_rec_data_t)); + + LWLockRelease(&ctl->fbpoint_rec_buf_lock); +} + +/* + * Search right flashback point record whose time is newest of older than or equals to + * the target time within a segment. + * + * NB: Please make true the minimal flashback point record is older than or equals to + * the target time. + */ +static uint64 +search_right_fbpoint_rec(char *start_ptr, pg_time_t target_time, uint32 seg_no, + uint64 max_rec_no) +{ + uint64 l = 0; + uint64 r = 0; + fbpoint_rec_data_t *first_fbpoint_rec; + + l = seg_no * FBPOINT_REC_PER_SEG; + first_fbpoint_rec = (fbpoint_rec_data_t *) FBPOINT_GET_OFFSET_BY_REC_NO(start_ptr, l); + /* The time of the minal record will be older than or equals to the target time */ + Assert(first_fbpoint_rec->time <= target_time); + r = Min(max_rec_no, (seg_no + 1) * FBPOINT_REC_PER_SEG - 1); + + /* The first is which we want */ + if (unlikely(first_fbpoint_rec->time == target_time)) + return l; + + while (l < r) + { + uint64 mid = (l + r + 1) / 2; + fbpoint_rec_data_t *fbpoint_rec; + + fbpoint_rec = (fbpoint_rec_data_t *) FBPOINT_GET_OFFSET_BY_REC_NO(start_ptr, mid); + + if (fbpoint_rec->time < target_time) + l = mid; + else if (unlikely(fbpoint_rec->time == target_time)) + return mid; + else + { + /* Can't overflow in here */ + Assert(mid > 0); + r = mid - 1; + } + } + + return l; +} + +void +polar_fbpoint_report_io_error(const char *fra_dir, fbpoint_io_error_t *io_error, int log_level) +{ + char path[MAXPGPATH]; + + polar_get_fbpoint_file_path(io_error->segno, fra_dir, path); + errno = io_error->save_errno; + + switch (io_error->errcause) + { + case FBPOINT_FILE_CREATE_FAILED: + ereport(log_level, + (errcode_for_file_access(), + "Could not create file \"%s\": %m.", path)); + break; + + case FBPOINT_FILE_OPEN_FAILED: + ereport(log_level, + (errcode_for_file_access(), + "Could not open file \"%s\": %m.", path)); + break; + + case FBPOINT_FILE_READ_FAILED: + ereport(log_level, + (errcode_for_file_access(), + "Could not read from file \"%s\" at offset %u: %m.", + path, io_error->offset)); + break; + + case FBPOINT_FILE_WRITE_FAILED: + ereport(log_level, + (errcode_for_file_access(), + "Could not write to file \"%s\" at offset %u: %m.", + path, io_error->offset)); + break; + + case FBPOINT_FILE_FSYNC_FAILED: + ereport(data_sync_elevel(log_level), + (errcode_for_file_access(), + "Could not fsync file \"%s\": %m.", path)); + break; + + case FBPOINT_FILE_CLOSE_FAILED: + ereport(log_level, + (errcode_for_file_access(), + "Could not close file \"%s\": %m.", path)); + break; + + default: + /* can't get here, we trust */ + elog(PANIC, "unrecognized flashback point file io error cause: %d", (int) io_error->errcause); + break; + } +} + +bool +polar_read_fbpoint_file(const char *fra_dir, char *data, uint32 seg_no, uint32 offset, uint32 size, fbpoint_io_error_t *io_error) +{ + char path[MAXPGPATH]; + int fd; + ssize_t read_len; + + Assert(size <= FBPOINT_SEG_SIZE); + /* Init the io error */ + io_error->segno = seg_no; + io_error->size = size; + io_error->offset = offset; + + polar_get_fbpoint_file_path(seg_no, fra_dir, path); + fd = polar_open_transient_file(path, O_RDONLY | PG_BINARY); + + if (fd < 0) + { + /*no cover begin*/ + io_error->save_errno = errno; + io_error->errcause = FBPOINT_FILE_OPEN_FAILED; + return false; + /*no cover end*/ + } + + pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_POINT_FILE_READ); + read_len = polar_pread(fd, data, size, offset); + + if (read_len != size) + { + /*no cover begin*/ + io_error->save_errno = errno; + io_error->errcause = FBPOINT_FILE_READ_FAILED; + io_error->io_return = read_len; + CloseTransientFile(fd); + return false; + /*no cover end*/ + } + + pgstat_report_wait_end(); + + if (CloseTransientFile(fd)) + { + /*no cover begin*/ + io_error->save_errno = errno; + io_error->errcause = FBPOINT_FILE_CLOSE_FAILED; + return false; + /*no cover end*/ + } + + return true; +} + +/* + * POLAR: Write and sync the flashback point file. + * + * NB: we use static fd and seg_no_open and use BasicOpenFile to avoid to be closed by vfd. + */ +bool +polar_write_fbpoint_file(const char *fra_dir, char *data, uint32 seg_no, uint32 offset, uint32 size, fbpoint_io_error_t *io_error) +{ + static int fd = -1; + static uint32 seg_no_open = 0; + uint32 write_len; + + /* Init the io error */ + io_error->segno = seg_no_open; + io_error->size = size; + io_error->offset = offset; + + if (fd < 0 || seg_no_open != seg_no) + { + if (fd >= 0 && polar_close(fd)) + { + /*no cover begin*/ + io_error->errcause = FBPOINT_FILE_CLOSE_FAILED; + io_error->save_errno = errno; + /*no cover end*/ + } + + seg_no_open = seg_no; + io_error->segno = seg_no; + fd = fbpoint_file_init(fra_dir, seg_no, io_error); + + if (fd < 0) + return false; + } + + /* Write data */ + pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_POINT_FILE_WRITE); + write_len = (uint32) polar_pwrite(fd, data, size, offset); + + if (write_len != size) + { + /*no cover begin*/ + pgstat_report_wait_end(); + io_error->errcause = FBPOINT_FILE_WRITE_FAILED; + io_error->save_errno = errno ? errno : ENOSPC; + io_error->io_return = write_len; + return false; + /*no cover end*/ + } + + pgstat_report_wait_end(); + + /* Sync file */ + pgstat_report_wait_start(WAIT_EVENT_FLASHBACK_POINT_FILE_SYNC); + + if (polar_fsync(fd) != 0) + { + /*no cover begin*/ + pgstat_report_wait_end(); + io_error->errcause = FBPOINT_FILE_FSYNC_FAILED; + io_error->save_errno = errno; + return false; + /*no cover end*/ + } + + pgstat_report_wait_end(); + + return true; +} + +static bool +read_fbpoint_records(const char *fra_dir, char *buf, uint32 seg_no, uint64 max_rec_no, bool ignore_noent) +{ + fbpoint_io_error_t io_error; + + if (polar_read_fbpoint_file(fra_dir, buf, seg_no, 0, FBPOINT_REC_END_POS, &io_error)) + { + uint64 page_no; + uint64 max_page_no; + fbpoint_page_header_t *header; + + /* Get the max page no has record */ + if (FBPOINT_REC_IN_SEG(max_rec_no, seg_no)) + max_page_no = FBPOINT_GET_PAGE_NO_BY_REC_NO(max_rec_no); + else + max_page_no = FBPOINT_PAGE_PER_SEG - 1; + + /* Check the crc of each page */ + header = (fbpoint_page_header_t *) buf; + + for (page_no = 0; page_no <= max_page_no; page_no++) + { + pg_crc32c crc; + + /* Verify CRC */ + crc = fbpoint_page_comp_crc(header); + + if (!EQ_CRC32C(crc, header->crc)) + { + /*no cover begin*/ + char path[MAXPGPATH]; + + polar_get_fbpoint_file_path(seg_no, fra_dir, path); + ereport(FATAL, + (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("calculated CRC checksum does not match value stored in file \"%s\" page %lu", + path, page_no))); + /*no cover end*/ + } + + header = (fbpoint_page_header_t *)((char *) header + FBPOINT_PAGE_SIZE); + } + } + /* Ignore report the io error when the file is non-existence */ + else if (ignore_noent && io_error.errcause == FBPOINT_FILE_OPEN_FAILED && io_error.save_errno == ENOENT) + return false; + else + polar_fbpoint_report_io_error(fra_dir, &io_error, ERROR); + + return true; +} + +static void +startup_fbpoint_rec_buf(fpoint_ctl_t ctl, const char *fra_dir) +{ + uint64 next_fbpoint_rec_no = ctl->next_fbpoint_rec_no; + + if (next_fbpoint_rec_no % FBPOINT_REC_PER_SEG != 0) + { + uint32 seg_no = FBPOINT_GET_SEG_NO_BY_REC_NO(next_fbpoint_rec_no); + + read_fbpoint_records(fra_dir, ctl->fbpoint_rec_buf, seg_no, next_fbpoint_rec_no - 1, false); + } +} + +static void +write_fbpoint_rec(fpoint_ctl_t ctl, const char *fra_dir) +{ + uint64 rec_no; + uint32 start_pos; + uint32 seg_no; + char *start_ptr; + fbpoint_page_header_t *header; + fbpoint_io_error_t io_error; + + /* Fill the info page header */ + rec_no = ctl->next_fbpoint_rec_no; + start_pos = FBPOINT_GET_PAGE_NO_BY_REC_NO(rec_no) * FBPOINT_PAGE_SIZE; + seg_no = FBPOINT_GET_SEG_NO_BY_REC_NO(rec_no); + start_ptr = ctl->fbpoint_rec_buf + start_pos; + header = (fbpoint_page_header_t *) start_ptr; + header->version = FBPOINT_REC_VERSION; + header->crc = fbpoint_page_comp_crc(header); + + if (!polar_write_fbpoint_file(fra_dir, start_ptr, seg_no, start_pos, FBPOINT_PAGE_SIZE, &io_error)) + /*no cover line*/ + polar_fbpoint_report_io_error(fra_dir, &io_error, ERROR); +} + /* POLAR: Is the page first modified after the redo_lsn (or flashback point lsn) */ bool polar_is_page_first_modified(flog_buf_ctl_t buf_ctl, XLogRecPtr page_lsn, XLogRecPtr redo_lsn) @@ -175,13 +573,12 @@ polar_is_flashback_point(flog_ctl_t instance, XLogRecPtr checkpoint_lsn, XLogRec XLogSegNo old_segno; XLogSegNo new_segno; bool is_shutdown = *flags & CHECKPOINT_IS_SHUTDOWN; - bool result = false; if (!polar_is_flog_enabled(instance)) return false; buf_ctl = instance->buf_ctl; - Assert(polar_is_flog_buf_ready(buf_ctl)); + Assert(POLAR_IS_FLOG_BUF_READY(buf_ctl)); SpinLockAcquire(&buf_ctl->info_lck); fbpoint_lsn = buf_ctl->wal_info.fbpoint_lsn; @@ -189,18 +586,27 @@ polar_is_flashback_point(flog_ctl_t instance, XLogRecPtr checkpoint_lsn, XLogRec prior_fbpoint_lsn = buf_ctl->wal_info.prior_fbpoint_lsn; SpinLockRelease(&buf_ctl->info_lck); + /* + * Something has not init in end of recovery checkpoint, + * but flashback point need it (like ShmemVariableCache->latestCompletedXid), so skip it. + */ + if (unlikely(*flags & CHECKPOINT_END_OF_RECOVERY)) + return false; + /* First checkpoint after flashback log enable is a flashback point */ if (unlikely(prior_fbpoint_lsn == InvalidXLogRecPtr)) { elog(LOG, "It is a first checkpoint after flashback log enable, treat it as a flashback point."); - result = true; + *flags = *flags | CHECKPOINT_FLASHBACK; + return true; } /* If it is a shutdown checkpoint, it will be treated as a flashback point */ if (unlikely(is_shutdown)) { elog(LOG, "It is a shutdown checkpoint, treat it as a flashback point."); - result = true; + *flags = *flags | CHECKPOINT_FLASHBACK; + return true; } /* Process the parallel standby mode */ @@ -243,7 +649,8 @@ polar_is_flashback_point(flog_ctl_t instance, XLogRecPtr checkpoint_lsn, XLogRec if (now - fbpoint_time >= polar_flashback_point_timeout) { elog(LOG, "The checkpoint is treated as a flashback point cause to timeout"); - result = true; + *flags = *flags | CHECKPOINT_FLASHBACK; + return true; } /* Check the WAL segments */ @@ -253,11 +660,217 @@ polar_is_flashback_point(flog_ctl_t instance, XLogRecPtr checkpoint_lsn, XLogRec if ((new_segno - old_segno) >= polar_flashback_point_segments) { elog(LOG, "The checkpoint is treated as a flashback point cause to WAL distance"); - result = true; + *flags = *flags | CHECKPOINT_FLASHBACK; + return true; } - if (result) - *flags = *flags | CHECKPOINT_FLASHBACK; + return false; +} + +Size +polar_flashback_point_shmem_size(void) +{ + Size size = 0; + + /* flashback point control data */ + size = sizeof(fbpoint_ctl_data_t); + /* extra alignment padding for checkpoint info I/O buffers */ + size = add_size(size, FBPOINT_PAGE_SIZE); + /* The flashback point record file is so small, so we can load it in the buffer */ + size = add_size(size, FBPOINT_REC_END_POS); + + return size; +} + +void +polar_flashback_point_shmem_init_data(fpoint_ctl_t ctl, const char *name) +{ + char *allocptr; + char buf_lock_name[FL_OBJ_MAX_NAME_LEN]; + + MemSet(ctl, 0, sizeof(fbpoint_ctl_data_t)); + allocptr = (char *) ctl + sizeof(fbpoint_ctl_data_t); + allocptr = (char *) TYPEALIGN(FBPOINT_PAGE_SIZE, allocptr); + ctl->fbpoint_rec_buf = allocptr; + MemSet(ctl->fbpoint_rec_buf, 0, FBPOINT_REC_END_POS); + + FLOG_GET_OBJ_NAME(buf_lock_name, name, FBPOINT_REC_BUF_LOCK_NAME_SUFFIX); + LWLockRegisterTranche(LWTRANCHE_POLAR_FLASHBACK_POINT_REC_BUF, buf_lock_name); + LWLockInitialize(&ctl->fbpoint_rec_buf_lock, LWTRANCHE_POLAR_FLASHBACK_POINT_REC_BUF); +} + +fpoint_ctl_t +polar_flashback_point_shmem_init(const char *name) +{ + fpoint_ctl_t ctl; + bool found; + char ctl_name[FL_OBJ_MAX_NAME_LEN]; + + FLOG_GET_OBJ_NAME(ctl_name, name, FBPOINT_CTL_NAME_SUFFIX); + + ctl = (fpoint_ctl_t)ShmemInitStruct(ctl_name, polar_flashback_point_shmem_size(), &found); + + if (!IsUnderPostmaster) + { + Assert(!found); + polar_flashback_point_shmem_init_data(ctl, name); + } + else + Assert(found); + + return ctl; +} + +void +polar_startup_flashback_point(fpoint_ctl_t ctl, const char *fra_dir, uint64 next_fbpoint_rec_no) +{ + ctl->next_fbpoint_rec_no = next_fbpoint_rec_no; + startup_fbpoint_rec_buf(ctl, fra_dir); +} + +/* + * POLAR: Get the right flashback point record + * whose time is last one less than or equal to target_time. + * Return true when found, otherwise return false. + * + * NB: When keep_seg_no isn't NULL, we will get the oldest flashback point record + * while we can't find a right flashback point and the keep_seg_no will be set to + * the minimal seg_no. + */ +bool +polar_get_right_fbpoint(fpoint_ctl_t ctl, const char *fra_dir, + pg_time_t target_time, fbpoint_rec_data_t *result, uint32 *keep_seg_no) +{ + fbpoint_rec_data_t *fbpoint_rec; + uint64 max_rec_no = ctl->next_fbpoint_rec_no; + uint32 seg_no; + char *start_ptr; + bool found = false; + bool is_in_memory = true; + + if (max_rec_no == 0) + return false; + else + max_rec_no--; + + seg_no = FBPOINT_GET_SEG_NO_BY_REC_NO(max_rec_no); + start_ptr = ctl->fbpoint_rec_buf; + + /* Search from memory need buf lock */ + LWLockAcquire(&ctl->fbpoint_rec_buf_lock, LW_SHARED); + + do + { + fbpoint_rec = FBPOINT_GET_FIRST_REC_IN_SEG(start_ptr); + + /* Find the segment no */ + if (fbpoint_rec->time <= target_time) + { + uint64 rec_no; + + rec_no = search_right_fbpoint_rec(start_ptr, target_time, seg_no, max_rec_no); + fbpoint_rec = (fbpoint_rec_data_t *) FBPOINT_GET_OFFSET_BY_REC_NO(start_ptr, rec_no); + Assert(fbpoint_rec->time <= target_time); + memcpy(result, fbpoint_rec, sizeof(fbpoint_rec_data_t)); + + if (keep_seg_no) + *keep_seg_no = seg_no; + + found = true; + break; + } + + /* We can't find the right one so we will keep the minimal one */ + if (keep_seg_no) + { + memcpy(result, fbpoint_rec, sizeof(fbpoint_rec_data_t)); + *keep_seg_no = seg_no; + found = true; + } + + if (seg_no == 0) + break; + + if (is_in_memory) + { + LWLockRelease(&ctl->fbpoint_rec_buf_lock); + is_in_memory = false; + start_ptr = palloc(FBPOINT_REC_END_POS); + } + + Assert(seg_no > 0); + seg_no--; + } + while (read_fbpoint_records(fra_dir, start_ptr, seg_no, max_rec_no, true)); + + if (is_in_memory) + LWLockRelease(&ctl->fbpoint_rec_buf_lock); + else + pfree(start_ptr); + + return found; +} + +void +polar_flush_fbpoint_rec(fpoint_ctl_t point_ctl, const char *fra_dir, + fbpoint_rec_data_t *rec_data) +{ + insert_fbpoint_rec(point_ctl, rec_data); + write_fbpoint_rec(point_ctl, fra_dir); + point_ctl->next_fbpoint_rec_no++; +} + +void +polar_truncate_fbpoint_files(const char *fra_dir, uint32 keep_seg_no) +{ + char path[MAXPGPATH]; + + /* There is only one file, just return */ + if (keep_seg_no == 0) + return; + else + keep_seg_no--; + + polar_get_fbpoint_file_path(keep_seg_no, fra_dir, path); + + while (polar_file_exists(path)) + { + durable_unlink(path, ERROR); + + /* There is nothing */ + if (keep_seg_no == 0) + break; + + keep_seg_no--; + polar_get_fbpoint_file_path(keep_seg_no, fra_dir, path); + } +} + +/* + * POLAR: get the flashback point keep record. + * + * Return segment no of flashback point keep record. + * + * TODO: Protect the flashback point used by flashback table. + */ +uint32 +polar_get_keep_fbpoint(fpoint_ctl_t ctl, const char *fra_dir, fbpoint_rec_data_t *fbpoint_rec, + polar_flog_rec_ptr *keep_ptr, XLogRecPtr *keep_lsn) +{ + pg_time_t now; + pg_time_t keep_time; + uint32 seg_no = 0; + + /* The fbpoint record must be valid */ + Assert(fbpoint_rec->time); + now = (pg_time_t) time(NULL); + keep_time = now - polar_fast_recovery_area_rotation * SECS_PER_MINUTE; + + polar_get_right_fbpoint(ctl, fra_dir, keep_time, fbpoint_rec, &seg_no); + + /* Update the flashback log keep pointer and wal keep lsn */ + *keep_ptr = Min(*keep_ptr, fbpoint_rec->flog_ptr); + *keep_lsn = fbpoint_rec->redo_lsn; - return result; + return seg_no; } diff --git a/src/backend/polar_flashback/polar_flashback_rel_filenode.c b/src/backend/polar_flashback/polar_flashback_rel_filenode.c new file mode 100644 index 00000000000..83bffd415ee --- /dev/null +++ b/src/backend/polar_flashback/polar_flashback_rel_filenode.c @@ -0,0 +1,213 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback_rel_filenode.c + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, Alibaba Group Holding limited + * + * IDENTIFICATION + * src/backend/polar_flashback/polar_flashback_rel_filenode.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/heapam.h" +#include "catalog/catalog.h" +#include "common/relpath.h" +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback_log.h" +#include "polar_flashback/polar_flashback_log_reader.h" +#include "polar_flashback/polar_flashback_rel_filenode.h" +#include "utils/palloc.h" + +#define NEED_LOG_RELNODE_UPDATE(rel, change_persistence) (polar_can_rel_flashback((rel)->rd_rel, (rel)->rd_id, (change_persistence))) + +static void +decode_rel_filenode_rec(flog_buf_ctl_t buf_ctl, polar_flog_rec_ptr ptr, fl_filenode_rec_data_t *result, + bool *can_flashback, flog_reader_state *reader) +{ + flog_record *rec; + fl_filenode_rec_data_t *rec_data; + + rec = polar_decode_flog_rec_common(reader, ptr, REL_FILENODE_ID); + + Assert(rec->xl_rmid == REL_FILENODE_ID); + + rec_data = FL_GET_FILENODE_REC_DATA(rec); + memcpy(result, rec_data, FL_FILENODE_REC_SIZE); + *can_flashback = ((rec->xl_info) & REL_CAN_FLASHBACK); +} + +/* + * POLAR: Insert the rel filenode record in the flashback log. + */ +static polar_flog_rec_ptr +polar_insert_filenode_rec(flog_ctl_t ins, RelFileNode *old, RelFileNode *new, uint8 info) +{ + BufferTag new_tag; + flog_insert_context insert_context; + + INIT_BUFFERTAG(new_tag, *new, FILENODE_FORK, 0); + /* Construct the insert context */ + insert_context.buf_tag = &new_tag; + insert_context.data = old; + insert_context.rmgr = REL_FILENODE_ID; + /* This will be update in the flashback log record */ + insert_context.info = info; + + return polar_flog_insert_into_buffer(ins, &insert_context); +} + +flog_record * +polar_assemble_filenode_rec(flog_insert_context *insert_context, uint32 xl_tot_len) +{ + + flog_record *rec; + fl_filenode_rec_data_t *rec_data; + + xl_tot_len = POLAR_GET_FILENODE_REC_LEN(xl_tot_len); + + /* Construct the flashback log record */ + rec = polar_palloc_in_crit(xl_tot_len); + rec->xl_tot_len = xl_tot_len; + rec->xl_info = insert_context->info; + + rec_data = FL_GET_FILENODE_REC_DATA(rec); + COPY_REL_FILENODE(*(RelFileNode *)(insert_context->data), rec_data->old_filenode); + COPY_REL_FILENODE(insert_context->buf_tag->rnode, rec_data->new_filenode); + rec_data->time = GetCurrentTimestamp(); + return rec; +} + +/* + * POLAR: Log the filenode update in the flashback log. + * + * It is used by rewrite the table or alter table set tablespace. + * + * NB: We think rewrite the table by vacuum full or alter table + * change the page content, so we can't use it as origin page to flashback. + */ +void +polar_flog_filenode_update(flog_ctl_t flog_ins, fra_ctl_t fra_ins, Oid relid, Oid new_rnode, + Oid new_tablespace, bool change_persistence, bool can_flashback) +{ + Relation rel = NULL; + uint8 info = 0; + + /* The fast recovery area is disable, so just skip */ + if (!polar_enable_fra(fra_ins)) + return; + + Assert(OidIsValid(new_rnode) || OidIsValid(new_tablespace)); + + rel = relation_open(relid, AccessShareLock); + + /* When change the relation persistence, don't need to check the relation persistence. */ + if (NEED_LOG_RELNODE_UPDATE(rel, change_persistence)) + { + RelFileNode old_filenode; + RelFileNode new_filenode; + polar_flog_rec_ptr ptr; + + old_filenode = rel->rd_node; + new_filenode = old_filenode; + + if (OidIsValid(new_rnode)) + new_filenode.relNode = new_rnode; + + if (OidIsValid(new_tablespace)) + new_filenode.spcNode = new_tablespace; + + if (can_flashback) + info |= REL_CAN_FLASHBACK; + + ptr = polar_insert_filenode_rec(flog_ins, &old_filenode, &new_filenode, info); + /* Flush it right now to avoid to miss it */ + polar_flog_flush(flog_ins->buf_ctl, ptr); + } + + relation_close(rel, AccessShareLock); +} + +/* + * POLAR: Get the original relation file node. + */ +bool +polar_find_origin_filenode(flog_ctl_t ins, RelFileNode *filenode, TimestampTz target_time, + polar_flog_rec_ptr start_ptr, polar_flog_rec_ptr end_ptr, flog_reader_state *reader) +{ + log_index_page_iter_t filenode_iter; + logindex_snapshot_t snapshot = ins->logindex_snapshot; + BufferTag tag; + log_index_lsn_t *lsn_info; + polar_flog_rec_ptr ptr; + bool result = false; + + INIT_BUFFERTAG(tag, *filenode, FILENODE_FORK, 0); + + filenode_iter = + polar_logindex_create_page_iterator(snapshot, &tag, start_ptr, end_ptr, false); + + if (polar_logindex_page_iterator_state(filenode_iter) != ITERATE_STATE_FINISHED) + { + /*no cover begin*/ + polar_logindex_release_page_iterator(filenode_iter); + elog(ERROR, "Failed to iterate data for [%u, %u, %u] relation file node, " + "which start pointer =%X/%X and end pointer =%X/%X", + tag.rnode.spcNode, + tag.rnode.dbNode, + tag.rnode.relNode, + (uint32)((start_ptr) >> 32), (uint32)start_ptr, + (uint32)((end_ptr - 1) >> 32), (uint32)(end_ptr - 1)); + return false; + /*no cover end*/ + } + + /* It is just one record */ + if ((lsn_info = polar_logindex_page_iterator_next(filenode_iter)) != NULL) + { + fl_filenode_rec_data_t rec_data; + bool can_flashback = false; + + ptr = (polar_flog_rec_ptr) lsn_info->lsn; + Assert(BUFFERTAGS_EQUAL(*(lsn_info->tag), tag)); + + decode_rel_filenode_rec(ins->buf_ctl, ptr, &rec_data, &can_flashback, reader); + + if (!RelFileNodeEquals(*filenode, rec_data.new_filenode)) + /*no cover line*/ + elog(ERROR, "The relation file node in flashback log record %X/%X is " + "([%u, %u, %u]) not ([%u, %u, %u])", + (uint32)(ptr >> 32), (uint32) ptr, + rec_data.new_filenode.spcNode, rec_data.new_filenode.dbNode, + rec_data.new_filenode.relNode, filenode->spcNode, filenode->dbNode, + filenode->relNode); + + elog(DEBUG2, "The original relation file node for [%u, %u, %u] is [%u, %u, %u] before %s", + filenode->spcNode, filenode->dbNode, filenode->relNode, + rec_data.old_filenode.spcNode, rec_data.old_filenode.dbNode, + rec_data.old_filenode.relNode, timestamptz_to_str(rec_data.time)); + + /* It is what we want */ + if (timestamptz_cmp_internal(rec_data.time, target_time) > 0) + { + if (!can_flashback) + { + polar_logindex_release_page_iterator(filenode_iter); + elog(ERROR, "The relation file node has been changed by vacuum full " + "or alter table or truncate table in the past, " + "we can't flashback the relation."); + } + + result = true; + COPY_REL_FILENODE(rec_data.old_filenode, *filenode); + } + } + + polar_logindex_release_page_iterator(filenode_iter); + return result; +} + diff --git a/src/backend/polar_flashback/polar_flashback_snapshot.c b/src/backend/polar_flashback/polar_flashback_snapshot.c new file mode 100644 index 00000000000..cedda9a009f --- /dev/null +++ b/src/backend/polar_flashback/polar_flashback_snapshot.c @@ -0,0 +1,853 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback_snapshot.c + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, Alibaba Group Holding limited + * + * IDENTIFICATION + * src/backend/polar_flashback/polar_flashback_snapshot.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/xact.h" +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback_snapshot.h" +#include "utils/builtins.h" +#include "utils/snapmgr.h" +#include "storage/procarray.h" + +#define COMPUTE_SNAPSHOT_DATA_CRC(result, header, data, size) \ + do{ \ + INIT_CRC32C(result); \ + COMP_CRC32C((result), (char *) (data), (size)); \ + COMP_CRC32C((result), (char *) (header), offsetof(flashback_snapshot_data_header_t, crc)); \ + FIN_CRC32C(result); \ + } while (0) + +inline Size +polar_get_snapshot_size(Size size, uint32 xcnt) +{ + size = add_size(size, mul_size(xcnt, sizeof(TransactionId))); + + return size; +} + +/* + * flashback_xid_comparator + * qsort comparison function for XIDs + * + * NB: We assume that the xid can't be wraparound twice in one snapshot, so + * we can use TransactionIdPrecedes to compare. + */ +static int +flashback_xid_comparator(const void *arg1, const void *arg2) +{ + TransactionId xid1 = *(const TransactionId *) arg1; + TransactionId xid2 = *(const TransactionId *) arg2; + + if (xid1 == xid2) + return 0; + + if (TransactionIdPrecedes(xid1, xid2)) + return -1; + else + return 1; +} + +static Snapshot +restore_flashback_snapshot(flashback_snapshot_t flashback_snapshot) +{ + Snapshot snapshot; + TransactionId *serialize_xip; + TransactionId *xip; + + snapshot = palloc0(sizeof(SnapshotData)); + snapshot->lsn = flashback_snapshot->lsn; + snapshot->takenDuringRecovery = false; + snapshot->xcnt = flashback_snapshot->xcnt; + snapshot->xmax = flashback_snapshot->xmax; + snapshot->xmin = flashback_snapshot->xmin; + /* It means nothing, but just to make us to use EstimateSnapshotSpace */ + snapshot->satisfies = HeapTupleSatisfiesMVCC; + + serialize_xip = (TransactionId *) POLAR_GET_FLSHBAK_SNAPSHOT_XIP(flashback_snapshot); + /* sort so we can bsearch() */ + qsort(serialize_xip, snapshot->xcnt, sizeof(TransactionId), flashback_xid_comparator); + xip = palloc(snapshot->xcnt * sizeof(TransactionId)); + memcpy(xip, serialize_xip, mul_size(snapshot->xcnt, sizeof(TransactionId))); + snapshot->xip = xip; + + return snapshot; +} + +static void +read_flashback_snapshot_data(const char *fra_dir, uint32 seg_no, uint32 offset, + flashback_snapshot_header_t header, char *data) +{ + char path[MAXPGPATH]; + uint32 read_size; + uint32 end_pos; + uint32 data_size; + pg_crc32c crc; + char *start = data; + + /* We read the header first, so normally we can use the same fd */ + data_size = header->data_size; + end_pos = GET_FLSHBAK_SNAPSHOT_END_POS(header->info); + Assert(end_pos <= FBPOINT_SEG_SIZE); + + do + { + fbpoint_io_error_t io_error; + + read_size = end_pos - offset; + Assert(read_size < FBPOINT_SEG_SIZE); + + if (read_size && !polar_read_fbpoint_file(fra_dir, data, seg_no, offset, read_size, &io_error)) + /*no cover line*/ + polar_fbpoint_report_io_error(fra_dir, &io_error, ERROR); + + data_size -= read_size; + + /* Can break the loop only in here */ + if (data_size == 0) + break; + + end_pos = FBPOINT_SEG_SIZE; + + if (data_size > (FBPOINT_SEG_SIZE - FBPOINT_REC_END_POS)) + offset = FBPOINT_REC_END_POS; + else + offset = FBPOINT_SEG_SIZE - data_size; + + seg_no++; + data += read_size; + } + while (data_size > 0); + + Assert(!data_size); + + /* Check the crc */ + COMPUTE_SNAPSHOT_DATA_CRC(crc, header, start, header->data_size); + + if (!EQ_CRC32C(crc, header->crc)) + { + /*no cover line*/ + ereport(FATAL, (errcode(ERRCODE_DATA_CORRUPTED), + errmsg("calculated CRC checksum does not match value stored in file \"%s\"", path))); + } +} + +/* Like XidInMVCCSnapshot, but ignore the sub transaction in the RW */ +static bool +xid_in_flashback_snapshot(Snapshot snapshot, TransactionId xid) +{ + /* Any xid < xmin is not in-progress */ + if (TransactionIdPrecedes(xid, snapshot->xmin)) + return false; + + /* Any xid >= xmax is in-progress */ + if (TransactionIdFollowsOrEquals(xid, snapshot->xmax)) + return true; + + return bsearch(&xid, snapshot->xip, snapshot->xcnt, sizeof(TransactionId), flashback_xid_comparator) != NULL; +} + +/* true if given transaction committed */ +static bool +polar_flashback_did_xid_commit(Snapshot snapshot, TransactionId xid, TransactionId max_xid, + uint32 next_clog_subdir_no, const char *fra_dir) +{ + XidStatus xidstatus; + + if (!TransactionIdIsNormal(xid)) + { + /*no cover begin*/ + if (TransactionIdEquals(xid, BootstrapTransactionId)) + return true; + + if (TransactionIdEquals(xid, FrozenTransactionId)) + return true; + + return false; + /*no cover end*/ + } + + if (xid_in_flashback_snapshot(snapshot, xid)) + return false; + + xidstatus = polar_flashback_get_xid_status(xid, max_xid, next_clog_subdir_no, fra_dir); + + return xidstatus == TRANSACTION_STATUS_COMMITTED; +} + +static TransactionId +get_next_valid_xid(TransactionId *xip, uint32 *xip_pos, uint32 xcnt, + uint32 *removed_xid_pos, uint32 removed_size, uint8 *removed_pos) +{ + int i; + TransactionId next = InvalidTransactionId; + + for (i = *xip_pos + 1; i < xcnt + removed_size; i++) + { + /* Next is removed, so skip */ + if (removed_size && + *removed_pos < removed_size - 1 && + i == removed_xid_pos[*removed_pos + 1]) + { + *removed_pos = *removed_pos + 1; + continue; + } + else + { + next = xip[i]; + *xip_pos = i; + break; + } + } + + return next; +} + +/* + * POLAR: Compact the removed xids from xip. + */ +static void +compact_removed_xids(TransactionId *xip, uint32 xcnt, flshbak_rel_snpsht_t rsnapshot) +{ + uint8 i = 0; + uint32 new_pos = 0; + uint32 old_pos = 0; + + Assert(xcnt); + + /* No removed xid kept */ + if (rsnapshot->removed_size == 0) + return; + + /* Removed size is larger than zero */ + old_pos = new_pos = rsnapshot->removed_xid_pos[0]; + + for (; new_pos < xcnt; new_pos++) + xip[new_pos] = get_next_valid_xid(xip, &old_pos, xcnt, rsnapshot->removed_xid_pos, rsnapshot->removed_size, &i); + + /* Reset the removed size */ + rsnapshot->removed_size = 0; +} + +static TransactionId * +bsearch_xip(TransactionId *xid, TransactionId *xip, uint32 xcnt, uint32 *removed_xid_pos, + uint32 removed_size, uint8 *removed_pos) +{ + uint32 start = 0; + uint32 end; + + Assert(xcnt); + + for (*removed_pos = 0; *removed_pos < removed_size; + *removed_pos = *removed_pos + 1) + { + end = removed_xid_pos[(*removed_pos)]; + + if (end == 0) + continue; + + if (TransactionIdPrecedes(*xid, xip[end])) + return bsearch(xid, xip + start, end - start, + sizeof(TransactionId), flashback_xid_comparator); + + /* The start has been searched */ + start = end; + } + + /* In the last partion */ + return bsearch(xid, xip + start, xcnt + removed_size - start, sizeof(TransactionId), flashback_xid_comparator); +} + +/* + * Remove the xid from running xids. + * + * NB: We just insert the xid pos to removed_xid_pos array and compact the xip while the + * removed_xid_pos is full. + */ +static void +remove_xid_from_running_xids(flshbak_rel_snpsht_t rsnapshot, TransactionId xid) +{ + TransactionId *xip; + TransactionId *target_xid; + uint32 xcnt; + uint32 i = 0; + uint8 removed_pos; + uint32 xip_pos; + + /* Sometimes we get some xids less than xmin, just skip */ + if (unlikely(TransactionIdPrecedes(xid, rsnapshot->snapshot->xmin))) + return; + + /* When we come here, xmin <= xid < xmax */ + Assert(TransactionIdPrecedes(rsnapshot->snapshot->xmin, rsnapshot->snapshot->xmax)); + xip = rsnapshot->snapshot->xip; + xcnt = rsnapshot->snapshot->xcnt; + Assert(xcnt > 0); + + /* Compact the xids when it is full */ + if (rsnapshot->removed_size == MAX_KEEP_REMOVED_XIDS) + compact_removed_xids(xip, xcnt, rsnapshot); + + target_xid = bsearch_xip(&xid, xip, xcnt, rsnapshot->removed_xid_pos, rsnapshot->removed_size, &removed_pos); + + /* + * Because the snapshot taken lsn is before snapshot taken, + * so sometimes the xid has been committed, ignore it. + */ + if (unlikely(target_xid == NULL)) + return; + + Assert(target_xid && (*target_xid == xid)); + + /* update the xcnt */ + if (rsnapshot->snapshot->xcnt) + rsnapshot->snapshot->xcnt--; + + /* Insert into the removed xip */ + if (rsnapshot->removed_size) + { + uint32 prev = rsnapshot->removed_xid_pos[removed_pos]; + uint32 next; + + Assert(rsnapshot->removed_size < MAX_KEEP_REMOVED_XIDS); + + /* The length is less than MAX_KEEP_REMOVED_XIDS, so removed_xid_pos[i + 1] is safe to process */ + for (i = removed_pos; i < rsnapshot->removed_size; i++) + { + next = rsnapshot->removed_xid_pos[i + 1]; + rsnapshot->removed_xid_pos[i + 1] = prev; + prev = next; + } + } + + /* Update the removed xids */ + xip_pos = target_xid - xip; + rsnapshot->removed_xid_pos[removed_pos] = xip_pos; + rsnapshot->removed_size++; + Assert(rsnapshot->removed_size <= MAX_KEEP_REMOVED_XIDS); + + /* Update the xmin to next valid xip */ + if (xid == rsnapshot->snapshot->xmin) + { + TransactionId next_xid; + + next_xid = get_next_valid_xid(xip, &xip_pos, xcnt, rsnapshot->removed_xid_pos, + rsnapshot->removed_size, &removed_pos); + + if (TransactionIdIsValid(next_xid)) + rsnapshot->snapshot->xmin = next_xid; + else + rsnapshot->snapshot->xmin = rsnapshot->snapshot->xmax; + } +} + +static void +insert_xid_into_running_xids(flshbak_rel_snpsht_t rsnapshot, TransactionId xid) +{ + TransactionId *xip; + TransactionId tmp = rsnapshot->snapshot->xmax; + uint32 i; + int32 insert_cnt; + uint32 xcnt; + + Assert(TransactionIdFollowsOrEquals(xid, tmp)); + insert_cnt = (int32)(xid - tmp); + + /* The [tmp, xid] must contain InvalidTransactionId BootstrapTransactionId FrozenTransactionId */ + if (xid < tmp) + insert_cnt -= FirstNormalTransactionId; + + /* Do nothing */ + if (insert_cnt == 0) + return; + + xip = rsnapshot->snapshot->xip; + xcnt = rsnapshot->snapshot->xcnt; + rsnapshot->snapshot->xcnt += insert_cnt; + + if (unlikely(rsnapshot->xip_size == 0)) + rsnapshot->xip_size = xcnt; + + /* Enlarge the xip and pfree old one */ + if (rsnapshot->xip_size < xcnt + rsnapshot->removed_size + insert_cnt) + { + uint32 new_xip_size = GET_ENLARGE_XIP_SIZE(xcnt + rsnapshot->removed_size + insert_cnt); + + if (xip) + xip = repalloc(xip, new_xip_size * sizeof(TransactionId)); + else + xip = palloc0(new_xip_size * sizeof(TransactionId)); + + rsnapshot->xip_size = new_xip_size; + rsnapshot->snapshot->xip = xip; + } + + xip += (xcnt + rsnapshot->removed_size); + + for (i = 0; i < insert_cnt; i++) + { + xip[i] = tmp; + TransactionIdAdvance(tmp); + } +} + +static inline MemoryContext +get_snapshot_memory_context(void) +{ + static MemoryContext context = NULL; + + if (unlikely(context == NULL)) + { + context = AllocSetContextCreate(TopMemoryContext, + "flashback snapshot memory context", + ALLOCSET_DEFAULT_SIZES); + MemoryContextAllowInCriticalSection(context, true); + } + + return context; +} + +void +polar_update_flashback_snapshot(flshbak_rel_snpsht_t rsnapshot, TransactionId xid) +{ + if (TransactionIdPrecedes(xid, rsnapshot->snapshot->xmax)) + /* Remove it from running transaction xid */ + remove_xid_from_running_xids(rsnapshot, xid); + else + { + /* Insert it into running xids */ + insert_xid_into_running_xids(rsnapshot, xid); + /* Update the xmax to xid + 1 */ + TransactionIdAdvance(xid); + rsnapshot->snapshot->xmax = xid; + } +} + +/* + * Redo the transaction xlog record whose commit/abort time + * is older than or equal to the end_time. + * + * Return true when the commit/abort time is newer than or + * equal to the end_time. + * + * NB: last_xact_lsn will be the lsn of the transaction whose + * commit/abort time is older than or equal to the end_time. + */ +bool +polar_flashback_xact_redo(XLogRecord *record, flshbak_rel_snpsht_t rsnapshot, + TimestampTz end_time, XLogReaderState *xlogreader) +{ + uint8 info; + TransactionId xid = InvalidTransactionId; + int nsubxacts = 0; + TransactionId *subxacts; + int i; + bool finish = false; + bool update_snapshot = false; + + xid = record->xl_xid; + + if (record->xl_rmid == RM_XACT_ID) + { + Assert(record->xl_rmid == RM_XACT_ID); + info = record->xl_info & XLOG_XACT_OPMASK; + /* Backup blocks are not used in xact records */ + Assert(!XLogRecHasAnyBlockRefs(xlogreader)); + + if (info == XLOG_XACT_COMMIT || info == XLOG_XACT_COMMIT_PREPARED) + { + xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(xlogreader); + xl_xact_parsed_commit parsed_commit; + + /* It is finished while the xact time is newer than or equal to end time */ + if (xlrec->xact_time > end_time) + return true; + else if (unlikely(xlrec->xact_time == end_time)) + finish = true; + + /* Do something like xact_redo_commit */ + ParseCommitRecord(info, xlrec, &parsed_commit); + nsubxacts = parsed_commit.nsubxacts; + subxacts = parsed_commit.subxacts; + update_snapshot = true; + } + else if (info == XLOG_XACT_ABORT || info == XLOG_XACT_ABORT_PREPARED) + { + xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(xlogreader); + xl_xact_parsed_abort parsed_abort; + + /* It is finished while the xact time is newer than or equal to end time */ + if (xlrec->xact_time > end_time) + return true; + else if (unlikely(xlrec->xact_time == end_time)) + finish = true; + + ParseAbortRecord(info, xlrec, &parsed_abort); + nsubxacts = parsed_abort.nsubxacts; + subxacts = parsed_abort.subxacts; + update_snapshot = true; + } + + /* Others we don't care */ + + /* Update the snapshot */ + if (update_snapshot) + { + polar_update_flashback_snapshot(rsnapshot, xid); + + /* Process the sub transactions */ + for (i = 0; i < nsubxacts; i++) + { + TransactionId tmp; + + tmp = subxacts[i]; + polar_update_flashback_snapshot(rsnapshot, tmp); + } + } + } + + /* Update the next xid and clog sub directory */ + if (TransactionIdIsValid(xid) && + TransactionIdEquals(rsnapshot->next_xid, xid)) + { + TransactionIdAdvance(rsnapshot->next_xid); + + /* The xid is wrapped */ + if (TransactionIdEquals(xid, FirstNormalTransactionId)) + rsnapshot->next_clog_subdir_no++; + } + + return finish; +} + +/* + * POLAR: Just like HeapTupleSatisfiesMVCC, but don't care about the sub xact + * and multi xact. + */ +HTSV_Result +polar_tuple_satisfies_flashback(HeapTuple htup, Buffer buffer, Snapshot snapshot, + uint32 next_clog_subdir_no, TransactionId max_xid, const char *fra_dir) +{ + HeapTupleHeader tuple = htup->t_data; + + Assert(ItemPointerIsValid(&htup->t_self)); + Assert(htup->t_tableOid != InvalidOid); + + /* + * Has inserting transaction committed? + * + * If the inserting transaction aborted, then the tuple was never visible + * to any other transaction, so we can delete it immediately. + */ + if (!HeapTupleHeaderXminCommitted(tuple)) + { + /* Check Hint bits at first, it will speed up in many cases. */ + if (HeapTupleHeaderXminInvalid(tuple)) + return HEAPTUPLE_DEAD; + else if (polar_flashback_did_xid_commit(snapshot, HeapTupleHeaderGetRawXmin(tuple), max_xid, + next_clog_subdir_no, fra_dir)) + HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_COMMITTED, + HeapTupleHeaderGetRawXmin(tuple)); + else + { + /* + * In Progress, or Not Committed, we think the xmin is invalid. + */ + HeapTupleSetHintBits(tuple, buffer, HEAP_XMIN_INVALID, + InvalidTransactionId); + return HEAPTUPLE_DEAD; + } + + /* + * At this point the xmin is known committed, but we might not have + * been able to set the hint bit yet; so we can no longer Assert that + * it's set. + */ + } + + /* + * Okay, the inserter committed, so it was good at some point. Now what + * about the deleting transaction? + */ + if (tuple->t_infomask & HEAP_XMAX_INVALID) + return HEAPTUPLE_LIVE; + + if (HEAP_XMAX_IS_LOCKED_ONLY(tuple->t_infomask)) + { + /* + * We don't really care whether xmax did commit, abort or crash. We + * know that xmax did lock the tuple, but it did not and will never + * actually update it. + */ + HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return HEAPTUPLE_LIVE; + } + + if (tuple->t_infomask & HEAP_XMAX_IS_MULTI) + { + HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return HEAPTUPLE_LIVE; + } + + if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED)) + { + if (polar_flashback_did_xid_commit(snapshot, HeapTupleHeaderGetRawXmax(tuple), max_xid, + next_clog_subdir_no, fra_dir)) + HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_COMMITTED, + HeapTupleHeaderGetRawXmax(tuple)); + else + { + /* + * Not in Progress, Not Committed, so either Aborted or crashed + */ + HeapTupleSetHintBits(tuple, buffer, HEAP_XMAX_INVALID, + InvalidTransactionId); + return HEAPTUPLE_LIVE; + } + + /* + * At this point the xmax is known committed, but we might not have + * been able to set the hint bit yet; so we can no longer Assert that + * it's set. + */ + } + + /* Otherwise, it's dead and removable */ + return HEAPTUPLE_DEAD; +} + +void +log_flashback_snapshot(Snapshot snapshot, int elevel) +{ + int i; + StringInfoData buf; + + /* + * Fill buf with a text serialization of the snapshot, plus identification + * data about this transaction. The format expected by ImportSnapshot is + * pretty rigid: each line must be fieldname:value. + */ + initStringInfo(&buf); + + appendStringInfo(&buf, "Flashback snapshot info -- "); + appendStringInfo(&buf, "insert wal lsn:%X/%X ", (uint32)(snapshot->lsn >> 32), (uint32) snapshot->lsn); + appendStringInfo(&buf, "xmin:%u ", snapshot->xmin); + appendStringInfo(&buf, "xmax:%u ", snapshot->xmax); + appendStringInfo(&buf, "xcnt:%d ", snapshot->xcnt); + + for (i = 0; i < snapshot->xcnt; i++) + appendStringInfo(&buf, "xip:%u ", snapshot->xip[i]); + + /* + * Similarly, we add our subcommitted child XIDs to the subxid data. Here, + * we have to cope with possible overflow. + */ + if (snapshot->suboverflowed) + appendStringInfoString(&buf, "sof:1 "); + else + { + appendStringInfoString(&buf, "sof:0 "); + appendStringInfo(&buf, "sxcnt:%d ", snapshot->subxcnt); + + for (i = 0; i < snapshot->subxcnt; i++) + appendStringInfo(&buf, "sxp:%u ", snapshot->subxip[i]); + } + + if (unlikely(snapshot->takenDuringRecovery)) + appendStringInfo(&buf, "take during recovery."); + + elog(elevel, "%s", buf.data); + + /* free the buffer */ + pfree(buf.data); +} + +flashback_snapshot_header_t +polar_get_flashback_snapshot_data(fra_ctl_t ins, XLogRecPtr lsn) +{ +#define SNAPSHOT_DEFUALT_SIZE 1024 + static flashback_snapshot_header_t header = NULL; /* Keep it, it is smaller than 5kB */ + static Size size = 0; + flashback_snapshot_t snapshot = NULL; + RunningTransactions trans; + TransactionId xmax; + TransactionId *xip; + uint32 data_size; + MemoryContext old_context; + + if (!polar_enable_fra(ins)) + return NULL; + + /* Use context created by myself to avoid to delete by others */ + old_context = MemoryContextSwitchTo(get_snapshot_memory_context()); + trans = polar_get_running_top_trans(); + xmax = trans->latestCompletedXid; + TransactionIdAdvance(xmax); + data_size = FLSHBAK_GET_SNAPSHOT_DATA_SIZE(trans->xcnt); + + if (unlikely(size < data_size + FLSHBAK_SNAPSHOT_HEADER_SIZE)) + { + if (header) + pfree(header); + + size = Max(SNAPSHOT_DEFUALT_SIZE, data_size + FLSHBAK_SNAPSHOT_HEADER_SIZE); + /* Use malloc, so avoid to be delete by others */ + header = (flashback_snapshot_header_t) palloc(size); + } + + header->data_size = data_size; + SET_FLSHBAK_SNAPSHOT_VERSION(header->info); + + snapshot = FLSHBAK_GET_SNAPSHOT_DATA(header); + snapshot->lsn = lsn; + snapshot->xcnt = trans->xcnt; + snapshot->xmax = xmax; + snapshot->xmin = trans->oldestRunningXid; + snapshot->next_xid = trans->nextXid; + snapshot->next_clog_subdir_no = pg_atomic_read_u32(&(ins->clog_ctl->next_clog_subdir_no)); + + /* Don't care about the sub transaction in flashback snapshot */ + xip = (TransactionId *) POLAR_GET_FLSHBAK_SNAPSHOT_XIP(snapshot); + memcpy(xip, trans->xids, snapshot->xcnt * sizeof(TransactionId)); + + MemoryContextSwitchTo(old_context); + return header; +} + +/* + * POLAR: Backup the transaction snapshot to fast recovery area. + */ +fbpoint_pos_t +polar_backup_snapshot_to_fra(flashback_snapshot_header_t header, fbpoint_pos_t *snapshot_end_pos, const char *fra_dir) +{ + uint32 data_size = 0; + Size total_size = 0; + pg_crc32 crc; + uint32 seg_no; + uint32 write_size = 0; + char *data; + uint32 offset; + fbpoint_pos_t snapshot_pos; + fbpoint_io_error_t io_error; + + Assert(header); + if (unlikely(header->data_size > UINT_MAX)) + elog(ERROR, "The flashback snapshot size is over %u, cancel the flashback point and wait the next", UINT_MAX); + + data_size = header->data_size; + total_size = FLSHBAK_SNAPSHOT_HEADER_SIZE + data_size; + + offset = snapshot_end_pos->offset; + seg_no = snapshot_end_pos->seg_no; + + /* The flashback snapshot data header can't be splitted */ + if (offset - FBPOINT_REC_END_POS < FLSHBAK_SNAPSHOT_HEADER_SIZE) + { + seg_no++; + offset = FBPOINT_SEG_SIZE; + } + + SET_FLSHBAK_SNAPSHOT_END_POS(header->info, offset); + /* Compute the CRC */ + COMPUTE_SNAPSHOT_DATA_CRC(crc, header, (char *) header + FLSHBAK_SNAPSHOT_HEADER_SIZE, data_size); + header->crc = crc; + + if (offset - FBPOINT_REC_END_POS < total_size) + SET_FBPOINT_POS(snapshot_pos, seg_no, FBPOINT_REC_END_POS); + + else + SET_FBPOINT_POS(snapshot_pos, seg_no, (offset - total_size)); + + data = (char *) header; + + /* Write the snapshot data and fsync file, the snapshot data may be split into many files */ + do + { + write_size = Min(total_size, offset - FBPOINT_REC_END_POS); + offset -= write_size; + + if (!polar_write_fbpoint_file(fra_dir, data, seg_no, offset, write_size, &io_error)) + /*no cover line*/ + polar_fbpoint_report_io_error(fra_dir, &io_error, ERROR); + + SET_FBPOINT_POS(*snapshot_end_pos, seg_no, offset); + total_size -= write_size; + offset = FBPOINT_SEG_SIZE; + seg_no++; + data += write_size; + } + while (total_size > 0); + + return snapshot_pos; +} + +/* + * POLAR: Get snapshot from flashback snapshot file. + * + * However, some values in the snapshot we don't care. + */ +Snapshot +polar_get_flashback_snapshot(const char *fra_dir, fbpoint_pos_t start_pos, + uint32 *next_clog_subdir_no, TransactionId *next_xid) +{ + flashback_snapshot_data_header_t header; + uint32 data_size; + uint32 offset; + uint32 seg_no; + char *data = NULL; + Snapshot snapshot; + fbpoint_io_error_t io_error; + + seg_no = start_pos.seg_no; + offset = start_pos.offset; + + if (polar_read_fbpoint_file(fra_dir, (char *) &header, seg_no, offset, FLSHBAK_SNAPSHOT_HEADER_SIZE, &io_error)) + { + char path[MAXPGPATH]; + + polar_get_fbpoint_file_path(seg_no, fra_dir, path); + + if (GET_FLSHBAK_SNAPSHOT_VERSION(header.info) != FLSHBAK_SNAPSHOT_DATA_VERSION) + /*no cover line*/ + elog(ERROR, "The version of snapshot data in %s is %d not match with binary %d", path, + GET_FLSHBAK_SNAPSHOT_VERSION(header.info), FLSHBAK_SNAPSHOT_DATA_VERSION); + } + else + /*no cover line*/ + polar_fbpoint_report_io_error(fra_dir, &io_error, FATAL); + + /* Read the snapshot data */ + data_size = header.data_size; + data = palloc(data_size); + offset = offset + FLSHBAK_SNAPSHOT_HEADER_SIZE; + read_flashback_snapshot_data(fra_dir, seg_no, offset, &header, data); + + /* Convert the data to snapshot */ + snapshot = restore_flashback_snapshot((flashback_snapshot_t) data); + *next_clog_subdir_no = ((flashback_snapshot_t) data)->next_clog_subdir_no; + *next_xid = ((flashback_snapshot_t) data)->next_xid; + + pfree(data); + log_flashback_snapshot(snapshot, DEBUG2); + return snapshot; +} + +inline void +polar_compact_xip(flshbak_rel_snpsht_t rsnapshot) +{ + if (rsnapshot->snapshot->xcnt) + compact_removed_xids(rsnapshot->snapshot->xip, rsnapshot->snapshot->xcnt, rsnapshot); +} diff --git a/src/backend/polar_flashback/polar_flashback_table.c b/src/backend/polar_flashback/polar_flashback_table.c new file mode 100644 index 00000000000..e1554b4da84 --- /dev/null +++ b/src/backend/polar_flashback/polar_flashback_table.c @@ -0,0 +1,1084 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback_table.c + * + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, Alibaba Group Holding limited + * + * IDENTIFICATION + * src/backend/polar_flashback/polar_flashback_table.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "access/parallel.h" +#include "access/polar_log.h" +#include "access/xact.h" +#include "access/xlog.h" +#include "catalog/catalog.h" +#include "catalog/namespace.h" +#include "commands/cluster.h" +#include "commands/tablecmds.h" +#include "executor/executor.h" +#include "miscadmin.h" +#include "nodes/execnodes.h" +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback_log.h" +#include "polar_flashback/polar_flashback_log_worker.h" +#include "polar_flashback/polar_flashback_rel_filenode.h" +#include "polar_flashback/polar_flashback_snapshot.h" +#include "polar_flashback/polar_flashback_table.h" +#include "utils/rel.h" +#include "utils/syscache.h" +#include "utils/timestamp.h" +#include "storage/lmgr.h" +#include "storage/predicate.h" +#include "tcop/tcopprot.h" + +/* + * DSM keys for parallel flashback. Unlike other parallel execution code, since + * we don't need to worry about DSM keys conflicting with plan_node_id we can + * use small integers. + */ +#define FLASHBACK_KEY_STATE 1 +#define FLASHBACK_KEY_SNAPSHOT 2 +#define FLASHBACK_KEY_XIP 3 +#define FLASHBACK_KEY_SUBXIP 4 +#define FLASHBACK_KEY_BUFFER_USAGE 5 +#define FLASHBACK_KEY_QUERY_TEXT 6 + +#define LOG_FLSHBAK_TBL_SHR_STATE(shared_state, target_block, elevel) \ + elog(elevel, "The flashback table %u to new table %u will replay xlog record from %X/%X to %X/%X, " \ + "flashback log record from %X/%X to end, the current transaction id is %u, " \ + "the max transaction in snapshot is %u, relation file node is (%u, %u, %u), " \ + "total block number now is %u, the next block to flashback is %u, " \ + "the next clog subdir number is %u", \ + (shared_state)->old_relid, (shared_state)->new_relid, \ + (uint32) ((shared_state)->wal_start_lsn >> 32), (uint32) ((shared_state)->wal_start_lsn), \ + (uint32) ((shared_state)->wal_end_lsn >> 32), (uint32) ((shared_state)->wal_end_lsn), \ + (uint32) ((shared_state)->flog_start_ptr >> 32), (uint32) ((shared_state)->flog_start_ptr), \ + (shared_state)->curr_xid, (shared_state)->next_xid, (shared_state)->rel_filenode.spcNode, \ + (shared_state)->rel_filenode.dbNode, (shared_state)->rel_filenode.relNode, \ + (shared_state)->nblocks, target_block, (shared_state)->next_clog_subdir_no) \ + +int polar_workers_per_flashback_table; + +static TimestampTz +get_flashback_target_time(Expr *flashback_time_expr) +{ + TimestampTz target_timestamptz; + ExprState *flashback_time_expr_state; + EState *estate = NULL; + ExprContext *econtext; + bool is_null; + Datum target_time_dt = (Datum) 0; + + /* Get the flashback target_time. */ + estate = CreateExecutorState(); + flashback_time_expr_state = ExecPrepareExpr(flashback_time_expr, estate); + econtext = GetPerTupleExprContext(estate); + target_time_dt = ExecEvalExpr(flashback_time_expr_state, econtext, &is_null); + target_timestamptz = DatumGetTimestampTz(target_time_dt); + FreeExecutorState(estate); + + elog(DEBUG1, "The target timestamp of flashback table is %s", + timestamptz_to_str(target_timestamptz)); + return target_timestamptz; +} + +static void +check_flashback_time(TimestampTz target_time) +{ + TimestampTz now = GetCurrentTimestamp(); + TimestampTz horizon; + + horizon = now - + (TimestampTz) polar_fast_recovery_area_rotation * SECS_PER_MINUTE * USECS_PER_SEC; + + if (timestamptz_cmp_internal(target_time, now) >= 0) + ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("The flashback table target time exceeds now!"))); + + if (timestamptz_cmp_internal(target_time, horizon) < 0) + ereport(ERROR, (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED), + errmsg("The lag between now and flashback table target time exceeds the " + "polar_fast_recovery_area_rotation %d minutes", polar_fast_recovery_area_rotation))); +} + +static inline void +log_flashback_table_state(flshbak_tbl_st_t state, int elevel) +{ + LOG_FLSHBAK_TBL_SHR_STATE(state->shared_state, + pg_atomic_read_u32(&(state->shared_state->next_blkno)), elevel); + log_flashback_snapshot(state->snapshot, elevel); +} + +void +polar_log_cannot_flashback_cause(Form_pg_class reltup, Oid relid, bool no_persistence_check) +{ + if (reltup->relkind != RELKIND_RELATION) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not support to flashback an irregular table \"%s\" now.", reltup->relname.data))); + + if (IsSystemClass(relid, reltup)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not support to flashback system catalog \"%s\".", reltup->relname.data))); + + if (reltup->relispartition) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not support to flashback an partition table \"%s\" now.", reltup->relname.data))); + + if (reltup->relhassubclass) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not support to flashback an parent table \"%s\" now.", reltup->relname.data))); + + /* Can not flashback temp or unlogged table */ + if (!no_persistence_check && reltup->relpersistence != RELPERSISTENCE_PERMANENT) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not support to flashback non-persistence table \"%s\".", reltup->relname.data))); + + /* Can not flashback toast table */ + if (OidIsValid(reltup->reltoastrelid)) + ereport(ERROR, + (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("can not support to flashback table which has a toast table \"%s\".", reltup->relname.data))); +} + +bool +polar_can_rel_flashback(Form_pg_class reltup, Oid relid, bool no_persistence_check) +{ + /* + * Now can't flashback: + * 1. index + * 2. toast table + * 3. materialized view + * 4. partitioned table + * 5. partition table + * 6. system table + * 7. foreign table + * 8. table has a toast table + * ... + */ + + if (reltup->relkind == RELKIND_RELATION && + (no_persistence_check || reltup->relpersistence == RELPERSISTENCE_PERMANENT) && + !reltup->relispartition && !reltup->relhassubclass && !OidIsValid(reltup->reltoastrelid) && + !IsSystemClass(relid, reltup)) + return true; + + return false; +} + +/* + * Before acquiring a table lock, check whether we have sufficient rights + * and all refrerence tables in the flashback objects. + * NB: It doesn't support to flashback the partition table. + */ +static void +rangevar_callback_for_flashback(const RangeVar *relation, Oid relid, Oid old_relid, void *arg) +{ + char relkind; + HeapTuple tp; + Form_pg_class reltup; + + /* Nothing to do if the relation was not found. */ + if (!OidIsValid(relid)) + return; + + tp = SearchSysCache1(RELOID, ObjectIdGetDatum(relid)); + + if (!HeapTupleIsValid(tp)) + /*no cover line*/ + elog(ERROR, "cache lookup failed for relation %u", relid); + + reltup = (Form_pg_class) GETSTRUCT(tp); + relkind = reltup->relkind; + + if (!polar_can_rel_flashback(reltup, relid, false)) + polar_log_cannot_flashback_cause(reltup, relid, false); + + ReleaseSysCache(tp); + + /* Check permissions */ + if (!pg_class_ownercheck(relid, GetUserId())) + aclcheck_error(ACLCHECK_NOT_OWNER, get_relkind_objtype(relkind), relation->relname); +} + +/* + * Parse the xlog record from start lsn to end_time + * Return the end+1 xlog record need to apply. + */ +static XLogRecPtr +flashback_table_apply_wal(Snapshot snapshot, flshbak_tbl_shr_st_t shared_state, TimestampTz end_time, + const char *relname, XLogRecPtr *logindex_max_lsn) +{ + XLogRecord *record; + XLogReaderState *xlogreader; + char *errormsg; + XLogRecPtr start_lsn = shared_state->wal_start_lsn; + XLogRecPtr end_lsn = start_lsn; + XLogRecPtr read_upto = GetFlushRecPtr(); + bool finish = false; + flashback_rel_snapshot_t flshbak_rel_sn; + + /* To get a exactly snapshot, we read xlog from snapshot->lsn */ + start_lsn = Min(start_lsn, snapshot->lsn); + + if (start_lsn == read_upto) + elog(ERROR, "There is no more wal record in disk while flashback table %s " + "from start lsn %X/%X, we think it is unchanged", relname, + (uint32)(start_lsn >> 32), (uint32) start_lsn); + + xlogreader = XLogReaderAllocate(wal_segment_size, &read_local_xlog_page, NULL); + + if (xlogreader == NULL) + /*no cover line*/ + ereport(ERROR, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("Can not allocate the xlog reader memory for flashback table %s", relname))); + + /* Init the flashback relation snapshot */ + flshbak_rel_sn.snapshot = snapshot; + flshbak_rel_sn.xip_size = 0; + flshbak_rel_sn.removed_xid_pos = palloc0(REMOVED_XID_POS_SIZE); + flshbak_rel_sn.removed_size = 0; + flshbak_rel_sn.next_clog_subdir_no = shared_state->next_clog_subdir_no; + flshbak_rel_sn.next_xid = shared_state->next_xid; + + /* Read the xlog record until the end time or xlog ptr read upto */ + do + { + record = XLogReadRecord(xlogreader, start_lsn, &errormsg); + + if (record != NULL) + { + /* + * We finish the apply while the xlog record xact_time newer than end_time. + * + * NB: The transaction time newer than end_time we don't apply. The end_lsn is + * always a xact commit or abort wal record end lsn or start_lsn while there is + * no xact commit or abort wal records. + */ + finish = polar_flashback_xact_redo(record, &flshbak_rel_sn, end_time, xlogreader); + + if (finish) + break; + + /* We just care about wal relatived with data page */ + if (polar_xlog_remove_payload(record)) + { + *logindex_max_lsn = xlogreader->currRecPtr; + end_lsn = xlogreader->EndRecPtr; + } + } + else + { + XLogRecPtr errptr; + + /*no cover begin*/ + errptr = start_lsn ? start_lsn : xlogreader->EndRecPtr; + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not read xlog record at %X/%X in flashback table %s", + (uint32)(errptr >> 32), + (uint32) errptr, relname))); + /*no cover end*/ + } + + start_lsn = InvalidXLogRecPtr; /* continue reading at next record */ + } + while (xlogreader->EndRecPtr < read_upto); + + /* Compact the snapshot xip */ + polar_compact_xip(&flshbak_rel_sn); + /* Update the shared_state next xid and next clog subdir no. */ + shared_state->next_xid = flshbak_rel_sn.next_xid; + shared_state->next_clog_subdir_no = flshbak_rel_sn.next_clog_subdir_no; + + pfree(flshbak_rel_sn.removed_xid_pos); + XLogReaderFree(xlogreader); + elog(DEBUG1, "Flashback table %s to time %s parse the xlog record from lsn %X/%X " + "to lsn %X/%X", relname, timestamptz_to_str(end_time), + (uint32)(shared_state->wal_start_lsn >> 32), (uint32)(shared_state->wal_start_lsn), + (uint32)(end_lsn >> 32), (uint32)(end_lsn)); + return end_lsn; +} + +static flshbak_tbl_st_t +construct_flashback_table_state(MemoryContext flashback_table_context, Relation rel, TimestampTz target_time) +{ + MemoryContext old_context; + fbpoint_rec_data_t fbpoint_rec; + flshbak_tbl_shr_st_t shared_state = NULL; + Snapshot snapshot; + flshbak_tbl_st_t state = NULL; + XLogRecPtr logindex_max_lsn_expected = InvalidXLogRecPtr; + polar_flog_rec_ptr write_result; + + old_context = MemoryContextSwitchTo(flashback_table_context); + shared_state = palloc(sizeof(flashback_table_shared_state_t)); + shared_state->old_relid = rel->rd_id; + shared_state->nblocks = RelationGetNumberOfBlocks(rel); + + /* Get right flashback point */ + if (polar_get_right_fbpoint(fra_instance->point_ctl, fra_instance->dir, + timestamptz_to_time_t(target_time), &fbpoint_rec, NULL)) + { + elog(DEBUG2, "find the rigth flashback point before target time %s: " + "flashback log pointer is %08X/%08X, " + "WAL redo lsn is %08X/%08X, the time is %s, " + "the next clog sub directory number is %08X, " + "the snapshot position is %08X/%08X", + timestamptz_to_str(target_time), + (uint32)(fbpoint_rec.flog_ptr >> 32), (uint32) fbpoint_rec.flog_ptr, + (uint32)(fbpoint_rec.redo_lsn >> 32), (uint32) fbpoint_rec.redo_lsn, + timestamptz_to_str(time_t_to_timestamptz(fbpoint_rec.time)), + fbpoint_rec.next_clog_subdir_no, + fbpoint_rec.snapshot_pos.seg_no, fbpoint_rec.snapshot_pos.offset); + } + else + /*no cover line*/ + elog(ERROR, "Can not find a right flashback point to flashback table %s.", RelationGetRelationName(rel)); + + shared_state->flog_start_ptr = fbpoint_rec.flog_ptr; + shared_state->rel_filenode = rel->rd_node; + write_result = polar_get_flog_write_result(flog_instance->buf_ctl); + + /* + * Get origin relation file node. + * + * Sometime the flog_start_ptr is equal to the write_result, + * there is nothing to do. + */ + if (likely(shared_state->flog_start_ptr < write_result)) + { + flog_reader_state *reader = NULL; + + /* Wait for flashback log to catch write result*/ + if (!polar_is_flog_index_inserted(flog_instance, &write_result)) + { + elog(LOG, "Wait the flashback logindex to catch flashback log write result " + "to find origin relation file node of ([%u, %u, %u])", + shared_state->rel_filenode.spcNode, shared_state->rel_filenode.dbNode, + shared_state->rel_filenode.relNode); + + polar_wait_flog_bgworker(flog_instance, polar_is_flog_index_inserted, + &write_result); + } + + FLOG_ALLOC_PAGE_READER(reader, flog_instance->buf_ctl, ERROR); + + while (polar_find_origin_filenode(flog_instance, &(shared_state->rel_filenode), target_time, + shared_state->flog_start_ptr, write_result, reader)); + + /* Free the flashback log reader */ + polar_flog_reader_free(reader); + } + + shared_state->wal_start_lsn = fbpoint_rec.redo_lsn; + + /* Get snapshot which xip is sorted */ + snapshot = polar_get_flashback_snapshot(fra_instance->dir, fbpoint_rec.snapshot_pos, + &shared_state->next_clog_subdir_no, &shared_state->next_xid); + + /* Parse WAL and get end wal lsn and update the snapshot */ + shared_state->wal_end_lsn = flashback_table_apply_wal(snapshot, shared_state, + target_time, RelationGetRelationName(rel), + &logindex_max_lsn_expected); + Assert(shared_state->wal_end_lsn >= shared_state->wal_start_lsn); + + /* Check the wal_logindex enough ? */ + if (ProcGlobal->walwriterLatch) + SetLatch(ProcGlobal->walwriterLatch); + + while (logindex_max_lsn_expected && + logindex_max_lsn_expected > polar_get_logindex_snapshot_max_lsn(polar_logindex_redo_instance->wal_logindex_snapshot)) + { + CHECK_FOR_INTERRUPTS(); + pg_usleep(1000); + } + + shared_state->curr_xid = GetCurrentTransactionId(); + pg_atomic_init_u32(&(shared_state->next_blkno), 0); + + /* Set the pointer */ + state = palloc(sizeof(flashback_table_state_t)); + state->shared_state = shared_state; + state->snapshot = snapshot; + + log_flashback_table_state(state, DEBUG2); + MemoryContextSwitchTo(old_context); + return state; +} + +static inline bool +polar_is_buffer_unchanged(Buffer buffer, XLogRecPtr target_lsn) +{ + return BufferGetLSNAtomic(buffer) <= target_lsn; +} + +static uint32 +polar_flashback_rebuild_tuples(Buffer buffer, Snapshot snapshot, uint32 next_clog_subdir_no, + TransactionId max_xid, TransactionId xid, Oid relid, const char *fra_dir, bool use_tuple_infomask) +{ + OffsetNumber offnum, + maxoff; + HeapTupleData tuple; + Page page; + BufferDesc *buf_desc; + BlockNumber block; + uint32 tuples = 0; + + buf_desc = GetBufferDescriptor(buffer - 1); + block = buf_desc->tag.blockNum; + page = BufferGetPage(buffer); + + /* Scan each block and reset the tuplehead. */ + maxoff = PageGetMaxOffsetNumber(page); + + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemid; + + itemid = PageGetItemId(page, offnum); + + /* Unused items require no processing */ + if (!ItemIdIsUsed(itemid)) + continue; + + /* Redirect items mustn't be touched */ + if (ItemIdIsRedirected(itemid)) + continue; + + if (ItemIdIsDead(itemid)) + continue; + + ItemPointerSet(&(tuple.t_self), block, offnum); + Assert(ItemIdIsNormal(itemid)); + tuple.t_data = (HeapTupleHeader) PageGetItem(page, itemid); + tuple.t_len = ItemIdGetLength(itemid); + tuple.t_tableOid = relid; + + /* We can't use the info mask in the tuple, just clean it */ + if (!use_tuple_infomask) + { + tuple.t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_XMIN_FROZEN | HEAP_XMAX_COMMITTED | HEAP_XMAX_INVALID); + tuple.t_data->t_infomask2 &= ~(HEAP2_XACT_MASK); + } + + /* + * The criteria for counting a tuple as live in this block need to + * match what analyze.c's acquire_sample_rows() does, otherwise + * VACUUM and ANALYZE may produce wildly different reltuples + * values, e.g. when there are many recently-dead tuples. + * + * The logic here is a bit simpler than acquire_sample_rows(), as + * VACUUM can't run inside a transaction block, which makes some + * cases impossible (e.g. in-progress insert from the same + * transaction). + */ + switch (polar_tuple_satisfies_flashback(&tuple, buffer, snapshot, next_clog_subdir_no, max_xid, fra_dir)) + { + case HEAPTUPLE_DEAD: + ItemIdSetDead(itemid); + break; + + case HEAPTUPLE_LIVE: + HeapTupleHeaderSetXmin(tuple.t_data, xid); + HeapTupleHeaderSetXmax(tuple.t_data, InvalidTransactionId); + tuples++; + break; + + default: + /*no cover line*/ + elog(ERROR, "unexpected polar_tuple_satisfies_flashback result"); + break; + } + } + + return tuples; +} + +/* + * POLAR: Rebuild pages parallel.Æ’ + * Return the parallel worker number. + */ +static int +polar_launch_flashback_pages_workers(flshbak_tbl_st_t state) +{ + int parallel_workers = 0; + ParallelContext *pcxt; + flshbak_tbl_shr_st_t flashback_shared_state; + BufferUsage *buffer_usage; + int i; + int querylen; + char *sharedquery; + Snapshot snapshot; + TransactionId *xip; + TransactionId *subxip; + Size xip_size = 0; + Size subxip_size = 0; + + parallel_workers = polar_workers_per_flashback_table; + + if (parallel_workers == 0) + return 0; + + EnterParallelMode(); + pcxt = CreateParallelContext("postgres", "polar_flashback_pages_woker_main", + parallel_workers, true); + Assert(pcxt->nworkers > 0); + + shm_toc_estimate_chunk(&pcxt->estimator, + sizeof(flashback_table_shared_state_t)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + shm_toc_estimate_chunk(&pcxt->estimator, sizeof(SnapshotData)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + if (state->snapshot->xcnt) + { + xip_size = mul_size(state->snapshot->xcnt, sizeof(TransactionId)); + shm_toc_estimate_chunk(&pcxt->estimator, xip_size); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } + + if (state->snapshot->subxcnt) + { + subxip_size = mul_size(state->snapshot->subxcnt, sizeof(TransactionId)); + shm_toc_estimate_chunk(&pcxt->estimator, subxip_size); + shm_toc_estimate_keys(&pcxt->estimator, 1); + } + + shm_toc_estimate_chunk(&pcxt->estimator, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + querylen = strlen(debug_query_string); + shm_toc_estimate_chunk(&pcxt->estimator, querylen + 1); + shm_toc_estimate_keys(&pcxt->estimator, 1); + + InitializeParallelDSM(pcxt); + + flashback_shared_state = + shm_toc_allocate(pcxt->toc, sizeof(flashback_table_shared_state_t)); + memcpy(flashback_shared_state, state->shared_state, sizeof(flashback_table_shared_state_t)); + pg_atomic_init_u32(&(flashback_shared_state->next_blkno), 0); + shm_toc_insert(pcxt->toc, FLASHBACK_KEY_STATE, flashback_shared_state); + + snapshot = shm_toc_allocate(pcxt->toc, sizeof(SnapshotData)); + memcpy(snapshot, state->snapshot, sizeof(SnapshotData)); + shm_toc_insert(pcxt->toc, FLASHBACK_KEY_SNAPSHOT, snapshot); + + + if (xip_size) + { + xip = shm_toc_allocate(pcxt->toc, xip_size); + memcpy(xip, state->snapshot->xip, xip_size); + shm_toc_insert(pcxt->toc, FLASHBACK_KEY_XIP, xip); + } + + if (subxip_size) + { + subxip = shm_toc_allocate(pcxt->toc, subxip_size); + memcpy(subxip, state->snapshot->subxip, subxip_size); + shm_toc_insert(pcxt->toc, FLASHBACK_KEY_SUBXIP, subxip); + } + + buffer_usage = shm_toc_allocate(pcxt->toc, + mul_size(sizeof(BufferUsage), pcxt->nworkers)); + shm_toc_insert(pcxt->toc, FLASHBACK_KEY_BUFFER_USAGE, buffer_usage); + + sharedquery = (char *) shm_toc_allocate(pcxt->toc, querylen + 1); + memcpy(sharedquery, debug_query_string, querylen); + sharedquery[querylen] = '\0'; + shm_toc_insert(pcxt->toc, FLASHBACK_KEY_QUERY_TEXT, sharedquery); + + LaunchParallelWorkers(pcxt); + parallel_workers = pcxt->nworkers_launched; + WaitForParallelWorkersToFinish(pcxt); + + for (i = 0; i < pcxt->nworkers_launched; i++) + InstrAccumParallelQuery(&buffer_usage[i]); + + DestroyParallelContext(pcxt); + ExitParallelMode(); + return parallel_workers; +} + +/* + * POLAR: Flashback the relation page. + * + * There are some cases we can't find a origin page of the target tag: + * 1. The origin page is in the disk. We will read the origin page from disk. + * 2. The flashback log index has some flashback log not inserted. We will wait + * them inserted. + * + * NB: Please make true the old buffer will not be flushed when process it in here. + * We can lock it with EXCLUSIVE mode in the caller. + */ +static bool +polar_flashback_rel_page(flshbak_tbl_shr_st_t shared_state, BufferTag *tag, + BufferDesc *old_buf_desc, BufferDesc *new_buf_desc, flog_reader_state *reader) +{ + Buffer new_buf; + polar_flog_rec_ptr flog_end_ptr; + polar_flog_rec_ptr logindex_max_ptr; + bool success = false; + flshbak_buf_context_t context; + + Assert(old_buf_desc == NULL || + LWLockHeldByMeInMode(BufferDescriptorGetContentLock(old_buf_desc), LW_EXCLUSIVE)); + + new_buf = new_buf_desc->buf_id + 1; + logindex_max_ptr = polar_get_logindex_max_parsed_lsn(flog_instance->logindex_snapshot); + flog_end_ptr = polar_get_flog_write_result(flog_instance->buf_ctl); + + INIT_FLSHBAK_BUF_CONTEXT(context, shared_state->flog_start_ptr, flog_end_ptr, + shared_state->wal_start_lsn, shared_state->wal_end_lsn, + flog_instance->logindex_snapshot, reader, tag, new_buf, LOG, false); + + /* If no flashback log or can't flashback page with the flashback log and flashback logindex */ + if (shared_state->flog_start_ptr != flog_end_ptr) + success = polar_flashback_buffer(&context); + + /* + * We can't find a valid origin page with the flashback log and + * flashback logindex. We will wait the flashback logindex and retry. + */ + if (!success && !polar_is_flog_index_inserted(flog_instance, &flog_end_ptr)) + { + elog(LOG, "Wait the flashback logindex to find " + "origin page of " POLAR_LOG_BUFFER_TAG_FORMAT, + POLAR_LOG_BUFFER_TAG(tag)); + + polar_wait_flog_bgworker(flog_instance, polar_is_flog_index_inserted, + &flog_end_ptr); + + /* Find the origin page with old logindex max pointer and new end pointer */ + context.start_ptr = logindex_max_ptr; + success = polar_flashback_buffer(&context); + } + + /* + * We can't find a valid origin page with the flashback log and + * flashback logindex, try to read the page in the disk as origin page. + * + * When the origin page can't be found in the flashback log and disk, + * it must be the last block or a flashback log lost. + */ + if (!success) + { + /* In some cases, the origin page is in the disk */ + if (new_buf_desc->tag.blockNum < shared_state->nblocks) + { + Assert(old_buf_desc); + + /* The origin page can't be in the disk with relfilenode changed */ + if (RelFileNodeEquals(tag->rnode, old_buf_desc->tag.rnode)) + { + read_origin_page_from_file(tag, BufferGetPage(new_buf)); + + if (likely(BufferGetLSN(new_buf_desc) <= shared_state->wal_end_lsn)) + { + elog(LOG, "The origin page of " POLAR_LOG_BUFFER_TAG_FORMAT " is in the disk, " + "read and apply it", POLAR_LOG_BUFFER_TAG(tag)); + + polar_logindex_apply_page(polar_logindex_redo_instance, + shared_state->wal_start_lsn, + shared_state->wal_end_lsn, tag, &new_buf); + + success = true; + } + } + } + /* We think it is last page */ + else + return false; + } + + if (!success) + /*no cover line*/ + POLAR_LOG_FLOG_LOST(tag, ERROR); + + /* Check the page lsn */ + Assert(BufferGetLSN(new_buf_desc) <= shared_state->wal_end_lsn); + return true; +} + +static bool +polar_flashback_rebuild_page(flshbak_tbl_shr_st_t shared_state, Snapshot snapshot, + Buffer old_buffer, Buffer new_buffer, flog_reader_state *reader) +{ + Page new_page; + Page old_page; + XLogRecPtr new_lsn; + BufferDesc *old_buf_desc = NULL; + BufferDesc *new_buf_desc = NULL; + RelFileNode new_rnode; + BufferTag old_tag; + BlockNumber target_block; + bool buf_changed = true; + bool is_finished = false; + + new_buf_desc = GetBufferDescriptor(new_buffer - 1); + /* Must hold the new buffer exclusive lock */ + Assert(LWLockHeldByMeInMode(BufferDescriptorGetContentLock(new_buf_desc), LW_EXCLUSIVE)); + new_page = BufferGetPage(new_buffer); + new_rnode = new_buf_desc->tag.rnode; + target_block = new_buf_desc->tag.blockNum; + INIT_BUFFERTAG(old_tag, shared_state->rel_filenode, MAIN_FORKNUM, target_block); + + if (BufferIsValid(old_buffer)) + { + /* Check it is changed? */ + old_buf_desc = GetBufferDescriptor(old_buffer - 1); + + /* Hold the exclusive lock, so can't flush the buffer to disk */ + LockBuffer(old_buffer, BUFFER_LOCK_EXCLUSIVE); + + /* The block is unchanged from end_ptr to now, so just copy the buffer */ + if (polar_is_buffer_unchanged(old_buffer, shared_state->wal_end_lsn)) + { + old_page = BufferGetPage(old_buffer); + memcpy((char *)new_page, (char *) old_page, BLCKSZ); + /* The update of info mask is without WAL, so we can't trust it */ + buf_changed = false; + } + } + + /* + * Get the origin page when buffer is changed. + * When the origin page can't find, it must be the last block. + */ + if (buf_changed && !polar_flashback_rel_page(shared_state, &old_tag, + old_buf_desc, new_buf_desc, reader)) + is_finished = true; + + if (!is_finished) + { + /* rebuild the tuples */ + if (!PageIsEmpty(new_page)) + { + polar_flashback_rebuild_tuples(new_buffer, snapshot, shared_state->next_clog_subdir_no, + shared_state->next_xid, shared_state->curr_xid, shared_state->new_relid, + fra_instance->dir, buf_changed); + } + + /* Log the new block */ + new_lsn = log_newpage(&new_rnode, + MAIN_FORKNUM, + target_block, + new_page, + true); + PageSetLSN(new_page, new_lsn); + } + + /* + * Release old and new buffer. + */ + if (BufferIsValid(old_buffer)) + UnlockReleaseBuffer(old_buffer); + + if (BufferIsValid(new_buffer)) + { + /* Mark the new buffer dirty. */ + MarkBufferDirty(new_buffer); + UnlockReleaseBuffer(new_buffer); + } + + return is_finished; +} + +static void +polar_flashback_rebuild_pages(flshbak_tbl_shr_st_t shared_state, Snapshot snapshot, + bool is_parallel) +{ + Relation old_heap; + Buffer old_buffer = InvalidBuffer; + /* New relation */ + Relation new_heap; + Buffer new_buffer = InvalidBuffer; + flog_reader_state *reader = NULL; + + old_heap = relation_open(shared_state->old_relid, AccessShareLock); + new_heap = relation_open(shared_state->new_relid, AccessShareLock); + + FLOG_ALLOC_PAGE_READER(reader, flog_instance->buf_ctl, ERROR); + + for (;;) + { + BlockNumber target_block = InvalidBlockNumber; + + CHECK_FOR_INTERRUPTS(); + + /* Init the buffer */ + old_buffer = InvalidBuffer; + new_buffer = InvalidBuffer; + + target_block = pg_atomic_fetch_add_u32(&(shared_state->next_blkno), 1); + LOG_FLSHBAK_TBL_SHR_STATE(shared_state, target_block, DEBUG2); + + /* + * Can't process the target block exceed than shared_state->nblocks + * in the parallel mode because we can't extend relation in the parallel + * mode. + */ + if (target_block < shared_state->nblocks) + old_buffer = ReadBuffer(old_heap, target_block); + else if (!is_parallel) + target_block = P_NEW; + else + break; + + /* No need to read from disk, just zero and cleanup the buffer */ + new_buffer = ReadBufferExtended(new_heap, MAIN_FORKNUM, target_block, RBM_ZERO_AND_LOCK, NULL); + + if (polar_flashback_rebuild_page(shared_state, snapshot, old_buffer, new_buffer, reader)) + break; + } + + /* Close the heap */ + heap_close(old_heap, AccessShareLock); + heap_close(new_heap, AccessShareLock); + + /* Free the flashback log reader */ + polar_flog_reader_free(reader); +} + +static void +bulk_extend_new_rel(Relation rel, BlockNumber nblocks) +{ +#define MAX_EXTEND_NBLOCKS_ONCE (512) + uint32 extend_nblks = 1; + BlockNumber start_blk = 0; + char *zero_blocks; + + /* 4 MB (512 blocks) once in shared storage mode */ + if (polar_enable_shared_storage_mode) + zero_blocks = palloc0(BLCKSZ * MAX_EXTEND_NBLOCKS_ONCE); + else + zero_blocks = palloc0(BLCKSZ); + + RelationOpenSmgr(rel); + + /* 4 MB (512 blocks) once */ + while (nblocks > 0) + { + if (polar_enable_shared_storage_mode) + { + extend_nblks = Min(MAX_EXTEND_NBLOCKS_ONCE, nblocks); + smgrextendbatch(rel->rd_smgr, MAIN_FORKNUM, start_blk, extend_nblks, zero_blocks, false); + } + else + smgrextend(rel->rd_smgr, MAIN_FORKNUM, start_blk, zero_blocks, false); + + start_blk = start_blk + extend_nblks; + nblocks = nblocks - extend_nblks; + } + + pfree(zero_blocks); +} + +static void +polar_flashback_rebuild_relation(flshbak_tbl_st_t state) +{ +#define NEW_HEAP_NAME_PREFIX "polar_flashback" + Oid table_space; + char relpersistence; + Oid old_relid; + Relation old_rel; + Oid new_relid; + Relation new_rel; + char old_relname[NAMEDATALEN]; + char new_relname[NAMEDATALEN]; + bool can_parallel = true; + BlockNumber nblocks = state->shared_state->nblocks; + + /* Get info for old relation */ + old_relid = state->shared_state->old_relid; + old_rel = relation_open(old_relid, AccessShareLock); + table_space = old_rel->rd_rel->reltablespace; + relpersistence = old_rel->rd_rel->relpersistence; + relation_close(old_rel, AccessShareLock); + strncpy(old_relname, RelationGetRelationName(old_rel), NAMEDATALEN); + + /* + * Create the transient table that will receive the flashback data. + * perform_flashback_table has hold the AccessExclusiveLock, + * so we just hold NoLock. + */ + snprintf(new_relname, sizeof(new_relname), "%s_%u", NEW_HEAP_NAME_PREFIX, old_relid); + new_relid = polar_make_new_heap(old_relid, table_space, relpersistence, NoLock, new_relname); + state->shared_state->new_relid = new_relid; + + new_rel = relation_open(new_relid, AccessExclusiveLock); + + /* + * We can't process it in parallel mode when the relation size is zero. + */ + if (nblocks == 0) + can_parallel = false; + else + bulk_extend_new_rel(new_rel, nblocks); + + /* Do the work in the parallel mode */ + if (can_parallel) + { + int nworkers_launched = 0; + + nworkers_launched = polar_launch_flashback_pages_workers(state); + + /* The parallel work is done, update the next_blkno to nblocks */ + if (nworkers_launched) + pg_atomic_write_u32(&(state->shared_state->next_blkno), state->shared_state->nblocks); + } + + /* Rebuild the remain pages */ + polar_flashback_rebuild_pages(state->shared_state, state->snapshot, false); + + /* Make the update visible */ + CommandCounterIncrement(); + relation_close(new_rel, AccessExclusiveLock); + + elog(NOTICE, "Flashback the relation %s to new relation %s, " + "please check the data", old_relname, new_relname); +} + +/* POLAR: Flashback a table */ +static void +perform_flashback_table(const RangeVar *relation, TimestampTz target_time) +{ + Oid table_oid; + Relation old_rel; + flshbak_tbl_st_t state; + /* lock mode is just AccessShareLock now */ + LOCKMODE lock_mode = AccessShareLock; + MemoryContext flashback_table_context; + + /* Check for user-requested abort. */ + CHECK_FOR_INTERRUPTS(); + + /* + * Create special memory context for cross-transaction storage. + * + * Since it is a child of PortalContext, it will go away even in case + * of error. + */ + flashback_table_context = AllocSetContextCreate(PortalContext, + "Flashback table", ALLOCSET_DEFAULT_SIZES); + + /* Check for user-requested abort. */ + CHECK_FOR_INTERRUPTS(); + + /* Find, lock, and check permissions and relation kind on the table. */ + table_oid = RangeVarGetRelidExtended(relation, lock_mode, 0, rangevar_callback_for_flashback, NULL); + Assert(OidIsValid(table_oid)); + + /* + * Now we just open the relation with AccessShareLock, but when we want to + * swap the relation file, must hold the AccessExclusiveLock and call CheckTableNotInUse + * and TransferPredicateLocksToHeapRelation. + */ + old_rel = relation_open(table_oid, lock_mode); + + /* Construct the flashback table state */ + state = construct_flashback_table_state(flashback_table_context, old_rel, target_time); + + /* polar_flashback_rebuild_relation does all the dirty work */ + polar_flashback_rebuild_relation(state); + + /* NB: polar_rebuild_relation does heap_close() on old_heap */ + relation_close(old_rel, lock_mode); + + /* Clean up working storage */ + MemoryContextDelete(flashback_table_context); +} + +/* POLAR: flashback table statement main function. */ +void +polar_exec_flashback_table_stmt(PolarFlashbackTableStmt *stmt) +{ + TimestampTz target_time; + + /* Check the GUCs */ + if (!fra_instance) + { + /*no cover line*/ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Don't support to flashback table when" + "polar_enable_fast_recovery_area if off"))); + } + + /* Check the flashback log state */ + if (!polar_is_flog_ready(flog_instance)) + { + /*no cover line*/ + ereport(ERROR, (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), + errmsg("Don't support to flashback table when flashback log is not ready, " + "just wait a moment"))); + } + + /* Get the flashback target time and check */ + target_time = get_flashback_target_time((Expr *) stmt->time_expr); + check_flashback_time(target_time); + + /* Rock and roll */ + perform_flashback_table(stmt->relation, target_time); +} + +/* + * POLAR: Perform work within a launched parallel process. + * + * Parallel flashback workers perform to flashback a block. + */ +void +polar_flashback_pages_woker_main(dsm_segment *seg, shm_toc *toc) +{ + char *sharedquery; + flshbak_tbl_shr_st_t shared_state; + BufferUsage *buffer_usage; + Snapshot snapshot; + SnapshotData local_snapshot; + + shared_state = (flshbak_tbl_shr_st_t) shm_toc_lookup(toc, FLASHBACK_KEY_STATE, false); + snapshot = (Snapshot) shm_toc_lookup(toc, FLASHBACK_KEY_SNAPSHOT, false); + + /* Copy snapshot to local */ + memcpy(&local_snapshot, snapshot, sizeof(SnapshotData)); + local_snapshot.xip = local_snapshot.subxip = NULL; + + if (snapshot->xcnt) + { + local_snapshot.xip = (TransactionId *) shm_toc_lookup(toc, FLASHBACK_KEY_XIP, false); + Assert(local_snapshot.xip); + } + + if (snapshot->subxcnt) + { + local_snapshot.subxip = (TransactionId *) shm_toc_lookup(toc, FLASHBACK_KEY_SUBXIP, false); + Assert(local_snapshot.subxip); + } + + log_flashback_snapshot(&local_snapshot, DEBUG2); + + /* Report the query string */ + sharedquery = (char *) shm_toc_lookup(toc, FLASHBACK_KEY_QUERY_TEXT, false); + debug_query_string = sharedquery; + pgstat_report_activity(STATE_RUNNING, debug_query_string); + + InstrStartParallelQuery(); + polar_flashback_rebuild_pages(shared_state, &local_snapshot, true); + buffer_usage = (BufferUsage *) shm_toc_lookup(toc, FLASHBACK_KEY_BUFFER_USAGE, false); + InstrEndParallelQuery(&buffer_usage[ParallelWorkerNumber]); +} diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 8ae7041d62a..f18492fb233 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -4323,6 +4323,24 @@ pgstat_get_wait_io(WaitEventIO w) case WAIT_EVENT_FLASHBACK_LOG_INSERT: event_name = "PolarFlashbackLogInsert"; break; + case WAIT_EVENT_FLASHBACK_POINT_FILE_WRITE: + event_name = "PolarFlashbackPointFileWrite"; + break; + case WAIT_EVENT_FLASHBACK_POINT_FILE_READ: + event_name = "PolarFlashbackPointFileRead"; + break; + case WAIT_EVENT_FLASHBACK_POINT_FILE_SYNC: + event_name = "PolarFlashbackPointFileSync"; + break; + case WAIT_EVENT_FRA_CTL_FILE_READ: + event_name = "PolarFraCtlFileRead"; + break; + case WAIT_EVENT_FRA_CTL_FILE_WRITE: + event_name = "PolarFraCtlFileWrite"; + break; + case WAIT_EVENT_FRA_CTL_FILE_SYNC: + event_name = "PolarFraCtlFileSync"; + break; /*no cover end*/ /* POLAR end */ /* no default case, so that compiler will warn */ diff --git a/src/backend/replication/walreceiver.c b/src/backend/replication/walreceiver.c index 781529eb94c..fd1f9f66f49 100644 --- a/src/backend/replication/walreceiver.c +++ b/src/backend/replication/walreceiver.c @@ -161,7 +161,8 @@ static void WalRcvShutdownHandler(SIGNAL_ARGS); static void WalRcvQuickDieHandler(SIGNAL_ARGS); /* POLAR: callback function when waiting free space from polar_xlog_queue */ -static void polar_receiver_xlog_queue_callback(void); +static void polar_receiver_xlog_queue_callback(polar_ringbuf_t rbuf); +static void polar_recv_push_storage_begin_callback(polar_ringbuf_t rbuf); static void polar_notify_read_wal_file(int code, Datum arg); /* @@ -1151,7 +1152,7 @@ XLogWalRcvProcessMsg(unsigned char type, char *buf, Size len) { Assert(POLAR_LOGINDEX_ENABLE_XLOG_QUEUE()); - polar_xlog_recv_queue_push_storage_begin(polar_logindex_redo_instance->xlog_queue, ProcessWalRcvInterrupts); + polar_xlog_recv_queue_push_storage_begin(polar_logindex_redo_instance->xlog_queue, polar_recv_push_storage_begin_callback); SpinLockAcquire(&WalRcv->mutex); WalRcv->polar_use_xlog_queue = false; @@ -2064,13 +2065,19 @@ pg_stat_get_wal_receiver(PG_FUNCTION_ARGS) * polar_xlog_queue.It will send feedback and handle interrupts */ static void -polar_receiver_xlog_queue_callback(void) +polar_receiver_xlog_queue_callback(polar_ringbuf_t rbuf) { ProcessWalRcvInterrupts(); XLogWalRcvSendReply(false, false); XLogWalRcvSendHSFeedback(false); } +static inline void +polar_recv_push_storage_begin_callback(polar_ringbuf_t rbuf) +{ + ProcessWalRcvInterrupts(); +} + static void polar_notify_read_wal_file(int code, Datum arg) { @@ -2080,7 +2087,7 @@ polar_notify_read_wal_file(int code, Datum arg) if (!got_SIGTERM && polar_in_replica_mode() && POLAR_LOGINDEX_ENABLE_XLOG_QUEUE()) { elog(LOG, "polar replica exit wal receiver and request to read from WAL file"); - polar_xlog_recv_queue_push_storage_begin(polar_logindex_redo_instance->xlog_queue, ProcessWalRcvInterrupts); + polar_xlog_recv_queue_push_storage_begin(polar_logindex_redo_instance->xlog_queue, polar_recv_push_storage_begin_callback); } } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index b311f2a79f1..387bce17cfc 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -55,7 +55,6 @@ /* POLAR */ #include "access/polar_logindex_redo.h" #include "polar_flashback/polar_flashback_log.h" -#include "polar_flashback/polar_flashback_log_repair_page.h" #include "replication/walreceiver.h" #include "storage/checksum.h" #include "storage/polar_fd.h" diff --git a/src/backend/storage/buffer/polar_bufmgr.c b/src/backend/storage/buffer/polar_bufmgr.c index 2f73e98c40d..0b6082c0a87 100644 --- a/src/backend/storage/buffer/polar_bufmgr.c +++ b/src/backend/storage/buffer/polar_bufmgr.c @@ -29,7 +29,6 @@ #include "access/polar_logindex_redo.h" #include "access/xlog.h" #include "polar_flashback/polar_flashback_log.h" -#include "polar_flashback/polar_flashback_log_repair_page.h" #include "postmaster/polar_parallel_bgwriter.h" #include "storage/polar_bufmgr.h" #include "storage/polar_copybuf.h" diff --git a/src/backend/storage/buffer/polar_copybuf.c b/src/backend/storage/buffer/polar_copybuf.c index ddae80b10e8..06460110a6b 100644 --- a/src/backend/storage/buffer/polar_copybuf.c +++ b/src/backend/storage/buffer/polar_copybuf.c @@ -23,6 +23,8 @@ * *------------------------------------------------------------------------- */ +#include "postgres.h" + #include "polar_flashback/polar_flashback_log.h" #include "storage/polar_copybuf.h" #include "storage/polar_bufmgr.h" diff --git a/src/backend/storage/file/polar_fd.c b/src/backend/storage/file/polar_fd.c index 9e9e2ed5d0d..e2b533bffea 100644 --- a/src/backend/storage/file/polar_fd.c +++ b/src/backend/storage/file/polar_fd.c @@ -954,3 +954,11 @@ assign_polar_datadir(const char *newval, void *extra) if (strncmp(POLAR_VFS_PROTOCAL_LOCAL_DIO, newval, strlen(POLAR_VFS_PROTOCAL_LOCAL_DIO)) == 0) polar_enable_buffer_alignment = true; } + +inline bool +polar_file_exists(const char *path) +{ + struct stat st; + + return (polar_stat(path, &st) == 0) && S_ISREG(st.st_mode); +} diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c index d99542e7bcf..60b6a382a4e 100755 --- a/src/backend/storage/ipc/ipci.c +++ b/src/backend/storage/ipc/ipci.c @@ -57,7 +57,7 @@ #include "common/file_perm.h" #include "executor/nodeShareInputScan.h" #include "polar_datamax/polar_datamax.h" -#include "polar_flashback/polar_flashback_log.h" +#include "polar_flashback/polar_flashback.h" #include "postmaster/polar_parallel_bgwriter.h" #include "replication/polar_cluster_info.h" #include "storage/polar_shmem.h" @@ -218,7 +218,7 @@ CreateSharedMemoryAndSemaphores(int port) /* POLAR end */ /* POLAR: add shared memory size for flashback log */ - size = add_size(size, polar_flog_shmem_size()); + size = add_size(size, polar_flashback_shmem_size()); /* POLAR end */ /* POLAR: add shared memory size for flashback log */ @@ -356,7 +356,7 @@ CreateSharedMemoryAndSemaphores(int port) /* POLAR end */ /* POLAR: init shared memory for flashback log */ - polar_flog_shmem_init(); + polar_flashback_shmem_init(); /* POLAR end */ /* POLAR: init cluster info share memory struct */ diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c index afd835c7dc3..e7ac9b89999 100755 --- a/src/backend/storage/ipc/procarray.c +++ b/src/backend/storage/ipc/procarray.c @@ -2685,7 +2685,7 @@ ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc) * If both field valid, we should not add it to running xids list */ RunningTransactions -GetRunningTransactionData(void) +PolarGetRunningTransactionData(bool ignore_subxid) { /* result workspace */ static RunningTransactionsData CurrentRunningXactsData; @@ -2854,7 +2854,7 @@ GetRunningTransactionData(void) * Spin over procArray collecting all subxids, but only if there hasn't * been a suboverflow. */ - if (!suboverflowed) + if (!suboverflowed && !ignore_subxid) { for (index = 0; index < arrayP->numProcs; index++) { @@ -5423,4 +5423,31 @@ polar_get_latestObservedXid(void) return latestObservedXid; } +/* + * POLAR: Just get the running top transactions (without sub transactions). + * + * Now it is used by flashback. + */ +RunningTransactions +polar_get_running_top_trans(void) +{ + RunningTransactions trans; + + /* Ignore the sub transactions */ + trans = PolarGetRunningTransactionData(true); + + if (polar_csn_enable) + { + LWLockRelease(ProcArrayLock); + LWLockRelease(XidGenLock); + LWLockRelease(CommitSeqNoLock); + } + else + { + LWLockRelease(ProcArrayLock); + LWLockRelease(XidGenLock); + } + return trans; +} + /* POLAR test end */ diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c index fac9512dabc..9c79cf7ee84 100644 --- a/src/backend/storage/page/bufpage.c +++ b/src/backend/storage/page/bufpage.c @@ -25,7 +25,7 @@ /* POLAR */ #include "utils/guc.h" -#include "polar_flashback/polar_flashback_log_repair_page.h" +#include "polar_flashback/polar_flashback_log.h" /* POLAR end */ /* GUC variable */ diff --git a/src/backend/tcop/utility.c b/src/backend/tcop/utility.c index ac5096b92f7..87c2f5dc209 100644 --- a/src/backend/tcop/utility.c +++ b/src/backend/tcop/utility.c @@ -74,6 +74,7 @@ /* POLAR */ #include "storage/proc.h" #include "polar_dma/polar_dma.h" +#include "polar_flashback/polar_flashback_table.h" /* POLAR px */ #include "access/px_btbuild.h" @@ -1043,6 +1044,16 @@ standard_ProcessUtility(PlannedStmt *pstmt, break; } + /* POLAR: Flashback table */ + case T_PolarFlashbackTableStmt: + { + /* not allow this during "read only" transactions */ + PreventCommandDuringRecovery("FLASHBACK TABLE"); + PreventInTransactionBlock(isTopLevel, "FLASHBACK TABLE"); + polar_exec_flashback_table_stmt((PolarFlashbackTableStmt *) parsetree); + break; + } + default: /* All other statement types have event trigger support */ ProcessUtilitySlow(pstate, pstmt, queryString, @@ -3062,6 +3073,11 @@ CreateCommandTag(Node *parsetree) } break; + /* POLAR: Flashback table */ + case T_PolarFlashbackTableStmt: + tag = "FLASHBACK TABLE"; + break; + default: elog(WARNING, "unrecognized node type: %d", (int) nodeTag(parsetree)); @@ -3584,6 +3600,11 @@ GetCommandLogLevel(Node *parsetree) } break; + /* POLAR: Flashback table */ + case T_PolarFlashbackTableStmt: + lev = LOGSTMT_DDL; + break; + default: elog(WARNING, "unrecognized node type: %d", (int) nodeTag(parsetree)); diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c index ba58e802170..2d75b75df81 100644 --- a/src/backend/utils/cache/relcache.c +++ b/src/backend/utils/cache/relcache.c @@ -95,6 +95,7 @@ /* POLAR */ #include "utils/guc.h" +#include "polar_flashback/polar_flashback_rel_filenode.h" #define RELCACHE_INIT_FILEMAGIC 0x573266 /* version ID value */ @@ -3443,6 +3444,10 @@ RelationSetNewRelfilenode(Relation relation, char persistence, CatalogTupleUpdate(pg_class, &tuple->t_self, tuple); } + /* POLAR: Log the relation file node update after set a new relfilenode */ + polar_flog_filenode_update(flog_instance, fra_instance, relation->rd_id, + newrelfilenode, InvalidOid, false, false); + heap_freetuple(tuple); heap_close(pg_class, RowExclusiveLock); @@ -6262,4 +6267,4 @@ polar_check_nblocks_consistent(Relation rel) if (cache_blkno != real_blkno) elog(LOG, "relation \"%s\" nblocks is not consistent, spcNode:%d, dbNode:%d, relNode:%d, cache is %d, real is %d", RelationGetRelationName(rel), rnode.spcNode, rnode.dbNode, rnode.relNode, cache_blkno, real_blkno); -} \ No newline at end of file +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 846cd6fc72e..0cb1ca981ce 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -105,6 +105,7 @@ #include "commands/tablecmds.h" #include "common/username.h" #include "polar_flashback/polar_flashback_log.h" +#include "polar_flashback/polar_flashback_table.h" #include "replication/polar_priority_replication.h" #include "storage/predicate.h" #include "storage/polar_fd.h" @@ -335,6 +336,7 @@ static void polar_assign_crash_recovery_rto_delay_time(const int newval, void *e static bool polar_check_enable_lazy_checkpoint(bool *newval, void **extra, GucSource source); static bool polar_check_enable_full_page_writes(bool *newval, void **extra, GucSource source); +static bool polar_check_enable_fra(bool *newval, void **extra, GucSource source); /* * Options for enum values defined in this module. @@ -3676,6 +3678,17 @@ static struct config_bool ConfigureNamesBool[] = NULL, NULL, NULL }, + { + {"polar_enable_fast_recovery_area", PGC_POSTMASTER, UNGROUPED, + gettext_noop("Enable fast recovery area for flashback"), + gettext_noop("Please make sure polar_enable_flashback_log=on to enable fast recovery area"), + GUC_NO_SHOW_ALL | GUC_NO_RESET_ALL + }, + &polar_enable_fast_recovery_area, + false, + polar_check_enable_fra, NULL, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, false, NULL, NULL, NULL @@ -6330,7 +6343,7 @@ static struct config_int ConfigureNamesInt[] = NULL }, &polar_flashback_point_segments, - 20, 1, INT_MAX, + 16, 1, INT_MAX, NULL, NULL, NULL }, @@ -6341,7 +6354,29 @@ static struct config_int ConfigureNamesInt[] = GUC_UNIT_S }, &polar_flashback_point_timeout, - 300, 1, 86400, + 300, 300, 86400, + NULL, NULL, NULL + }, + + { + {"polar_fast_recovery_area_rotation", PGC_SIGHUP, UNGROUPED, + gettext_noop("Sets the maximum time to keep fast recovery area " + "include clog, xact snapshot and flashback point files."), + NULL, + GUC_UNIT_MIN + }, + &polar_fast_recovery_area_rotation, + 180, 0, 14400, + NULL, NULL, NULL + }, + + { + {"polar_workers_per_flashback_table", PGC_USERSET, UNGROUPED, + gettext_noop("Set the max number of the background workers for each flashback table operation."), + NULL + }, + &polar_workers_per_flashback_table, + 5, 0, 64, NULL, NULL, NULL }, @@ -15541,6 +15576,21 @@ polar_assign_max_normal_backends(double newval, void *extra) MaxNormalBackends = (int)floor(newval * MaxConnections); } +static bool +polar_check_enable_fra(bool *newval, void **extra, GucSource source) +{ + + if (IsUnderPostmaster && !(IsInParallelMode() || IsParallelWorker()) && + source != PGC_S_DEFAULT && *newval && !polar_enable_flashback_log) + { + /*no cover begin*/ + GUC_check_errdetail("Cannot enable parameter polar_enable_fast_recovery_area when \"polar_enable_flashback_log\" is false."); + return false; + /*no cover end*/ + } + + return true; +} /* POLAR end */ #include "guc-file.c" diff --git a/src/bin/polar_tools/Makefile b/src/bin/polar_tools/Makefile index 61a856ba33f..0ca488f5b98 100644 --- a/src/bin/polar_tools/Makefile +++ b/src/bin/polar_tools/Makefile @@ -27,7 +27,10 @@ OBJS = polar_tools.o \ datamax_get_wal_from_backup.o \ xlogreader.o \ flashback_log_control_dump.o \ - flashback_log_file_dump.o + flashback_log_file_dump.o \ + flashback_point_file_dump.o \ + flashback_snapshot_dump.o \ + fra_control_dump.o CPPFLAGS_XLOGREADER := $(CPPFLAGS) -DFRONTEND diff --git a/src/bin/polar_tools/flashback_log_control_dump.c b/src/bin/polar_tools/flashback_log_control_dump.c index a8295930d9d..0ce0a43f019 100644 --- a/src/bin/polar_tools/flashback_log_control_dump.c +++ b/src/bin/polar_tools/flashback_log_control_dump.c @@ -22,6 +22,8 @@ */ #include +#include "postgres.h" + #include "polar_tools.h" #include "polar_flashback/polar_flashback_log_file.h" @@ -37,7 +39,7 @@ static void usage(void) { printf("Dump flashback log control file usage:\n"); - printf("-f, --file_path Specify dma log file path\n"); + printf("-f, --file_path Specify flashback log control file path\n"); printf("-?, --help show this help, then exit\n"); } @@ -107,7 +109,7 @@ flashback_log_control_dump_main(int argc, char **argv) printf(_("The flashback log record write result point in the last flashback point end: %X/%X\n"), (uint32)(flashback_log_ctl_file.fbpoint_info.flog_end_ptr >> 32), (uint32) flashback_log_ctl_file.fbpoint_info.flog_end_ptr); - printf(_("The previous flashback log record start point in the last flashback point end: %X/%X\n"), + printf(_("The last flashback log record start point before shutdown: %X/%X\n"), (uint32)(flashback_log_ctl_file.fbpoint_info.flog_end_ptr_prev >> 32), (uint32) flashback_log_ctl_file.fbpoint_info.flog_end_ptr_prev); printf(_("The current flashback point WAL lsn: %X/%X\n"), diff --git a/src/bin/polar_tools/flashback_log_file_dump.c b/src/bin/polar_tools/flashback_log_file_dump.c index f8ea4b542a1..216d57a8477 100644 --- a/src/bin/polar_tools/flashback_log_file_dump.c +++ b/src/bin/polar_tools/flashback_log_file_dump.c @@ -20,6 +20,7 @@ *------------------------------------------------------------------------- */ #include +#include #include #include #include @@ -29,11 +30,13 @@ #include "access/xlogdefs.h" #include "access/transam.h" #include "common/pg_lzcompress.h" +#include "fe_utils/timestamp.h" #include "polar_flashback/polar_flashback_log_file.h" #include "polar_flashback/polar_flashback_log_reader.h" #include "polar_flashback/polar_flashback_log_record.h" +#include "polar_flashback/polar_flashback_rel_filenode.h" #include "polar_tools.h" - +#include "utils/datetime.h" /*no cover begin*/ /* checksum_impl.h uses Assert, which doesn't work outside the server */ #undef Assert @@ -49,8 +52,6 @@ static int segment_size; static bool just_version = false; static bool just_one_record_without_check = false; -#define FLASHBACK_LOG_REC_TYPES (1) - /* load the flashback log switch ptrs */ #define load_switch_ptrs(dir, ptrs) \ (ptrs != NIL? ptrs : flog_read_history_file(dir)) @@ -85,14 +86,10 @@ typedef union pg_aligned_flashback_log_blk int64 force_align_i64; } pg_aligned_flashback_log_blk; -const char *flashback_log_record_types[FLASHBACK_LOG_REC_TYPES + 1] = -{ - "original_page", - NULL -}; - static List *switch_ptr_list = NIL; +const char *flog_record_types[FLOG_REC_TYPES + 1] = FLOG_RECORD_TYPES; + static void fatal_error(const char *fmt, ...) pg_attribute_printf(1, 2); static void report_invalid_flog_record(flog_reader_state *state, const char *fmt, ...) pg_attribute_printf(2, 3); @@ -121,8 +118,8 @@ print_type_list(void) { int i; - for (i = 0; i < FLASHBACK_LOG_REC_TYPES; i++) - printf("%s\n", flashback_log_record_types[i]); + for (i = 0; i < FLOG_REC_TYPES; i++) + printf("%s\n", flog_record_types[i]); } /* @@ -536,7 +533,7 @@ flog_pos2ptr(uint64 bytepos) seg_offset += fullpages * POLAR_FLOG_BLCKSZ + bytesleft + FLOG_SHORT_PHD_SIZE; } - flog_seg_offset_to_ptr(fullsegs, seg_offset, POLAR_FLOG_SEG_SIZE, result); + FLOG_SEG_OFFSET_TO_PTR(fullsegs, seg_offset, POLAR_FLOG_SEG_SIZE, result); return result; } @@ -551,7 +548,7 @@ flog_ptr2pos(polar_flog_rec_ptr ptr) uint32 offset; uint64 result; - fullsegs = flog_ptr_to_seg(ptr, POLAR_FLOG_SEG_SIZE); + fullsegs = FLOG_PTR_TO_SEG(ptr, POLAR_FLOG_SEG_SIZE); fullpages = (FLOG_SEGMENT_OFFSET(ptr, POLAR_FLOG_SEG_SIZE)) / POLAR_FLOG_BLCKSZ; offset = ptr % POLAR_FLOG_BLCKSZ; @@ -725,16 +722,16 @@ flog_page_header_validate(flog_reader_state *state, Assert((recptr % POLAR_FLOG_BLCKSZ) == 0); - segno = flog_ptr_to_seg(recptr, state->segment_size); + segno = FLOG_PTR_TO_SEG(recptr, state->segment_size); offset = FLOG_SEGMENT_OFFSET(recptr, state->segment_size); - flog_seg_offset_to_ptr(segno, offset, state->segment_size, recaddr); + FLOG_SEG_OFFSET_TO_PTR(segno, offset, state->segment_size, recaddr); if (hdr->xlp_magic != FLOG_PAGE_MAGIC) { char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "invalid magic number %04X in flashback log segment %s, offset %u", @@ -747,7 +744,7 @@ flog_page_header_validate(flog_reader_state *state, if (hdr->xlp_version < FLOG_PAGE_VERSION) { char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "invalid version %04X in flashback log segment %s, offset %u", hdr->xlp_version, @@ -760,7 +757,7 @@ flog_page_header_validate(flog_reader_state *state, { char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "invalid info bits %04X in flashback log segment %s, offset %u", @@ -814,7 +811,7 @@ flog_page_header_validate(flog_reader_state *state, { char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); /* hmm, first page of file doesn't have a long header? */ report_invalid_flog_record(state, @@ -834,7 +831,7 @@ flog_page_header_validate(flog_reader_state *state, { char fname[FLOG_MAX_FNAME_LEN]; - get_flog_fname(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, segno, state->segment_size, FLOG_DEFAULT_TIMELINE); report_invalid_flog_record(state, "unexpected pageaddr %X/%X in flashback log segment %s, offset %u", @@ -934,7 +931,7 @@ read_flog_page_internal(flog_reader_state *state, Assert((pageptr % POLAR_FLOG_BLCKSZ) == 0); - targetSegNo = flog_ptr_to_seg(pageptr, state->segment_size); + targetSegNo = FLOG_PTR_TO_SEG(pageptr, state->segment_size); targetPageOff = FLOG_SEGMENT_OFFSET(pageptr, state->segment_size); @@ -1481,35 +1478,6 @@ find_first_flog_rec(flog_reader_state *state, return found; } -/* - * Split a pathname as dirname(1) and basename(1) would. - * - * XXX this probably doesn't do very well on Windows. We probably need to - * apply canonicalize_path(), at the very least. - */ -static void -split_path(const char *path, char **dir, char **fname) -{ - char *sep; - - /* split filepath into directory & filename */ - sep = strrchr(path, '/'); - - /* directory path */ - if (sep != NULL) - { - *dir = pg_strdup(path); - (*dir)[(sep - path) + 1] = '\0'; /* no strndup */ - *fname = pg_strdup(sep + 1); - } - /* local directory */ - else - { - *dir = NULL; - *fname = pg_strdup(path); - } -} - /* * Open the file in the valid target directory. * @@ -1560,7 +1528,7 @@ search_directory(const char *directory, const char *fname) while ((xlde = readdir(xldir)) != NULL) { - if (is_flashback_log_file(xlde->d_name)) + if (FLOG_IS_LOG_FILE(xlde->d_name)) { fd = open_file_in_directory(directory, xlde->d_name); fname = xlde->d_name; @@ -1716,7 +1684,7 @@ flog_dump_read(const char *directory, polar_flog_rec_ptr startptr, startoff = FLOG_SEGMENT_OFFSET(recptr, segment_size); - if (sendFile < 0 || !ptr_in_flog_seg(recptr, sendSegNo, segment_size)) + if (sendFile < 0 || !FLOG_PTR_IN_SEG(recptr, sendSegNo, segment_size)) { char fname[FLOG_MAX_FNAME_LEN]; int tries; @@ -1725,9 +1693,9 @@ flog_dump_read(const char *directory, polar_flog_rec_ptr startptr, if (sendFile >= 0) close(sendFile); - sendSegNo = flog_ptr_to_seg(recptr, segment_size); + sendSegNo = FLOG_PTR_TO_SEG(recptr, segment_size); - get_flog_fname(fname, sendSegNo, segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, sendSegNo, segment_size, FLOG_DEFAULT_TIMELINE); /* * In follow mode there is a short period of time after the server @@ -1773,7 +1741,7 @@ flog_dump_read(const char *directory, polar_flog_rec_ptr startptr, int err = errno; char fname[MAXPGPATH]; - get_flog_fname(fname, sendSegNo, segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, sendSegNo, segment_size, FLOG_DEFAULT_TIMELINE); fatal_error("could not seek in log file %s to offset %u: %s", fname, startoff, strerror(err)); @@ -1795,7 +1763,7 @@ flog_dump_read(const char *directory, polar_flog_rec_ptr startptr, int err = errno; char fname[MAXPGPATH]; - get_flog_fname(fname, sendSegNo, segment_size, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(fname, sendSegNo, segment_size, FLOG_DEFAULT_TIMELINE); fatal_error("could not read from log file %s, offset %u, length %d: %s", fname, sendOff, segbytes, strerror(err)); @@ -1845,8 +1813,8 @@ get_flog_rec_type(flog_record *record, polar_flog_rec_ptr lsn) { RmgrId xl_rmgr = record->xl_rmid; - if (xl_rmgr < FLASHBACK_LOG_REC_TYPES) - return flashback_log_record_types[xl_rmgr]; + if (xl_rmgr < FLOG_REC_TYPES) + return flog_record_types[xl_rmgr]; else fatal_error("The type of the record %X/%08X is wrong, \n", (uint32)(lsn >> 32), (uint32)lsn); @@ -1854,6 +1822,23 @@ get_flog_rec_type(flog_record *record, polar_flog_rec_ptr lsn) return NULL; } +static void +print_rel_filenode_rec(flog_record *record) +{ + fl_filenode_rec_data_t *filenode_rec; + bool can_be_flashback; + + can_be_flashback = (record->xl_info & REL_FILENODE_TYPE_MASK) & REL_CAN_FLASHBACK; + filenode_rec = FL_GET_FILENODE_REC_DATA(record); + printf("The previous relation filenode is [%u, %u, %u], " + "the current relation filenode is [%u, %u, %u], the time is %s, " + "can%s flashback the relation to past", + filenode_rec->old_filenode.spcNode, filenode_rec->old_filenode.dbNode, + filenode_rec->old_filenode.relNode, filenode_rec->new_filenode.spcNode, + filenode_rec->new_filenode.dbNode, filenode_rec->new_filenode.relNode, + timestamptz_to_str(filenode_rec->time), (can_be_flashback)? "":"'t"); +} + /* * Print a record to stdout */ @@ -1883,7 +1868,7 @@ flog_display_rec(dump_config *config, flog_record *record, rec_len = record->xl_tot_len; rmid = record->xl_rmid; record_data = (char *)record; - printf("type: %16s total_len: %6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", + printf("type: %24s total_len: %6u, tx: %10u, lsn: %X/%08X, prev %X/%08X, ", get_flog_rec_type(record, lsn), rec_len, record->xl_xid, (uint32)(lsn >> 32), (uint32) lsn, (uint32)(xl_prev >> 32), (uint32) xl_prev); @@ -1898,6 +1883,13 @@ flog_display_rec(dump_config *config, flog_record *record, from_origin_buf = info & FROM_ORIGIN_BUF; printf("desc: %s, ", type == ORIGIN_PAGE_EMPTY ? "empty page" : "full page"); } + else if (rmid == REL_FILENODE_ID) + { + printf("desc: change relation file node, "); + } + else + fatal_error("invalid rmid %d for the record at %X/%X", rmid, + (uint32)(lsn >> 32), (uint32) lsn); if (!config->bkp_details) { @@ -1906,6 +1898,8 @@ flog_display_rec(dump_config *config, flog_record *record, tag.rnode.spcNode, tag.rnode.dbNode, tag.rnode.relNode, forkNames[tag.forkNum], tag.blockNum, (uint32)(redo_lsn >> 32), (uint32) redo_lsn); + else if (rmid == REL_FILENODE_ID) + print_rel_filenode_rec(record); putchar('\n'); } @@ -2004,12 +1998,10 @@ flog_display_rec(dump_config *config, flog_record *record, else fatal_error("invalid xl_info %d for the record at %X/%X", info, (uint32)(lsn >> 32), (uint32) lsn); - - putchar('\n'); } - else - fatal_error("invalid rmid %d for the record at %X/%X", rmid, - (uint32)(lsn >> 32), (uint32) lsn); + else if (rmid == REL_FILENODE_ID) + print_rel_filenode_rec(record); + putchar('\n'); } } @@ -2152,16 +2144,16 @@ flashback_log_file_dump_main(int argc, char **argv) exit(EXIT_SUCCESS); } - for (i = 0; i < FLASHBACK_LOG_REC_TYPES; i++) + for (i = 0; i < FLOG_REC_TYPES; i++) { - if (pg_strcasecmp(optarg, flashback_log_record_types[i]) == 0) + if (pg_strcasecmp(optarg, flog_record_types[i]) == 0) { config.filter_by_type = i; break; } } - if (config.filter_by_type > FLASHBACK_LOG_REC_TYPES) + if (config.filter_by_type > FLOG_REC_TYPES) { fprintf(stderr, _("%s: type \"%s\" does not exist\n"), progname, optarg); @@ -2257,7 +2249,8 @@ flashback_log_file_dump_main(int argc, char **argv) int fd; uint64 segno; - split_path(argv[optind], &directory, &fname); + fname = basename(argv[optind]); + directory = dirname(argv[optind]); if (private.inpath == NULL && directory != NULL) { @@ -2277,16 +2270,16 @@ flashback_log_file_dump_main(int argc, char **argv) close(fd); /* parse position from file */ - get_flog_seg_from_fname(fname, &segno, segment_size); + FLOG_GET_SEG_FROM_FNAME(fname, &segno, segment_size); if (private.startptr == POLAR_INVALID_FLOG_REC_PTR) { - flog_seg_offset_to_ptr(segno, 0, + FLOG_SEG_OFFSET_TO_PTR(segno, 0, segment_size, private.startptr); private.startptr = private.startptr == POLAR_INVALID_FLOG_REC_PTR ? FLOG_LONG_PHD_SIZE : private.startptr; } - else if (!ptr_in_flog_seg(private.startptr, segno, segment_size)) + else if (!FLOG_PTR_IN_SEG(private.startptr, segno, segment_size)) { fprintf(stderr, _("%s: start flashback log location " @@ -2300,7 +2293,7 @@ flashback_log_file_dump_main(int argc, char **argv) /* no second file specified, set end position */ if (!(optind + 1 < argc) && private.endptr == POLAR_INVALID_FLOG_REC_PTR) - flog_seg_offset_to_ptr(segno + 1, 0, + FLOG_SEG_OFFSET_TO_PTR(segno + 1, 0, segment_size, private.endptr); /* parse ENDSEG if passed */ @@ -2309,8 +2302,7 @@ flashback_log_file_dump_main(int argc, char **argv) uint64 endsegno; /* ignore directory, already have that */ - split_path(argv[optind + 1], &directory, &fname); - + fname = basename(argv[optind]); fd = open_file_in_directory(private.inpath, fname); if (fd < 0) @@ -2319,21 +2311,21 @@ flashback_log_file_dump_main(int argc, char **argv) close(fd); /* parse position from file */ - get_flog_seg_from_fname(fname, &endsegno, segment_size); + FLOG_GET_SEG_FROM_FNAME(fname, &endsegno, segment_size); if (endsegno < segno) fatal_error("ENDSEG %s is before STARTSEG %s", argv[optind + 1], argv[optind]); if (private.endptr == POLAR_INVALID_FLOG_REC_PTR) - flog_seg_offset_to_ptr(endsegno + 1, 0, + FLOG_SEG_OFFSET_TO_PTR(endsegno + 1, 0, segment_size, private.endptr); /* set segno to endsegno for check of --end */ segno = endsegno; } - if (!ptr_in_flog_seg(private.endptr, segno, segment_size) && + if (!FLOG_PTR_IN_SEG(private.endptr, segno, segment_size) && private.endptr != (segno + 1) * segment_size) { fprintf(stderr, @@ -2427,15 +2419,31 @@ flashback_log_file_dump_main(int argc, char **argv) config.filter_by_type != record->xl_rmid) continue; - if (record->xl_rmid == ORIGIN_PAGE_ID && config.rel_file_node.spcNode != 0) + if (config.rel_file_node.spcNode != 0) { - fl_origin_page_rec_data *rec_data; + if (record->xl_rmid == ORIGIN_PAGE_ID) + { + fl_origin_page_rec_data *rec_data; + + rec_data = FL_GET_ORIGIN_PAGE_REC_DATA(record); - rec_data = FL_GET_ORIGIN_PAGE_REC_DATA(record); + if (config.rel_file_node.spcNode != rec_data->tag.rnode.spcNode || + config.rel_file_node.dbNode != rec_data->tag.rnode.dbNode || + config.rel_file_node.relNode != rec_data->tag.rnode.relNode) + continue; + } + else if (record->xl_rmid == REL_FILENODE_ID) + { + fl_filenode_rec_data_t *rec_data; - if (config.rel_file_node.spcNode != rec_data->tag.rnode.spcNode || - config.rel_file_node.dbNode != rec_data->tag.rnode.dbNode || - config.rel_file_node.relNode != rec_data->tag.rnode.relNode) + rec_data = FL_GET_FILENODE_REC_DATA(record); + + if (config.rel_file_node.spcNode != rec_data->new_filenode.spcNode|| + config.rel_file_node.dbNode != rec_data->new_filenode.dbNode || + config.rel_file_node.relNode != rec_data->new_filenode.relNode) + continue; + } + else continue; } diff --git a/src/bin/polar_tools/flashback_point_file_dump.c b/src/bin/polar_tools/flashback_point_file_dump.c new file mode 100644 index 00000000000..6f5a8ecf214 --- /dev/null +++ b/src/bin/polar_tools/flashback_point_file_dump.c @@ -0,0 +1,268 @@ +/*------------------------------------------------------------------------- + * + * flashback_point_file_dump.c + * + * + * Copyright (c) 2021, Alibaba Group Holding limited + * + * IDENTIFICATION + * src/bin/polar_tools/flashback_point_file_dump.c + * + *------------------------------------------------------------------------- + */ +#include +#include + +#include "postgres.h" + +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback_log_file.h" +#include "polar_tools.h" + +/*no cover begin*/ +static struct option long_options[] = +{ + {"ctl-file-path", required_argument, NULL, 'c'}, + {"file-path", required_argument, NULL, 'f'}, + {"max-rec-no", required_argument, NULL, 'm'}, + {"help", no_argument, NULL, '?'}, + {NULL, 0, NULL, 0} +}; + +static void +usage(void) +{ + printf("Dump flashback point record file usage:\n"); + printf("-c, --ctl-file-path Specify fast recovery area control path\n"); + printf("-f, --file-path Specify flashback point record file path\n"); + printf("-m, --max-rec-no Specify max record number, start from 0 \n"); + printf("-w, --with-snapshot output with snapshot \n"); + printf("-?, --help show this help, then exit\n"); +} + +static void +ckp_file_page_dump(const char *dir, char *start_ptr, uint32 seg_no, uint32 page_no, uint64 max_slot_no, bool with_snapshot) +{ + uint64 slot_no; + fbpoint_rec_data_t *tkp_info; + uint64 max_slot_in_page; + + slot_no = page_no * FBPOINT_REC_PER_PAGE; + tkp_info = (fbpoint_rec_data_t *) ((char *) start_ptr + page_no * FBPOINT_PAGE_SIZE + FBPOINT_PAGE_HEADER_SIZE); + + max_slot_in_page = Min(max_slot_no, slot_no + FBPOINT_REC_PER_PAGE - 1); + while (slot_no <= max_slot_in_page) + { + char ckpttime_str[128]; + time_t time_tmp; + const char *strftime_fmt = "%c"; + + time_tmp = (time_t) (tkp_info->time); + strftime(ckpttime_str, sizeof(ckpttime_str), strftime_fmt, + localtime(&time_tmp)); + + printf(_("------------------------------------------------------------------------------\n")); + printf(_("------------------------------------------------------------------------------\n")); + printf(_("The %2lu flashback point record in the page %2u: " + "flashback log pointer is %08X/%08X, " + "WAL lsn is %08X/%08X, checkpoint time is %s, " + "next clog sub directory number is %08X, " + "snapshot segment no. is %u, snapshot offset is %u\n"), + slot_no, page_no, + (uint32) (tkp_info->flog_ptr >> 32), (uint32) (tkp_info->flog_ptr), + (uint32) (tkp_info->redo_lsn >> 32), (uint32) (tkp_info->redo_lsn), + ckpttime_str, tkp_info->next_clog_subdir_no, + tkp_info->snapshot_pos.seg_no, tkp_info->snapshot_pos.offset); + + if (with_snapshot) + flashback_snapshot_dump(dir, tkp_info->snapshot_pos); + + slot_no++; + tkp_info = (fbpoint_rec_data_t *) ((char *) tkp_info + FBPOINT_REC_SIZE); + } +} + +static pg_crc32c +page_comp_crc(fbpoint_page_header_t *header) +{ + pg_crc32c crc; + + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) header + FBPOINT_PAGE_HEADER_SIZE, FBPOINT_PAGE_SIZE - FBPOINT_PAGE_HEADER_SIZE); + COMP_CRC32C(crc, (char *) header, offsetof(fbpoint_page_header_t, crc)); + FIN_CRC32C(crc); + + return crc; +} + +int +flashback_point_file_dump_main(int argc, char **argv) +{ + int c; + char *ctl_file_path = NULL; + FILE *ctl_fp = NULL; + fra_ctl_file_data_t ctl_file; + pg_crc32c crc; + char *file_path = NULL; + uint64 max_slot_no = 0; + char *fname = NULL; + uint32 seg_no; + uint32 max_page_no; + uint32 page_no = 0; + char ckp_info_seg[FBPOINT_REC_END_POS]; + bool succeed = false; + bool use_ctl_file = false; + bool use_input = false; + bool with_snapshot = false; + FILE *fp = NULL; + char *dir; + + if (argc <= 1) + { + usage(); + return -1; + } + + while ((c = getopt_long(argc, argv, "c:f:m:w", long_options, NULL)) != -1) + { + switch (c) + { + case 'c': + ctl_file_path = strdup(optarg); + use_ctl_file = true; + break; + case 'f': + file_path = strdup(optarg); + break; + case 'm': + max_slot_no = (uint64) strtoull(optarg, NULL, 10); + use_input = true; + break; + case 'w': + with_snapshot = true; + break; + default: + usage(); + return -1; + } + } + + if (use_ctl_file) + { + /* Read the max slot no from control file */ + ctl_fp = fopen(ctl_file_path, "r"); + + if (ctl_fp == NULL) + { + fprintf(stderr, "Failed to open control file %s\n", ctl_file_path); + goto end; + } + + if (fread(&ctl_file, 1, sizeof(fra_ctl_file_data_t), ctl_fp) != + sizeof(fra_ctl_file_data_t)) + { + fprintf(stderr, "We cannot read the control file %s\n", ctl_file_path); + goto end; + } + + /* Verify CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) &ctl_file, offsetof(fra_ctl_file_data_t, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, ctl_file.crc)) + { + fprintf(stderr, "The crc in control file is incorrect, got %u, expect %u", crc, + ctl_file.crc); + goto end; + } + + fclose(ctl_fp); + ctl_fp = NULL; + + if (ctl_file.next_fbpoint_rec_no == 0) + { + printf(_("There are no flashback point record file.\n")); + succeed = true; + goto end; + } + + max_slot_no = ctl_file.next_fbpoint_rec_no - 1; + } + else if (!use_input) + { + printf(_("Please input the -m max flashback point record or -c fast recovery control file path.\n")); + succeed = false; + goto end; + } + + /* Dump the checkpoint file */ + fname = basename(file_path); + dir = dirname(file_path); + FBPOINT_GET_SEG_FROM_FNAME(fname, seg_no); + + if (FBPOINT_GET_SEG_NO_BY_REC_NO(max_slot_no) == seg_no) + { + max_page_no = FBPOINT_GET_PAGE_NO_BY_REC_NO(max_slot_no); + max_slot_no = max_slot_no % FBPOINT_REC_PER_SEG; + } + else + { + max_page_no = FBPOINT_PAGE_PER_SEG - 1; + max_slot_no = seg_no * FBPOINT_REC_PER_SEG - 1; + } + + printf(_("The flashback point record file %s has %u pages\n"), file_path, max_page_no + 1); + + fp = fopen(file_path, "r"); + + if (fp == NULL) + { + fprintf(stderr, "Failed to open flashback point record file %s\n", file_path); + goto end; + } + + if (fread(ckp_info_seg, 1, FBPOINT_REC_END_POS, fp) != FBPOINT_REC_END_POS) + { + fprintf(stderr, "We cannot read the flashback point record file %s\n", file_path); + goto end; + } + + fclose(fp); + fp = NULL; + + while (page_no <= max_page_no) + { + char *page_start; + + /* Check the crc of the page */ + page_start = ckp_info_seg + page_no * FBPOINT_PAGE_SIZE; + crc = page_comp_crc((fbpoint_page_header_t *) page_start); + + if (!EQ_CRC32C(crc, ((fbpoint_page_header_t *) page_start)->crc)) + { + fprintf(stderr, "The crc of page %u in flashback point record file %s is incorrect, " + "got %u, expect %u", page_no, file_path, crc, ((fbpoint_page_header_t *) page_start)->crc); + goto end; + } + + ckp_file_page_dump(dir, ckp_info_seg, seg_no, page_no, max_slot_no, with_snapshot); + + printf(_("\n")); + page_no++; + } + + succeed = true; +end: + if (ctl_fp) + fclose(ctl_fp); + if (ctl_file_path) + free(ctl_file_path); + if (fp) + fclose(fp); + if (file_path) + free(file_path); + + return succeed ? 0 : -1; +} +/*no cover end*/ diff --git a/src/bin/polar_tools/flashback_snapshot_dump.c b/src/bin/polar_tools/flashback_snapshot_dump.c new file mode 100644 index 00000000000..e51d8d28340 --- /dev/null +++ b/src/bin/polar_tools/flashback_snapshot_dump.c @@ -0,0 +1,248 @@ +/*------------------------------------------------------------------------- + * + * flashback_snapshot_dump.c + * + * + * Copyright (c) 2021, Alibaba Group Holding limited + * + * IDENTIFICATION + * src/bin/polar_tools/flashback_snapshot_dump.c + * + *------------------------------------------------------------------------- + */ +#include +#include + +#include "postgres.h" + +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback_snapshot.h" +#include "polar_tools.h" + +/*no cover begin*/ +static struct option long_options[] = +{ + {"file-path", required_argument, NULL, 'f'}, + {"start-pos", required_argument, NULL, 's'}, + {"help", no_argument, NULL, '?'}, + {NULL, 0, NULL, 0} +}; + +static void +usage(void) +{ + printf("Dump flashback snapshot file usage:\n"); + printf("-f, --file-path Specify flashback point file path\n"); + printf("-s, --start-pos Specify snapshot data start position\n"); + printf("-?, --help show this help, then exit\n"); +} + +static void +flashback_snapshot_header_dump(flashback_snapshot_data_header_t *head) +{ + printf(_("The flashback snapshot data version no: %d\n"), + GET_FLSHBAK_SNAPSHOT_VERSION(head->info)); + printf(_("The flashback snapshot data size: %u\n"), + head->data_size); +} + +static void +flashback_snapshot_data_dump(flashback_snapshot_t snapshot) +{ + int i; + int count = 0; + TransactionId *xip; + + printf(_("The snapshot is exported while the current insert lsn of WAL is %X/%X \n"), + (uint32) (snapshot->lsn >> 32), (uint32) (snapshot->lsn)); + printf(_("The xmin is %u\n"), snapshot->xmin); + printf(_("The xmax is %u\n"), snapshot->xmax); + + xip = (TransactionId *) POLAR_GET_FLSHBAK_SNAPSHOT_XIP(snapshot); + + printf(_("The snapshot is exported in RW node\n")); + count = snapshot->xcnt; + + if (count > 0) + printf(_("The %u running transactions between xmin and xmax are"), snapshot->xcnt); + else + printf(_("There no running transactions between xmin and xmax")); + + for (i = 0; i < count; i++) + printf(_(" xip:%u"), xip[i]); + + printf(_("\n")); +} + +void +flashback_snapshot_dump(const char *dir, fbpoint_pos_t snapshot_pos) +{ + char file_path[MAXPGPATH]; + FILE *fp = NULL; + bool succeed = false; + size_t read_len; + pg_crc32c crc; + flashback_snapshot_data_header_t header; + uint32 data_size; + flashback_snapshot_t snapshot; + uint32 segno = snapshot_pos.seg_no; + uint32 offset = snapshot_pos.offset; + uint32 end_pos; + char *data; + + snprintf(file_path, MAXPGPATH, "%s/%08X", dir, segno); + + /* Read the header */ + fp = fopen(file_path, "r"); + + if (fp == NULL) + { + fprintf(stderr, "Failed to open flashback point file %s\n", file_path); + goto end; + } + + if (fseek(fp, offset, SEEK_SET) != 0) + { + fprintf(stderr, "Failed to seek %s offset %d\n", file_path, offset); + goto end; + } + + read_len = fread(&header, 1, FLSHBAK_SNAPSHOT_HEADER_SIZE, fp); + + if (read_len != FLSHBAK_SNAPSHOT_HEADER_SIZE) + { + fprintf(stderr, "could not read file header from flashback point file %s\n", file_path); + goto end; + } + + flashback_snapshot_header_dump(&header); + data_size = header.data_size; + data = palloc(data_size); + snapshot = (flashback_snapshot_t) data; + end_pos = GET_FLSHBAK_SNAPSHOT_END_POS(header.info); + offset += FLSHBAK_SNAPSHOT_HEADER_SIZE; + + do{ + read_len = end_pos - offset; + + if (read_len) + { + if (fseek(fp, offset, SEEK_SET) != 0) + { + fprintf(stderr, "Failed to seek %s offset %d\n", file_path, offset); + goto end; + } + + if (fread(data, 1, read_len, fp) != read_len) + { + fprintf(stderr, "Failed to read snapshot data from %s\n", file_path); + goto end; + } + + fclose(fp); + fp = NULL; + } + + data_size -= read_len; + + /* Can break the loop only in here */ + if (data_size == 0) + break; + + end_pos = FBPOINT_SEG_SIZE; + + if (data_size > (FBPOINT_SEG_SIZE - FBPOINT_REC_END_POS)) + offset = FBPOINT_REC_END_POS; + else + offset = FBPOINT_SEG_SIZE - data_size; + + segno++; + data += read_len; + snprintf(file_path, MAXPGPATH, "%s/%08X", dir, segno); + fp = fopen(file_path, "r"); + + if (fp == NULL) + { + fprintf(stderr, "Failed to open flashback point file %s\n", file_path); + goto end; + } + } while (data_size > 0); + + INIT_CRC32C(crc); + COMP_CRC32C((crc), (char *) (snapshot), (header.data_size)); + COMP_CRC32C((crc), (char *) (&header), offsetof(flashback_snapshot_data_header_t, crc)); + FIN_CRC32C(crc); + + if (!EQ_CRC32C(crc, header.crc)) + { + /*no cover line*/ + fprintf(stderr, "calculated snapshot data CRC checksum does not match " + "value stored in file \"%s\"", file_path); + + goto end; + } + + flashback_snapshot_data_dump(snapshot); + succeed = true; +end: + if (fp) + fclose(fp); + + if (!succeed) + exit(EXIT_FAILURE); +} + +int +flashback_snapshot_dump_main(int argc, char **argv) +{ + int c; + char *file_path = NULL; + bool succeed = false; + uint32 start_pos = 0; + char *fname; + char *dir; + fbpoint_pos_t snapshot_pos; + uint32 seg_no; + + if (argc <= 1) + { + usage(); + return -1; + } + + while ((c = getopt_long(argc, argv, "f:s:", long_options, NULL)) != -1) + { + switch (c) + { + case 'f': + file_path = strdup(optarg); + break; + case 's': + start_pos = (uint32) atoi(optarg); + break; + default: + usage(); + return -1; + } + } + + if (start_pos >= FBPOINT_SEG_SIZE || start_pos < FBPOINT_REC_END_POS) + { + fprintf(stderr, "Must set a valid start position (larger than %d, less than %d) with -s or --start-pos \n", + FBPOINT_REC_END_POS, FBPOINT_SEG_SIZE); + goto end; + } + + fname = basename(file_path); + FBPOINT_GET_SEG_FROM_FNAME(fname, seg_no); + dir = dirname(file_path); + SET_FBPOINT_POS(snapshot_pos, seg_no, start_pos); + flashback_snapshot_dump(dir, snapshot_pos); + succeed = true; +end: + if (file_path) + free(file_path); + + return succeed ? 0 : -1; +} +/*no cover end*/ diff --git a/src/bin/polar_tools/fra_control_dump.c b/src/bin/polar_tools/fra_control_dump.c new file mode 100644 index 00000000000..5ae6d1175e6 --- /dev/null +++ b/src/bin/polar_tools/fra_control_dump.c @@ -0,0 +1,109 @@ +/*------------------------------------------------------------------------- + * + * fra_control_dump.c + * + * + * Copyright (c) 2020-2120, Alibaba-inc PolarDB Group + * + * IDENTIFICATION + * src/bin/polar_tools/fra_control_dump.c + * + *------------------------------------------------------------------------- + */ +#include + +#include "polar_tools.h" +#include "polar_flashback/polar_fast_recovery_area.h" + +/*no cover begin*/ +static struct option long_options[] = +{ + {"file-path", required_argument, NULL, 'f'}, + {"help", no_argument, NULL, '?'}, + {NULL, 0, NULL, 0} +}; + +static void +usage(void) +{ + printf("Dump fast recovery area control file usage:\n"); + printf("-f, --file_path fast recovery area control file path\n"); + printf("-?, --help show this help, then exit\n"); +} + +int +fra_control_dump_main(int argc, char **argv) +{ + int c; + char *file_path = NULL; + FILE *fp = NULL; + bool succeed = false; + fra_ctl_file_data_t ctl_file; + pg_crc32c crc; + + if (argc <= 1) + { + usage(); + return -1; + } + + while ((c = getopt_long(argc, argv, "f:", long_options, NULL)) != -1) + { + switch (c) + { + case 'f': + file_path = strdup(optarg); + break; + + default: + usage(); + return -1; + } + } + + fp = fopen(file_path, "r"); + if (fp == NULL) + { + fprintf(stderr, "Failed to open %s\n", file_path); + goto end; + } + + if (fread(&ctl_file, 1, sizeof(fra_ctl_file_data_t), fp) != + sizeof(fra_ctl_file_data_t)) + { + fprintf(stderr, "We cannot read the file %s\n", file_path); + goto end; + } + + printf(_("The control file version no: %hu\n"), + ctl_file.version_no); + printf(_("The next flashback point record number: %lu\n"), ctl_file.next_fbpoint_rec_no); + printf(_("The minimal WAL keep lsn is: %X/%X\n"), + (uint32)(ctl_file.min_keep_lsn >> 32), + (uint32) ctl_file.min_keep_lsn); + printf(_("The next flashback clog sub direcotry number: %X\n"), + ctl_file.next_clog_subdir_no); + printf(_("The minimal clog segment number when enable fast recovery area: %04X\n"), + ctl_file.min_clog_seg_no); + + /* Verify CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) &ctl_file, offsetof(fra_ctl_file_data_t, crc)); + FIN_CRC32C(crc); + if (!EQ_CRC32C(crc, ctl_file.crc)) + { + fprintf(stderr, "The crc is incorrect, got %u, expect %u", crc, + ctl_file.crc); + goto end; + } + printf(_("The crc is correct: %X\n"), crc); + succeed = true; +end: + if (fp) + fclose(fp); + if (file_path) + free(file_path); + + return succeed ? 0 : -1; +} +/*no cover end*/ diff --git a/src/bin/polar_tools/polar_tools.c b/src/bin/polar_tools/polar_tools.c index 16895f04a09..8cadf061e17 100644 --- a/src/bin/polar_tools/polar_tools.c +++ b/src/bin/polar_tools/polar_tools.c @@ -38,6 +38,9 @@ usage(void) printf("datamax-get-wal\n"); printf("flashback-log-control\n"); printf("flashback-log-file\n"); + printf("flashback-point-file\n"); + printf("flashback-snapshot-file\n"); + printf("fra-control-file\n"); } int @@ -73,6 +76,12 @@ main(int argc, char **argv) return flashback_log_control_dump_main(--argc, ++argv); else if (strcmp(argv[1], "flashback-log-file") == 0) return flashback_log_file_dump_main(--argc, ++argv); + else if (strcmp(argv[1], "flashback-point-file") == 0) + return flashback_point_file_dump_main(--argc, ++argv); + else if (strcmp(argv[1], "flashback-snapshot-file") == 0) + return flashback_snapshot_dump_main(--argc, ++argv); + else if (strcmp(argv[1], "fra-control-file") == 0) + return fra_control_dump_main(--argc, ++argv); else usage(); diff --git a/src/bin/polar_tools/polar_tools.h b/src/bin/polar_tools/polar_tools.h index c0ad820c85b..b32326fca02 100644 --- a/src/bin/polar_tools/polar_tools.h +++ b/src/bin/polar_tools/polar_tools.h @@ -18,6 +18,7 @@ #include "common/fe_memutils.h" #include "getopt_long.h" +#include "polar_flashback/polar_flashback_log_internal.h" /* Get a bit mask of the bits set in non-uint32 aligned addresses */ #define UINT32_ALIGN_MASK (sizeof(uint32) - 1) @@ -60,5 +61,9 @@ extern int datamax_meta_main(int argc, char **argv); extern int datamax_get_wal_main(int argc, char **argv); extern int flashback_log_control_dump_main(int argc, char **argv); extern int flashback_log_file_dump_main(int argc, char **argv); +extern int flashback_point_file_dump_main(int argc, char **argv); +extern int fra_control_dump_main(int argc, char **argv); +extern int flashback_snapshot_dump_main(int argc, char **argv); +extern void flashback_snapshot_dump(const char *dir, fbpoint_pos_t snapshot_pos); #endif diff --git a/src/fe_utils/Makefile b/src/fe_utils/Makefile index 5362cffd573..166e19866a6 100644 --- a/src/fe_utils/Makefile +++ b/src/fe_utils/Makefile @@ -19,7 +19,7 @@ include $(top_builddir)/src/Makefile.global override CPPFLAGS := -DFRONTEND -I$(libpq_srcdir) $(CPPFLAGS) -OBJS = mbprint.o print.o psqlscan.o simple_list.o string_utils.o conditional.o +OBJS = mbprint.o print.o psqlscan.o simple_list.o string_utils.o conditional.o timestamp.o all: libpgfeutils.a diff --git a/src/fe_utils/timestamp.c b/src/fe_utils/timestamp.c new file mode 100644 index 00000000000..2c1e5544a0a --- /dev/null +++ b/src/fe_utils/timestamp.c @@ -0,0 +1,62 @@ +/*------------------------------------------------------------------------- + * + * timestamp.c + * copy from src/backend/utils/adt/timestamp.c but for fronted. + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, Alibaba Group Holding Limited + * + * + * IDENTIFICATION + * src/fe_utils/timestamp.c + * + *------------------------------------------------------------------------- + */ +#include + +#include "postgres.h" + +#include "fe_utils/timestamp.h" +#include "utils/datetime.h" + +/* copied from timestamp.c */ +pg_time_t +timestamptz_to_time_t(TimestampTz t) +{ + pg_time_t result; + + result = (pg_time_t) (t / USECS_PER_SEC + + ((POSTGRES_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECS_PER_DAY)); + return result; +} + +/* + * Stopgap implementation of timestamptz_to_str that doesn't depend on backend + * infrastructure. This will work for timestamps that are within the range + * of the platform time_t type. (pg_time_t is compatible except for possibly + * being wider.) + * + * XXX the return value points to a static buffer, so beware of using more + * than one result value concurrently. + * + * XXX: The backend timestamp infrastructure should instead be split out and + * moved into src/common. That's a large project though. + */ +const char * +timestamptz_to_str(TimestampTz dt) +{ + static char buf[MAXDATELEN + 1]; + char ts[MAXDATELEN + 1]; + char zone[MAXDATELEN + 1]; + time_t result = (time_t) timestamptz_to_time_t(dt); + struct tm *ltime = localtime(&result); + + strftime(ts, sizeof(ts), "%Y-%m-%d %H:%M:%S", ltime); + strftime(zone, sizeof(zone), "%Z", ltime); + + snprintf(buf, sizeof(buf), "%s.%06d %s", + ts, (int) (dt % USECS_PER_SEC), zone); + + return buf; +} diff --git a/src/include/access/clog.h b/src/include/access/clog.h index 1ef133cb525..5f56217327a 100644 --- a/src/include/access/clog.h +++ b/src/include/access/clog.h @@ -72,6 +72,9 @@ extern const char *clog_identify(uint8 info); extern void polar_promote_clog(void); /* POLAR: remove clog local cache file */ extern void polar_remove_clog_local_cache_file(void); +extern int polar_get_clog_min_seg_no(void); +extern XidStatus polar_get_xid_status(TransactionId xid, const char *clog_dir); +extern bool polar_xid_in_clog_dir(TransactionId xid, const char *clog_dir); #endif #endif /* CLOG_H */ diff --git a/src/include/access/polar_logindex.h b/src/include/access/polar_logindex.h index 2463159e72f..a712d53cc2a 100644 --- a/src/include/access/polar_logindex.h +++ b/src/include/access/polar_logindex.h @@ -167,4 +167,6 @@ extern void polar_logindex_online_promote(logindex_snapshot_t logindex_snapshot) extern XLogRecPtr polar_logindex_check_valid_start_lsn(logindex_snapshot_t logindex_snapshot); extern int polar_trace_logindex(int trace_level); extern void polar_logindex_update_promoted_info(logindex_snapshot_t logindex_snapshot, XLogRecPtr last_replayed_lsn); +extern XLogRecPtr polar_get_logindex_max_parsed_lsn(logindex_snapshot_t logindex_snapshot); +extern void polar_set_logindex_max_parsed_lsn(logindex_snapshot_t logindex_snapshot, XLogRecPtr lsn); #endif diff --git a/src/include/access/polar_logindex_internal.h b/src/include/access/polar_logindex_internal.h index c383b680fbd..f83ebea3794 100644 --- a/src/include/access/polar_logindex_internal.h +++ b/src/include/access/polar_logindex_internal.h @@ -364,6 +364,7 @@ typedef struct log_index_snapshot_t slock_t lock; char dir[NAMEDATALEN]; XLogRecPtr max_lsn; + XLogRecPtr max_parsed_lsn; /* Max end+1 parsed lsn, now just is used by flashback logindex */ pg_atomic_uint32 state; uint32 active_table; log_idx_table_id_t max_idx_table_id; diff --git a/src/include/access/polar_ringbuf.h b/src/include/access/polar_ringbuf.h index 3d914bb550d..1f38e5b22b9 100644 --- a/src/include/access/polar_ringbuf.h +++ b/src/include/access/polar_ringbuf.h @@ -27,8 +27,6 @@ #include "port/atomics.h" #include "storage/lwlock.h" -typedef void (*polar_interrupt_callback)(void); - /* * Each ring buffer reference occupy one slot. * Define the upper limit for ring buffer reference @@ -37,6 +35,8 @@ typedef void (*polar_interrupt_callback)(void); #define POLAR_RINGBUF_MAX_REF_NAME 63 typedef struct polar_ringbuf_data_t *polar_ringbuf_t; +typedef void (*polar_interrupt_callback)(polar_ringbuf_t); + typedef struct polar_ringbuf_slot_t { /* Is this a strong reference? */ diff --git a/src/include/access/slru.h b/src/include/access/slru.h index ddd042f074a..932f52b7020 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -228,4 +228,6 @@ extern void polar_slru_init(void); extern void polar_slru_reg_local_cache(SlruCtl ctl, polar_local_cache cache); extern void polar_slru_promote(SlruCtl ctl); extern void polar_slru_remove_local_cache_file(SlruCtl ctl); +extern bool polar_slru_find_min_seg(SlruCtl ctl, char *filename, int segpage, void *data); +extern void polar_physical_read_fra_slru(const char *slru_dir, int page_no, char *page); #endif /* SLRU_H */ diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h index f2434395eeb..a92adfca7c7 100755 --- a/src/include/catalog/pg_control.h +++ b/src/include/catalog/pg_control.h @@ -90,7 +90,6 @@ typedef struct CheckPoint #define XLOG_CSNLOG_TRUNCATE 0xF0 /* POLAR end */ - /* * System status indicator. Note this is stored in pg_control; if you change * it, you must bump PG_CONTROL_VERSION diff --git a/src/include/commands/cluster.h b/src/include/commands/cluster.h index b338cb10988..36ccf645264 100644 --- a/src/include/commands/cluster.h +++ b/src/include/commands/cluster.h @@ -17,6 +17,8 @@ #include "storage/lock.h" #include "utils/relcache.h" +#define make_new_heap(OIDOldHeap, NewTableSpace, relpersistence, lockmode) \ + polar_make_new_heap(OIDOldHeap, NewTableSpace, relpersistence, lockmode, NULL) extern void cluster(ClusterStmt *stmt, bool isTopLevel); extern void cluster_rel(Oid tableOid, Oid indexOid, bool recheck, @@ -25,8 +27,8 @@ extern void check_index_is_clusterable(Relation OldHeap, Oid indexOid, bool recheck, LOCKMODE lockmode); extern void mark_index_clustered(Relation rel, Oid indexOid, bool is_internal); -extern Oid make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence, - LOCKMODE lockmode); +extern Oid polar_make_new_heap(Oid OIDOldHeap, Oid NewTableSpace, char relpersistence, + LOCKMODE lockmode, const char * NewHeapNameGiven); extern void finish_heap_swap(Oid OIDOldHeap, Oid OIDNewHeap, bool is_system_catalog, bool swap_toast_by_content, diff --git a/src/include/fe_utils/timestamp.h b/src/include/fe_utils/timestamp.h new file mode 100644 index 00000000000..03a66e8fe82 --- /dev/null +++ b/src/include/fe_utils/timestamp.h @@ -0,0 +1,20 @@ +/*------------------------------------------------------------------------- + * + * Simple timestamp functions for frontend code + * + * Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group + * Portions Copyright (c) 1994, Regents of the University of California + * Portions Copyright (c) 2021, Alibaba Group Holding limited + * + * src/include/fe_utils/timestamp.h + * + *------------------------------------------------------------------------- + */ +#ifndef TIMESTAMP_FE_H +#define TIMESTAMP_FE_H +#include "datatype/timestamp.h" +#include "pgtime.h" + +extern const char *timestamptz_to_str(TimestampTz dt); +extern pg_time_t timestamptz_to_time_t(TimestampTz t); +#endif /* TIMESTAMP_FE_H */ diff --git a/src/include/nodes/nodes.h b/src/include/nodes/nodes.h index 069628ad994..a7f183329de 100644 --- a/src/include/nodes/nodes.h +++ b/src/include/nodes/nodes.h @@ -571,7 +571,11 @@ typedef enum NodeTag T_PartitionSelectorState, T_AssertOpState, /* POLAR px */ - T_SplitUpdateState + T_SplitUpdateState, + /* POLAR end */ + + /* POLAR: Flashback table stmt */ + T_PolarFlashbackTableStmt /* POLAR end */ } NodeTag; diff --git a/src/include/nodes/parsenodes.h b/src/include/nodes/parsenodes.h index 82115b5f117..8f4f842ba4f 100644 --- a/src/include/nodes/parsenodes.h +++ b/src/include/nodes/parsenodes.h @@ -27,6 +27,7 @@ #include "nodes/primnodes.h" #include "nodes/value.h" #include "partitioning/partdefs.h" +#include "datatype/timestamp.h" /* POLAR px */ #include "catalog/px_policy.h" @@ -3571,4 +3572,17 @@ typedef struct DropSubscriptionStmt DropBehavior behavior; /* RESTRICT or CASCADE behavior */ } DropSubscriptionStmt; +/* --------------------------------------------- + * POLAR: FLASHBACK TABLE xxx TO TIMESTAMP xxx + * --------------------------------------------- + */ +typedef struct PolarFlashbackTableStmt +{ + NodeTag type; + RangeVar *relation; /* table to work on */ + Node *target_timestamp; /* The flashback target timestamp expr */ + Node *time_expr; /* transformed target timestamp expr */ + char *newname; /* the new name */ +} PolarFlashbackTableStmt; + #endif /* PARSENODES_H */ diff --git a/src/include/parser/kwlist.h b/src/include/parser/kwlist.h index 4869b484c9d..f832e8d9fb7 100644 --- a/src/include/parser/kwlist.h +++ b/src/include/parser/kwlist.h @@ -166,7 +166,9 @@ PG_KEYWORD("family", FAMILY, UNRESERVED_KEYWORD) PG_KEYWORD("fetch", FETCH, RESERVED_KEYWORD) PG_KEYWORD("filter", FILTER, UNRESERVED_KEYWORD) PG_KEYWORD("first", FIRST_P, UNRESERVED_KEYWORD) +/* POLAR: FLASHBACK unreserverd keyword */ PG_KEYWORD("flashback", FLASHBACK, UNRESERVED_KEYWORD) +/* POLAR: END */ PG_KEYWORD("float", FLOAT_P, COL_NAME_KEYWORD) PG_KEYWORD("follower", FOLLOWER, UNRESERVED_KEYWORD) PG_KEYWORD("following", FOLLOWING, UNRESERVED_KEYWORD) diff --git a/src/include/parser/parse_node.h b/src/include/parser/parse_node.h index 0230543810f..ac354c25936 100644 --- a/src/include/parser/parse_node.h +++ b/src/include/parser/parse_node.h @@ -69,7 +69,8 @@ typedef enum ParseExprKind EXPR_KIND_TRIGGER_WHEN, /* WHEN condition in CREATE TRIGGER */ EXPR_KIND_POLICY, /* USING or WITH CHECK expr in policy */ EXPR_KIND_PARTITION_EXPRESSION, /* PARTITION BY expression */ - EXPR_KIND_CALL_ARGUMENT /* procedure argument in CALL */ + EXPR_KIND_CALL_ARGUMENT, /* procedure argument in CALL */ + EXPR_KIND_FLASHBACK_TABLE /* POLAR: flashback table target time expression */ } ParseExprKind; diff --git a/src/include/pgstat.h b/src/include/pgstat.h index 4108369c86e..1e64be9f9cd 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -1052,6 +1052,12 @@ typedef enum WAIT_EVENT_FLASHBACK_LOG_HISTORY_FILE_SYNC, WAIT_EVENT_FLASHBACK_LOG_BUF_READY, WAIT_EVENT_FLASHBACK_LOG_INSERT, + WAIT_EVENT_FLASHBACK_POINT_FILE_WRITE, + WAIT_EVENT_FLASHBACK_POINT_FILE_READ, + WAIT_EVENT_FLASHBACK_POINT_FILE_SYNC, + WAIT_EVENT_FRA_CTL_FILE_READ, + WAIT_EVENT_FRA_CTL_FILE_WRITE, + WAIT_EVENT_FRA_CTL_FILE_SYNC, /* POLAR end */ /* POLAR wal pipeline */ WAIT_EVENT_WAL_PIPELINE_COMMIT_WAIT diff --git a/src/include/polar_flashback/polar_fast_recovery_area.h b/src/include/polar_flashback/polar_fast_recovery_area.h new file mode 100644 index 00000000000..09d1d335821 --- /dev/null +++ b/src/include/polar_flashback/polar_fast_recovery_area.h @@ -0,0 +1,74 @@ +/*------------------------------------------------------------------------- + * + * polar_fast_recovery_area.h + * + * + * Copyright (c) 2021, Alibaba Group Holding limited + * + * src/include/polar_flashback/polar_fast_recovery_area.h + * + *------------------------------------------------------------------------- + */ +#ifndef POLAR_FAST_RECOVERY_AREA_H +#define POLAR_FAST_RECOVERY_AREA_H + +#include "access/slru.h" +#include "polar_flashback/polar_flashback_clog.h" +#include "polar_flashback/polar_flashback_log_internal.h" +#include "polar_flashback/polar_flashback_log_mem.h" +#include "polar_flashback/polar_flashback_point.h" +#include "polar_flashback/polar_flashback_table.h" + +#define POLAR_FRA_DEFAULT_DIR "polar_fra" +#define FRA_CTL_NAME_SUFFIX "fra ctl" +#define FRA_CTL_FILE_LOCK_NAME_SUFFIX "_ctl_file" +#define FRA_CTL_FILE_NAME "fra_control" +#define TMP_FRA_CTL_FILE_NAME "tmp_fra_ctl" +#define FRA_CTL_FILE_VERSION (1) + +typedef struct fra_ctl_file_data_t +{ + uint32 version_no; + uint32 next_clog_subdir_no; /* Next clog sub director number */ + uint64 next_fbpoint_rec_no; /* The max flashback point record number */ + fbpoint_pos_t snapshot_end_pos; /* The snapshot data end pos, high 32 bits is segment no, lower 32 bit is offset in a segment */ + XLogRecPtr min_keep_lsn; /* Minimal keep lsn while enable fast recover area */ + int min_clog_seg_no; /* Minimal clog segment while enable fast recover area */ + pg_crc32c crc; +} fra_ctl_file_data_t; + +typedef struct fra_ctl_data_t +{ + fpoint_ctl_t point_ctl; + flashback_clog_ctl_t clog_ctl; + char dir[FL_INS_MAX_NAME_LEN]; /* The directory of the fast recovery area */ + uint64 next_fbpoint_rec_no; /* A copy of point_ctl->next_fbpoint_rec_no */ + fbpoint_pos_t snapshot_end_pos; /* The snapshot data end position, high 32 bits is segment no, lower 32 bit is offset in file */ + XLogRecPtr min_keep_lsn; /* Minimal wal keep lsn */ + LWLock ctl_file_lock; /* protect the control file lock */ +} fra_ctl_data_t; + +typedef fra_ctl_data_t *fra_ctl_t; + +#define FRA_GET_SUBDIR_PATH(fra_dir, sub_dir, path) \ + polar_make_file_path_level3(path, fra_dir, sub_dir) + +#define FRA_REMOVE_ALL_DATA() rmtree(POLAR_FRA_DEFAULT_DIR, true) + +#define FRA_SHMEM_INIT() \ + do { \ + fra_instance = fra_shmem_init_internal(POLAR_FRA_DEFAULT_DIR); \ + }while (0) + +extern fra_ctl_t fra_instance; + +extern bool polar_enable_fra(fra_ctl_t ctl); +extern Size polar_fra_shmem_size(void); +extern void fra_shmem_init_data(fra_ctl_t ctl, const char *name); +extern fra_ctl_t fra_shmem_init_internal(const char *name); + +extern void polar_startup_fra(fra_ctl_t ctl); + +extern bool polar_slru_seg_need_mv(fra_ctl_t fra_ctl, SlruCtl slru_ctl); +extern void polar_mv_slru_seg_to_fra(fra_ctl_t ctl, const char *fname, const char *old_path); +#endif diff --git a/src/include/polar_flashback/polar_flashback.h b/src/include/polar_flashback/polar_flashback.h new file mode 100644 index 00000000000..ef9f7c26dbc --- /dev/null +++ b/src/include/polar_flashback/polar_flashback.h @@ -0,0 +1,26 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback.h + * + * + * Copyright (c) 2021, Alibaba Group Holding limited + * + * src/include/polar_flashback/polar_flashback.h + * + *------------------------------------------------------------------------- + */ +#ifndef POLAR_FLASHBACK_H +#define POLAR_FLASHBACK_H + +#include "catalog/pg_control.h" +#include "polar_flashback/polar_flashback_snapshot.h" +#include "polar_flashback/polar_flashback_log_internal.h" + +extern Size polar_flashback_shmem_size(void); +extern void polar_flashback_shmem_init(void); +extern void polar_startup_flashback(CheckPoint *checkpoint); +extern void polar_do_flashback_point(polar_flog_rec_ptr ckp_start, flashback_snapshot_header_t snapshot, bool shutdown); +extern XLogRecPtr polar_get_flashback_keep_wal(XLogRecPtr keep); +extern void polar_write_ctl_file_atomic(const char *path, void *data, size_t size, uint32 write_event_info, + uint32 fsync_event_info); +#endif diff --git a/src/include/polar_flashback/polar_flashback_clog.h b/src/include/polar_flashback/polar_flashback_clog.h new file mode 100644 index 00000000000..dac59938c19 --- /dev/null +++ b/src/include/polar_flashback/polar_flashback_clog.h @@ -0,0 +1,46 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback_clog.h + * + * + * Copyright (c) 2021, Alibaba Group Holding limited + * + * src/include/polar_flashback/polar_flashback_clog.h + * + *------------------------------------------------------------------------- + */ +#ifndef POLAR_FLASHBACK_CLOG_H +#define POLAR_FLASHBACK_CLOG_H + +#include "access/clog.h" +#include "port/atomics.h" + +#define FLASHBACK_CLOG_DIR "pg_xact" +#define FLASHBACK_CLOG_CTL_NAME_SUFFIX " clog ctl" + +#define FLASHBACK_CLOG_SUBDIR_NAME_LEN (8) + +typedef struct flashback_clog_ctl_data_t +{ + pg_atomic_uint32 next_clog_subdir_no; + int min_clog_seg_no; /* Minimal of the clog segment after polar_enable_flashback is on */ +} flashback_clog_ctl_data_t; + +typedef flashback_clog_ctl_data_t *flashback_clog_ctl_t; + +#define FLASHBACK_CLOG_SHMEM_SIZE (sizeof(flashback_clog_ctl_data_t)) +#define FLASHBACK_CLOG_INVALID_SEG (-1) +#define FLASHBACK_CLOG_MIN_SUBDIR (0) +#define FLASHBACK_CLOG_MIN_NEXT_SUBDIR (FLASHBACK_CLOG_MIN_SUBDIR + 1) + +#define FLASHBACK_CLOG_IS_EMPTY(next_clog_subdir) (next_clog_subdir == FLASHBACK_CLOG_MIN_SUBDIR) + +extern void polar_flashback_clog_shmem_init_data(flashback_clog_ctl_t ctl); +extern flashback_clog_ctl_t polar_flashback_clog_shmem_init(const char *name); +extern void polar_startup_flashback_clog(flashback_clog_ctl_t clog_ctl, int min_clog_seg_no, uint32 next_clog_subdir_no); + +extern void polar_mv_clog_seg_to_fra(flashback_clog_ctl_t clog_ctl, const char *fra_dir, const char *fname, const char *old_path, bool *need_update_ctl); + +extern void polar_truncate_flashback_clog_subdir(const char *fra_dir, uint32 next_clog_subdir_no); +extern XidStatus polar_flashback_get_xid_status(TransactionId xid, TransactionId max_xid, uint32 next_clog_subdir_no, const char *fra_dir); +#endif diff --git a/src/include/polar_flashback/polar_flashback_log.h b/src/include/polar_flashback/polar_flashback_log.h index ee1ece12f18..2a07a10c649 100644 --- a/src/include/polar_flashback/polar_flashback_log.h +++ b/src/include/polar_flashback/polar_flashback_log.h @@ -23,8 +23,8 @@ */ #ifndef POLAR_FLASHBACK_LOG_H #define POLAR_FLASHBACK_LOG_H -#include "postgres.h" +#include "access/polar_log.h" #include "access/polar_logindex.h" #include "access/polar_logindex_redo.h" #include "access/xlogdefs.h" @@ -33,10 +33,41 @@ #include "polar_flashback/polar_flashback_log_internal.h" #include "polar_flashback/polar_flashback_log_list.h" #include "polar_flashback/polar_flashback_log_mem.h" +#include "polar_flashback/polar_flashback_log_reader.h" #include "storage/buf_internals.h" -#define is_need_flog(forkno) (forkno == MAIN_FORKNUM) -#define polar_check_buf_flog_state(buf_hdr, state) polar_redo_check_state(buf_hdr, state) +#define POLAR_IS_NEED_FLOG(forkno) ((forkno) == MAIN_FORKNUM) +#define POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, state) polar_redo_check_state(buf_hdr, state) + +#define POLAR_LOG_FLOG_LOST(buf_tag, elevel) \ + do { \ + elog(elevel, "The origin page of " POLAR_LOG_BUFFER_TAG_FORMAT " is lost", \ + POLAR_LOG_BUFFER_TAG(buf_tag)); \ + }while (0) + +/* + * POLAR: Flashback log do checkpoint. Now be called by create checkpoint + * or restartpoint. + * NB: It is Different from polar_flog_do_fbpoint. + * It is called in every checkpoint not only flashback point. + */ +#define POLAR_CHECK_POINT_FLOG(ins, lsn) \ + do { \ + if (ins) {(ins)->buf_ctl->redo_lsn = (lsn);} \ + }while (0) + +#define POLAR_FLOG_SHMEM_SIZE() \ + polar_flog_shmem_size_internal(polar_flashback_log_insert_locks, \ + polar_flashback_log_buffers, polar_flashback_logindex_mem_size, \ + polar_flashback_logindex_bloom_blocks, polar_flashback_logindex_queue_buffers) \ + +#define POLAR_FLOG_SHMEM_INIT() \ + do { \ + flog_instance = polar_flog_shmem_init_internal(POLAR_FL_DEFAULT_DIR, \ + polar_flashback_log_insert_locks, polar_flashback_log_buffers, \ + polar_flashback_logindex_mem_size, polar_flashback_logindex_bloom_blocks, \ + polar_flashback_logindex_queue_buffers); \ + }while (0) typedef struct flog_ctl_data_t { @@ -44,13 +75,48 @@ typedef struct flog_ctl_data_t flog_list_ctl_t list_ctl; logindex_snapshot_t logindex_snapshot; flog_index_queue_ctl_t queue_ctl; - flashback_state state; + Latch *bgwriter_latch; + pg_atomic_uint32 state; } flog_ctl_data_t; +#define FLOG_INIT 0 +#define FLOG_STARTUP 1 +#define FLOG_READY 2 + typedef flog_ctl_data_t *flog_ctl_t; extern flog_ctl_t flog_instance; +typedef struct flshbak_buf_context_t +{ + polar_flog_rec_ptr start_ptr; /* The flashback log start point to flashback buffer. */ + polar_flog_rec_ptr end_ptr; /* The flashback log end point to flashback buffer. */ + XLogRecPtr start_lsn; /* The WAL start lsn to replay. */ + XLogRecPtr end_lsn; /* The WAL end lsn to replay. */ + logindex_snapshot_t logindex_snapshot; /* The flashback logindex snapshot to find origin page */ + BufferTag *tag; /* The origin page buffer tag. NB: This may be not as same as the target buffer */ + flog_reader_state *reader; /* The flashback log reader */ + Buffer buf; /* The target buffer */ + int elevel; /* The elog level when we can't find a valid origin page */ + bool apply_fpi; /* Apply the full page image wal record or not. */ +} flshbak_buf_context_t; + +#define INIT_FLSHBAK_BUF_CONTEXT(a, xx_start_ptr, xx_end_ptr, xx_start_lsn, \ + xx_end_lsn, xx_flog_index, xx_reader, xx_tag, xx_buf, xx_elevel, xx_apply_fpi) \ + do { \ + (a).start_ptr = (xx_start_ptr); \ + (a).end_ptr = (xx_end_ptr); \ + (a).start_lsn = (xx_start_lsn); \ + (a).end_lsn = (xx_end_lsn); \ + (a).logindex_snapshot = (xx_flog_index); \ + (a).reader = (xx_reader); \ + (a).tag = (xx_tag); \ + (a).buf = (xx_buf); \ + (a).elevel = (xx_elevel); \ + (a).apply_fpi = (xx_apply_fpi); \ + Assert((a).end_ptr >= (a).start_ptr); \ + }while (0) + /* GUCs */ extern int polar_flashback_logindex_mem_size; extern int polar_flashback_logindex_bloom_blocks; @@ -72,24 +138,50 @@ extern bool polar_has_flog_startup(flog_ctl_t instance); extern Size polar_flog_shmem_size_internal(int insert_locks_num, int log_buffers, int logindex_mem_size, int logindex_bloom_blocks, int queue_buffers_MB); -extern Size polar_flog_shmem_size(void); extern void polar_flog_ctl_init_data(flog_ctl_t ctl); extern flog_ctl_t polar_flog_shmem_init_internal(const char *name, int insert_locks_num, int log_buffers, int logindex_mem_size, int logindex_bloom_blocks, int queue_buffers_MB); -extern void polar_flog_shmem_init(void); -extern void polar_flog_do_fbpoint(flog_ctl_t instance, polar_flog_rec_ptr ckp_start, bool shutdown); +extern void polar_flog_do_fbpoint(flog_ctl_t instance, polar_flog_rec_ptr ckp_start, polar_flog_rec_ptr keep_ptr, bool shutdown); extern void polar_flog_insert(flog_ctl_t instance, Buffer buf, bool is_candidate, bool is_recovery); extern void polar_check_fpi_origin_page(RelFileNode rnode, ForkNumber forkno, BlockNumber block, uint8 xl_info); -extern void polar_flush_buf_flog_rec(BufferDesc *buf_hdr, flog_ctl_t instance, bool is_invalidate); +extern void polar_flush_buf_flog_rec(BufferDesc *buf_hdr, flog_ctl_t instance, bool invalidate); + +extern void polar_remove_all_flog_data(flog_ctl_t instance); extern void polar_startup_flog(CheckPoint *checkpoint, flog_ctl_t instance); extern void polar_recover_flog_buf(flog_ctl_t instance); extern void polar_recover_flog(flog_ctl_t instance); extern void polar_set_buf_flog_lost_checked(flog_ctl_t flog_ins, polar_logindex_redo_ctl_t redo_ins, Buffer buffer); -extern bool polar_may_buf_lost_flog(flog_ctl_t flog_instance, polar_logindex_redo_ctl_t logindex_redo_instance, BufferDesc *buf_desc); - +extern bool polar_may_buf_lost_flog(flog_ctl_t flog_ins, polar_logindex_redo_ctl_t redo_ins, BufferDesc *buf_desc); extern void polar_make_true_no_flog(flog_ctl_t instance, BufferDesc *buf); + +extern void polar_get_buffer_tag_in_flog_rec(flog_record *rec, BufferTag *tag); + +extern void polar_flog_rel_bulk_extend(flog_ctl_t instance, Buffer buffer); + +extern bool polar_can_flog_repair(flog_ctl_t instance, BufferDesc *buf_hdr, bool has_redo_action); +extern void polar_repair_partial_write(flog_ctl_t instance, BufferDesc *bufHdr); + +typedef struct flog_insert_context +{ + BufferTag *buf_tag; + void *data; + XLogRecPtr redo_lsn; + RmgrId rmgr; + uint8 info; +} flog_insert_context; + +extern flog_record *polar_assemble_filenode_rec(flog_insert_context *insert_context, uint32 xl_tot_len); +extern polar_flog_rec_ptr polar_flog_insert_into_buffer(flog_ctl_t instance, flog_insert_context *insert_context); +extern polar_flog_rec_ptr polar_insert_buf_flog_rec(flog_ctl_t instance, BufferTag *tag, + XLogRecPtr redo_lsn, XLogRecPtr fbpoint_lsn, uint8 info, Page origin_page, bool from_origin_buf); +extern bool polar_process_buf_flog_list(flog_ctl_t instance, BufferDesc *buf_hdr, + bool is_background, bool invalidate); +extern void polar_process_flog_list_bg(flog_ctl_t instance); + +extern bool polar_get_origin_page(flshbak_buf_context_t *context, Page page, XLogRecPtr *replay_start_lsn); +extern bool polar_flashback_buffer(flshbak_buf_context_t *context); #endif diff --git a/src/include/polar_flashback/polar_flashback_log_decoder.h b/src/include/polar_flashback/polar_flashback_log_decoder.h deleted file mode 100644 index 081edf35420..00000000000 --- a/src/include/polar_flashback/polar_flashback_log_decoder.h +++ /dev/null @@ -1,35 +0,0 @@ -/*------------------------------------------------------------------------- - * - * polar_flashback_log_decoder.h - * - * - * Copyright (c) 2020, Alibaba Group Holding Limited - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * IDENTIFICATION - * src/include/polar_flashback/polar_flashback_log_decoder.h - * - *------------------------------------------------------------------------- - */ -#ifndef POLAR_FLASHBACK_LOG_DECODER_H -#define POLAR_FLASHBACK_LOG_DECODER_H -#include "polar_flashback/polar_flashback_log.h" -#include "storage/buf_internals.h" -#include "storage/bufpage.h" - -extern bool polar_get_origin_page(flog_ctl_t instance, BufferTag *tag, Page page, - polar_flog_rec_ptr start_ptr, polar_flog_rec_ptr end_ptr, XLogRecPtr *replay_start_lsn); -extern bool polar_flashback_buffer(flog_ctl_t instance, Buffer *buf, BufferTag *tag, - polar_flog_rec_ptr start_ptr, polar_flog_rec_ptr end_ptr, XLogRecPtr start_lsn, - XLogRecPtr end_lsn, int elevel, bool apply_fpi); -#endif diff --git a/src/include/polar_flashback/polar_flashback_log_file.h b/src/include/polar_flashback/polar_flashback_log_file.h index fe57c437edc..8eeaeaea244 100644 --- a/src/include/polar_flashback/polar_flashback_log_file.h +++ b/src/include/polar_flashback/polar_flashback_log_file.h @@ -23,12 +23,13 @@ */ #ifndef POLAR_FLASHBACK_LOG_FILE_H #define POLAR_FLASHBACK_LOG_FILE_H + #include "access/xlogdefs.h" #include "polar_flashback/polar_flashback_log_mem.h" /* Something about flashback log ctl file */ #define FLOG_CTL_FILE_VERSION 0x0001 -#define FLOG_CTL_FILE "flashback_log_control" +#define FLOG_CTL_FILE "flog_control" /* Something about flashback log history file */ #define FLOG_HISTORY_FILE "flashback_log.history" @@ -45,33 +46,33 @@ /* * Is an flashback log ptr within a particular segment? * - * For ptr_in_flog_seg, do the computation at face value. - * For ptr_prev_in_flog_seg, a boundary byte is taken to be in the previous segment. + * For FLOG_PTR_IN_SEG, do the computation at face value. + * For FLOG_PTR_PREV_IN_SEG, a boundary byte is taken to be in the previous segment. */ -#define ptr_in_flog_seg(ptr, seg_no, segsz_bytes) \ +#define FLOG_PTR_IN_SEG(ptr, seg_no, segsz_bytes) \ (((ptr) / (segsz_bytes)) == (seg_no)) -#define ptr_prev_in_flog_seg(ptr, seg_no, segsz_bytes) \ +#define FLOG_PTR_PREV_IN_SEG(ptr, seg_no, segsz_bytes) \ ((((ptr) - 1) / (segsz_bytes)) == (seg_no)) -#define segs_per_flog_id(segsz_bytes) \ +#define FLOG_SEGS_PER_ID(segsz_bytes) \ (UINT64CONST(0x100000000) / (segsz_bytes)) -#define get_flog_fname(fname, seg_no, segsz_bytes, tli) \ +#define FLOG_GET_FNAME(fname, seg_no, segsz_bytes, tli) \ snprintf(fname, FLOG_MAX_FNAME_LEN, "%08X%08X%08X", \ - tli, (uint32) ((seg_no) / segs_per_flog_id(segsz_bytes)), \ - (uint32) ((seg_no) % segs_per_flog_id(segsz_bytes))) + tli, (uint32) ((seg_no) / FLOG_SEGS_PER_ID(segsz_bytes)), \ + (uint32) ((seg_no) % FLOG_SEGS_PER_ID(segsz_bytes))) -#define get_flog_seg_from_fname(fname, seg_no, seg_size) \ +#define FLOG_GET_SEG_FROM_FNAME(fname, seg_no, seg_size) \ do { \ uint32 log; \ uint32 seg; \ TimeLineID tli; \ sscanf(fname, "%08X%08X%08X", &tli, &log, &seg); \ - *seg_no = (uint64) log * segs_per_flog_id(seg_size) + seg; \ + *seg_no = (uint64) log * FLOG_SEGS_PER_ID(seg_size) + seg; \ } while (0) -#define is_flashback_log_file(fname) \ +#define FLOG_IS_LOG_FILE(fname) \ (strlen(fname) == FLOG_FNAME_LEN && \ strspn(fname, "0123456789ABCDEF") == FLOG_FNAME_LEN) @@ -90,10 +91,10 @@ * be in the previous segment. This is suitable for deciding which segment * to write given a pointer to a record end, for example. */ -#define flog_ptr_to_seg(ptr, segsz_bytes) \ +#define FLOG_PTR_TO_SEG(ptr, segsz_bytes) \ ((ptr) / (segsz_bytes)) -#define flog_ptr_prev_to_seg(ptr, segsz_bytes) \ +#define FLOG_PTR_PREV_TO_SEG(ptr, segsz_bytes) \ (((ptr) - 1) / (segsz_bytes)) typedef struct flog_ctl_file_data_t @@ -131,7 +132,7 @@ extern void polar_validate_flog_dir(flog_buf_ctl_t ctl); extern void polar_flog_clean_dir_internal(const char *dir_path); extern void polar_flog_remove_all(flog_buf_ctl_t ctl); -extern bool polar_is_flog_file_exist(const char *dir, polar_flog_rec_ptr ptr, int elevel); +extern bool polar_flog_file_exists(const char *dir, polar_flog_rec_ptr ptr, int elevel); extern int polar_flog_file_open(uint64 segno, const char *dir); extern int polar_flog_file_init(flog_buf_ctl_t ctl, uint64 logsegno, bool *use_existent); diff --git a/src/include/polar_flashback/polar_flashback_log_index.h b/src/include/polar_flashback/polar_flashback_log_index.h index a279acefae5..248255635a3 100644 --- a/src/include/polar_flashback/polar_flashback_log_index.h +++ b/src/include/polar_flashback/polar_flashback_log_index.h @@ -23,6 +23,7 @@ */ #ifndef POLAR_FLASHBACK_LOG_INDEX_H #define POLAR_FLASHBACK_LOG_INDEX_H + #include "access/polar_logindex.h" #include "access/polar_logindex_internal.h" #include "polar_flashback/polar_flashback_log_index_queue.h" diff --git a/src/include/polar_flashback/polar_flashback_log_index_queue.h b/src/include/polar_flashback/polar_flashback_log_index_queue.h index ad54121505b..a95e0c7756a 100644 --- a/src/include/polar_flashback/polar_flashback_log_index_queue.h +++ b/src/include/polar_flashback/polar_flashback_log_index_queue.h @@ -23,6 +23,7 @@ */ #ifndef POLAR_FLASHBACK_LOG_INDEX_QUEUE_H #define POLAR_FLASHBACK_LOG_INDEX_QUEUE_H + #include "access/polar_ringbuf.h" #include "polar_flashback/polar_flashback_log_internal.h" #include "polar_flashback/polar_flashback_log_record.h" diff --git a/src/include/polar_flashback/polar_flashback_log_insert.h b/src/include/polar_flashback/polar_flashback_log_insert.h deleted file mode 100644 index df529ac4036..00000000000 --- a/src/include/polar_flashback/polar_flashback_log_insert.h +++ /dev/null @@ -1,44 +0,0 @@ -/*------------------------------------------------------------------------- - * - * polar_flashback_log_insert.h - * - * - * Copyright (c) 2020, Alibaba Group Holding Limited - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * IDENTIFICATION - * src/include/polar_flashback/polar_flashback_log_insert.h - * - *------------------------------------------------------------------------- - */ -#ifndef POLAR_FLASHBACK_LOG_INSERT_H -#define POLAR_FLASHBACK_LOG_INSERT_H -#include "polar_flashback/polar_flashback_log_index_queue.h" -#include "polar_flashback/polar_flashback_log_mem.h" -#include "polar_flashback/polar_flashback_log_record.h" - -typedef struct flog_insert_context -{ - BufferTag *buf_tag; - Page origin_page; - XLogRecPtr redo_lsn; - RmgrId rmgr; - uint8 info; -} flog_insert_context; - -extern polar_flog_rec_ptr polar_flog_insert_into_buffer(flog_buf_ctl_t buf_ctl, - flog_index_queue_ctl_t queue_ctl, flog_insert_context insert_context); -extern polar_flog_rec_ptr polar_insert_buf_flog_rec(flog_buf_ctl_t buf_ctl, - flog_index_queue_ctl_t queue_ctl, BufferTag *tag, XLogRecPtr redo_lsn, XLogRecPtr fbpoint_lsn, uint8 info, Page origin_page, bool from_origin_buf); - -#endif diff --git a/src/include/polar_flashback/polar_flashback_log_internal.h b/src/include/polar_flashback/polar_flashback_log_internal.h index a6c14637807..a64126bbd7f 100644 --- a/src/include/polar_flashback/polar_flashback_log_internal.h +++ b/src/include/polar_flashback/polar_flashback_log_internal.h @@ -37,6 +37,20 @@ typedef uint64 polar_flog_rec_ptr; +typedef struct fbpoint_pos_t +{ + uint32 seg_no; + uint32 offset; +} fbpoint_pos_t; + +#define FBPOINT_POS_EQUAL(p1, p2) ((p1).seg_no == (p2).seg_no && (p1).offset == (p2).offset) + +#define SET_FBPOINT_POS(p, seg, off) \ +( \ + (p).seg_no = (seg), \ + (p).offset = (off) \ +) + /* GUCs */ extern bool polar_enable_flashback_log; extern bool polar_has_partial_write; @@ -51,10 +65,13 @@ extern int polar_flashback_log_insert_list_delay; extern int polar_flashback_point_segments; extern int polar_flashback_point_timeout; +extern bool polar_enable_fast_recovery_area; +extern int polar_fast_recovery_area_rotation; + #define FLOG_SEGMENT_OFFSET(ptr, segsz_bytes) \ ((ptr) & ((segsz_bytes) - 1)) -#define flog_seg_offset_to_ptr(segno, offset, segsz_bytes, dest) \ +#define FLOG_SEG_OFFSET_TO_PTR(segno, offset, segsz_bytes, dest) \ (dest) = (segno) * (segsz_bytes) + (offset) /* @@ -89,12 +106,4 @@ typedef enum FLOG_BUF_SHUTDOWNED } flog_buf_state; -typedef enum -{ - /* FLOG_INIT must be first */ - FLOG_INIT, - FLOG_STARTUP, - FLOG_READY -} flashback_state; - #endif diff --git a/src/include/polar_flashback/polar_flashback_log_list.h b/src/include/polar_flashback/polar_flashback_log_list.h index e81d6e778cd..4be4d756553 100644 --- a/src/include/polar_flashback/polar_flashback_log_list.h +++ b/src/include/polar_flashback/polar_flashback_log_list.h @@ -24,19 +24,21 @@ #ifndef POLAR_FLASHBACK_LOG_LIST_H #define POLAR_FLASHBACK_LOG_LIST_H -#include "postgres.h" - #include "access/xlogdefs.h" #include "polar_flashback/polar_flashback_log_internal.h" #include "polar_flashback/polar_flashback_log_mem.h" #include "storage/buf.h" #define POLAR_ORIGIN_PAGE_BUF_NUM_PER_ARRAY (32) +/* The POLAR_ORIGIN_PAGE_BUF_NUM_PER_ARRAY is 32, so we can use 31 to do mod operation */ +#define POLAR_ORIGIN_PAGE_BUF_BIT_MASK (31) #define POLAR_ORIGIN_PAGE_BUF_ARRAY_NUM (4) #define POLAR_ORIGIN_PAGE_BUF_NUM (POLAR_ORIGIN_PAGE_BUF_NUM_PER_ARRAY * POLAR_ORIGIN_PAGE_BUF_ARRAY_NUM) -#define get_origin_buf_array_id(index) (index / POLAR_ORIGIN_PAGE_BUF_NUM_PER_ARRAY) -#define get_origin_buf_bit(ctl, index) (pg_atomic_read_u32(&ctl->origin_buf_bitmap[get_origin_buf_array_id(index)]) >> index) -#define is_buf_in_flog_list(buf_hdr) polar_check_buf_flog_state(buf_hdr, POLAR_BUF_IN_FLOG_LIST) + +#define GET_ORIGIN_BUF_INDEX(index) ((index) & POLAR_ORIGIN_PAGE_BUF_BIT_MASK) +#define GET_ORIGIN_BUF_ARRAY_ID(index) ((index) / POLAR_ORIGIN_PAGE_BUF_NUM_PER_ARRAY) +#define GET_ORIGIN_BUF_BIT(ctl, index) (pg_atomic_read_u32(&(ctl)->origin_buf_bitmap[GET_ORIGIN_BUF_ARRAY_ID(index)]) >> (GET_ORIGIN_BUF_INDEX(index))) +#define IS_BUF_IN_FLOG_LIST(buf_hdr) POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_IN_FLOG_LIST) typedef struct flog_list_slot { @@ -94,9 +96,7 @@ extern flog_list_ctl_t polar_flog_async_list_init(const char *name); extern void polar_flog_get_async_list_info(flog_list_ctl_t ctl, int *head, int *tail); extern void polar_push_buf_to_flog_list(flog_list_ctl_t ctl, flog_buf_ctl_t buf_ctl, Buffer buf, bool is_candidate); -extern void polar_insert_flog_rec_from_list_bg(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, flog_index_queue_ctl_t queue_ctl); -extern void polar_insert_buf_flog_rec_sync(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, flog_index_queue_ctl_t queue_ctl, BufferDesc *buf_hdr, bool is_validate); extern void polar_flush_buf_flog(flog_list_ctl_t list_ctl, flog_buf_ctl_t buf_ctl, BufferDesc *buf, bool is_invalidate); extern void polar_get_flog_list_stat(flog_list_ctl_t ctl, uint64 *insert_total_num, uint64 *remove_total_num, uint64 *bg_remove_num); diff --git a/src/include/polar_flashback/polar_flashback_log_mem.h b/src/include/polar_flashback/polar_flashback_log_mem.h index d231ea22f6f..ac3fc8dba53 100644 --- a/src/include/polar_flashback/polar_flashback_log_mem.h +++ b/src/include/polar_flashback/polar_flashback_log_mem.h @@ -30,7 +30,7 @@ #include "polar_flashback/polar_flashback_log_index_queue.h" #include "polar_flashback/polar_flashback_log_record.h" -#define polar_is_flog_buf_ready(ctl) (polar_get_flog_buf_state(ctl) == FLOG_BUF_READY) +#define POLAR_IS_FLOG_BUF_READY(ctl) ((ctl)->buf_state == FLOG_BUF_READY) typedef struct { @@ -106,7 +106,7 @@ typedef struct flog_buf_ctl_data_t fbpoint_wal_info_data_t wal_info; /* The flashback point wal info */ slock_t info_lck; /* locks shared variables shown above */ - XLogRecPtr keep_wal_lsn; /* keep lsn of WAL */ + XLogRecPtr redo_lsn; /* The redo lsn of flashback log record, a copy of XLogCtl->RedoRecPtr after CheckPointBuffers */ polar_flog_rec_ptr initalized_upto; /* initalized upto location, protect by buf_mapping_lock */ fbpoint_info_data_t fbpoint_info; /* The information about flashback log point */ @@ -145,9 +145,6 @@ extern void polar_flog_buf_init_data(flog_buf_ctl_t ctl, const char *name, extern flog_buf_ctl_t polar_flog_buf_init(const char *name, int insert_locks_num, int log_buffers); extern void polar_startup_flog_buf(flog_buf_ctl_t ctl, CheckPoint *checkpoint); - -extern flog_buf_state polar_get_flog_buf_state(flog_buf_ctl_t ctl); -extern void polar_set_flog_buf_state(flog_buf_ctl_t ctl, flog_buf_state buf_state); extern void polar_log_flog_buf_state(flog_buf_state state); extern polar_flog_rec_ptr polar_flog_rec_insert(flog_buf_ctl_t buf_ctl, flog_index_queue_ctl_t queue_ctl, flog_record *rec, polar_flog_rec_ptr *start_ptr); @@ -159,15 +156,10 @@ extern polar_flog_rec_ptr polar_get_flog_write_request(flog_buf_ctl_t ctl); extern polar_flog_rec_ptr polar_get_curr_flog_ptr(flog_buf_ctl_t ctl, polar_flog_rec_ptr *prev_ptr); extern polar_flog_rec_ptr polar_get_flog_buf_initalized_upto(flog_buf_ctl_t ctl); -extern void polar_flog_get_keep_wal_lsn(flog_buf_ctl_t ctl, XLogRecPtr *keep); - extern polar_flog_rec_ptr polar_flog_flush_bg(flog_buf_ctl_t ctl); extern void polar_flog_flush(flog_buf_ctl_t ctl, polar_flog_rec_ptr end_ptr); extern void polar_get_flog_write_stat(flog_buf_ctl_t ctl, uint64 *write_total_num, uint64 *bg_write_num, uint64 *segs_added_total_num); extern char *polar_get_flog_dir(flog_buf_ctl_t ctl); -extern polar_flog_rec_ptr polar_get_flog_min_recover_lsn(flog_buf_ctl_t ctl); -extern void polar_set_flog_min_recover_lsn(flog_buf_ctl_t ctl, polar_flog_rec_ptr ptr); - #endif diff --git a/src/include/polar_flashback/polar_flashback_log_reader.h b/src/include/polar_flashback/polar_flashback_log_reader.h index 041536c2cb7..ee7e5dd41e9 100644 --- a/src/include/polar_flashback/polar_flashback_log_reader.h +++ b/src/include/polar_flashback/polar_flashback_log_reader.h @@ -23,11 +23,22 @@ */ #ifndef POLAR_FLASHBACK_LOG_READER_H #define POLAR_FLASHBACK_LOG_READER_H + #include "polar_flashback/polar_flashback_log_mem.h" #include "polar_flashback/polar_flashback_log_record.h" #define REC_UNFLUSHED_ERROR_MSG "The flashback log record is not flushed, please read in next time" +/* Allocate a flashback log reader which read function is polar_flog_page_read */ +#define FLOG_ALLOC_PAGE_READER(reader, buf_ctl, elevel) \ + do{\ + (reader) = polar_flog_reader_allocate(POLAR_FLOG_SEG_SIZE, \ + &polar_flog_page_read, NULL, (buf_ctl)); \ + if ((reader) == NULL) \ + ereport((elevel), (errcode(ERRCODE_OUT_OF_MEMORY), \ + errmsg("Can not allocate the flashback log reader memory"))); \ + } while(0) + typedef struct flog_reader_state flog_reader_state; /* Function type definition for the read_page callback */ @@ -126,4 +137,7 @@ extern int polar_flog_page_read(flog_reader_state *state, extern flog_reader_state *polar_flog_reader_allocate(int segment_size, page_read_callback pagereadfunc, void *private_data, flog_buf_ctl_t flog_buf_ctl); extern void polar_flog_reader_free(flog_reader_state *state); extern bool polar_is_flog_rec_ignore(polar_flog_rec_ptr *ptr, uint32 log_len, flog_reader_state *reader); +extern flog_record * polar_decode_flog_rec_common(flog_reader_state *reader, polar_flog_rec_ptr ptr, RmgrId rm_id); +extern bool polar_decode_origin_page_rec(flog_reader_state *reader, polar_flog_rec_ptr ptr, Page page, + XLogRecPtr *redo_lsn, BufferTag *tag); #endif diff --git a/src/include/polar_flashback/polar_flashback_log_record.h b/src/include/polar_flashback/polar_flashback_log_record.h index 334d8fee956..07370c292b7 100644 --- a/src/include/polar_flashback/polar_flashback_log_record.h +++ b/src/include/polar_flashback/polar_flashback_log_record.h @@ -23,6 +23,7 @@ */ #ifndef POLAR_FLASHBACK_LOG_RECORD_H #define POLAR_FLASHBACK_LOG_RECORD_H + #include "access/xlogdefs.h" #include "access/xlogrecord.h" #include "polar_flashback/polar_flashback_log_internal.h" @@ -88,7 +89,7 @@ typedef flog_long_page_header_data *polar_long_page_header; /* Make the invalid ptr to first record ptr in flashback log */ #define VALID_FLOG_PTR(ptr) \ - ((ptr == POLAR_INVALID_FLOG_REC_PTR) ? FLOG_LONG_PHD_SIZE : ptr) + (((ptr) == POLAR_INVALID_FLOG_REC_PTR) ? FLOG_LONG_PHD_SIZE : (ptr)) /* Check if an flog ptr value is in a plausible range */ #define FLOG_REC_PTR_IS_VAILD(ptr) \ @@ -110,7 +111,12 @@ typedef struct XLogRecord flog_record; #define FLOG_REC_HEADER_SIZE SizeOfXLogRecord /* RMGR ID */ -#define ORIGIN_PAGE_ID 0x00 +#define ORIGIN_PAGE_ID 0x00 +#define REL_FILENODE_ID 0x01 + +#define FLOG_REC_TYPES (2) + +#define FLOG_RECORD_TYPES {"original_page", "relation_file_node", NULL} /* Every thing for origin page record (xl_rmid = ORIGIN_PAGE_ID in the flashback log) */ /* xl_info */ @@ -146,8 +152,10 @@ typedef struct XLogRecordBlockCompressHeader fl_rec_img_comp_header; #define FL_REC_IMG_COMP_HEADER_SIZE SizeOfXLogRecordBlockCompressHeader +#define FL_GET_REC_DATA(rec) ((char *)(rec) + FLOG_REC_HEADER_SIZE)) + #define FL_GET_ORIGIN_PAGE_REC_DATA(rec) \ - ((fl_origin_page_rec_data *)((char *)rec + FLOG_REC_HEADER_SIZE)) + ((fl_origin_page_rec_data *)(FL_GET_REC_DATA(rec)) #define FL_GET_ORIGIN_PAGE_IMG_HEADER(rec) \ - ((fl_rec_img_header *)((char *)rec + FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE)) + ((fl_rec_img_header *)((char *)(rec) + FLOG_REC_HEADER_SIZE + FL_ORIGIN_PAGE_REC_INFO_SIZE)) #endif diff --git a/src/include/polar_flashback/polar_flashback_log_repair_page.h b/src/include/polar_flashback/polar_flashback_log_repair_page.h deleted file mode 100644 index 101dbe77090..00000000000 --- a/src/include/polar_flashback/polar_flashback_log_repair_page.h +++ /dev/null @@ -1,32 +0,0 @@ -/*------------------------------------------------------------------------- - * - * polar_flashback_log_repair_page.h - * - * - * Copyright (c) 2020, Alibaba Group Holding Limited - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * IDENTIFICATION - * src/include/polar_flashback/polar_flashback_log_repair_page.h - * - *------------------------------------------------------------------------- - */ -#ifndef POLAR_FLASHBACK_LOG_REPAIR_PAGE_H -#define POLAR_FLASHBACK_LOG_REPAIR_PAGE_H -#include "access/polar_logindex.h" -#include "polar_flashback/polar_flashback_log.h" -#include "storage/buf_internals.h" - -extern bool polar_can_flog_repair(flog_ctl_t instance, BufferDesc *buf_hdr, bool has_redo_action); -extern void polar_repair_partial_write(flog_ctl_t instance, BufferDesc *bufHdr); -#endif diff --git a/src/include/polar_flashback/polar_flashback_log_worker.h b/src/include/polar_flashback/polar_flashback_log_worker.h index c5484a9fc17..8fbe9523a30 100644 --- a/src/include/polar_flashback/polar_flashback_log_worker.h +++ b/src/include/polar_flashback/polar_flashback_log_worker.h @@ -23,8 +23,8 @@ */ #ifndef POLAR_FLASHBACK_LOG_WORKER_H #define POLAR_FLASHBACK_LOG_WORKER_H -#include "postgres.h" +#include "polar_flashback/polar_flashback_log.h" #define FLOG_BG_WORKER_NAME "polar flashback log bg worker" #define FLOG_BG_WORKER_TYPE "polar flashback log" @@ -39,6 +39,11 @@ FlogBgWriterPID = StartFlogBgWriter(); \ } while (0) +typedef bool (*flog_bg_worker_done)(flog_ctl_t ins, void *data); + +extern bool polar_is_flog_index_inserted(flog_ctl_t instance, void *data); +extern void polar_wait_flog_bgworker(flog_ctl_t instance, flog_bg_worker_done is_done, void *extra_data); + extern void polar_flog_bgwriter_main(void); extern void polar_flog_bginserter_main(void); diff --git a/src/include/polar_flashback/polar_flashback_point.h b/src/include/polar_flashback/polar_flashback_point.h index f7b51c1db68..35eb28f2eae 100644 --- a/src/include/polar_flashback/polar_flashback_point.h +++ b/src/include/polar_flashback/polar_flashback_point.h @@ -3,7 +3,7 @@ * polar_flashback_point.h * * - * Copyright (c) 2020-2120, Alibaba-inc PolarDB Group + * Copyright (c) 2021, Alibaba Group Holding limited * * src/include/polar_flashback/polar_flashback_point.h * @@ -11,10 +11,115 @@ */ #ifndef POLAR_FLASHBACK_POINT_H #define POLAR_FLASHBACK_POINT_H -#include "postgres.h" #include "polar_flashback/polar_flashback_log.h" +#define FBPOINT_DIR "fbpoint" + +#define FBPOINT_PAGE_SIZE (512) +/* Segment size is 4MB */ +#define FBPOINT_SEG_SIZE (4 * 1024 * 1024) + +#define FBPOINT_REC_VERSION (1) + +#define FBPOINT_PAGE_HEADER_SIZE (offsetof(fbpoint_page_header_t, crc) + sizeof(pg_crc32c)) +#define FBPOINT_PAGE_SIZE_VALID (FBPOINT_PAGE_SIZE - FBPOINT_PAGE_HEADER_SIZE) + +#define FBPOINT_PAGE_PER_SEG (FBPOINT_REC_END_POS / FBPOINT_PAGE_SIZE) + +#define FBPOINT_REC_SIZE (64) +#define FBPOINT_REC_PER_PAGE (FBPOINT_PAGE_SIZE_VALID / FBPOINT_REC_SIZE) +#define FBPOINT_REC_PER_SEG (FBPOINT_PAGE_PER_SEG * FBPOINT_REC_PER_PAGE) + +#define FBPOINT_SEG_FNAME_MAX_LEN (16) + +#define FBPOINT_CTL_NAME_SUFFIX "point ctl" +#define FBPOINT_REC_BUF_LOCK_NAME_SUFFIX "_fbp_file" + +#define FBPOINT_GET_PAGE_NO_BY_REC_NO(rec_no) (((rec_no) % FBPOINT_REC_PER_SEG) / FBPOINT_REC_PER_PAGE) +#define FBPOINT_GET_SEG_NO_BY_REC_NO(rec_no) ((rec_no) / FBPOINT_REC_PER_SEG) +#define FBPOINT_GET_OFFSET_BY_REC_NO(start_ptr, rec_no) \ + ((char *) (start_ptr) + FBPOINT_GET_PAGE_NO_BY_REC_NO(rec_no) * FBPOINT_PAGE_SIZE + FBPOINT_PAGE_HEADER_SIZE + ((rec_no) % FBPOINT_REC_PER_PAGE) * FBPOINT_REC_SIZE) +#define FBPOINT_GET_FIRST_REC_IN_SEG(seg_start_ptr) \ + ((fbpoint_rec_data_t *) ((seg_start_ptr) + FBPOINT_PAGE_HEADER_SIZE)) + +#define FBPOINT_REC_IN_SEG(rec_no, seg_no) \ + ((FBPOINT_GET_SEG_NO_BY_REC_NO(rec_no)) == (seg_no)) + +#define FBPOINT_GET_SEG_FROM_FNAME(fname, seg_no) \ + do { \ + sscanf((fname), "%08X", &(seg_no)); \ + } while (0) + +#define FBPOINT_POS_IS_INVALID(pos) (pos.seg_no == 0 && pos.offset == 0) + +/* + * The flashback point record page struct like: + * + * page_header + * fbpoint_rec_data_t[FBPOINT_REC_PER_PAGE] + * padding + */ +typedef struct fbpoint_page_header_t +{ + uint32 version; + pg_crc32c crc; /* A crc */ +} fbpoint_page_header_t; + +/* + * The size of fbpoint_rec_data_t (now is 36 without aligned) must less than FBPOINT_REC_SIZE (64). + * The remained size (now is 28) is left for extra info. + * If the size is larger than FBPOINT_REC_SIZE, please import another file. + */ +typedef struct fbpoint_rec_data_t +{ + XLogRecPtr redo_lsn; /* The redo lsn of the flashback point */ + polar_flog_rec_ptr flog_ptr; /* The flashback point start pointer of the flashback point */ + pg_time_t time; /* The time of the flashback point */ + fbpoint_pos_t snapshot_pos; /* The position of the snapshot data */ + uint32 next_clog_subdir_no; /* The next clog subdir no, copy from flashback snapshot data, just for truncating old clog subdir */ +} fbpoint_rec_data_t; + +#define FBPOINT_RECORD_DATA_SIZE 36 + +/* Flashback point record end position in segment is 64KB */ +#define FBPOINT_REC_END_POS (64 * 1024) + +typedef struct fbpoint_ctl_data_t +{ + LWLock fbpoint_rec_buf_lock; /* protect the flashback point record buffer */ + uint64 next_fbpoint_rec_no; /* The max flashback point record number */ + char *fbpoint_rec_buf; /* buffers for flashback point record file */ +} fbpoint_ctl_data_t; + +typedef fbpoint_ctl_data_t *fpoint_ctl_t; + +typedef enum +{ + FBPOINT_FILE_CREATE_FAILED, + FBPOINT_FILE_OPEN_FAILED, + FBPOINT_FILE_READ_FAILED, + FBPOINT_FILE_WRITE_FAILED, + FBPOINT_FILE_FSYNC_FAILED, + FBPOINT_FILE_CLOSE_FAILED +} fbpoint_io_errcause_t; + +typedef struct fbpoint_io_error_t +{ + uint64 segno; + off_t offset; + ssize_t size; + ssize_t io_return; + int save_errno; + fbpoint_io_errcause_t errcause; +} fbpoint_io_error_t; + +extern void polar_get_fbpoint_file_path(uint32 seg_no, const char *fra_dir, char *path); + +extern bool polar_read_fbpoint_file(const char *fra_dir, char *data, uint32 seg_no, uint32 offset, uint32 size, fbpoint_io_error_t *io_error); +extern bool polar_write_fbpoint_file(const char *fra_dir, char *data, uint32 seg_no, uint32 offset, uint32 size, fbpoint_io_error_t *io_error); +extern void polar_fbpoint_report_io_error(const char *fra_dir, fbpoint_io_error_t *io_error, int log_level); + extern void polar_set_fbpoint_wal_info(flog_buf_ctl_t buf_ctl, XLogRecPtr fbpoint_lsn, pg_time_t fbpoint_time, XLogRecPtr bg_replayed_lsn, bool is_restart_point); extern XLogRecPtr polar_get_curr_fbpoint_lsn(flog_buf_ctl_t buf_ctl); extern XLogRecPtr polar_get_prior_fbpoint_lsn(flog_buf_ctl_t buf_ctl); @@ -23,4 +128,16 @@ extern polar_flog_rec_ptr polar_get_fbpoint_start_ptr(flog_buf_ctl_t ctl); extern bool polar_is_page_first_modified(flog_buf_ctl_t buf_ctl, XLogRecPtr page_lsn, XLogRecPtr redo_lsn); extern bool polar_is_flashback_point(flog_ctl_t instance, XLogRecPtr checkpoint_lsn, XLogRecPtr bg_replayed_lsn, int *flags, bool is_restart_point); + +extern Size polar_flashback_point_shmem_size(void); +extern void polar_flashback_point_shmem_init_data(fpoint_ctl_t ctl, const char *name); +extern fpoint_ctl_t polar_flashback_point_shmem_init(const char *name); +extern void polar_startup_flashback_point(fpoint_ctl_t ctl, const char *fra_dir, uint64 next_fbpoint_rec_no); + +extern bool polar_get_right_fbpoint(fpoint_ctl_t ctl, const char *fra_dir, pg_time_t target_time, fbpoint_rec_data_t *result, uint32 *keep_seg_no); + +extern void polar_flush_fbpoint_rec(fpoint_ctl_t ctl, const char *fra_dir, fbpoint_rec_data_t *rec_data); +extern void polar_truncate_fbpoint_files(const char *fra_dir, uint32 keep_seg_no); +extern uint32 polar_get_keep_fbpoint(fpoint_ctl_t ctl, const char *fra_dir, fbpoint_rec_data_t *fbpoint_rec, + polar_flog_rec_ptr *keep_ptr, XLogRecPtr *keep_lsn); #endif diff --git a/src/include/polar_flashback/polar_flashback_rel_filenode.h b/src/include/polar_flashback/polar_flashback_rel_filenode.h new file mode 100644 index 00000000000..bcc3d56e789 --- /dev/null +++ b/src/include/polar_flashback/polar_flashback_rel_filenode.h @@ -0,0 +1,42 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback_rel_filenode.h + * + * + * Copyright (c) 2021, Alibaba Group Holding limited + * + * src/include/polar_flashback/polar_flashback_rel_filenode.h + * + *------------------------------------------------------------------------- + */ +#ifndef POLAR_FLASHBACK_REL_FILENODE_H +#define POLAR_FLASHBACK_REL_FILENODE_H + +#include "datatype/timestamp.h" +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback_log.h" +#include "storage/buf_internals.h" + +/* It is a fake fork, just be used by search the origin filenode. */ +#define FILENODE_FORK (MAX_FORKNUM + 1) + +/* Every thing for relation file node record (xl_rmid = REL_FILENODE_ID in the flashback log) */ +#define REL_CAN_FLASHBACK 0x01 +#define REL_FILENODE_TYPE_MASK 0x01 + +#define FL_FILENODE_REC_SIZE (offsetof(fl_filenode_rec_data_t, time) + sizeof(TimestampTz)) +#define FL_GET_FILENODE_REC_DATA(rec) \ + ((fl_filenode_rec_data_t *)(FL_GET_REC_DATA(rec)) + +typedef struct fl_filenode_rec_data_t +{ + RelFileNode old_filenode; + RelFileNode new_filenode; + TimestampTz time; +} fl_filenode_rec_data_t; + +#define POLAR_GET_FILENODE_REC_LEN(rec_len) ((rec_len) + FL_FILENODE_REC_SIZE) + +extern void polar_flog_filenode_update(flog_ctl_t flog_ins, fra_ctl_t fra_ins, Oid relid, Oid new_rnode, Oid new_tablespace, bool change_persistence, bool can_flashback); +extern bool polar_find_origin_filenode(flog_ctl_t ins, RelFileNode *filenode, TimestampTz target_time, polar_flog_rec_ptr start_ptr, polar_flog_rec_ptr end_ptr, flog_reader_state *reader); +#endif diff --git a/src/include/polar_flashback/polar_flashback_snapshot.h b/src/include/polar_flashback/polar_flashback_snapshot.h new file mode 100644 index 00000000000..2aa7616038d --- /dev/null +++ b/src/include/polar_flashback/polar_flashback_snapshot.h @@ -0,0 +1,103 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback_snapshot.h + * + * + * Copyright (c) 2021, Alibaba Group Holding limited + * + * src/include/polar_flashback/polar_flashback_snapshot.h + * + *------------------------------------------------------------------------- + */ +#ifndef POLAR_FLASHBACK_SNAPSHOT_H +#define POLAR_FLASHBACK_SNAPSHOT_H + +#include "access/htup.h" +#include "polar_flashback/polar_fast_recovery_area.h" +#include "utils/pg_crc.h" +#include "utils/tqual.h" + +#define FLSHBAK_SNAPSHOT_DATA_VERSION (1) + +#define POLAR_GET_FLSHBAK_SNAPSHOT_XIP(snapshot) ((char *) (snapshot) + FLSHBAK_SNAPSHOT_T_SIZE) + +/* We will enlarge the xip size 256 one time */ +#define ENLARGE_XIP_SIZE_ONCE (256) +#define GET_ENLARGE_XIP_SIZE(total_xcnt) ((total_xcnt) + ENLARGE_XIP_SIZE_ONCE) + +typedef struct flashback_snapshot_data_t +{ + XLogRecPtr lsn; /* The xlog insert lsn */ + TransactionId xmin; + TransactionId xmax; + uint32 xcnt; + uint32 next_clog_subdir_no; /* The next clog subdir no. in fast recovery area */ + TransactionId next_xid; /* The next xid, must be the last */ +} flashback_snapshot_data_t; + +typedef flashback_snapshot_data_t *flashback_snapshot_t; + +#define FLSHBAK_SNAPSHOT_T_SIZE (offsetof(flashback_snapshot_data_t, next_xid) + sizeof(TransactionId)) + +typedef struct flashback_snapshot_data_header_t +{ + uint32 info; /* high 24 bits is the end offset, low 8 bit is the version */ + uint32 data_size; /* Maybe the data size is over UINT_MAX, we think the database will be broken in this case */ + pg_crc32 crc; /* Must be the last */ +} flashback_snapshot_data_header_t; + +#define FLSHBAK_SNAPSHOT_HEADER_SIZE (offsetof(flashback_snapshot_data_header_t, crc) + sizeof(pg_crc32)) + +typedef flashback_snapshot_data_header_t *flashback_snapshot_header_t; + +#define FLSHBAK_SNAPSHOT_VERSION_SHIFT 24 +#define FLSHBAK_SNAPSHOT_VERSION_MASK 0xFFFFFF00 +#define FLSHBAK_SNAPSHOT_END_POS_SHIFT 8 +#define FLSHBAK_SNAPSHOT_END_POS_MASK 0x0FF + +#define SET_FLSHBAK_SNAPSHOT_END_POS(info, end_pos) ((info) = (((uint64) (end_pos) << FLSHBAK_SNAPSHOT_END_POS_SHIFT) | ((info) & FLSHBAK_SNAPSHOT_END_POS_MASK))) +#define SET_FLSHBAK_SNAPSHOT_VERSION(info) ((info) = ((info) & FLSHBAK_SNAPSHOT_VERSION_MASK) | (FLSHBAK_SNAPSHOT_DATA_VERSION)) + +#define GET_FLSHBAK_SNAPSHOT_END_POS(info) ((info) >> FLSHBAK_SNAPSHOT_END_POS_SHIFT) +#define GET_FLSHBAK_SNAPSHOT_VERSION(info) ((uint8) (info)) + +typedef struct flashback_rel_snapshot_t +{ + Snapshot snapshot; + uint32 *removed_xid_pos; /* The removed xid position in the xip */ + uint32 xip_size; /* The xid count has been allocated, it is large than xcnt */ + uint32 removed_size; /* The removed xid count */ + uint32 next_clog_subdir_no; /* The next clog subdir no. in fast recovery area */ + TransactionId next_xid; /* The next xid */ +} flashback_rel_snapshot_t; + +typedef flashback_rel_snapshot_t *flshbak_rel_snpsht_t; + +#define FLSHBAK_SNAPSHOT_DATA_BASE_SIZE (FLSHBAK_SNAPSHOT_HEADER_SIZE + FLSHBAK_SNAPSHOT_T_SIZE) +#define FLSHBAK_GET_SNAPSHOT_DATA_SIZE(xcnt) (polar_get_snapshot_size(FLSHBAK_SNAPSHOT_T_SIZE, xcnt)) +#define FLSHBAK_GET_SNAPSHOT_TOTAL_SIZE(xcnt) (polar_get_snapshot_size(FLSHBAK_SNAPSHOT_DATA_BASE_SIZE, xcnt)) + +#define FLSHBAK_GET_SNAPSHOT_DATA(header) ((flashback_snapshot_t) ((char *) header + FLSHBAK_SNAPSHOT_HEADER_SIZE)) + +#define MAX_KEEP_REMOVED_XIDS (128) + +#define REMOVED_XID_POS_SIZE (MAX_KEEP_REMOVED_XIDS * sizeof(uint32)) + +extern void log_flashback_snapshot(Snapshot snapshot, int elevel); +extern Size polar_get_snapshot_size(Size size, uint32 xcnt); + +extern flashback_snapshot_header_t polar_get_flashback_snapshot_data(fra_ctl_t ins, XLogRecPtr lsn); +extern fbpoint_pos_t polar_backup_snapshot_to_fra(flashback_snapshot_header_t header, fbpoint_pos_t *snapshot_end_pos, const char *fra_dir); +extern Snapshot polar_get_flashback_snapshot(const char *fra_dir, fbpoint_pos_t start_pos, uint32 *next_clog_subdir_no, TransactionId *next_xid); + +extern void polar_update_flashback_snapshot(flshbak_rel_snpsht_t rsnapshot, TransactionId xid); + +extern HTSV_Result polar_tuple_satisfies_flashback(HeapTuple htup, Buffer buffer, Snapshot snapshot, uint32 next_clog_subdir_no, TransactionId max_xid, const char *fra_dir); +extern bool polar_flashback_xact_redo(XLogRecord *record, flshbak_rel_snpsht_t rsnapshot, TimestampTz end_time, XLogReaderState *xlogreader); + +extern void polar_free_snapshot_data(Snapshot snapshot); +extern void polar_compact_xip(flshbak_rel_snpsht_t rsnapshot); + +extern void polar_fra_do_fbpoint(fra_ctl_t ctl, fbpoint_wal_info_data_t *wal_info, + polar_flog_rec_ptr *keep_ptr, flashback_snapshot_header_t snapshot); +#endif diff --git a/src/include/polar_flashback/polar_flashback_table.h b/src/include/polar_flashback/polar_flashback_table.h new file mode 100644 index 00000000000..389ffc03881 --- /dev/null +++ b/src/include/polar_flashback/polar_flashback_table.h @@ -0,0 +1,58 @@ +/*------------------------------------------------------------------------- + * + * polar_flashback_table.h + * + * + * Copyright (c) 2021, Alibaba Group Holding limited + * + * src/include/polar_flashback/polar_flashback_table.h + * + *------------------------------------------------------------------------- + */ +#ifndef POLAR_FLASHBACK_TABLE_H +#define POLAR_FLASHBACK_TABLE_H + +#include "catalog/pg_class.h" +#include "nodes/parsenodes.h" +#include "pgtime.h" +#include "polar_flashback/polar_flashback_log_internal.h" +#include "storage/relfilenode.h" +#include "storage/shm_toc.h" + +extern int polar_workers_per_flashback_table; + +typedef struct flashback_table_shared_state_t +{ + Oid old_relid; /* old relation id */ + Oid new_relid; /* new relation id */ + + polar_flog_rec_ptr flog_start_ptr; /* flashback logindex iterator start with */ + + XLogRecPtr wal_start_lsn; /* wal lsn start with */ + XLogRecPtr wal_end_lsn; /* wal lsn end with */ + + TransactionId curr_xid; /* The current transaction id */ + TransactionId next_xid; /* The next transaction id */ + + RelFileNode rel_filenode; /* The relation file node in the target time */ + BlockNumber nblocks; /* The old blocks number */ + pg_atomic_uint32 next_blkno; /* The next block number to process */ + uint32 next_clog_subdir_no; /* Next sub clog directory */ +} flashback_table_shared_state_t; + +typedef flashback_table_shared_state_t *flshbak_tbl_shr_st_t; + +typedef struct flashback_table_state_t +{ + flshbak_tbl_shr_st_t shared_state; + Snapshot snapshot; /* The snapshot at the target time */ +} flashback_table_state_t; + +typedef flashback_table_state_t *flshbak_tbl_st_t; + +extern bool polar_can_rel_flashback(Form_pg_class reltup, Oid relid, bool no_persistence_check); +extern void polar_log_cannot_flashback_cause(Form_pg_class reltup, Oid relid, bool no_persistence_check); +extern void polar_exec_flashback_table_stmt(PolarFlashbackTableStmt *stmt); +extern void polar_flashback_pages_woker_main(dsm_segment *seg, shm_toc *toc); + +#endif diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 8c163d5800d..c75c60333c4 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -280,6 +280,8 @@ typedef enum BuiltinTrancheIds LWTRANCHE_POLAR_FLASHBACK_LOG_INIT, LWTRANCHE_POLAR_FLASHBACK_LOG_CTL_FILE, LWTRANCHE_POLAR_FLASHBACK_LOG_QUEUE, + LWTRANCHE_POLAR_FLASHBACK_POINT_REC_BUF, + LWTRANCHE_POLAR_FRA_CTL_FILE, /* POLAR: Define tranche id for flashback logindex. They must be defiend between LWTRANCHE_FLOG_LOGINDEX_BEGIN and LWTRANCE_FLOG_LOGINDEX_END */ LWTRANCHE_FLOG_LOGINDEX_BEGIN, LWTRANCHE_FLOG_LOGINDEX_MEM_TBL = LWTRANCHE_FLOG_LOGINDEX_BEGIN, diff --git a/src/include/storage/polar_fd.h b/src/include/storage/polar_fd.h index 50ec608da9a..e7e5818ed4c 100644 --- a/src/include/storage/polar_fd.h +++ b/src/include/storage/polar_fd.h @@ -176,4 +176,6 @@ extern void polar_init_node_type(void); extern PolarNodeType polar_node_type_by_file(void); extern void assign_polar_datadir(const char *newval, void *extra); +extern bool polar_file_exists(const char *path); + #endif diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h index b275cbd6922..60ae24ec41f 100755 --- a/src/include/storage/procarray.h +++ b/src/include/storage/procarray.h @@ -94,7 +94,7 @@ extern bool ProcArrayInstallImportedXmin(TransactionId xmin, VirtualTransactionId *sourcevxid); extern bool ProcArrayInstallRestoredXmin(TransactionId xmin, PGPROC *proc); -extern RunningTransactions GetRunningTransactionData(void); +extern RunningTransactions PolarGetRunningTransactionData(bool ignore_subxid); extern bool TransactionIdIsInProgress(TransactionId xid); extern bool TransactionIdIsActive(TransactionId xid); @@ -173,4 +173,7 @@ extern void polar_set_latestObservedXid(TransactionId latest_observed_xid); extern TransactionId polar_get_latestObservedXid(void); /* POLAR end */ +#define GetRunningTransactionData() (PolarGetRunningTransactionData(false)) +extern RunningTransactions polar_get_running_top_trans(void); + #endif /* PROCARRAY_H */ diff --git a/src/include/storage/relfilenode.h b/src/include/storage/relfilenode.h index abffd84a1cf..7fcbcb22bb4 100644 --- a/src/include/storage/relfilenode.h +++ b/src/include/storage/relfilenode.h @@ -96,4 +96,12 @@ typedef struct RelFileNodeBackend (node1).backend == (node2).backend && \ (node1).node.spcNode == (node2).node.spcNode) +/* Copy relation filenode from a to b */ +#define COPY_REL_FILENODE(a, b) \ +( \ + (b).spcNode = (a).spcNode, \ + (b).dbNode = (a).dbNode, \ + (b).relNode = (a).relNode \ +) + #endif /* RELFILENODE_H */ diff --git a/src/include/utils/px_unsync_guc_name.h b/src/include/utils/px_unsync_guc_name.h index e77e32bde7d..5100d769c75 100644 --- a/src/include/utils/px_unsync_guc_name.h +++ b/src/include/utils/px_unsync_guc_name.h @@ -704,4 +704,6 @@ "polar_flashback_point_timeout", "polar_has_partial_write", "polar_enable_flashback_drop", - + "polar_enable_fast_recovery_area", + "polar_fast_recovery_area_rotation", + "polar_workers_per_flashback_table", diff --git a/src/test/Makefile b/src/test/Makefile index 5b8dab11e82..d27b734b8fd 100644 --- a/src/test/Makefile +++ b/src/test/Makefile @@ -40,12 +40,6 @@ endif # make confusion.) ALWAYS_SUBDIRS = $(filter-out $(SUBDIRS),examples kerberos ldap locale thread ssl) -ifeq ($(enable_inject_faults),yes) -SUBDIRS += polar_flog_repair_partial -else -ALWAYS_SUBDIRS += polar_flog_repair_partial -endif - # We want to recurse to all subdirs for all standard targets, except that # installcheck and install should not recurse into the subdirectory "modules". diff --git a/src/test/modules/Makefile b/src/test/modules/Makefile index bb57ca15e0b..d31bc27a05b 100644 --- a/src/test/modules/Makefile +++ b/src/test/modules/Makefile @@ -35,7 +35,8 @@ SUBDIRS = \ test_polar_bulk_read \ test_px \ test_wal_pipeline \ - test_flashback_log + test_flashback_log \ + test_flashback_table ifeq ($(enable_inject_faults),yes) SUBDIRS += test_xact_split diff --git a/src/test/modules/test_flashback_log/test_flashback_log.c b/src/test/modules/test_flashback_log/test_flashback_log.c index 0ec4330093d..ac3be5fa902 100644 --- a/src/test/modules/test_flashback_log/test_flashback_log.c +++ b/src/test/modules/test_flashback_log/test_flashback_log.c @@ -27,17 +27,14 @@ #include "catalog/pg_am_d.h" #include "miscadmin.h" #include "polar_flashback/polar_flashback_log.h" -#include "polar_flashback/polar_flashback_log_decoder.h" #include "polar_flashback/polar_flashback_log_file.h" #include "polar_flashback/polar_flashback_log_index.h" #include "polar_flashback/polar_flashback_log_index_queue.h" -#include "polar_flashback/polar_flashback_log_insert.h" #include "polar_flashback/polar_flashback_log_internal.h" #include "polar_flashback/polar_flashback_log_list.h" #include "polar_flashback/polar_flashback_log_mem.h" #include "polar_flashback/polar_flashback_log_reader.h" #include "polar_flashback/polar_flashback_log_record.h" -#include "polar_flashback/polar_flashback_log_repair_page.h" #include "polar_flashback/polar_flashback_log_worker.h" #include "polar_flashback/polar_flashback_point.h" #include "postmaster/bgworker.h" @@ -301,8 +298,8 @@ check_flog_history_file(polar_flog_rec_ptr switch_ptr, polar_flog_rec_ptr next_p static polar_flog_rec_ptr test_flog_insert_to_buffer(BufferTag test_tag, Page page, XLogRecPtr redo_lsn) { - return polar_insert_buf_flog_rec(buf_ctl_test, flog_index_queue_test, - &test_tag, redo_lsn, polar_get_curr_fbpoint_lsn(buf_ctl_test), 0, page, false); + return polar_insert_buf_flog_rec(test_instance, &test_tag, redo_lsn, + polar_get_curr_fbpoint_lsn(buf_ctl_test), 0, page, false); } static void @@ -473,16 +470,16 @@ check_flog_truncate(polar_flog_rec_ptr ptr) uint64 seg_no; polar_make_file_path_level2(polar_path, polar_get_flog_dir(buf_ctl_test)); - seg_no = flog_ptr_to_seg(ptr, POLAR_FLOG_SEG_SIZE); + seg_no = FLOG_PTR_TO_SEG(ptr, POLAR_FLOG_SEG_SIZE); if (seg_no == 0) return; seg_no--; - get_flog_fname(lastoff, seg_no, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(lastoff, seg_no, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); xldir = polar_allocate_dir(polar_path); while ((xlde = ReadDir(xldir, polar_path)) != NULL) { /* Ignore files that are not flashback log segments */ - if (!is_flashback_log_file(xlde->d_name)) + if (!FLOG_IS_LOG_FILE(xlde->d_name)) continue; Assert(strcmp(xlde->d_name, lastoff) > 0); } @@ -498,12 +495,12 @@ check_flog_prealloc_files(uint64 seg_no) char polar_path[MAXPGPATH]; polar_make_file_path_level2(polar_path, polar_get_flog_dir(buf_ctl_test)); - get_flog_fname(lastoff, seg_no, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); + FLOG_GET_FNAME(lastoff, seg_no, POLAR_FLOG_SEG_SIZE, FLOG_DEFAULT_TIMELINE); xldir = polar_allocate_dir(polar_path); while ((xlde = ReadDir(xldir, polar_path)) != NULL) { /* Ignore files that are not flashback log segments */ - if (!is_flashback_log_file(xlde->d_name)) + if (!FLOG_IS_LOG_FILE(xlde->d_name)) continue; Assert(strcmp(xlde->d_name, lastoff) <= 0); } @@ -578,7 +575,7 @@ init_flog_index(char *name, int logindex_mem_size, int logindex_bloom_blocks) static bool check_add_origin_page(BufferDesc *buf_hdr, int8 buf_index) { - Assert(((get_origin_buf_bit(flog_list_test, buf_index)) & 1) == 1); + Assert(((GET_ORIGIN_BUF_BIT(flog_list_test, buf_index)) & 1) == 1); Assert(BUFFERTAGS_EQUAL(flog_list_test->buf_tag[buf_index], buf_hdr->tag)); Assert(memcmp(flog_list_test->origin_buf + buf_index * BLCKSZ, (char *) BufHdrGetBlock(buf_hdr), BLCKSZ) == 0); @@ -592,7 +589,7 @@ check_clean_origin_page(int buf_id, int8 buf_index) char origin_buf_clean[BLCKSZ]; MemSet(origin_buf_clean, 0, BLCKSZ); - Assert(((get_origin_buf_bit(flog_list_test, buf_index)) & 1) == 0); + Assert(((GET_ORIGIN_BUF_BIT(flog_list_test, buf_index)) & 1) == 0); Assert(memcmp(flog_list_test->origin_buf + buf_index * BLCKSZ, origin_buf_clean, BLCKSZ) == 0); Assert(flog_list_test->flashback_list[buf_id].origin_buf_index == -1); return true; @@ -736,9 +733,22 @@ test_flog_index_search(polar_flog_rec_ptr start_ptr) PGAlignedBlock page_empty; polar_flog_rec_ptr end_ptr; int i; + flshbak_buf_context_t context; + flog_reader_state *reader; PageInit((Page)page_empty.data, BLCKSZ, 0); end_ptr = polar_get_flog_write_result(buf_ctl_test); + + reader = polar_flog_reader_allocate(POLAR_FLOG_SEG_SIZE, + &polar_flog_page_read, NULL, buf_ctl_test); + + Assert(reader); + + context.logindex_snapshot = flog_index_test; + context.start_ptr = start_ptr; + context.end_ptr = end_ptr; + context.reader = reader; + for (i = 0; i < insert_empty_page_rec_num; i++) { BufferTag test_tag; @@ -752,8 +762,8 @@ test_flog_index_search(polar_flog_rec_ptr start_ptr) test_tag.rnode.spcNode = i; test_tag.forkNum = MAIN_FORKNUM; test_tag.blockNum = i; - - polar_get_origin_page(test_instance, &test_tag, (Page) page.data, start_ptr, end_ptr, &redo_lsn); + context.tag = &test_tag; + polar_get_origin_page(&context, (Page) page.data, &redo_lsn); Assert(redo_lsn == i); Assert(memcmp(page_empty.data, page.data, BLCKSZ) == 0); } @@ -791,7 +801,7 @@ test_flog_init(bool first_init) Assert(test_instance->list_ctl); Assert(test_instance->logindex_snapshot); Assert(test_instance->queue_ctl); - Assert(test_instance->state == FLOG_INIT); + Assert(pg_atomic_read_u32(&test_instance->state) == FLOG_INIT); /* Check flashback log control */ buf_ctl_test = test_instance->buf_ctl; @@ -833,14 +843,14 @@ test_flog_startup(bool is_crash, flog_ctl_file_data_t ctl_file_data) ctl_file_data.fbpoint_info.flog_end_ptr_prev = POLAR_INVALID_FLOG_REC_PTR; polar_startup_flog(&checkpoint, test_instance); Assert(buf_ctl_test->buf_state == FLOG_BUF_READY); - Assert(test_instance->state == FLOG_STARTUP || test_instance->state == FLOG_READY); + Assert(pg_atomic_read_u32(&test_instance->state) == FLOG_STARTUP || pg_atomic_read_u32(&test_instance->state) == FLOG_READY); } else { Assert(ctl_file_data.version_no & FLOG_SHUTDOWNED); polar_startup_flog(&checkpoint, test_instance); Assert(buf_ctl_test->buf_state == FLOG_BUF_READY); - Assert(test_instance->state == FLOG_STARTUP || test_instance->state == FLOG_READY); + Assert(pg_atomic_read_u32(&test_instance->state) == FLOG_STARTUP || pg_atomic_read_u32(&test_instance->state) == FLOG_READY); } check_flog_dir_validate(TEST_FLOG_NAME, false); /* Check the flashback log checkpoint info */ @@ -848,6 +858,8 @@ test_flog_startup(bool is_crash, flog_ctl_file_data_t ctl_file_data) /* Check the flashback point wal lsn */ Assert(flog_wal_info_equal(buf_ctl_test->wal_info, ctl_file_data.fbpoint_info.wal_info)); + /* Check the flashback log redo lsn */ + Assert(buf_ctl_test->redo_lsn == checkpoint.redo); /* Check the flashback log max seg no */ Assert(buf_ctl_test->max_seg_no == ctl_file_data.max_seg_no); /* Check the flashback log index */ @@ -864,7 +876,7 @@ test_flog_recover(void) polar_recover_flog(test_instance); Assert(buf_ctl_test->buf_state == FLOG_BUF_READY); - Assert(test_instance->state == FLOG_READY); + Assert(pg_atomic_read_u32(&test_instance->state) == FLOG_READY); } static void @@ -873,12 +885,12 @@ check_flog_list_insert(int buf_id, XLogRecPtr fbpoint_lsn) XLogRecPtr redo_lsn; BufferDesc *buf_hdr = GetBufferDescriptor(buf_id); - redo_lsn = polar_get_prior_fbpoint_lsn(buf_ctl_test); + redo_lsn = buf_ctl_test->redo_lsn; Assert(flog_list_test->flashback_list[buf_id].flashback_ptr == POLAR_INVALID_FLOG_REC_PTR); Assert(flog_list_test->flashback_list[buf_id].redo_lsn == redo_lsn); Assert(flog_list_test->flashback_list[buf_id].fbpoint_lsn == fbpoint_lsn); Assert(flog_list_test->flashback_list[buf_id].info & FLOG_LIST_SLOT_READY); - Assert(polar_check_buf_flog_state(buf_hdr, POLAR_BUF_IN_FLOG_LIST)); + Assert(POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_IN_FLOG_LIST)); } static void @@ -898,7 +910,7 @@ check_flog_list_clean(int buf_id, bool is_flog_flushed, bool check_buf_redo_stat Assert(flog_list_test->flashback_list[buf_id].origin_buf_index == -1); Assert(flog_list_test->head != buf_id); if (check_buf_redo_state) - Assert(!polar_check_buf_flog_state(buf_hdr, POLAR_BUF_IN_FLOG_LIST)); + Assert(!POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_IN_FLOG_LIST)); } static void @@ -921,8 +933,9 @@ test_flog_insert_list(bool has_origin_buffer) int8 origin_buf_index = -1; - rel = try_relation_open(test_relnodes[i], NoLock); + rel = try_relation_open(test_relnodes[i], AccessShareLock); buf = ReadBuffer(rel, 0); + relation_close(rel, AccessShareLock); Assert(buf > 0 && buf <= NBuffers); buf_id = buf - 1; buf_hdr = GetBufferDescriptor(buf_id); @@ -967,7 +980,7 @@ test_flog_insert_list(bool has_origin_buffer) buf_id = test_bufs[i]; Assert(flog_list_test->flashback_list[buf_id].prev_buf == NOT_IN_FLOG_LIST); - polar_insert_flog_rec_from_list_bg(flog_list_test, buf_ctl_test, flog_index_queue_test); + polar_process_flog_list_bg(test_instance); if (has_origin_buffer) { @@ -1163,6 +1176,8 @@ test_flog_repair_buffer(Buffer bufs[TEST_BUF_LIST_NUM]) { buf_hdr = GetBufferDescriptor(buf_id); memcpy(page.data, (char *) BufHdrGetBlock(buf_hdr), BLCKSZ); + /* Memset the hole to zero */ + MemSet(page.data + ((PageHeader) page.data)->pd_lower, 0, ((PageHeader) page.data)->pd_upper - ((PageHeader) page.data)->pd_lower); /* Set the buffer invalid first to test */ buf_state = LockBufHdr(buf_hdr); @@ -1172,9 +1187,8 @@ test_flog_repair_buffer(Buffer bufs[TEST_BUF_LIST_NUM]) /* Break the buffer */ memset((char *) BufHdrGetBlock(buf_hdr), 0, BLCKSZ); polar_repair_partial_write(test_instance, buf_hdr); - Assert(polar_check_buf_flog_state(buf_hdr, POLAR_BUF_FLOG_DISABLE)); + Assert(POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_FLOG_DISABLE)); Assert(memcmp(page.data, (char *) BufHdrGetBlock(buf_hdr), BLCKSZ) == 0); - /* Set the buffer valid again */ buf_state = LockBufHdr(buf_hdr); buf_state |= BM_VALID; @@ -1216,7 +1230,9 @@ test_flog_checkpoint(bool is_shutdown) ckp_start = polar_get_flog_write_result(buf_ctl_test); polar_set_fbpoint_wal_info(buf_ctl_test, lsn, ckp_time, InvalidXLogRecPtr, false); - polar_flog_do_fbpoint(test_instance, ckp_start, is_shutdown); + polar_flog_do_fbpoint(test_instance, ckp_start, ckp_start, is_shutdown); + /* Just set the redo lsn to lsn prior */ + POLAR_CHECK_POINT_FLOG(test_instance, lsn_prior); if (is_shutdown) { @@ -1244,7 +1260,7 @@ test_flog_checkpoint(bool is_shutdown) check_flog_control_file(ckp_start, ckp_end, ckp_end_prev, max_seg_no, buf_ctl_test->wal_info, is_shutdown); - Assert(buf_ctl_test->keep_wal_lsn == lsn_prior); + Assert(buf_ctl_test->redo_lsn == lsn_prior); check_flog_truncate(Min(ckp_start, polar_get_flog_index_meta_max_ptr(flog_index_test))); } @@ -1453,7 +1469,7 @@ test_insert_flog_from_bp(int test_bp_buf_nums, bool is_recovery) replay_lsn = polar_get_curr_fbpoint_lsn(buf_ctl_test) + 1; } - if (polar_check_buf_flog_state(buf_hdr, POLAR_BUF_IN_FLOG_LIST)) + if (POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_IN_FLOG_LIST)) { check_buf_redo_state = false; inserted = false; @@ -1536,13 +1552,13 @@ test_flashback_log_online_promote(void) polar_get_local_fbpoint_lsn(buf_ctl_test, InvalidXLogRecPtr, 1); if ((j & 1) || - polar_check_buf_flog_state(buf_hdr, POLAR_BUF_FLOG_LOST_CHECKED)) + POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_FLOG_LOST_CHECKED)) { polar_set_buf_flog_lost_checked(test_instance, &redo_instance, i + 1); inserted = false; } - if (polar_check_buf_flog_state(buf_hdr, POLAR_BUF_IN_FLOG_LIST)) + if (POLAR_CHECK_BUF_FLOG_STATE(buf_hdr, POLAR_BUF_IN_FLOG_LIST)) { inserted = false; check_buf_redo_state = false; @@ -1596,7 +1612,6 @@ test_flashback_log(PG_FUNCTION_ARGS) { polar_flog_rec_ptr ptr; flog_ctl_file_data_t ctl_file_data; - CheckPoint checkpoint; fbpoint_wal_info_data_t wal_info; if (!polar_is_flog_enabled(flog_instance)) @@ -1665,10 +1680,7 @@ test_flashback_log(PG_FUNCTION_ARGS) test_flashback_log_online_promote(); /* test remove the flashback data */ - polar_enable_flashback_log = false; - checkpoint.redo = GetRedoRecPtr(); - checkpoint.time = (pg_time_t) time(NULL); - polar_startup_flog(&checkpoint, test_instance); + polar_remove_all_flog_data(test_instance); check_flog_dir_validate(TEST_FLOG_NAME, true); PG_RETURN_VOID(); diff --git a/src/test/modules/test_flashback_log/test_flashback_log.conf b/src/test/modules/test_flashback_log/test_flashback_log.conf index 03bf4d01bd4..a632d3f4f67 100644 --- a/src/test/modules/test_flashback_log/test_flashback_log.conf +++ b/src/test/modules/test_flashback_log/test_flashback_log.conf @@ -2,14 +2,13 @@ # FLASHBACK LOG OPTIONS #------------------------------------------------------------------------------ polar_enable_flashback_log = on -polar_enable_lazy_checkpoint = off +polar_enable_lazy_checkpoint = on polar_flashback_log_keep_segments = 8 polar_flashback_log_buffers = 10 polar_flashback_log_insert_locks = 8 polar_flashback_logindex_queue_buffers = 1 polar_flashback_log_bgwrite_delay = 100 polar_flashback_log_flush_max_size = 50 -polar_flashback_log_enable_worker = off polar_flashback_log_debug = off full_page_writes = off polar_unit_test_mem_size=256 diff --git a/src/test/modules/test_flashback_table/.gitignore b/src/test/modules/test_flashback_table/.gitignore new file mode 100644 index 00000000000..5dcb3ff9723 --- /dev/null +++ b/src/test/modules/test_flashback_table/.gitignore @@ -0,0 +1,4 @@ +# Generated subdirectories +/log/ +/results/ +/tmp_check/ diff --git a/src/test/modules/test_flashback_table/Makefile b/src/test/modules/test_flashback_table/Makefile new file mode 100644 index 00000000000..e62dad4b45d --- /dev/null +++ b/src/test/modules/test_flashback_table/Makefile @@ -0,0 +1,52 @@ +# src/test/modules/test_flashback_table/Makefile + +MODULE_big = test_flashback_table +OBJS = test_flashback_table.o $(WIN32RES) +PGFILEDESC = "test_flashback_table - test code for flashback table feature" + +EXTENSION = test_flashback_table +DATA = test_flashback_table--1.0.sql +TEMP_CONFIG = "test_flashback_table.conf" + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = src/test/modules/test_flashback_table +top_builddir = ../../../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif + +# Disabled because these tests require "polar_enable_flashback_log=on" and "polar_enable_fast_recovery_area=on, +# which typical installcheck users do not have (e.g. buildfarm clients). +installcheck:; + +check: regresscheck isolationcheck + +submake-regress: + $(MAKE) -C $(top_builddir)/src/test/regress all + +submake-isolation: + $(MAKE) -C $(top_builddir)/src/test/isolation all + +submake-test-flashback-table: + $(MAKE) -C $(top_builddir)/src/test/modules/test_flashback_table + +REGRESSCHECKS=test_flashback_table + +regresscheck: | submake-regress submake-test-flashback-table temp-install + $(pg_regress_check) \ + --temp-config $(top_srcdir)/src/test/modules/test_flashback_table/test_flashback_table.conf \ + $(REGRESSCHECKS) + +ISOLATIONCHECKS=flashback_table_isolation + +isolationcheck: | submake-isolation submake-test-flashback-table temp-install + $(pg_isolation_regress_check) \ + --temp-config $(top_srcdir)/src/test/modules/test_flashback_table/test_flashback_table.conf \ + $(ISOLATIONCHECKS) + +.PHONY: submake-test-flashback-table submake-regress check \ + regresscheck isolationcheck diff --git a/src/test/modules/test_flashback_table/expected/flashback_table_isolation.out b/src/test/modules/test_flashback_table/expected/flashback_table_isolation.out new file mode 100644 index 00000000000..d21e1a2d08c --- /dev/null +++ b/src/test/modules/test_flashback_table/expected/flashback_table_isolation.out @@ -0,0 +1,361 @@ +Parsed test spec with 10 sessions + +starting permutation: s1_begin s1_insert_tbl s2_begin s2_insert_target s2_commit s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check s1_commit +step s1_begin: BEGIN; +step s1_insert_tbl: INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t +step s1_commit: COMMIT; + +starting permutation: s1_begin s1_insert_tbl s6_begin s6_insert_tbl s2_begin s2_insert_target s2_commit s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check s1_commit s6_commit +step s1_begin: BEGIN; +step s1_insert_tbl: INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); +step s6_begin: BEGIN; +step s6_insert_tbl: INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t +step s1_commit: COMMIT; +step s6_commit: COMMIT; + +starting permutation: s1_begin s1_insert_tbl s6_begin s6_insert_tbl s7_begin s7_insert_tbl s8_begin s8_insert_tbl s9_begin s9_insert_tbl s2_begin s2_insert_target s2_commit s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check s1_commit s6_commit s7_commit s8_commit s9_commit +step s1_begin: BEGIN; +step s1_insert_tbl: INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); +step s6_begin: BEGIN; +step s6_insert_tbl: INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); +step s7_begin: BEGIN; +step s7_insert_tbl: INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); +step s8_begin: BEGIN; +step s8_insert_tbl: INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); +step s9_begin: BEGIN; +step s9_insert_tbl: INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t +step s1_commit: COMMIT; +step s6_commit: COMMIT; +step s7_commit: COMMIT; +step s8_commit: COMMIT; +step s9_commit: COMMIT; + +starting permutation: s2_begin s2_insert_target s3_copy_rel s3_sleep s3_delete_rel s3_flashback s2_commit s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s2_commit: COMMIT; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s4_begin s4_multi_xact s4_delete s4_commit s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s4_begin: BEGIN; +step s4_multi_xact: SELECT id FROM target_rel where id < 10 for share; +id + +1 +2 +3 +4 +5 +6 +7 +8 +9 +step s4_delete: DELETE FROM target_rel where id > 10; +step s4_commit: COMMIT; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s4_begin s4_multi_xact s4_delete s3_copy_rel s3_sleep s4_sleep s4_commit s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s4_begin: BEGIN; +step s4_multi_xact: SELECT id FROM target_rel where id < 10 for share; +id + +1 +2 +3 +4 +5 +6 +7 +8 +9 +step s4_delete: DELETE FROM target_rel where id > 10; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s4_sleep: SELECT pg_sleep(11); +pg_sleep + + +step s4_commit: COMMIT; +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s5_begin s5_update s5_subxact s5_delete s5_commit s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s5_begin: BEGIN; +step s5_update: UPDATE target_rel SET first_name = 'test'; +step s5_subxact: SAVEPOINT a; +step s5_delete: DELETE FROM target_rel where id <= 10; +step s5_commit: COMMIT; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s5_begin s5_update s5_subxact s5_delete s3_copy_rel s3_sleep s5_sleep s5_commit s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s5_begin: BEGIN; +step s5_update: UPDATE target_rel SET first_name = 'test'; +step s5_subxact: SAVEPOINT a; +step s5_delete: DELETE FROM target_rel where id <= 10; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s5_sleep: SELECT pg_sleep(11); +pg_sleep + + +step s5_commit: COMMIT; +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s5_begin s5_update s5_subxact s5_delete s5_rollback s5_commit s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s5_begin: BEGIN; +step s5_update: UPDATE target_rel SET first_name = 'test'; +step s5_subxact: SAVEPOINT a; +step s5_delete: DELETE FROM target_rel where id <= 10; +step s5_rollback: ROLLBACK TO SAVEPOINT a; +step s5_commit: COMMIT; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s5_begin s5_update s5_subxact s5_delete s5_rollback s3_copy_rel s3_sleep s5_sleep s5_commit s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s5_begin: BEGIN; +step s5_update: UPDATE target_rel SET first_name = 'test'; +step s5_subxact: SAVEPOINT a; +step s5_delete: DELETE FROM target_rel where id <= 10; +step s5_rollback: ROLLBACK TO SAVEPOINT a; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s5_sleep: SELECT pg_sleep(11); +pg_sleep + + +step s5_commit: COMMIT; +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s5_begin s5_subxact s5_update s5_delete s5_rollback s3_copy_rel s3_sleep s5_sleep s5_commit s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s5_begin: BEGIN; +step s5_subxact: SAVEPOINT a; +step s5_update: UPDATE target_rel SET first_name = 'test'; +step s5_delete: DELETE FROM target_rel where id <= 10; +step s5_rollback: ROLLBACK TO SAVEPOINT a; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s5_sleep: SELECT pg_sleep(11); +pg_sleep + + +step s5_commit: COMMIT; +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s5_begin s5_update s5_subxact s5_delete s5_rollback s5_commit s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s5_begin: BEGIN; +step s5_update: UPDATE target_rel SET first_name = 'test'; +step s5_subxact: SAVEPOINT a; +step s5_delete: DELETE FROM target_rel where id <= 10; +step s5_rollback: ROLLBACK TO SAVEPOINT a; +step s5_commit: COMMIT; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s10_begin s10_delete s10_prepare_xact s10_commit s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s10_begin: BEGIN; +step s10_delete: DELETE FROM target_rel where id <= 10; +step s10_prepare_xact: prepare transaction 'prepare_xact'; +step s10_commit: COMMIT prepared 'prepare_xact'; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s10_begin s10_delete s10_prepare_xact s3_copy_rel s3_sleep s10_sleep s10_commit s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s10_begin: BEGIN; +step s10_delete: DELETE FROM target_rel where id <= 10; +step s10_prepare_xact: prepare transaction 'prepare_xact'; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s10_sleep: SELECT pg_sleep(11); +pg_sleep + + +step s10_commit: COMMIT prepared 'prepare_xact'; +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t + +starting permutation: s2_begin s2_insert_target s2_commit s10_begin s10_delete s10_prepare_xact s10_rollback s3_copy_rel s3_sleep s3_delete_rel s3_flashback s3_check +step s2_begin: BEGIN; +step s2_insert_target: INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +step s2_commit: COMMIT; +step s10_begin: BEGIN; +step s10_delete: DELETE FROM target_rel where id <= 10; +step s10_prepare_xact: prepare transaction 'prepare_xact'; +step s10_rollback: ROLLBACK prepared 'prepare_xact'; +step s3_copy_rel: CREATE TABLE copy_rel as SELECT * FROM target_rel; +step s3_sleep: SELECT pg_sleep(10); +pg_sleep + + +step s3_delete_rel: DELETE FROM target_rel; +step s3_flashback: FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; +step s3_check: SELECT fb_check_data('target_rel', 'copy_rel'); +fb_check_data + +t diff --git a/src/test/modules/test_flashback_table/expected/test_flashback_table.out b/src/test/modules/test_flashback_table/expected/test_flashback_table.out new file mode 100644 index 00000000000..ce315085ee8 --- /dev/null +++ b/src/test/modules/test_flashback_table/expected/test_flashback_table.out @@ -0,0 +1,381 @@ +CREATE EXTENSION IF NOT EXISTS test_flashback_table; +-- test fast recovery area interface +SELECT test_fast_recovery_area(); + test_fast_recovery_area +------------------------- + +(1 row) + +-- setup +CREATE ROLE fb_test with login; +DROP TABLE IF EXISTS old_rel; +DROP TABLE IF EXISTS expected_rel; +CREATE TABLE old_rel(id int PRIMARY KEY, first_name varchar(6), last_name varchar(6)); +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +-- delete and flashback table +-- let flashback logindex has something not inserted +ALTER SYSTEM SET polar_flashback_log_insert_list_delay = 10000; +SELECT * FROM pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +INSERT INTO old_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +CHECKPOINT; +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +DELETE FROM old_rel; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); + fb_check_data +--------------- + t +(1 row) + +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +DROP TABLE expected_rel; +ALTER SYSTEM RESET polar_flashback_log_insert_list_delay; +SELECT * FROM pg_reload_conf(); + pg_reload_conf +---------------- + t +(1 row) + +-- insert and flashback table +SET polar_workers_per_flashback_table = 0; +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +INSERT INTO old_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); + fb_check_data +--------------- + t +(1 row) + +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +DROP TABLE expected_rel; +RESET polar_workers_per_flashback_table; +-- update and flashback table +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +UPDATE old_rel set first_name='fbtest'; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); + fb_check_data +--------------- + t +(1 row) + +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +DROP TABLE expected_rel; +-- alter add check and flashback table +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +DELETE from old_rel where id < 3; +ALTER TABLE old_rel ADD CHECK (id >= 3); +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); + fb_check_data +--------------- + t +(1 row) + +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +DROP TABLE expected_rel; +-- alter drop column and flashback table without rewrite table +CREATE TABLE expected_rel(id int, last_name varchar(6)); +INSERT INTO expected_rel SELECT id, last_name FROM old_rel; +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel DROP COLUMN first_name; +DELETE FROM old_rel; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); + fb_check_data +--------------- + t +(1 row) + +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +DROP TABLE expected_rel; +-- alter add column and flashback table without rewrite table +INSERT INTO old_rel SELECT *, fb_random_string(6) FROM generate_series(3,10000); +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel ADD COLUMN first_name varchar(6); +UPDATE old_rel SET first_name='test'; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +CREATE TABLE expected_fb_rel (LIKE old_rel); +INSERT INTO expected_fb_rel select id, last_name from expected_rel; +SELECT fb_check_data('old_rel', 'expected_fb_rel'); + fb_check_data +--------------- + t +(1 row) + +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +DROP TABLE expected_rel; +DROP TABLE expected_fb_rel; +-- alter column type and flashback table without rewrite table +CREATE DOMAIN int32 AS int; +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel ALTER id TYPE int32; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); + fb_check_data +--------------- + t +(1 row) + +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +DROP TABLE expected_rel; +-- flashback table out of rotation +FLASHBACK TABLE old_rel to timestamp '1990-01-01 00:00:00'; +ERROR: The lag between now and flashback table target time exceeds the polar_fast_recovery_area_rotation 180 minutes +FLASHBACK TABLE old_rel to timestamp '2999-01-01 00:00:00'; +ERROR: The flashback table target time exceeds now! +--flashback table with a role which is not its owner +SET SESSION AUTHORIZATION fb_test; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: must be owner of table old_rel +RESET SESSION AUTHORIZATION; +-- flashback index +CREATE INDEX id_int_index on old_rel(id); +FLASHBACK TABLE id_int_index to timestamp now() - interval '10s'; +ERROR: can not support to flashback an irregular table "id_int_index" now. +-- flashback table has a toast table +CREATE TABLE fb_toast(id int, name text); +FLASHBACK TABLE fb_toast to timestamp now() - interval '10s'; +ERROR: can not support to flashback table which has a toast table "fb_toast". +DROP TABLE fb_toast; +-- flashback partitioned and partition table +CREATE TABLE fb_partitioned ( + id int, + create_time timestamp(0) +) PARTITION BY RANGE(create_time); +CREATE TABLE fb_partition_202003 PARTITION OF fb_partitioned FOR VALUES FROM ('2020-03-01') TO ('2020-04-01'); +FLASHBACK TABLE fb_partitioned to timestamp now() - interval '10s'; +ERROR: can not support to flashback an irregular table "fb_partitioned" now. +FLASHBACK TABLE fb_partition_202003 to timestamp now() - interval '10s'; +ERROR: can not support to flashback an partition table "fb_partition_202003" now. +DROP TABLE fb_partitioned; +-- flashback partitioned and partition table +FLASHBACK TABLE pg_class to timestamp now() - interval '10s'; +ERROR: can not support to flashback system catalog "pg_class". +-- flashback view and materialized view +CREATE VIEW fb_view AS SELECT * FROM old_rel; +FLASHBACK TABLE fb_view to timestamp now() - interval '10s'; +ERROR: can not support to flashback an irregular table "fb_view" now. +DROP VIEW fb_view; +CREATE MATERIALIZED VIEW fb_materialized_view AS SELECT * FROM old_rel; +FLASHBACK TABLE fb_materialized_view to timestamp now() - interval '10s'; +ERROR: can not support to flashback an irregular table "fb_materialized_view" now. +DROP MATERIALIZED VIEW fb_materialized_view; +-- alter table with oids and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel SET WITH OIDS; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: The relation file node has been changed by vacuum full or alter table or truncate table in the past, we can't flashback the relation. +-- alter table without oids and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel SET WITHOUT OIDS; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: The relation file node has been changed by vacuum full or alter table or truncate table in the past, we can't flashback the relation. +-- alter add column with identity and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel ADD COLUMN test_identity int GENERATED ALWAYS AS IDENTITY; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: The relation file node has been changed by vacuum full or alter table or truncate table in the past, we can't flashback the relation. +-- alter add column type with check and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +CREATE DOMAIN age AS int CHECK(VALUE >=0 AND VALUE <= 200); +ALTER TABLE old_rel ADD COLUMN man_age age; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: The relation file node has been changed by vacuum full or alter table or truncate table in the past, we can't flashback the relation. +-- alter add column with default contain volatile function and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel ADD COLUMN user_id int DEFAULT random()*10000; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: The relation file node has been changed by vacuum full or alter table or truncate table in the past, we can't flashback the relation. +-- alter table unlooged and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel SET UNLOGGED; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: can not support to flashback non-persistence table "old_rel". +-- alter table logged and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +ALTER TABLE old_rel SET LOGGED; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: The relation file node has been changed by vacuum full or alter table or truncate table in the past, we can't flashback the relation. +-- vacuum full table and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +VACUUM FULL old_rel; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: The relation file node has been changed by vacuum full or alter table or truncate table in the past, we can't flashback the relation. +-- truncate table and flashback table with rewrite table +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +TRUNCATE TABLE old_rel; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +ERROR: The relation file node has been changed by vacuum full or alter table or truncate table in the past, we can't flashback the relation. +-- alter table add a foregin key and flashback table +INSERT INTO old_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(3,10000); +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); + pg_sleep +---------- + +(1 row) + +CREATE TABLE fb_foreign_key(nick_name varchar(6) PRIMARY KEY); +DELETE FROM old_rel; +ALTER TABLE old_rel ADD CONSTRAINT fb_foreign FOREIGN KEY (first_name) REFERENCES fb_foreign_key (nick_name); +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); + fb_check_data +--------------- + t +(1 row) + +SELECT clean_fb_rel('old_rel'); + clean_fb_rel +-------------- + t +(1 row) + +DROP TABLE expected_rel; +-- clean up +DROP TABLE IF EXISTS old_rel; +DROP TABLE IF EXISTS fb_foreign_key; +DROP DOMAIN int32; +DROP DOMAIN age; +DROP EXTENSION test_flashback_table; diff --git a/src/test/modules/test_flashback_table/specs/flashback_table_isolation.spec b/src/test/modules/test_flashback_table/specs/flashback_table_isolation.spec new file mode 100644 index 00000000000..8211efb1c6e --- /dev/null +++ b/src/test/modules/test_flashback_table/specs/flashback_table_isolation.spec @@ -0,0 +1,99 @@ +setup +{ + CREATE EXTENSION IF NOT EXISTS test_flashback_table; + DROP TABLE IF EXISTS target_rel; + DROP TABLE IF EXISTS copy_rel; + DROP TABLE IF EXISTS tbl1; + CREATE TABLE target_rel(id int PRIMARY KEY, first_name varchar(6), last_name varchar(6)); + SELECT clean_fb_rel('target_rel'); + CREATE TABLE tbl1(id int); +} + +teardown +{ + SELECT clean_fb_rel('target_rel'); + DROP TABLE IF EXISTS target_rel; + DROP TABLE IF EXISTS copy_rel; + DROP TABLE IF EXISTS tbl1; +} + +session "s1" +step "s1_begin" { BEGIN; } +step "s1_insert_tbl" { INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); } +step "s1_commit" { COMMIT; } + +session "s2" +step "s2_begin" { BEGIN; } +step "s2_insert_target" { INSERT INTO target_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); } +step "s2_commit" { COMMIT; } + +session "s3" +step "s3_copy_rel" { CREATE TABLE copy_rel as SELECT * FROM target_rel; } +step "s3_sleep" { SELECT pg_sleep(10); } +step "s3_delete_rel" { DELETE FROM target_rel; } +step "s3_flashback" { FLASHBACK TABLE target_rel to timestamp now() - interval '10s'; } +step "s3_check" { SELECT fb_check_data('target_rel', 'copy_rel'); } + +session "s4" +step "s4_begin" { BEGIN; } +step "s4_multi_xact" { SELECT id FROM target_rel where id < 10 for share; } +step "s4_delete" { DELETE FROM target_rel where id > 10; } +step "s4_sleep" { SELECT pg_sleep(11); } +step "s4_commit" { COMMIT; } + +session "s5" +step "s5_begin" { BEGIN; } +step "s5_update" { UPDATE target_rel SET first_name = 'test'; } +step "s5_subxact" { SAVEPOINT a; } +step "s5_delete" { DELETE FROM target_rel where id <= 10; } +step "s5_rollback" { ROLLBACK TO SAVEPOINT a; } +step "s5_sleep" { SELECT pg_sleep(11); } +step "s5_commit" { COMMIT; } + +session "s6" +step "s6_begin" { BEGIN; } +step "s6_insert_tbl" { INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); } +step "s6_commit" { COMMIT; } + +session "s7" +step "s7_begin" { BEGIN; } +step "s7_insert_tbl" { INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); } +step "s7_commit" { COMMIT; } + +session "s8" +step "s8_begin" { BEGIN; } +step "s8_insert_tbl" { INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); } +step "s8_commit" { COMMIT; } + +session "s9" +step "s9_begin" { BEGIN; } +step "s9_insert_tbl" { INSERT INTO tbl1 SELECT * FROM generate_series(1,10000); } +step "s9_commit" { COMMIT; } + +session "s10" +step "s10_begin" { BEGIN; } +step "s10_delete" { DELETE FROM target_rel where id <= 10; } +step "s10_prepare_xact" { prepare transaction 'prepare_xact'; } +step "s10_sleep" { SELECT pg_sleep(11); } +step "s10_commit" { COMMIT prepared 'prepare_xact'; } +step "s10_rollback" { ROLLBACK prepared 'prepare_xact'; } + +# long transaction +permutation "s1_begin" "s1_insert_tbl" "s2_begin" "s2_insert_target" "s2_commit" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" "s1_commit" +permutation "s1_begin" "s1_insert_tbl" "s6_begin" "s6_insert_tbl" "s2_begin" "s2_insert_target" "s2_commit" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" "s1_commit" "s6_commit" +permutation "s1_begin" "s1_insert_tbl" "s6_begin" "s6_insert_tbl" "s7_begin" "s7_insert_tbl" "s8_begin" "s8_insert_tbl" "s9_begin" "s9_insert_tbl" "s2_begin" "s2_insert_target" "s2_commit" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" "s1_commit" "s6_commit" "s7_commit" "s8_commit" "s9_commit" +permutation "s2_begin" "s2_insert_target" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s2_commit" "s3_check" +# multi transaction +permutation "s2_begin" "s2_insert_target" "s2_commit" "s4_begin" "s4_multi_xact" "s4_delete" "s4_commit" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" +permutation "s2_begin" "s2_insert_target" "s2_commit" "s4_begin" "s4_multi_xact" "s4_delete" "s3_copy_rel" "s3_sleep" "s4_sleep" "s4_commit" "s3_delete_rel" "s3_flashback" "s3_check" +# sub transaction +permutation "s2_begin" "s2_insert_target" "s2_commit" "s5_begin" "s5_update" "s5_subxact" "s5_delete" "s5_commit" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" +permutation "s2_begin" "s2_insert_target" "s2_commit" "s5_begin" "s5_update" "s5_subxact" "s5_delete" "s3_copy_rel" "s3_sleep" "s5_sleep" "s5_commit" "s3_delete_rel" "s3_flashback" "s3_check" +permutation "s2_begin" "s2_insert_target" "s2_commit" "s5_begin" "s5_update" "s5_subxact" "s5_delete" "s5_rollback" "s5_commit" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" +permutation "s2_begin" "s2_insert_target" "s2_commit" "s5_begin" "s5_update" "s5_subxact" "s5_delete" "s5_rollback" "s3_copy_rel" "s3_sleep" "s5_sleep" "s5_commit" "s3_delete_rel" "s3_flashback" "s3_check" +permutation "s2_begin" "s2_insert_target" "s2_commit" "s5_begin" "s5_subxact" "s5_update" "s5_delete" "s5_rollback" "s3_copy_rel" "s3_sleep" "s5_sleep" "s5_commit" "s3_delete_rel" "s3_flashback" "s3_check" +permutation "s2_begin" "s2_insert_target" "s2_commit" "s5_begin" "s5_update" "s5_subxact" "s5_delete" "s5_rollback" "s5_commit" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" +# prepare transaction +permutation "s2_begin" "s2_insert_target" "s2_commit" "s10_begin" "s10_delete" "s10_prepare_xact" "s10_commit" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" +permutation "s2_begin" "s2_insert_target" "s2_commit" "s10_begin" "s10_delete" "s10_prepare_xact" "s3_copy_rel" "s3_sleep" "s10_sleep" "s10_commit" "s3_delete_rel" "s3_flashback" "s3_check" +permutation "s2_begin" "s2_insert_target" "s2_commit" "s10_begin" "s10_delete" "s10_prepare_xact" "s10_rollback" "s3_copy_rel" "s3_sleep" "s3_delete_rel" "s3_flashback" "s3_check" \ No newline at end of file diff --git a/src/test/modules/test_flashback_table/sql/test_flashback_table.sql b/src/test/modules/test_flashback_table/sql/test_flashback_table.sql new file mode 100644 index 00000000000..11328bc4ae2 --- /dev/null +++ b/src/test/modules/test_flashback_table/sql/test_flashback_table.sql @@ -0,0 +1,204 @@ +CREATE EXTENSION IF NOT EXISTS test_flashback_table; + +-- test fast recovery area interface +SELECT test_fast_recovery_area(); + +-- setup +CREATE ROLE fb_test with login; +DROP TABLE IF EXISTS old_rel; +DROP TABLE IF EXISTS expected_rel; +CREATE TABLE old_rel(id int PRIMARY KEY, first_name varchar(6), last_name varchar(6)); +SELECT clean_fb_rel('old_rel'); + +-- delete and flashback table +-- let flashback logindex has something not inserted +ALTER SYSTEM SET polar_flashback_log_insert_list_delay = 10000; +SELECT * FROM pg_reload_conf(); +INSERT INTO old_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +CHECKPOINT; +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); +DELETE FROM old_rel; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); +SELECT clean_fb_rel('old_rel'); +DROP TABLE expected_rel; +ALTER SYSTEM RESET polar_flashback_log_insert_list_delay; +SELECT * FROM pg_reload_conf(); + +-- insert and flashback table +SET polar_workers_per_flashback_table = 0; +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); +INSERT INTO old_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(1,10000); +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); +SELECT clean_fb_rel('old_rel'); +DROP TABLE expected_rel; +RESET polar_workers_per_flashback_table; + +-- update and flashback table +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); +UPDATE old_rel set first_name='fbtest'; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); +SELECT clean_fb_rel('old_rel'); +DROP TABLE expected_rel; + +-- alter add check and flashback table +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); +DELETE from old_rel where id < 3; +ALTER TABLE old_rel ADD CHECK (id >= 3); +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); +SELECT clean_fb_rel('old_rel'); +DROP TABLE expected_rel; + +-- alter drop column and flashback table without rewrite table +CREATE TABLE expected_rel(id int, last_name varchar(6)); +INSERT INTO expected_rel SELECT id, last_name FROM old_rel; +SELECT pg_sleep(10); +ALTER TABLE old_rel DROP COLUMN first_name; +DELETE FROM old_rel; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); +SELECT clean_fb_rel('old_rel'); +DROP TABLE expected_rel; + +-- alter add column and flashback table without rewrite table +INSERT INTO old_rel SELECT *, fb_random_string(6) FROM generate_series(3,10000); +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); +ALTER TABLE old_rel ADD COLUMN first_name varchar(6); +UPDATE old_rel SET first_name='test'; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +CREATE TABLE expected_fb_rel (LIKE old_rel); +INSERT INTO expected_fb_rel select id, last_name from expected_rel; +SELECT fb_check_data('old_rel', 'expected_fb_rel'); +SELECT clean_fb_rel('old_rel'); +DROP TABLE expected_rel; +DROP TABLE expected_fb_rel; + +-- alter column type and flashback table without rewrite table +CREATE DOMAIN int32 AS int; +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); +ALTER TABLE old_rel ALTER id TYPE int32; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); +SELECT clean_fb_rel('old_rel'); +DROP TABLE expected_rel; + +-- flashback table out of rotation +FLASHBACK TABLE old_rel to timestamp '1990-01-01 00:00:00'; +FLASHBACK TABLE old_rel to timestamp '2999-01-01 00:00:00'; + +--flashback table with a role which is not its owner +SET SESSION AUTHORIZATION fb_test; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +RESET SESSION AUTHORIZATION; + +-- flashback index +CREATE INDEX id_int_index on old_rel(id); +FLASHBACK TABLE id_int_index to timestamp now() - interval '10s'; + +-- flashback table has a toast table +CREATE TABLE fb_toast(id int, name text); +FLASHBACK TABLE fb_toast to timestamp now() - interval '10s'; +DROP TABLE fb_toast; + +-- flashback partitioned and partition table +CREATE TABLE fb_partitioned ( + id int, + create_time timestamp(0) +) PARTITION BY RANGE(create_time); +CREATE TABLE fb_partition_202003 PARTITION OF fb_partitioned FOR VALUES FROM ('2020-03-01') TO ('2020-04-01'); +FLASHBACK TABLE fb_partitioned to timestamp now() - interval '10s'; +FLASHBACK TABLE fb_partition_202003 to timestamp now() - interval '10s'; +DROP TABLE fb_partitioned; + +-- flashback partitioned and partition table +FLASHBACK TABLE pg_class to timestamp now() - interval '10s'; + +-- flashback view and materialized view +CREATE VIEW fb_view AS SELECT * FROM old_rel; +FLASHBACK TABLE fb_view to timestamp now() - interval '10s'; +DROP VIEW fb_view; +CREATE MATERIALIZED VIEW fb_materialized_view AS SELECT * FROM old_rel; +FLASHBACK TABLE fb_materialized_view to timestamp now() - interval '10s'; +DROP MATERIALIZED VIEW fb_materialized_view; + +-- alter table with oids and flashback table with rewrite table +SELECT pg_sleep(10); +ALTER TABLE old_rel SET WITH OIDS; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- alter table without oids and flashback table with rewrite table +SELECT pg_sleep(10); +ALTER TABLE old_rel SET WITHOUT OIDS; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- alter add column with identity and flashback table with rewrite table +SELECT pg_sleep(10); +ALTER TABLE old_rel ADD COLUMN test_identity int GENERATED ALWAYS AS IDENTITY; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- alter add column type with check and flashback table with rewrite table +SELECT pg_sleep(10); +CREATE DOMAIN age AS int CHECK(VALUE >=0 AND VALUE <= 200); +ALTER TABLE old_rel ADD COLUMN man_age age; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- alter add column with default contain volatile function and flashback table with rewrite table +SELECT pg_sleep(10); +ALTER TABLE old_rel ADD COLUMN user_id int DEFAULT random()*10000; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- alter table unlooged and flashback table with rewrite table +SELECT pg_sleep(10); +ALTER TABLE old_rel SET UNLOGGED; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- alter table logged and flashback table with rewrite table +SELECT pg_sleep(10); +ALTER TABLE old_rel SET LOGGED; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- vacuum full table and flashback table with rewrite table +SELECT pg_sleep(10); +VACUUM FULL old_rel; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- truncate table and flashback table with rewrite table +SELECT pg_sleep(10); +TRUNCATE TABLE old_rel; +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; + +-- alter table add a foregin key and flashback table +INSERT INTO old_rel SELECT *, fb_random_string(6), fb_random_string(6) FROM generate_series(3,10000); +CREATE TABLE expected_rel as SELECT * FROM old_rel; +SELECT pg_sleep(10); +CREATE TABLE fb_foreign_key(nick_name varchar(6) PRIMARY KEY); +DELETE FROM old_rel; +ALTER TABLE old_rel ADD CONSTRAINT fb_foreign FOREIGN KEY (first_name) REFERENCES fb_foreign_key (nick_name); +FLASHBACK TABLE old_rel to timestamp now() - interval '10s'; +-- check the data and clean +SELECT fb_check_data('old_rel', 'expected_rel'); +SELECT clean_fb_rel('old_rel'); +DROP TABLE expected_rel; + +-- clean up +DROP TABLE IF EXISTS old_rel; +DROP TABLE IF EXISTS fb_foreign_key; +DROP DOMAIN int32; +DROP DOMAIN age; +DROP EXTENSION test_flashback_table; diff --git a/src/test/modules/test_flashback_table/test_flashback_table--1.0.sql b/src/test/modules/test_flashback_table/test_flashback_table--1.0.sql new file mode 100644 index 00000000000..e37fe5fea6c --- /dev/null +++ b/src/test/modules/test_flashback_table/test_flashback_table--1.0.sql @@ -0,0 +1,65 @@ +/* src/test/modules/test_flashback_table/test_flashback_table--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION test_flashback_table" to load this file. \quit + +CREATE FUNCTION test_fast_recovery_area() +RETURNS pg_catalog.void STRICT +AS 'MODULE_PATHNAME' LANGUAGE C; + +CREATE OR REPLACE FUNCTION fb_random_string( + num INTEGER, + chars TEXT default 'abcdefghijklmnopqrstuvwxyz' +) RETURNS TEXT +LANGUAGE plpgsql +AS $$ +DECLARE + res_str TEXT := ''; +BEGIN + IF num < 1 THEN + RAISE EXCEPTION 'Invalid length'; + END IF; + FOR __ IN 1..num LOOP + res_str := res_str || substr(chars, floor(random() * length(chars))::int + 1, 1); + END LOOP; + RETURN res_str; +END $$; + +CREATE OR REPLACE FUNCTION fb_check_data( + old_relname TEXT, + expected_relname TEXT +) RETURNS BOOLEAN +LANGUAGE plpgsql +AS $$ +DECLARE + flashback_relname TEXT; + old_relid OID; + count_rel_expected INTEGER; + count_rel_flashback INTEGER; +BEGIN + execute format('select oid from pg_class where relname=''%I''', old_relname) into old_relid; + flashback_relname := 'polar_flashback_' || old_relid; + execute 'select count(*) from ' || expected_relname into count_rel_expected; + execute 'select count(*) from ' || flashback_relname || ',' || expected_relname || ' where md5(CAST((' || flashback_relname || '.*)AS text)) = md5(CAST((' || expected_relname || '.*)AS text))' into count_rel_flashback; + + IF count_rel_expected = count_rel_flashback THEN + RETURN TRUE; + ELSE + RETURN FALSE; + END IF; +END $$; + +CREATE OR REPLACE FUNCTION clean_fb_rel( + old_relname TEXT +) RETURNS BOOLEAN +LANGUAGE plpgsql +AS $$ +DECLARE + flashback_relname TEXT; + old_relid OID; +BEGIN + execute format('select oid from pg_class where relname=''%I''', old_relname) into old_relid; + flashback_relname := 'polar_flashback_' || old_relid; + execute 'DROP TABLE IF EXISTS ' || flashback_relname; + RETURN TRUE; +END $$; \ No newline at end of file diff --git a/src/test/modules/test_flashback_table/test_flashback_table.c b/src/test/modules/test_flashback_table/test_flashback_table.c new file mode 100644 index 00000000000..f1248a14daa --- /dev/null +++ b/src/test/modules/test_flashback_table/test_flashback_table.c @@ -0,0 +1,1036 @@ +/*------------------------------------------------------------------------- + * + * test_flashback_table.c + * + * + * Copyright (c) 2020-2120, Alibaba-inc PolarDB Group + * + * IDENTIFICATION + * src/test/modules/test_flashback_table/test_flashback_table.c + * + *------------------------------------------------------------------------- + */ +#include "postgres.h" + +#include "miscadmin.h" +#include "polar_flashback/polar_fast_recovery_area.h" +#include "polar_flashback/polar_flashback_snapshot.h" +#include "polar_flashback/polar_flashback_snapshot.h" +#include "utils/guc.h" + +PG_MODULE_MAGIC; + +#define TEST_FRA_NAME "testfra" +#define TEST_CLOG_SEGNO (4095) +#define XID_PER_CLOG_SEG (8*1024*32*4) +#define get_clog_segno_fname(segno, fname) (snprintf(fname, 5, "%04X", segno)) +#define MIN_RECORD_TIME (2) +#define LUCKY_RECORD_NO (17) /* Must be (2, FBPOINT_REC_PER_SEG * 3 + 2] */ + +/* Saved hook values in case of unload */ +static shmem_startup_hook_type prev_shmem_startup_hook = NULL; + +static fra_ctl_t test_fra = NULL; +static fpoint_ctl_t test_point_ctl = NULL; +static flashback_clog_ctl_t test_clog_ctl = NULL; + +/*---- Function declarations ----*/ +void _PG_init(void); +void _PG_fini(void); + +static bool +check_dir_validate(char *path) +{ + struct stat st; + + if ((polar_stat(path, &st) == 0) && S_ISDIR(st.st_mode)) + return true; + else + return false; +} + +static bool +check_fra_subdir_validate(const char *sub_path) +{ + char path[MAXPGPATH]; + + FRA_GET_SUBDIR_PATH(test_fra->dir, sub_path, path); + return check_dir_validate(path); +} + +static void +check_fra_dir_validate(bool is_rm) +{ + bool clog_exist = false; + bool fbpoint_exist = false; + + if (is_rm) + { + Assert(!check_dir_validate(test_fra->dir)); + return; + } + + Assert(check_dir_validate(test_fra->dir)); + clog_exist = check_fra_subdir_validate(FLASHBACK_CLOG_DIR); + fbpoint_exist = check_fra_subdir_validate(FBPOINT_DIR); + Assert(clog_exist && fbpoint_exist); +} + +static void +init_fake_clog(char *path, XidStatus xid_status) +{ + int fd; + static char data[SLRU_PAGES_PER_SEGMENT * BLCKSZ]; + int rc = 0; + + fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, true); + Assert(fd >= 0); + + MemSet(data, 0, sizeof(data)); + data[0] = xid_status; + rc = (int)polar_pwrite(fd, data, SLRU_PAGES_PER_SEGMENT * BLCKSZ, 0); + + Assert(rc == SLRU_PAGES_PER_SEGMENT * BLCKSZ); + polar_fsync(fd); + polar_close(fd); +} + +static void +get_clog_path(int segno, char *path) +{ + if (polar_enable_shared_storage_mode) + snprintf(path, MAXPGPATH, "%s/%s/%04X", polar_datadir, "pg_xact", segno); + else + snprintf(path, MAXPGPATH, "%s/%04X", "pg_xact", segno); +} + +static bool +is_file_exists(char *path) +{ + struct stat st; + + if ((polar_stat(path, &st) == 0) && S_ISREG(st.st_mode)) + return true; + else + return false; +} + +static void +get_fra_clog_path(char *clog_dir_path, uint32 clog_subdir_no, char *fname, char *path) +{ + snprintf(path, MAXPGPATH, "%s/%08X/%s", clog_dir_path, clog_subdir_no, fname); +} + +static bool +is_file_in_fra(char *fname_relative_path) +{ + char file_path[MAXPGPATH]; + + FRA_GET_SUBDIR_PATH(test_fra->dir, fname_relative_path, file_path); + return is_file_exists(file_path); +} + +static void +check_clog_in_fra(char *fname) +{ + char relative_path[MAXPGPATH]; + uint32 next_clog_subdir_no; + bool is_exist = false; + + next_clog_subdir_no = pg_atomic_read_u32(&test_clog_ctl->next_clog_subdir_no); + Assert(next_clog_subdir_no); + next_clog_subdir_no--; + get_fra_clog_path(FLASHBACK_CLOG_DIR, next_clog_subdir_no, fname, relative_path); + is_exist = is_file_in_fra(relative_path); + Assert(is_exist); +} + +static void +test_fra_init(void) +{ + test_fra = fra_shmem_init_internal(TEST_FRA_NAME); + test_point_ctl = test_fra->point_ctl; + test_clog_ctl = test_fra->clog_ctl; + Assert(strncmp(test_fra->dir, TEST_FRA_NAME, FL_INS_MAX_NAME_LEN) == 0); +} + +static void +test_fra_startup(int min_clog_seg_no, uint32 next_clog_subdir_no, + uint64 next_fbpoint_rec_no, XLogRecPtr min_keep_lsn, fbpoint_pos_t snapshot_end_pos) +{ + polar_startup_fra(test_fra); + + Assert(test_fra->next_fbpoint_rec_no == next_fbpoint_rec_no); + Assert(test_fra->min_keep_lsn == min_keep_lsn); + Assert(FBPOINT_POS_EQUAL(test_fra->snapshot_end_pos, snapshot_end_pos)); + Assert(test_point_ctl->next_fbpoint_rec_no == next_fbpoint_rec_no); + Assert(test_clog_ctl->min_clog_seg_no == min_clog_seg_no); + Assert(pg_atomic_read_u32(&test_clog_ctl->next_clog_subdir_no) == next_clog_subdir_no); + check_fra_dir_validate(false); +} + +static void +test_fra_shmem_startup(void) +{ + if (prev_shmem_startup_hook) + prev_shmem_startup_hook(); + + /* + * Create or attach to the shared memory state, including hash table + */ + LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE); + test_fra_init(); + LWLockRelease(AddinShmemInitLock); +} + +static void +test_clog_to_fra(int segno, uint32 clog_subdir_expected, XidStatus xid_status) +{ + SlruCtlData clog_ctl; + char fname[5]; + char path[MAXPGPATH]; + + StrNCpy(clog_ctl.Dir, "pg_xact", 64); + Assert(polar_slru_seg_need_mv(test_fra, &clog_ctl)); + + /* Create a fake clog */ + get_clog_path(segno, path); + init_fake_clog(path, xid_status); + + /* Move clog to FRA */ + get_clog_segno_fname(segno, fname); + polar_mv_slru_seg_to_fra(test_fra, fname, path); + + /* Check */ + check_clog_in_fra(fname); + Assert(pg_atomic_read_u32(&test_clog_ctl->next_clog_subdir_no) == clog_subdir_expected + 1); +} + +static void +get_fbpoint_rec_expected(fbpoint_rec_data_t *rec_data, fbpoint_wal_info_data_t *wal_info, + polar_flog_rec_ptr flog_ptr) +{ + uint32 next_clog_subdir_no; + + next_clog_subdir_no = pg_atomic_read_u32(&test_clog_ctl->next_clog_subdir_no); + rec_data->flog_ptr = flog_ptr; + rec_data->redo_lsn = wal_info->fbpoint_lsn; + rec_data->time = wal_info->fbpoint_time; + rec_data->next_clog_subdir_no = next_clog_subdir_no; +} + +static void +check_fra_ctl_file(uint64 next_fbpoint_rec_no, XLogRecPtr min_keep_lsn, + int min_clog_seg_no, uint32 next_clog_subdir_no) +{ + char ctl_file_path[MAXPGPATH]; + int fd; + int read_len; + pg_crc32c crc; + int rc; + fra_ctl_file_data_t *ctl_file_data; + + ctl_file_data = palloc0(sizeof(fra_ctl_file_data_t)); + polar_make_file_path_level3(ctl_file_path, test_fra->dir, FRA_CTL_FILE_NAME); + fd = BasicOpenFile(ctl_file_path, O_RDWR | PG_BINARY, true); + Assert(fd >= 0); + read_len = polar_read(fd, ctl_file_data, sizeof(fra_ctl_file_data_t)); + Assert(read_len == sizeof(fra_ctl_file_data_t)); + rc = polar_close(fd); + Assert(!rc); + + /* Verify CRC */ + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) ctl_file_data, offsetof(fra_ctl_file_data_t, crc)); + FIN_CRC32C(crc); + Assert(EQ_CRC32C(crc, ctl_file_data->crc)); + + Assert(ctl_file_data->version_no == FRA_CTL_FILE_VERSION); + Assert(ctl_file_data->next_fbpoint_rec_no == next_fbpoint_rec_no); + Assert(ctl_file_data->min_keep_lsn == min_keep_lsn); + Assert(ctl_file_data->min_clog_seg_no == min_clog_seg_no); + Assert(ctl_file_data->next_clog_subdir_no == next_clog_subdir_no); + + pfree(ctl_file_data); +} + +static void +get_test_fbpoint_file_path(uint32 seg_no, char *path) +{ + char relative_path[MAXPGPATH]; + + snprintf(relative_path, MAXPGPATH, "%s/%s/%08X", test_fra->dir, FBPOINT_DIR, seg_no); + + polar_make_file_path_level2(path, relative_path); +} + +static void +check_fbpoint_page_crc(fbpoint_page_header_t *header) +{ + pg_crc32c crc; + + INIT_CRC32C(crc); + COMP_CRC32C(crc, (char *) header + FBPOINT_PAGE_HEADER_SIZE, FBPOINT_PAGE_SIZE - FBPOINT_PAGE_HEADER_SIZE); + COMP_CRC32C(crc, (char *) header, offsetof(fbpoint_page_header_t, crc)); + FIN_CRC32C(crc); + + Assert(EQ_CRC32C(crc, header->crc)); +} + +static inline bool +check_fbpoint_record_data(fbpoint_rec_data_t *r1, polar_flog_rec_ptr flog_ptr, + XLogRecPtr redo_lsn, pg_time_t time, uint32 next_clog_subdir_no) +{ + return r1->flog_ptr == flog_ptr && r1->redo_lsn == redo_lsn && r1->time == time && + r1->next_clog_subdir_no == next_clog_subdir_no; +} + +static inline bool +fbpoint_record_expected(fbpoint_rec_data_t *r1, fbpoint_rec_data_t *r2) +{ + return memcmp(r1, r2, FBPOINT_RECORD_DATA_SIZE) == 0; +} + +static void +check_fbpoint_rec_in_disk(fbpoint_rec_data_t *rec_data) +{ + char path[MAXPGPATH]; + uint32 seg_no; + uint64 next_fbpoint_rec_no = test_point_ctl->next_fbpoint_rec_no; + char buf[FBPOINT_REC_END_POS]; + int fd; + int read_len; + int rc; + uint32 page_no = 0; + fbpoint_page_header_t *header; + uint32 max_page_no; + fbpoint_rec_data_t *rec; + + seg_no = FBPOINT_GET_SEG_NO_BY_REC_NO(next_fbpoint_rec_no - 1); + max_page_no = FBPOINT_GET_PAGE_NO_BY_REC_NO(next_fbpoint_rec_no - 1); + + get_test_fbpoint_file_path(seg_no, path); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY, true); + + read_len = polar_pread(fd, buf, FBPOINT_REC_END_POS, 0); + Assert(read_len == FBPOINT_REC_END_POS); + rc = polar_close(fd); + Assert(!rc); + + header = (fbpoint_page_header_t *) buf; + + /* Check the CRC */ + for (; page_no < max_page_no; page_no++) + { + check_fbpoint_page_crc(header); + header = (fbpoint_page_header_t *)(buf + FBPOINT_PAGE_SIZE); + } + + rec = (fbpoint_rec_data_t *) FBPOINT_GET_OFFSET_BY_REC_NO(buf, next_fbpoint_rec_no - 1); + Assert(fbpoint_record_expected(rec, rec_data)); +} + +static void +check_fbpoint_rec_in_mem(uint64 rec_no, fbpoint_rec_data_t *rec_data) +{ + uint64 page_no; + char *start_ptr; + bool is_expected; + + Assert(test_point_ctl->next_fbpoint_rec_no == rec_no + 1); + page_no = FBPOINT_GET_PAGE_NO_BY_REC_NO(rec_no); + + start_ptr = test_point_ctl->fbpoint_rec_buf + page_no * FBPOINT_PAGE_SIZE; + Assert(((fbpoint_page_header_t *) start_ptr)->version == FBPOINT_REC_VERSION); + check_fbpoint_page_crc((fbpoint_page_header_t *) start_ptr); + + start_ptr = FBPOINT_GET_OFFSET_BY_REC_NO(test_point_ctl->fbpoint_rec_buf, rec_no); + is_expected = fbpoint_record_expected((fbpoint_rec_data_t *)start_ptr, rec_data); + Assert(is_expected); +} + +static int +test_xid_comparator(const void *arg1, const void *arg2) +{ + TransactionId xid1 = *(const TransactionId *) arg1; + TransactionId xid2 = *(const TransactionId *) arg2; + + if (xid1 == xid2) + return 0; + + if (TransactionIdPrecedes(xid1, xid2)) + return -1; + else + return 1; +} + +static bool +is_snapshot_eqaul(flashback_snapshot_t s1, Snapshot s2_sorted) +{ + TransactionId *xip; + TransactionId *xip_sorted; + uint32 xcnt; + + if (s1->xmin != s2_sorted->xmin || s1->xmax != s2_sorted->xmax || s1->xcnt != s2_sorted->xcnt || + s1->lsn != s2_sorted->lsn) + return false; + + xip = (TransactionId *) POLAR_GET_FLSHBAK_SNAPSHOT_XIP(s1); + xcnt = s1->xcnt; + xip_sorted = s2_sorted->xip; + + qsort(xip, xcnt, sizeof(TransactionId), test_xid_comparator); + + return memcmp(xip, xip_sorted, xcnt * sizeof(TransactionId)) == 0; +} + +static inline void +free_test_snapshot(Snapshot snapshot) +{ + /* Free it */ + if (snapshot->xip) + pfree(snapshot->xip); + + if (snapshot->subxip) + pfree(snapshot->subxip); + + pfree(snapshot); +} + +static void +check_flashback_snapshot(fbpoint_pos_t snapshot_pos, flashback_snapshot_t snapshot_expected) +{ + Snapshot snapshot = NULL; + bool is_snapshot_right; + uint32 next_clog_subdir_no; + uint32 next_xid; + + snapshot = polar_get_flashback_snapshot(test_fra->dir, snapshot_pos, &next_clog_subdir_no, &next_xid); + is_snapshot_right = is_snapshot_eqaul(snapshot_expected, snapshot); + Assert(is_snapshot_right); + Assert(snapshot_expected->next_clog_subdir_no == next_clog_subdir_no); + Assert(snapshot_expected->next_xid == next_xid); + + free_test_snapshot(snapshot); +} + +/* Create a fake flashback snapshot */ +static flashback_snapshot_header_t +create_a_fake_snapshot(uint32 size, XLogRecPtr lsn, uint32 next_clog_subdir_no, TransactionId next_xid) +{ + uint32 xcnt; + flashback_snapshot_header_t header; + flashback_snapshot_t snapshot; + + xcnt = size / sizeof(TransactionId) + 1; + header = (flashback_snapshot_header_t) palloc(FLSHBAK_GET_SNAPSHOT_TOTAL_SIZE(xcnt)); + header->data_size = FLSHBAK_GET_SNAPSHOT_DATA_SIZE(xcnt); + SET_FLSHBAK_SNAPSHOT_VERSION(header->info); + + snapshot = FLSHBAK_GET_SNAPSHOT_DATA(header); + snapshot->lsn = lsn; + snapshot->xcnt = xcnt; + snapshot->next_clog_subdir_no = next_clog_subdir_no; + snapshot->next_xid = next_xid; + + /* Xip is fake */ + return header; +} + +static fbpoint_pos_t +test_back_snapshot_to_fra(fbpoint_pos_t *snapshot_end_pos, uint32 size, uint32 next_clog_subdir_no, TransactionId next_xid) +{ + XLogRecPtr lsn; + fbpoint_pos_t snapshot_pos; + flashback_snapshot_header_t header; + flashback_snapshot_t snapshot; + + lsn = GetXLogInsertRecPtr(); + header = create_a_fake_snapshot(size, lsn, next_clog_subdir_no, next_xid); + snapshot_pos = polar_backup_snapshot_to_fra(header, snapshot_end_pos, test_fra->dir); + + /* Check the snapshot info */ + snapshot = FLSHBAK_GET_SNAPSHOT_DATA(header); + check_flashback_snapshot(snapshot_pos, snapshot); + + pfree(header); + return snapshot_pos; +} + +static void +test_flush_fbpoint_rec(void) +{ + int i; + uint32 next_clog_subdir_no; + uint64 rec_no_expected; + fbpoint_rec_data_t rec_data; + fbpoint_rec_data_t rec_data_expected; + uint64 fake_info; + fbpoint_pos_t snapshot_pos_expected; + uint32 snapshot_size; + bool snapshot_over_seg = false; + fbpoint_pos_t snapshot_end_pos = test_fra->snapshot_end_pos; + + /* I want there is one snapshot will be splitted to two segments. */ + snapshot_size = (FBPOINT_SEG_SIZE - FBPOINT_REC_END_POS) / FBPOINT_REC_PER_SEG; + next_clog_subdir_no = pg_atomic_read_u32(&test_clog_ctl->next_clog_subdir_no); + + /* Flush three fbpoint record files */ + for (i = 0; i < FBPOINT_REC_PER_SEG; i++) + { + rec_no_expected = test_point_ctl->next_fbpoint_rec_no; + fake_info = rec_no_expected + MIN_RECORD_TIME; + + /* Test back snapshot to fra */ + snapshot_pos_expected = test_back_snapshot_to_fra(&snapshot_end_pos, snapshot_size, next_clog_subdir_no, (TransactionId) fake_info); + + if (snapshot_end_pos.seg_no != rec_no_expected / FBPOINT_REC_PER_SEG) + snapshot_over_seg = true; + + /* It is a lucky no, skip it */ + if (fake_info == LUCKY_RECORD_NO) + fake_info++; + + rec_data.flog_ptr = fake_info; + rec_data.redo_lsn = fake_info; + rec_data.time = fake_info; + rec_data.next_clog_subdir_no = next_clog_subdir_no; + rec_data.snapshot_pos = snapshot_pos_expected; + rec_data_expected = rec_data; + polar_flush_fbpoint_rec(test_point_ctl, test_fra->dir, &rec_data); + Assert(test_point_ctl->next_fbpoint_rec_no == rec_no_expected + 1); + check_fbpoint_rec_in_mem(rec_no_expected, &rec_data_expected); + /* Check the flashback point record file in disk */ + check_fbpoint_rec_in_disk(&rec_data_expected); + } + + /* There must be a snapshot not in the same segment file */ + Assert(snapshot_over_seg); + + /* Update the test_fra for rightness */ + test_fra->next_fbpoint_rec_no = test_point_ctl->next_fbpoint_rec_no; + test_fra->snapshot_end_pos = snapshot_end_pos; +} + +static void +test_get_right_fbpoint(uint64 info, uint64 expected_info, uint32 next_clog_subdir_no) +{ + fbpoint_rec_data_t record; + uint32 seg_no_expected = 0; + uint32 seg_no = 0; + bool is_expected; + + Assert(expected_info >= MIN_RECORD_TIME && expected_info != LUCKY_RECORD_NO); + polar_get_right_fbpoint(test_point_ctl, test_fra->dir, info, &record, &seg_no); + is_expected = check_fbpoint_record_data(&record, expected_info, expected_info, expected_info, + next_clog_subdir_no); + Assert(is_expected); + + if (expected_info < LUCKY_RECORD_NO) + seg_no_expected = FBPOINT_GET_SEG_NO_BY_REC_NO(expected_info - MIN_RECORD_TIME); + else + seg_no_expected = FBPOINT_GET_SEG_NO_BY_REC_NO(expected_info - MIN_RECORD_TIME - 1); + + Assert(seg_no == seg_no_expected); +} + +static void +test_get_right_fbpoints(void) +{ + uint64 next_rec_no; + + next_rec_no = test_point_ctl->next_fbpoint_rec_no; + + /* Now the times of the records are [MIN_RECORD_TIME, test_point_ctl->next_fbpoint_rec_no + MIN_RECORD_TIME] */ + + /* the time of every record is newer than the keep time but return minimal */ + test_get_right_fbpoint(1, MIN_RECORD_TIME, 1); + + /* the time of every record is older than the keep time but return maximal */ + test_get_right_fbpoint(next_rec_no + MIN_RECORD_TIME, next_rec_no + MIN_RECORD_TIME - 1, 3); + + /* a record is the right one whose time is older than the keep time */ + test_get_right_fbpoint(LUCKY_RECORD_NO, LUCKY_RECORD_NO - 1, FBPOINT_GET_SEG_NO_BY_REC_NO(LUCKY_RECORD_NO - MIN_RECORD_TIME - 2) + 1); + + /* a record is the right one whose time is equals to the keep time */ + test_get_right_fbpoint(next_rec_no, next_rec_no, FBPOINT_GET_SEG_NO_BY_REC_NO(next_rec_no - MIN_RECORD_TIME - 1) + 1); +} + +static void +test_get_xid_status_from_fra(TransactionId xid, TransactionId max_xid, uint32 next_clog_subdir_no, XidStatus status_expected) +{ + XidStatus status; + + status = polar_flashback_get_xid_status(xid, max_xid, next_clog_subdir_no, test_fra->dir); + Assert(status == status_expected); +} + +static void +check_dir_empty(char *dir_path) +{ + DIR *dir; + struct dirent *de; + bool result = true; + + dir = polar_allocate_dir(dir_path); + + while ((de = ReadDir(dir, dir_path)) != NULL) + { + if (!strcmp(de->d_name, ".") || !(strcmp(de->d_name, ".."))) + continue; + else + { + result = false; + break; + } + } + + FreeDir(dir); + + Assert(result); +} + +static void +check_files_removed_in_dir(char *dir_path, uint64 value) +{ + DIR *dir; + struct dirent *de; + bool result = true; + bool found_keep = false; + + dir = polar_allocate_dir(dir_path); + + while ((de = ReadDir(dir, dir_path)) != NULL) + { + uint64 fname; + int nfields; + int len; + + len = strlen(de->d_name); + + if (strspn(de->d_name, "0123456789ABCDEF") == len) + { + nfields = sscanf(de->d_name, "%lu", &fname); + + Assert(nfields == 1); + + if (fname < value) + { + result = false; + break; + } + else if (fname == value) + found_keep = true; + } + } + + FreeDir(dir); + + Assert(result && found_keep); +} + +static void +check_fra_file_removed(uint32 fbpoint_keep_seg_no, pg_time_t keep_time, uint32 keep_clog_subdir_no) +{ + char fbpoint_dir_path[MAXPGPATH]; + char clog_dir_path[MAXPGPATH]; + + FRA_GET_SUBDIR_PATH(test_fra->dir, FBPOINT_DIR, fbpoint_dir_path); + check_files_removed_in_dir(fbpoint_dir_path, fbpoint_keep_seg_no); + + FRA_GET_SUBDIR_PATH(test_fra->dir, FLASHBACK_CLOG_DIR, clog_dir_path); + + if (keep_clog_subdir_no == 0) + check_dir_empty(clog_dir_path); + else + check_files_removed_in_dir(clog_dir_path, keep_clog_subdir_no); +} + +static fbpoint_pos_t +compute_snapshot_end_pos(flashback_snapshot_t snapshot, fbpoint_pos_t *pos) +{ + uint32 seg_no; + uint32 offset; + Size data_size; + Size write_size; + fbpoint_pos_t end_pos; + + data_size = FLSHBAK_GET_SNAPSHOT_TOTAL_SIZE(snapshot->xcnt); + seg_no = pos->seg_no; + offset = pos->offset; + + if (offset - FBPOINT_REC_END_POS < FLSHBAK_SNAPSHOT_HEADER_SIZE) + { + seg_no++; + offset = FBPOINT_SEG_SIZE; + } + + if (data_size < offset - FBPOINT_REC_END_POS) + SET_FBPOINT_POS(*pos, seg_no, offset - data_size); + else + SET_FBPOINT_POS(*pos, seg_no, FBPOINT_REC_END_POS); + + do + { + write_size = Min(data_size, offset - FBPOINT_REC_END_POS); + end_pos.seg_no = seg_no; + end_pos.offset = offset - write_size; + data_size -= write_size; + offset = FBPOINT_SEG_SIZE; + seg_no++; + } + while (data_size > 0); + + Assert(data_size == 0); + + return end_pos; +} + +static void +test_fra_fbpoint(XLogRecPtr fbpoint_lsn, pg_time_t fbpoint_time, XLogRecPtr prior_fbpoint_lsn) +{ + fbpoint_wal_info_data_t fake_wal_info; + flashback_snapshot_header_t header; + flashback_snapshot_t snapshot; + XLogRecPtr lsn; + polar_flog_rec_ptr flog_ptr; + polar_flog_rec_ptr start_ptr; + fbpoint_rec_data_t rec_data_expected; + uint64 rec_no_expected; + int min_clog_seg_no; + uint32 next_clog_subdir_no; + fbpoint_pos_t snapshot_pos; + fbpoint_pos_t snapshot_end_pos; + + rec_no_expected = test_point_ctl->next_fbpoint_rec_no; + min_clog_seg_no = test_clog_ctl->min_clog_seg_no; + next_clog_subdir_no = pg_atomic_read_u32(&test_clog_ctl->next_clog_subdir_no); + snapshot_pos = snapshot_end_pos = test_fra->snapshot_end_pos; + + fake_wal_info.fbpoint_lsn = fbpoint_lsn; + fake_wal_info.fbpoint_time = fbpoint_time; + fake_wal_info.prior_fbpoint_lsn = prior_fbpoint_lsn; + + start_ptr = flog_ptr = polar_get_flog_write_result(flog_instance->buf_ctl); + + get_fbpoint_rec_expected(&rec_data_expected, &fake_wal_info, flog_ptr); + + lsn = GetXLogInsertRecPtr(); + header = polar_get_flashback_snapshot_data(test_fra, lsn); + snapshot = FLSHBAK_GET_SNAPSHOT_DATA(header); + Assert(snapshot->lsn == lsn); + + polar_fra_do_fbpoint(test_fra, &fake_wal_info, &flog_ptr, header); + Assert(flog_ptr == Min(start_ptr, rec_no_expected + MIN_RECORD_TIME - 1)); + Assert(test_fra->next_fbpoint_rec_no == rec_no_expected + 1); + Assert(test_point_ctl->next_fbpoint_rec_no == rec_no_expected + 1); + Assert(test_fra->min_keep_lsn == rec_no_expected + MIN_RECORD_TIME - 1); + + snapshot_end_pos = compute_snapshot_end_pos(snapshot, &snapshot_pos); + SET_FBPOINT_POS(rec_data_expected.snapshot_pos, snapshot_pos.seg_no, snapshot_pos.offset); + check_fbpoint_rec_in_mem(rec_no_expected, &rec_data_expected); + Assert(FBPOINT_POS_EQUAL(test_fra->snapshot_end_pos, snapshot_end_pos)); + + /* Check the new fbpoint record */ + check_fbpoint_rec_in_disk(&rec_data_expected); + + /* Check the flashback snapshot */ + check_flashback_snapshot(snapshot_pos, snapshot); + + check_fra_ctl_file(rec_no_expected + 1, rec_no_expected + MIN_RECORD_TIME - 1, min_clog_seg_no, next_clog_subdir_no); + /* + * The time is large than we flushed in test_flush_fbpoint_rec, so + * the keep flashback point record is: + * rec_data.flog_ptr = rec_no_expected + MIN_RECORD_TIME - 1; + * rec_data.redo_lsn = rec_no_expected + MIN_RECORD_TIME - 1; + * rec_data.time = rec_no_expected + MIN_RECORD_TIME - 1; + * rec_data.next_clog_subdir_no = 3; + * rec_data.next_xid = rec_no_expected + MIN_RECORD_TIME - 1; + * It is in the fbpoint segment 2, so we remove segment 0 and 1. + */ + check_fra_file_removed(2, rec_no_expected + MIN_RECORD_TIME - 1, 2); +} + +static void +test_flashback_snapshot_insert_xid(flshbak_rel_snpsht_t rsnapshot, TransactionId xid) +{ + uint32 xcnt; + uint32 xip_size = rsnapshot->xip_size; + TransactionId xmax = rsnapshot->snapshot->xmax; + TransactionId xmin = rsnapshot->snapshot->xmin; + uint32 removed_size = rsnapshot->removed_size; + uint32 insert_xcnt; + TransactionId *xip; + int i; + + if (rsnapshot->snapshot->takenDuringRecovery) + xcnt = rsnapshot->snapshot->subxcnt; + else + xcnt = rsnapshot->snapshot->xcnt; + + Assert(!TransactionIdPrecedes(xid, rsnapshot->snapshot->xmax)); + polar_update_flashback_snapshot(rsnapshot, xid); + + insert_xcnt = (int32)(xid - xmax); + + if (xid < xmax) + insert_xcnt -= FirstNormalTransactionId; + + if (insert_xcnt == 0) + return; + + if (rsnapshot->snapshot->takenDuringRecovery) + { + Assert(rsnapshot->snapshot->subxcnt == (xcnt + insert_xcnt)); + xip = rsnapshot->snapshot->subxip; + } + else + { + Assert(rsnapshot->snapshot->xcnt == (xcnt + insert_xcnt)); + xip = rsnapshot->snapshot->xip; + } + + for (i = 0; i < insert_xcnt; i++) + { + Assert(xip[xcnt + i] == xmax); + TransactionIdAdvance(xmax); + } + + TransactionIdAdvance(xid); + Assert(rsnapshot->snapshot->xmax == xid); + Assert(rsnapshot->snapshot->xmin == xmin); + Assert(rsnapshot->removed_size == removed_size); + /* Enlarge or not */ + Assert(rsnapshot->xip_size == xip_size || rsnapshot->xip_size == (xcnt + insert_xcnt + rsnapshot->removed_size + ENLARGE_XIP_SIZE_ONCE)); +} + +static void +test_flashback_snapshot_remove_xid(flshbak_rel_snpsht_t rsnapshot, TransactionId xid, bool is_desc) +{ + uint32 xcnt; + uint32 xip_size = rsnapshot->xip_size; + TransactionId xmax = rsnapshot->snapshot->xmax; + TransactionId xmin = rsnapshot->snapshot->xmin; + uint32 removed_size = rsnapshot->removed_size; + + if (rsnapshot->snapshot->takenDuringRecovery) + xcnt = rsnapshot->snapshot->subxcnt; + else + xcnt = rsnapshot->snapshot->xcnt; + + Assert(TransactionIdPrecedes(xid, rsnapshot->snapshot->xmax)); + polar_update_flashback_snapshot(rsnapshot, xid); + + if (rsnapshot->snapshot->takenDuringRecovery) + Assert(rsnapshot->snapshot->subxcnt == (xcnt - 1)); + else + Assert(rsnapshot->snapshot->xcnt == (xcnt - 1)); + + Assert(rsnapshot->snapshot->xmax == xmax); + + if (is_desc) + { + if (rsnapshot->removed_size == 1) + Assert(rsnapshot->removed_xid_pos[0] == xcnt - 1); + else + { + Assert(rsnapshot->removed_size == removed_size + 1); + Assert(rsnapshot->removed_xid_pos[0] == xcnt - 1); + } + + Assert(rsnapshot->snapshot->xmin == xmin); + } + + Assert(rsnapshot->xip_size == xip_size); +} + +static void +test_flashback_snapshot(bool is_standby) +{ +#define TEST_XID 300 + XLogRecord record; + SnapshotData snapshot; + uint32 removed_xid_pos[256]; + flashback_rel_snapshot_t rsnapshot; + XLogReaderState xlogreader; + TimestampTz time = 0; + int i; + uint32 xcnt; + TransactionId *xip = NULL; + TransactionId next_xid; + + /* Construct a fake record which will update the next_xid */ + record.xl_rmid = RM_XLOG_ID; + record.xl_xid = TEST_XID; + rsnapshot.next_clog_subdir_no = 0; + rsnapshot.next_xid = TEST_XID; + Assert(!polar_flashback_xact_redo(&record, &rsnapshot, time, &xlogreader)); + Assert(rsnapshot.next_clog_subdir_no == 0 && rsnapshot.next_xid == TEST_XID + 1); + + /* Construct a fake record which will update the clog_subdir_no and next_xid */ + rsnapshot.next_xid = FirstNormalTransactionId; + record.xl_rmid = RM_XLOG_ID; + record.xl_xid = FirstNormalTransactionId; + Assert(!polar_flashback_xact_redo(&record, &rsnapshot, time, &xlogreader)); + Assert(rsnapshot.next_clog_subdir_no == 1 && rsnapshot.next_xid == FirstNormalTransactionId + 1); + + /* Test insert a xid to xip */ + snapshot.xcnt = 0; + snapshot.subxcnt = 0; + snapshot.xmax = snapshot.xmin = MaxTransactionId; + snapshot.xip = NULL; + snapshot.subxip = NULL; + rsnapshot.removed_size = 0; + rsnapshot.removed_xid_pos = removed_xid_pos; + rsnapshot.snapshot = &snapshot; + rsnapshot.xip_size = 0; + + if (is_standby) + snapshot.takenDuringRecovery = true; + else + snapshot.takenDuringRecovery = false; + + /* Insert the xid and make xip full */ + next_xid = TEST_XID; + test_flashback_snapshot_insert_xid(&rsnapshot, next_xid); + /* Now xip is {MaxTransactionId, 3, 4, 5, 6, 7...299} */ + + /* Test remove a xid from xip */ + for (i = 100; i < TEST_XID; i++) + { + next_xid--; + test_flashback_snapshot_remove_xid(&rsnapshot, next_xid, true); + } + + /* Now xip is {MaxTransactionId, 3, 4, 5, 6, 7...99} */ + + /* Update the xmin */ + next_xid = MaxTransactionId; + test_flashback_snapshot_remove_xid(&rsnapshot, next_xid, false); + Assert(snapshot.xmin == FirstNormalTransactionId); + + /* Test compact the xip */ + if (is_standby) + { + xcnt = snapshot.subxcnt; + xip = snapshot.subxip; + } + else + { + xcnt = snapshot.xcnt; + xip = snapshot.xip; + } + + polar_compact_xip(&rsnapshot); + Assert(rsnapshot.removed_size == 0); + next_xid = FirstNormalTransactionId; + + for (i = 0; i < xcnt; i++) + { + Assert(xip[i] == next_xid); + TransactionIdAdvance(next_xid); + } + + pfree(xip); +} + +static void +test_startup_from_ctl_file(void) +{ + int min_clog_seg_no; + uint32 next_clog_subdir_no; + uint64 next_fbpoint_rec_no; + XLogRecPtr min_keep_lsn; + fbpoint_pos_t snapshot_end_pos; + + min_clog_seg_no = test_clog_ctl->min_clog_seg_no; + next_clog_subdir_no = pg_atomic_read_u32(&test_clog_ctl->next_clog_subdir_no); + next_fbpoint_rec_no = test_point_ctl->next_fbpoint_rec_no; + min_keep_lsn = test_fra->min_keep_lsn; + snapshot_end_pos = test_fra->snapshot_end_pos; + + MemSet(test_clog_ctl, 0, sizeof(flashback_clog_ctl_data_t)); + test_fra->min_keep_lsn = 0; + test_fra->next_fbpoint_rec_no = 0; + SET_FBPOINT_POS(test_fra->snapshot_end_pos, 0, 0); + test_point_ctl->next_fbpoint_rec_no = 0; + + test_fra_init(); + test_fra_startup(min_clog_seg_no, next_clog_subdir_no, next_fbpoint_rec_no, min_keep_lsn, snapshot_end_pos); +} + +void +_PG_init(void) +{ + if (!process_shared_preload_libraries_in_progress) + return; + + RequestAddinShmemSpace(polar_fra_shmem_size()); + prev_shmem_startup_hook = shmem_startup_hook; + shmem_startup_hook = test_fra_shmem_startup; +} + +void +_PG_fini(void) +{ + shmem_startup_hook = prev_shmem_startup_hook; +} + +PG_FUNCTION_INFO_V1(test_fast_recovery_area); +/* + * SQL-callable entry point to perform all tests. + */ +Datum +test_fast_recovery_area(PG_FUNCTION_ARGS) +{ + fbpoint_pos_t pos; + + if (!polar_enable_fra(fra_instance)) + PG_RETURN_VOID(); + + /* Test fra init */ + test_fra_init(); + + /* Test fra startup */ + SET_FBPOINT_POS(pos, 0, FBPOINT_SEG_SIZE); + test_fra_startup(0, 0, 0, GetRedoRecPtr(), pos); + + /* Test back clog to fra */ + test_clog_to_fra(TEST_CLOG_SEGNO, 0, TRANSACTION_STATUS_COMMITTED); + + /* Test flush fbpoint record in segment 0 with next_clog_subdir_no 1 */ + test_flush_fbpoint_rec(); + + /* Update the min clog segment no and test move clog to fra */ + test_fra->clog_ctl->min_clog_seg_no = TEST_CLOG_SEGNO; + test_clog_to_fra(TEST_CLOG_SEGNO - 1, 1, TRANSACTION_STATUS_ABORTED); + + /* Test flush fbpoint record in segment 1 with next_clog_subdir_no 2 */ + test_flush_fbpoint_rec(); + + /* Test back clog to fra with a new subdir */ + test_clog_to_fra(TEST_CLOG_SEGNO - 1, 2, TRANSACTION_STATUS_ABORTED); + + /* Test get xid status from fra clog */ + test_get_xid_status_from_fra((uint32) XID_PER_CLOG_SEG * TEST_CLOG_SEGNO, + ((uint32) XID_PER_CLOG_SEG * TEST_CLOG_SEGNO + 1), 1, TRANSACTION_STATUS_COMMITTED); + test_get_xid_status_from_fra((uint32) XID_PER_CLOG_SEG * (TEST_CLOG_SEGNO - 1), + (uint32) XID_PER_CLOG_SEG * (TEST_CLOG_SEGNO - 1) - 1, 3, TRANSACTION_STATUS_ABORTED); + + /* Test flush fbpoint record in segment 2 with next_clog_subdir_no 3 */ + test_flush_fbpoint_rec(); + + /* Test get right fbpoint cases */ + test_get_right_fbpoints(); + + /* Test fra flashback point */ + test_fra_fbpoint(GetRedoRecPtr(), (pg_time_t) time(NULL), GetRedoRecPtr() - 1); + + /* Test flashback snapshot */ + test_flashback_snapshot(false); + + /* Test fra startup again */ + test_startup_from_ctl_file(); + + PG_RETURN_VOID(); +} diff --git a/src/test/modules/test_flashback_table/test_flashback_table.conf b/src/test/modules/test_flashback_table/test_flashback_table.conf new file mode 100644 index 00000000000..9bc20d0ad7d --- /dev/null +++ b/src/test/modules/test_flashback_table/test_flashback_table.conf @@ -0,0 +1,28 @@ +#------------------------------------------------------------------------------ +# FLASHBACK LOG OPTIONS +#------------------------------------------------------------------------------ +polar_enable_flashback_log = on +polar_enable_lazy_checkpoint = on +polar_flashback_log_keep_segments = 8 +polar_flashback_log_buffers = 10 +polar_flashback_log_insert_locks = 8 +polar_flashback_logindex_queue_buffers = 1 +polar_flashback_log_bgwrite_delay = 100 +polar_flashback_log_flush_max_size = 50 +polar_flashback_log_debug = off +full_page_writes = off +polar_internal_shared_preload_libraries='polar_vfs,polar_worker,polar_resource_group,polar_monitor_preload' +#------------------------------------------------------------------------------ +# FAST RECOVERY AREA OPTIONS +#------------------------------------------------------------------------------ +polar_flashback_point_timeout = 300 +polar_enable_fast_recovery_area = on +polar_fast_recovery_area_rotation = 180 +shared_preload_libraries = 'test_flashback_table' +#------------------------------------------------------------------------------ +# OTHER OPTIONS +#------------------------------------------------------------------------------ +checkpoint_timeout = 5 +client_min_messages = error +polar_bulk_extend_size = 5 +max_prepared_transactions = 5 diff --git a/src/test/modules/test_flashback_table/test_flashback_table.control b/src/test/modules/test_flashback_table/test_flashback_table.control new file mode 100644 index 00000000000..c529234bbe8 --- /dev/null +++ b/src/test/modules/test_flashback_table/test_flashback_table.control @@ -0,0 +1,4 @@ +comment = 'Test code for flashback table feature' +default_version = '1.0' +module_pathname = '$libdir/test_flashback_table' +relocatable = true diff --git a/src/test/perl/PolarRegression.pm b/src/test/perl/PolarRegression.pm index b1035396782..903027f7e73 100644 --- a/src/test/perl/PolarRegression.pm +++ b/src/test/perl/PolarRegression.pm @@ -8,7 +8,8 @@ use PostgresNode; use TestLib (); use IO::Pipe; -use Time::HiRes qw( gettimeofday tv_interval ); +use Time::HiRes qw( gettimeofday tv_interval time ); +use POSIX qw(strftime); our @EXPORT = qw( create_new_test @@ -33,11 +34,15 @@ sub new # randome pg_control command could be registered by register_random_pg_control. _random_pg_control => '', _mode => 'default', + _enable_random_flashback => 0, }; $node_master->safe_psql('postgres', 'CREATE DATABASE '.$regress_db, timeout=>600); $node_master->safe_psql($regress_db, 'CREATE extension polar_monitor', timeout=>10); $node_master->safe_psql($regress_db, 'CREATE TABLE replayed(val integer unique);'); + # Create a function to copy random rel for flashback test + $node_master->safe_psql($regress_db, "CREATE OR REPLACE FUNCTION fb_copy_a_random_rel() RETURNS TEXT LANGUAGE plpgsql AS \$\$ DECLARE random_rel TEXT; BEGIN SELECT (n.nspname)::text || '.' || (c.relname)::text into random_rel FROM pg_catalog.pg_class c LEFT JOIN pg_catalog.pg_namespace n ON n.oid = c.relnamespace WHERE c.relkind IN ('r') AND n.nspname <> 'pg_catalog' AND n.nspname <> 'information_schema' AND n.nspname !~ '^pg_toast' AND relispartition = 'f' AND relpersistence = 'p' AND reltoastrelid = 0 AND relhassubclass = 'f' AND pg_catalog.pg_table_is_visible(c.oid) order by random() limit 1; execute 'drop table if exists ' || random_rel || '_fbnew'; execute 'create table ' || random_rel || '_fbnew as select * from ' || random_rel; RETURN random_rel; END \$\$;"); + bless $self, $class; return $self; @@ -86,6 +91,86 @@ sub register_random_pg_control } } +sub enable_random_flashback +{ + my ($self) = @_; + $self->{_enable_random_flashback} = 1; +} + +sub fb_copy_a_random_rel +{ + my ($self) = @_; + + my $node_master = $self->node_master; + my $random_rel = $node_master->safe_psql($self->regress_db, "select * from fb_copy_a_random_rel()"); + + return $random_rel; +} + +sub get_rel_oid +{ + my ($self, $rel) = @_; + + my $relid = 0; + my $node_master = $self->node_master; + $relid = $node_master->safe_psql($self->regress_db, "select '$rel'::regclass::oid"); + return $relid; +} + +sub fb_rel_and_check +{ + my ($self, $old_rel, $old_rel_oid, $flashback_time) = @_; + + my $new_rel = $old_rel."_fbnew"; + my $flashback_rel = ''; + my $rel_exist = 0; + my $failed = 0; + + $rel_exist = $self->node_master->safe_psql($self->regress_db, "select count(1) from pg_class where oid = $old_rel_oid"); + + if ($rel_exist) + { + $self->node_master->safe_psql($self->regress_db, + "flashback table $old_rel to timestamp '$flashback_time'" + ); + + $flashback_rel = 'polar_flashback_'.$old_rel_oid; + + # Just count(1) to be quick. + + my $count = $self->node_master->safe_psql($self->regress_db, + "select count(1) from $new_rel" + ); + + my $right_count = $self->node_master->safe_psql($self->regress_db, + "select count(1) from $flashback_rel" + ); + + if ($right_count != $count) + { + print "flashback: The count in the ".$new_rel." is:".$count.", but is:".$right_count." in the ".$flashback_rel."\n"; + $failed = 1; + } + } + + return $failed; +} + +sub fb_drop_copy_rel +{ + my ($self, $old_rel, $old_rel_oid) = @_; + + my $new_rel = $old_rel."_fbnew"; + my $flashback_rel = 'polar_flashback_'.$old_rel_oid; + + $self->node_master->safe_psql($self->regress_db, + "drop table IF EXISTS $new_rel"); + + $self->node_master->safe_psql($self->regress_db, + "drop table IF EXISTS $flashback_rel"); +} + + sub get_random_pg_control { my ($self) = @_; @@ -1003,6 +1088,13 @@ sub test_one_case my $failed = 0; $ENV{'PG_TEST_NOCLEAN'} = 'Do not clean data directory after error during test.'; my $table = "replayed_$test_case\_$$"; + + # something about random flashback + my $random_rel = ''; + my $flashback_time = ''; + my $flashback_epoc = 0; + my $random_rel_oid = 0; + my $flashback_t; $node_master->safe_psql($regress_db, "CREATE TABLE $table(val integer unique);"); $self->replay_check_all($node_replica_ref, $table); @@ -1060,6 +1152,40 @@ sub test_one_case { last; } + + # Get a random relation and copy + if ($self->{_enable_random_flashback} and $random_rel eq '' and $line_num > 10 and $line_num > int(rand(20))) + { + $random_rel = $self->fb_copy_a_random_rel; + print "flashback: The random relation is ".$random_rel."\n"; + $random_rel_oid = $self->get_rel_oid($random_rel); + print "flashback: The random relation oid is ".$random_rel_oid."\n"; + + $flashback_t = time; + $flashback_time = strftime "%Y-%m-%d %H:%M:%S", gmtime $flashback_t; + $flashback_time .= sprintf ".%05d", ($flashback_t-int($flashback_t))*100000; + print "flashback: The flashback time is ".$flashback_time."\n"; + $flashback_epoc = time(); + } + + # FLashback the table and check it if it exists + if ($random_rel ne '' and time() - $flashback_epoc >= 3) + { + $failed = $self->fb_rel_and_check($random_rel, $random_rel_oid, $flashback_time); + print "flashback: The flashback table ".$random_rel." are ".$failed." failed\n"; + + $random_rel = ''; + + if ($failed == 1) + { + last; + } + else + { + $self->fb_drop_copy_rel($random_rel, $random_rel_oid); + } + } + if ($polar_prev_tag ne 'EQUAL' and $polar_tag eq 'EQUAL' or $polar_prev_tag ne 'REPLICA_IGNORE' and $polar_tag eq 'REPLICA_IGNORE') { @@ -1090,6 +1216,12 @@ sub test_one_case { $node_replica->psql_close($node_replica->get_psql()); } + + # Clean the random rel when it isn't flashbacked because it run within 3 second + if ($random_rel ne '') + { + $self->fb_drop_copy_rel($random_rel, $random_rel_oid); + } close $fh; close $regress_out; diff --git a/src/test/polar_consistency/t/009_flashback_random_table.pl b/src/test/polar_consistency/t/009_flashback_random_table.pl new file mode 100644 index 00000000000..2211c996cad --- /dev/null +++ b/src/test/polar_consistency/t/009_flashback_random_table.pl @@ -0,0 +1,62 @@ +# Test flashback table +# 1. Enable logindex in master and replica +# 2. Read transaction from SQL file +# 3. Execute serial_schedule on master and replica. +# 4. If it's readonly transaction, then compare the stdout and stderr from master and replica +# 5. If it's write transaction then check stderr from replica is not empty +# 6. select a random table, copy it and wait 3 seconds to flashback the random table and check +# +use strict; +use warnings; +use PostgresNode; +use PolarRegression; +use Test::More tests=>1; + +my $sql_dir = $ENV{PWD}.'/sql'; +my $regress_db = 'polar_regression'; + +my $node_master = get_new_node('master'); +$node_master->polar_init(1, 'polar_master_logindex'); + +my $node_replica1 = get_new_node('replica1'); +$node_replica1->polar_init(0, 'polar_repli_logindex'); +$node_replica1->polar_set_recovery($node_master); + +$node_master->append_conf('postgresql.conf', + "synchronous_standby_names='".$node_replica1->name."'"); + +$node_master->append_conf('postgresql.conf', 'wal_sender_timeout=3600s'); +$node_master->append_conf('postgresql.conf', 'polar_enable_flashback_log=on'); +$node_master->append_conf('postgresql.conf', 'polar_enable_fast_recovery_area=on'); + +$node_replica1->append_conf('postgresql.conf', 'wal_receiver_timeout=3600s'); +$node_replica1->append_conf('postgresql.conf', 'polar_enable_flashback_log=on'); +$node_replica1->append_conf('postgresql.conf', 'polar_enable_fast_recovery_area=on'); + +$node_master->start; + +$node_master->polar_create_slot($node_replica1->name); +$node_replica1->start; + +my @replicas = ($node_replica1); +my $regress = PolarRegression->create_new_test($node_master); +$regress->enable_random_flashback(); + +my $start_time = time(); +my $failed = $regress->test('serial_schedule', $sql_dir, \@replicas); + +ok($failed == 0, "Polar regression test"); + +if ($failed) +{ + if (-e `printf polar_dump_core`) + { + print `ps -ef | grep postgres: | xargs -n 1 -P 0 gcore`; + } + sleep(864000); +} +my $cost_time = time() - $start_time; + +note "Test cost time $cost_time"; +my @nodes = ($node_master, $node_replica1); +$regress->shutdown_all_nodes(\@nodes); \ No newline at end of file diff --git a/src/test/polar_flog_repair_partial/.gitignore b/src/test/polar_flog_repair_partial/.gitignore deleted file mode 100644 index 871e943d50e..00000000000 --- a/src/test/polar_flog_repair_partial/.gitignore +++ /dev/null @@ -1,2 +0,0 @@ -# Generated by test suite -/tmp_check/ diff --git a/src/test/polar_flog_repair_partial/Makefile b/src/test/polar_flog_repair_partial/Makefile deleted file mode 100644 index 8acc58039e4..00000000000 --- a/src/test/polar_flog_repair_partial/Makefile +++ /dev/null @@ -1,25 +0,0 @@ -#------------------------------------------------------------------------- -# -# Makefile for src/test/polar_flog_repair_partial -# -# Portions Copyright (c) 1996-2018, PostgreSQL Global Development Group -# Portions Copyright (c) 1994, Regents of the University of California -# -# src/test/polar_flog_repair_partial/Makefile -# -#------------------------------------------------------------------------- - -subdir = src/test/polar_flog_repair_partial -top_builddir = ../../.. -include $(top_builddir)/src/Makefile.global - -EXTRA_INSTALL = contrib/faultinjector - -check: - $(prove_check) - -installcheck: - $(prove_installcheck) - -clean distclean maintainer-clean: - rm -rf tmp_check \ No newline at end of file diff --git a/src/test/polar_flog_repair_partial/README b/src/test/polar_flog_repair_partial/README deleted file mode 100644 index 9bbe8278277..00000000000 --- a/src/test/polar_flog_repair_partial/README +++ /dev/null @@ -1,25 +0,0 @@ -src/test/polar_flashback/README - -Regression tests for PolarDB flashback log -======================================================== - -This directory contains a test suite for flashback log to repair partial write. - -Running the tests -================= - -NOTE: You must have given the --enable-tap-tests and argument to configure. -Also, to use "make installcheck", you must have built and installed -contrib/test_decoding in addition to the core code. - -Run - make check -or - make installcheck -You can use "make installcheck" if you previously did "make install". -In that case, the code in the installation tree is tested. With -"make check", a temporary installation tree is built from the current -sources and then tested. - -Either way, this test initializes, starts, and stops several test Postgres -clusters. diff --git a/src/test/polar_pl/Makefile b/src/test/polar_pl/Makefile index c62a09f38e7..0991a22d25d 100644 --- a/src/test/polar_pl/Makefile +++ b/src/test/polar_pl/Makefile @@ -10,6 +10,7 @@ #------------------------------------------------------------------------- EXTRA_INSTALL=contrib/test_decoding +EXTRA_INSTALL+=contrib/faultinjector subdir = src/test/polar_pl top_builddir = ../../.. diff --git a/src/test/polar_pl/README b/src/test/polar_pl/README index da154e42590..86124af7968 100644 --- a/src/test/polar_pl/README +++ b/src/test/polar_pl/README @@ -8,9 +8,10 @@ This directory contains a test suite for recovery and replication. Running the tests ================= -NOTE: You must have given the --enable-tap-tests argument to configure. -Also, to use "make installcheck", you must have built and installed -contrib/test_decoding in addition to the core code. +NOTE: You must have given the --enable-tap-tests and --enable-inject-faults +argument to configure. Also, to use "make installcheck", you must have built +and installed contrib/test_decoding and contrib/faultinjector in addition +to the core code. Run make check diff --git a/src/test/polar_flog_repair_partial/t/001_flog_solve_torn_page.pl b/src/test/polar_pl/t/017_flog_solve_torn_page.pl similarity index 100% rename from src/test/polar_flog_repair_partial/t/001_flog_solve_torn_page.pl rename to src/test/polar_pl/t/017_flog_solve_torn_page.pl diff --git a/src/test/polar_flog_repair_partial/t/002_flog_in_online_promote.pl b/src/test/polar_pl/t/018_flog_in_online_promote.pl similarity index 100% rename from src/test/polar_flog_repair_partial/t/002_flog_in_online_promote.pl rename to src/test/polar_pl/t/018_flog_in_online_promote.pl diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index 02d88972e27..224908e11e7 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1756,6 +1756,7 @@ PlannerInfo PlannerParamItem Point Pointer +PolarFlashbackTableStmt PolicyInfo PolyNumAggState Pool