From de48012db4017151109b2df1df4da23eb02317f6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A3=A0=E7=BE=BD?= Date: Mon, 4 Jul 2022 15:21:20 +0800 Subject: [PATCH 01/12] docs: refactor documentation structure --- README-CN.md | 8 +- README.md | 6 +- docs/.vuepress/config.ts | 5 +- docs/.vuepress/configs/navbar/en.ts | 99 +++++++---- docs/.vuepress/configs/navbar/zh.ts | 94 ++++++---- docs/.vuepress/configs/sidebar/en.ts | 80 +++++---- docs/.vuepress/configs/sidebar/zh.ts | 77 ++++++--- docs/README.md | 92 ++++++---- docs/{zh/guide => deploying}/db-localfs.md | 2 +- docs/{zh/guide => deploying}/db-pfs.md | 2 +- docs/deploying/deploy-official.md | 3 + .../deploy-stack.md} | 8 +- docs/{guide => deploying}/deploy.md | 0 docs/{guide => deploying}/fs-pfs.md | 2 +- docs/{guide => deploying}/introduction.md | 2 +- docs/{guide => deploying}/quick-start.md | 34 +--- .../storage-aliyun-essd.md | 0 docs/{guide => deploying}/storage-ceph.md | 0 docs/{guide => deploying}/storage-nbd.md | 0 .../customize-dev-env.md | 0 docs/development/dev-on-docker.md | 70 ++++++++ docs/{guide => features}/tpch-on-px.md | 2 +- .../backup-and-restore.md | 8 +- docs/{guide => operation}/tpcc-test.md | 6 +- .../README.md => theory/arch-overview.md} | 144 ++++++++++------ .../buffer-management.md | 0 .../ddl-synchronization.md | 0 docs/{architecture => theory}/logindex.md | 0 docs/zh/README.md | 95 ++++++---- docs/{guide => zh/deploying}/db-localfs.md | 2 +- docs/{guide => zh/deploying}/db-pfs.md | 2 +- docs/zh/deploying/deploy-official.md | 3 + .../deploy-stack.md} | 8 +- docs/zh/{guide => deploying}/deploy.md | 0 docs/zh/{guide => deploying}/fs-pfs.md | 2 +- docs/zh/{guide => deploying}/introduction.md | 2 +- docs/zh/{guide => deploying}/quick-start.md | 34 +--- .../storage-aliyun-essd.md | 0 docs/zh/{guide => deploying}/storage-ceph.md | 0 docs/zh/{guide => deploying}/storage-nbd.md | 0 .../customize-dev-env.md | 0 docs/zh/development/dev-on-docker.md | 70 ++++++++ docs/zh/{guide => features}/tpch-on-px.md | 2 +- .../backup-and-restore.md | 8 +- docs/zh/{guide => operation}/tpcc-test.md | 6 +- .../README.md => theory/arch-overview.md} | 163 +++++++++++------- .../buffer-management.md | 0 .../ddl-synchronization.md | 0 docs/zh/{architecture => theory}/logindex.md | 0 49 files changed, 720 insertions(+), 421 deletions(-) rename docs/{zh/guide => deploying}/db-localfs.md (99%) rename docs/{zh/guide => deploying}/db-pfs.md (99%) create mode 100644 docs/deploying/deploy-official.md rename docs/{guide/deploy-more.md => deploying/deploy-stack.md} (66%) rename docs/{guide => deploying}/deploy.md (100%) rename docs/{guide => deploying}/fs-pfs.md (98%) rename docs/{guide => deploying}/introduction.md (97%) rename docs/{guide => deploying}/quick-start.md (73%) rename docs/{guide => deploying}/storage-aliyun-essd.md (100%) rename docs/{guide => deploying}/storage-ceph.md (100%) rename docs/{guide => deploying}/storage-nbd.md (100%) rename docs/{guide => development}/customize-dev-env.md (100%) create mode 100644 docs/development/dev-on-docker.md rename docs/{guide => features}/tpch-on-px.md (99%) rename docs/{guide => operation}/backup-and-restore.md (97%) rename docs/{guide => operation}/tpcc-test.md (96%) rename docs/{architecture/README.md => theory/arch-overview.md} (95%) rename docs/{architecture => theory}/buffer-management.md (100%) rename docs/{architecture => theory}/ddl-synchronization.md (100%) rename docs/{architecture => theory}/logindex.md (100%) rename docs/{guide => zh/deploying}/db-localfs.md (99%) rename docs/{guide => zh/deploying}/db-pfs.md (99%) create mode 100644 docs/zh/deploying/deploy-official.md rename docs/zh/{guide/deploy-more.md => deploying/deploy-stack.md} (66%) rename docs/zh/{guide => deploying}/deploy.md (100%) rename docs/zh/{guide => deploying}/fs-pfs.md (98%) rename docs/zh/{guide => deploying}/introduction.md (97%) rename docs/zh/{guide => deploying}/quick-start.md (73%) rename docs/zh/{guide => deploying}/storage-aliyun-essd.md (100%) rename docs/zh/{guide => deploying}/storage-ceph.md (100%) rename docs/zh/{guide => deploying}/storage-nbd.md (100%) rename docs/zh/{guide => development}/customize-dev-env.md (100%) create mode 100644 docs/zh/development/dev-on-docker.md rename docs/zh/{guide => features}/tpch-on-px.md (99%) rename docs/zh/{guide => operation}/backup-and-restore.md (97%) rename docs/zh/{guide => operation}/tpcc-test.md (96%) rename docs/zh/{architecture/README.md => theory/arch-overview.md} (87%) rename docs/zh/{architecture => theory}/buffer-management.md (100%) rename docs/zh/{architecture => theory}/ddl-synchronization.md (100%) rename docs/zh/{architecture => theory}/logindex.md (100%) diff --git a/README-CN.md b/README-CN.md index 900361ccdc8..60984d6daa4 100644 --- a/README-CN.md +++ b/README-CN.md @@ -43,13 +43,13 @@ PolarDB 还支持时空、GIS、图像、向量、搜索、图谱等多模创新 `POLARDB_11_STABLE` 为稳定分支,持存储计算分离的云原生形态。 `distribute` 分支支持分布式形态。 -## 产品架构和版本规划 +## 产品架构 PolarDB 采用了基于 Shared-Storage 的存储计算分离架构。数据库由传统的 Share-Nothing 架构,转变成了 Shared-Storage 架构。由原来的 N 份计算 + N 份存储,转变成了 N 份计算 + 1 份存储。虽然共享存储上数据是一份,但是数据在各节点内存中的状态是不同的,需要通过内存状态的同步来维护数据的一致性;同时主节点在刷脏时也需要做协调,避免只读节点读取到超前的 **“未来页面”**,也要避免只读节点读取到过时的没有在内存中被正确回放的 **“过去页面”**。为了解决该问题,PolarDB 创造性地设计了 _LogIndex_ 数据结构来维护页面的回放历史,该结构能够实现主节点与只读节点之间的同步。 在存储计算分离后,I/O 单路延迟变大的同时,I/O 的吞吐也变大了。在处理分析型查询时,仅使用单个只读节点无法发挥出存储侧的大 I/O 带宽优势,也无法利用其他只读节点的 CPU、内存和 I/O 资源。为了解决该问题,PolarDB 研发了基于 Shared-Storage 的并行执行引擎,能够在 SQL 级别上弹性利用任意数目的 CPU 来加速分析查询,支持 HTAP 的混合负载场景。 -详情请查阅 [产品架构](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/architecture/) 和 [版本规划](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/roadmap/)。 +详情请查阅 [产品架构](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/architecture/theory/arch-overview.html)。 ## 快速入门 @@ -68,7 +68,7 @@ psql -h 127.0.0.1 -c 'select version();' (1 row) ``` -对于更多进阶部署方式,请移步在线文档中的 [进阶部署](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/guide/deploy.html)。在部署前,我们建议您先了解一下 PolarDB for PostgreSQL 的 [架构简介](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/guide/introduction.html)。 +对于更多进阶部署方式,请移步在线文档中的 [进阶部署](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/deploying/deploy.html)。在部署前,我们建议您先了解一下 PolarDB for PostgreSQL 的 [架构简介](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/deploying/introduction.html)。 ## 文档 @@ -78,7 +78,7 @@ psql -h 127.0.0.1 -c 'select version();' ## 参与贡献 -我们诚挚欢迎社区参与 PolarDB 的贡献,无论是代码还是文档。在线文档中的 [参与社区](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/contributing/) 提供了关于贡献流程与规范的更多信息。 +我们诚挚欢迎社区参与 PolarDB 的贡献,无论是代码还是文档。在线文档中的 [参与社区](https://apsaradb.github.io/PolarDB-for-PostgreSQL/zh/contributing/code-of-conduct.html) 提供了关于贡献流程与规范的更多信息。 以下是贡献者列表(由 [contrib.rocks](https://contrib.rocks) 支持): diff --git a/README.md b/README.md index 6b18169312b..583d31ec44c 100644 --- a/README.md +++ b/README.md @@ -49,7 +49,7 @@ PolarDB uses a shared-storage-based architecture in which computing is decoupled After computing is decoupled from storage, the I/O latency and throughput increase. When a single read-only node is used to process analytical queries, the CPUs, memory, and I/O of other read-only nodes and the large storage I/O bandwidth cannot be fully utilized. To resolve this issue, PolarDB provides the shared-storage-based MPP engine. The engine can use CPUs to accelerate analytical queries at SQL level and support a mix of OLAP workloads and OLTP workloads for HTAP. -For more information, see [Architecture](https://apsaradb.github.io/PolarDB-for-PostgreSQL/architecture/) and [Roadmap](https://apsaradb.github.io/PolarDB-for-PostgreSQL/roadmap/). +For more information, see [Architecture](https://apsaradb.github.io/PolarDB-for-PostgreSQL/theory/arch-overview.html). ## Quick Start with PolarDB @@ -68,7 +68,7 @@ psql -h 127.0.0.1 -c 'select version();' (1 row) ``` -For more advanced deployment way, please refer to [Advanced Deployment](https://apsaradb.github.io/PolarDB-for-PostgreSQL/guide/deploy.html). Before your deployment, we recommand to figure out the [architecture](https://apsaradb.github.io/PolarDB-for-PostgreSQL/guide/introduction.html) of PolarDB for PostgreSQL. +For more advanced deployment way, please refer to [Advanced Deployment](https://apsaradb.github.io/PolarDB-for-PostgreSQL/deploying/deploy.html). Before your deployment, we recommand to figure out the [architecture](https://apsaradb.github.io/PolarDB-for-PostgreSQL/deploying/introduction.html) of PolarDB for PostgreSQL. ## Documentation @@ -78,7 +78,7 @@ If you want to explore or develop documentation locally, see [Document Contribut ## Contributing -You are welcome to make contributions to PolarDB, no matter code or documentation. We appreciate all the contributions. For more information about how to start development and pull requests, see [Community](https://apsaradb.github.io/PolarDB-for-PostgreSQL/contributing/). +You are welcome to make contributions to PolarDB, no matter code or documentation. We appreciate all the contributions. For more information about how to start development and pull requests, see [Community](https://apsaradb.github.io/PolarDB-for-PostgreSQL/contributing/code-of-conduct.html). Here are the contributors: diff --git a/docs/.vuepress/config.ts b/docs/.vuepress/config.ts index 56a5ad37c8b..379f098554d 100644 --- a/docs/.vuepress/config.ts +++ b/docs/.vuepress/config.ts @@ -14,13 +14,12 @@ export default defineUserConfig({ "/": { lang: "en-US", title: "PolarDB for PostgreSQL", - description: - "A cloud-native database service independently developed by Alibaba Cloud", + description: "A cloud-native database developed by Alibaba Cloud", }, "/zh/": { lang: "zh-CN", title: "PolarDB for PostgreSQL", - description: "阿里云自主研发的云原生数据库产品", + description: "阿里云自主研发的云原生数据库", }, }, diff --git a/docs/.vuepress/configs/navbar/en.ts b/docs/.vuepress/configs/navbar/en.ts index f1a08ccdcb3..4a2ce976c34 100644 --- a/docs/.vuepress/configs/navbar/en.ts +++ b/docs/.vuepress/configs/navbar/en.ts @@ -2,86 +2,115 @@ import type { NavbarConfig } from "@vuepress/theme-default"; export const en: NavbarConfig = [ { - text: "Guide", + text: "Deployment", children: [ - "/guide/quick-start.html", - "/guide/introduction.html", - "/guide/deploy.html", + "/deploying/introduction.html", + "/deploying/quick-start.html", + "/deploying/deploy.html", { - text: "准备块存储设备", + text: "Preparation of Storage Device", children: [ - "/guide/storage-ceph.html", - "/guide/storage-aliyun-essd.html", - "/guide/storage-nbd.html", + "/deploying/storage-aliyun-essd.html", + "/deploying/storage-ceph.html", + "/deploying/storage-nbd.html", ], }, { - text: "准备文件系统", - children: ["/guide/fs-pfs.html"], + text: "Preparation of File System", + children: ["/deploying/fs-pfs.html"], }, { - text: "编译部署 PolarDB 内核", - children: ["/guide/db-localfs.html", "/guide/db-pfs.html"], + text: "Building PolarDB Kernel", + children: ["/deploying/db-localfs.html", "/deploying/db-pfs.html"], }, { - text: "更多", + text: "More about Deploying", children: [ - "/guide/backup-and-restore.html", - "/guide/customize-dev-env.html", - "/guide/deploy-more.html", + "/deploying/deploy-stack.html", + "/deploying/deploy-official.html", ], }, + ], + }, + { + text: "Ops", + link: "/operation/", + children: [ + { + text: "Daily Ops", + children: ["/operation/backup-and-restore.html"], + }, + { + text: "Benchmarks", + children: ["/operation/tpcc-test.html"], + }, + ], + }, + { + text: "Features", + link: "/zh/features/", + children: [ { - text: "性能测试", - children: ["/guide/tpch-on-px.html", "/guide/tpcc-test.html"], + text: "HTAP", + children: ["/zh/features/tpch-on-px.html"], }, ], }, { - text: "Architecture", - link: "/architecture/", + text: "Theory", + link: "/theory/", children: [ { - text: "Overview", - link: "/architecture/", + text: "Architecture Overview", + link: "/theory/arch-overview.html", }, { text: "Buffer Management", - link: "/architecture/buffer-management.html", + link: "/theory/buffer-management.html", }, { text: "DDL Synchronization", - link: "/architecture/ddl-synchronization.html", + link: "/theory/ddl-synchronization.html", }, { text: "LogIndex", - link: "/architecture/logindex.html", + link: "/theory/logindex.html", }, ], }, { - text: "Roadmap", - link: "/roadmap/", + text: "Dev", + link: "/development/", + children: [ + { + text: "Development on Docker", + link: "/development/dev-on-docker.md", + }, + { + text: "Customize Development Environment", + link: "/development/customize-dev-env.md", + }, + ], }, { - text: "Community", + text: "Contributing", link: "/contributing/", children: [ { - text: "Code Contributing", - link: "/contributing/contributing-polardb-kernel.html", + text: "Code of Conduct", + link: "/contributing/code-of-conduct.html", }, { - text: "Docs Contributing", + text: "Contributing Docs", link: "/contributing/contributing-polardb-docs.html", }, { - text: "Coding Style", - link: "/contributing/coding-style.html", + text: "Contributing Code", + link: "/contributing/contributing-polardb-kernel.html", }, { - text: "Code of Conduct", - link: "/contributing/code-of-conduct.html", + text: "Coding Style", + link: "/contributing/coding-style.html", }, ], }, diff --git a/docs/.vuepress/configs/navbar/zh.ts b/docs/.vuepress/configs/navbar/zh.ts index 569dcaf5253..71fb1346149 100644 --- a/docs/.vuepress/configs/navbar/zh.ts +++ b/docs/.vuepress/configs/navbar/zh.ts @@ -2,86 +2,118 @@ import type { NavbarConfig } from "@vuepress/theme-default"; export const zh: NavbarConfig = [ { - text: "入门指南", + text: "部署指南", children: [ - "/zh/guide/quick-start.html", - "/zh/guide/introduction.html", - "/zh/guide/deploy.html", + "/zh/deploying/introduction.html", + "/zh/deploying/quick-start.html", + "/zh/deploying/deploy.html", { - text: "准备块存储设备", + text: "存储设备的准备", children: [ - "/zh/guide/storage-ceph.html", - "/zh/guide/storage-aliyun-essd.html", - "/zh/guide/storage-nbd.html", + "/zh/deploying/storage-aliyun-essd.html", + "/zh/deploying/storage-ceph.html", + "/zh/deploying/storage-nbd.html", ], }, { - text: "准备文件系统", - children: ["/zh/guide/fs-pfs.html"], + text: "文件系统的准备", + children: ["/zh/deploying/fs-pfs.html"], }, { text: "编译部署 PolarDB 内核", - children: ["/zh/guide/db-localfs.html", "/zh/guide/db-pfs.html"], + children: [ + "/zh/deploying/db-localfs.html", + "/zh/deploying/db-pfs.html", + ], }, { - text: "更多", + text: "更多部署方式", children: [ - "/zh/guide/backup-and-restore.html", - "/zh/guide/customize-dev-env.html", - "/zh/guide/deploy-more.html", + "/zh/deploying/deploy-stack.html", + "/zh/deploying/deploy-official.html", ], }, + ], + }, + { + text: "使用与运维", + link: "/zh/operation/", + children: [ + { + text: "日常运维", + children: ["/zh/operation/backup-and-restore.html"], + }, { text: "性能测试", - children: ["/zh/guide/tpch-on-px.html", "/zh/guide/tpcc-test.html"], + children: ["/zh/operation/tpcc-test.html"], }, ], }, { - text: "架构解读", - link: "/zh/architecture/", + text: "特性实践", + link: "/zh/features/", children: [ { - text: "架构详解", - link: "/zh/architecture/", + text: "HTAP", + children: ["/zh/features/tpch-on-px.html"], + }, + ], + }, + { + text: "原理解读", + link: "/zh/theory/", + children: [ + { + text: "架构总览", + link: "/zh/theory/arch-overview.html", }, { text: "缓冲区管理", - link: "/zh/architecture/buffer-management.html", + link: "/zh/theory/buffer-management.html", }, { text: "DDL 同步", - link: "/zh/architecture/ddl-synchronization.html", + link: "/zh/theory/ddl-synchronization.html", }, { text: "LogIndex", - link: "/zh/architecture/logindex.html", + link: "/zh/theory/logindex.html", }, ], }, { - text: "版本规划", - link: "/zh/roadmap/", + text: "上手开发", + link: "/zh/development/", + children: [ + { + text: "基于容器开发", + link: "/zh/development/dev-on-docker.md", + }, + { + text: "开发环境定制", + link: "/zh/development/customize-dev-env.md", + }, + ], }, { text: "参与社区", link: "/zh/contributing/", children: [ { - text: "贡献代码", - link: "/zh/contributing/contributing-polardb-kernel.html", + text: "行为准则", + link: "/zh/contributing/code-of-conduct.html", }, { text: "贡献文档", link: "/zh/contributing/contributing-polardb-docs.html", }, { - text: "编码风格", - link: "/zh/contributing/coding-style.html", + text: "贡献代码", + link: "/zh/contributing/contributing-polardb-kernel.html", }, { - text: "行为准则", - link: "/zh/contributing/code-of-conduct.html", + text: "编码风格", + link: "/zh/contributing/coding-style.html", }, ], }, diff --git a/docs/.vuepress/configs/sidebar/en.ts b/docs/.vuepress/configs/sidebar/en.ts index 1924d9f3ebe..b01c9f8418a 100644 --- a/docs/.vuepress/configs/sidebar/en.ts +++ b/docs/.vuepress/configs/sidebar/en.ts @@ -1,64 +1,86 @@ import type { SidebarConfig } from "@vuepress/theme-default"; export const en: SidebarConfig = { - "/guide/": [ + "/deploying": [ { - text: "Guide", + text: "Deployment", children: [ - "/guide/quick-start.md", - "/guide/introduction.md", + "/deploying/introduction.md", + "/deploying/quick-start.md", { - text: "进阶部署", - link: "/guide/deploy.md", + text: "Advanced Deployment", + link: "/deploying/deploy.md", children: [ { - text: "一、准备块存储设备", + text: "Preparation of Storage Device", children: [ - "/guide/storage-aliyun-essd.md", - "/guide/storage-ceph.md", - "/guide/storage-nbd.md", + "/deploying/storage-aliyun-essd.md", + "/deploying/storage-ceph.md", + "/deploying/storage-nbd.md", ], }, { - text: "二、准备文件系统", - children: ["/guide/fs-pfs.md"], + text: "Preparation of File System", + children: ["/deploying/fs-pfs.md"], }, { - text: "三、编译部署 PolarDB 内核", - children: ["/guide/db-localfs.md", "/guide/db-pfs.md"], + text: "Building PolarDB Kernel", + children: ["/deploying/db-localfs.md", "/deploying/db-pfs.md"], }, ], }, - "/guide/backup-and-restore.md", - "/guide/customize-dev-env.md", - "/guide/deploy-more.md", + ], + }, + ], + "/operation/": [ + { + text: "Ops", + children: [ + { + text: "Daily Ops", + children: ["/operation/backup-and-restore.md"], + }, + { + text: "Benchmarks", + children: ["/operation/tpcc-test.md"], + }, + ], + }, + ], + "/features": [ + { + text: "Features Practice", + children: [ { - text: "性能测试", - children: ["/guide/tpch-on-px.md", "/guide/tpcc-test.md"], + text: "HTAP", + children: ["/features/tpch-on-px.md"], }, ], }, ], - "/architecture/": [ + "/theory/": [ { - text: "Architecture Introduction", + text: "Theory", children: [ - "/architecture/README.md", - "/architecture/buffer-management.md", - "/architecture/ddl-synchronization.md", - "/architecture/logindex.md", + "/theory/arch-overview.md", + "/theory/buffer-management.md", + "/theory/ddl-synchronization.md", + "/theory/logindex.md", ], }, ], - "/roadmap/": [ + "/development/": [ { - text: "Roadmap", - children: ["/roadmap/README.md"], + text: "Development", + children: [ + "/development/dev-on-docker.md", + "/development/customize-dev-env.md", + ], }, ], "/contributing": [ { - text: "Community", + text: "Contributing", children: [ "/contributing/contributing-polardb-kernel.md", "/contributing/contributing-polardb-docs.md", diff --git a/docs/.vuepress/configs/sidebar/zh.ts b/docs/.vuepress/configs/sidebar/zh.ts index 520c9e1345a..e00678d7d51 100644 --- a/docs/.vuepress/configs/sidebar/zh.ts +++ b/docs/.vuepress/configs/sidebar/zh.ts @@ -1,59 +1,84 @@ import type { SidebarConfig } from "@vuepress/theme-default"; export const zh: SidebarConfig = { - "/zh/guide/": [ + "/zh/deploying": [ { - text: "入门指南", + text: "部署指南", children: [ - "/zh/guide/quick-start.md", - "/zh/guide/introduction.md", + "/zh/deploying/introduction.md", + "/zh/deploying/quick-start.md", { text: "进阶部署", - link: "/zh/guide/deploy.md", + link: "/zh/deploying/deploy.md", children: [ { - text: "一、准备块存储设备", + text: "存储设备的准备", children: [ - "/zh/guide/storage-aliyun-essd.md", - "/zh/guide/storage-ceph.md", - "/zh/guide/storage-nbd.md", + "/zh/deploying/storage-aliyun-essd.md", + "/zh/deploying/storage-ceph.md", + "/zh/deploying/storage-nbd.md", ], }, { - text: "二、准备文件系统", - children: ["/zh/guide/fs-pfs.md"], + text: "文件系统的准备", + children: ["/zh/deploying/fs-pfs.md"], }, { - text: "三、编译部署 PolarDB 内核", - children: ["/zh/guide/db-localfs.md", "/zh/guide/db-pfs.md"], + text: "编译部署 PolarDB 内核", + children: [ + "/zh/deploying/db-localfs.md", + "/zh/deploying/db-pfs.md", + ], }, ], }, - "/zh/guide/backup-and-restore.md", - "/zh/guide/customize-dev-env.md", - "/zh/guide/deploy-more.md", + ], + }, + ], + "/zh/operation/": [ + { + text: "使用与运维", + children: [ + { + text: "日常运维", + children: ["/zh/operation/backup-and-restore.md"], + }, { text: "性能测试", - children: ["/zh/guide/tpch-on-px.md", "/zh/guide/tpcc-test.md"], + children: ["/zh/operation/tpcc-test.md"], }, ], }, ], - "/zh/architecture/": [ + "/zh/features": [ { - text: "架构解读", + text: "特性实践", children: [ - "/zh/architecture/README.md", - "/zh/architecture/buffer-management.md", - "/zh/architecture/ddl-synchronization.md", - "/zh/architecture/logindex.md", + { + text: "HTAP", + children: ["/zh/features/tpch-on-px.md"], + }, ], }, ], - "/zh/roadmap/": [ + "/zh/theory/": [ { - text: "版本规划", - children: ["/zh/roadmap/README.md"], + text: "原理解读", + children: [ + "/zh/theory/arch-overview.md", + "/zh/theory/buffer-management.md", + "/zh/theory/ddl-synchronization.md", + "/zh/theory/logindex.md", + ], + }, + ], + "/zh/development/": [ + { + text: "上手开发", + children: [ + "/zh/development/dev-on-docker.md", + "/zh/development/customize-dev-env.md", + ], }, ], "/zh/contributing": [ diff --git a/docs/README.md b/docs/README.md index 2b4a953acaf..da19d7a4605 100644 --- a/docs/README.md +++ b/docs/README.md @@ -2,24 +2,12 @@ home: true title: Documentation heroImage: /images/polardb.png -actions: - - text: Getting Started - link: /guide/quick-start.html - type: primary - - text: Architecture Introduction - link: /architecture/ - type: secondary -features: - - title: Flexible Scalability - details: Scale out compute/storage clusters on demand. - - title: Millisecond-level Latency - details: Lazy/parallel replay via shared-storage-based WAL and LogIndex. - - title: HTAP - details: Shared-storage-based massively parallel processing (MPP) framework. footer: Apache 2.0 Licensed | Copyright © Alibaba Group, Inc. --- -### Use with Docker +--- + +### Quick Start with Docker Pull the [instance image](https://hub.docker.com/r/polardb/polardb_pg_local_instance/tags) of PolarDB for PostgreSQL based on local storage. Create, run and enter the container, and use PolarDB instance directly: @@ -75,29 +63,59 @@ psql -h 127.0.0.1 -c 'select version();' ::: :::: -### Develop with Docker +
-Pull the [development image](https://hub.docker.com/r/polardb/polardb_pg_devel/tags) of PolarDB for PostgreSQL from DockerHub. Create, run and enter the container: + -```bash -# pull the development image of PolarDB -docker pull polardb/polardb_pg_devel -# create, run and enter the container -docker run -it --cap-add=SYS_PTRACE --privileged=true --name polardb_pg_devel polardb/polardb_pg_devel bash -``` +
+

Operation and Maintenance

+ +
-After entering the container, clone the latest stable code from GitHub, build and deploy the simplest PolarDB instance and check: +
+

Feature Practice

+ +
-```bash -# code fetching -git clone -b POLARDB_11_STABLE https://github.com/ApsaraDB/PolarDB-for-PostgreSQL.git -cd PolarDB-for-PostgreSQL -# build and deploy -./polardb_build.sh -# check -psql -h 127.0.0.1 -c 'select version();' - version --------------------------------- - PostgreSQL 11.9 (POLARDB 11.9) -(1 row) -``` + + + + + + +
diff --git a/docs/zh/guide/db-localfs.md b/docs/deploying/db-localfs.md similarity index 99% rename from docs/zh/guide/db-localfs.md rename to docs/deploying/db-localfs.md index cc0dc0afbc1..fbef8fb3b96 100644 --- a/docs/zh/guide/db-localfs.md +++ b/docs/deploying/db-localfs.md @@ -1,4 +1,4 @@ -# PolarDB 编译部署:单机文件系统 +# 编译部署:基于单机文件系统 本文将指导您在单机文件系统(如 ext4)上编译部署 PolarDB,适用于所有计算节点都可以访问相同本地磁盘存储的场景。 diff --git a/docs/zh/guide/db-pfs.md b/docs/deploying/db-pfs.md similarity index 99% rename from docs/zh/guide/db-pfs.md rename to docs/deploying/db-pfs.md index eaceda862b3..f4daa0453cc 100644 --- a/docs/zh/guide/db-pfs.md +++ b/docs/deploying/db-pfs.md @@ -1,4 +1,4 @@ -# PolarDB 编译部署:PFS 文件系统 +# 编译部署:基于 PFS 文件系统 本文将指导您在分布式文件系统 PolarDB File System(PFS)上编译部署 PolarDB,适用于已经在共享存储上格式化并挂载 PFS 的计算节点。 diff --git a/docs/deploying/deploy-official.md b/docs/deploying/deploy-official.md new file mode 100644 index 00000000000..48484d5e5cc --- /dev/null +++ b/docs/deploying/deploy-official.md @@ -0,0 +1,3 @@ +# 阿里云官网购买实例 + +阿里云官网直接提供了可供购买的 [云原生关系型数据库 PolarDB PostgreSQL 引擎](https://www.aliyun.com/product/polardb)。 diff --git a/docs/guide/deploy-more.md b/docs/deploying/deploy-stack.md similarity index 66% rename from docs/guide/deploy-more.md rename to docs/deploying/deploy-stack.md index d0bb1177564..0ce722f4c97 100644 --- a/docs/guide/deploy-more.md +++ b/docs/deploying/deploy-stack.md @@ -1,13 +1,7 @@ -# 更多部署方式 - -## 基于 PolarDB Stack 共享存储 +# 基于 PolarDB Stack 共享存储 PolarDB Stack 是轻量级 PolarDB PaaS 软件。基于共享存储提供一写多读的 PolarDB 数据库服务,特别定制和深度优化了数据库生命周期管理。通过 PolarDB Stack 可以一键部署 PolarDB-for-PostgreSQL 内核和 PolarDB-FileSystem。 PolarDB Stack 架构如下图所示,进入 [PolarDB Stack 的部署文档](https://github.com/ApsaraDB/PolarDB-Stack-Operator/blob/master/README.md) ![PolarDB Stack arch](../imgs/63-PolarDBStack-arch.png) - -## 阿里云官网购买实例 - -阿里云官网直接提供了可供购买的 [云原生关系型数据库 PolarDB PostgreSQL 引擎](https://www.aliyun.com/product/polardb)。 diff --git a/docs/guide/deploy.md b/docs/deploying/deploy.md similarity index 100% rename from docs/guide/deploy.md rename to docs/deploying/deploy.md diff --git a/docs/guide/fs-pfs.md b/docs/deploying/fs-pfs.md similarity index 98% rename from docs/guide/fs-pfs.md rename to docs/deploying/fs-pfs.md index c7c3563e0f9..83bdf929b7b 100644 --- a/docs/guide/fs-pfs.md +++ b/docs/deploying/fs-pfs.md @@ -1,4 +1,4 @@ -# 格式化并挂载 PFS +# 格式化并挂载 PolarDB File System PolarDB File System,简称 PFS 或 PolarFS,是由阿里云自主研发的高性能类 POSIX 的用户态分布式文件系统,服务于阿里云数据库 PolarDB 产品。使用 PFS 对共享存储进行格式化并挂载后,能够保证一个计算节点对共享存储的写入能够立刻对另一个计算节点可见。 diff --git a/docs/guide/introduction.md b/docs/deploying/introduction.md similarity index 97% rename from docs/guide/introduction.md rename to docs/deploying/introduction.md index 5c253a425d2..e576502469d 100644 --- a/docs/guide/introduction.md +++ b/docs/deploying/introduction.md @@ -1,4 +1,4 @@ -# PolarDB 架构简介 +# 架构简介 PolarDB for PostgreSQL 采用了基于 Shared-Storage 的存储计算分离架构。数据库由传统的 Share-Nothing 架构,转变成了 Shared-Storage 架构——由原来的 N 份计算 + N 份存储,转变成了 N 份计算 + 1 份存储;而 PostgreSQL 使用了传统的单体数据库架构,存储和计算耦合在一起。 diff --git a/docs/guide/quick-start.md b/docs/deploying/quick-start.md similarity index 73% rename from docs/guide/quick-start.md rename to docs/deploying/quick-start.md index 05d69c5ca02..e39d7b65d7a 100644 --- a/docs/guide/quick-start.md +++ b/docs/deploying/quick-start.md @@ -1,4 +1,4 @@ -# 快速上手 +# 快速部署 仅需单台计算机,同时满足以下要求,就可以快速开启您的 PolarDB 之旅: @@ -12,8 +12,6 @@ - Fedora:[在 Fedora 上安装 Docker Engine](https://docs.docker.com/engine/install/fedora/) - macOS(支持 M1 芯片):[在 Mac 上安装 Docker Desktop](https://docs.docker.com/desktop/mac/install/),并建议将内存调整为 4GB 以上 -## 快速体验 - 从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [本地存储实例镜像](https://hub.docker.com/r/polardb/polardb_pg_local_instance/tags),创建、运行并进入容器,然后直接使用 PolarDB 实例: :::: code-group @@ -67,33 +65,3 @@ psql -h 127.0.0.1 -c 'select version();' ::: :::: - -## 快速开发 - -从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [开发镜像](https://hub.docker.com/r/polardb/polardb_pg_devel/tags),创建、运行并进入容器: - -```bash -# 拉取 PolarDB 开发镜像 -docker pull polardb/polardb_pg_devel -# 创建、运行并进入容器 -docker run -it \ - --cap-add=SYS_PTRACE --privileged=true \ - --name polardb_pg \ - polardb/polardb_pg_devel bash -``` - -进入容器后,从 GitHub 拉取最新的稳定代码,快速编译部署最简单的 PolarDB 实例并进行验证: - -```bash -# 代码拉取 -git clone -b POLARDB_11_STABLE https://github.com/ApsaraDB/PolarDB-for-PostgreSQL.git -cd PolarDB-for-PostgreSQL -# 编译部署 -./polardb_build.sh -# 验证 -psql -h 127.0.0.1 -c 'select version();' - version --------------------------------- - PostgreSQL 11.9 (POLARDB 11.9) -(1 row) -``` diff --git a/docs/guide/storage-aliyun-essd.md b/docs/deploying/storage-aliyun-essd.md similarity index 100% rename from docs/guide/storage-aliyun-essd.md rename to docs/deploying/storage-aliyun-essd.md diff --git a/docs/guide/storage-ceph.md b/docs/deploying/storage-ceph.md similarity index 100% rename from docs/guide/storage-ceph.md rename to docs/deploying/storage-ceph.md diff --git a/docs/guide/storage-nbd.md b/docs/deploying/storage-nbd.md similarity index 100% rename from docs/guide/storage-nbd.md rename to docs/deploying/storage-nbd.md diff --git a/docs/guide/customize-dev-env.md b/docs/development/customize-dev-env.md similarity index 100% rename from docs/guide/customize-dev-env.md rename to docs/development/customize-dev-env.md diff --git a/docs/development/dev-on-docker.md b/docs/development/dev-on-docker.md new file mode 100644 index 00000000000..6329b12af35 --- /dev/null +++ b/docs/development/dev-on-docker.md @@ -0,0 +1,70 @@ +# 基于 Docker 容器开发 + +## 在开发机器上下载源代码 + +从 [GitHub](https://github.com/ApsaraDB/PolarDB-for-PostgreSQL) 上下载 PolarDB for PostgreSQL 的源代码,稳定分支为 `POLARDB_11_STABLE`。如果因网络原因不能稳定访问 GitHub,则可以访问 [Gitee 国内镜像](https://gitee.com/mirrors/PolarDB-for-PostgreSQL)。 + +:::: code-group +::: code-group-item GitHub + +```bash:no-line-numbers +git clone -b POLARDB_11_STABLE https://github.com/ApsaraDB/PolarDB-for-PostgreSQL.git +``` + +::: +::: code-group-item Gitee 国内镜像 + +```bash:no-line-numbers +git clone -b POLARDB_11_STABLE https://gitee.com/mirrors/PolarDB-for-PostgreSQL +``` + +::: +:::: + +代码克隆完毕后,进入源码目录: + +```bash:no-line-numbers +cd PolarDB-for-PostgreSQL/ +``` + +## 拉取开发镜像 + +从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [开发镜像](https://hub.docker.com/r/polardb/polardb_pg_devel/tags)。 + +```bash +# 拉取 PolarDB 开发镜像 +docker pull polardb/polardb_pg_devel +``` + +## 创建并运行容器 + +此时我们已经在开发机器的源码目录中。从开发镜像上创建一个容器,将当前目录作为一个 volume 挂载到容器中,这样可以: + +- 在容器内的环境中编译源码 +- 在容器外(开发机器上)使用编辑器来查看或修改代码 + +```bash +docker run -it \ + -v $PWD:/home/postgres/polardb_pg \ + --shm-size=512m --cap-add=SYS_PTRACE --privileged=true \ + --name polardb_pg_devel \ + polardb/polardb_pg_devel \ + bash +``` + +进入容器后,为容器内用户获取源码目录的权限,然后编译部署 PolarDB 实例。 + +```bash +# 获取权限并编译部署 +cd polardb_pg +sudo chmod -R a+wr ./ +sudo chown -R postgres:postgres ./ +./polardb_build.sh + +# 验证 +psql -h 127.0.0.1 -c 'select version();' + version +-------------------------------- + PostgreSQL 11.9 (POLARDB 11.9) +(1 row) +``` diff --git a/docs/guide/tpch-on-px.md b/docs/features/tpch-on-px.md similarity index 99% rename from docs/guide/tpch-on-px.md rename to docs/features/tpch-on-px.md index 8a07daef3ee..bc3a64e6767 100644 --- a/docs/guide/tpch-on-px.md +++ b/docs/features/tpch-on-px.md @@ -6,7 +6,7 @@ ### 部署 PolarDB PG -在运行前默认已经通过 [前置文档](./db-localfs.md#本地多节点-htap-实例) 部署好本地多节点 HTAP 实例。 +在运行前默认已经通过 [前置文档](../deploying/db-localfs.md#本地多节点-htap-实例) 部署好本地多节点 HTAP 实例。 也可以直接从 DockerHub 上拉取 HTAP 实例镜像: diff --git a/docs/guide/backup-and-restore.md b/docs/operation/backup-and-restore.md similarity index 97% rename from docs/guide/backup-and-restore.md rename to docs/operation/backup-and-restore.md index 81e8e61b87a..c2d94fa3541 100644 --- a/docs/guide/backup-and-restore.md +++ b/docs/operation/backup-and-restore.md @@ -9,7 +9,7 @@ PolarDB 是基于共享存储的存算分离架构,因此 PolarDB 的备份恢 5. PolarDB 搭建 Standby 6. PolarDB 按时间点恢复 -前置条件是先准备一个 PolarDB 实例,可以参考文档 [搭建 PolarDB](./deploy.md)。 +前置条件是先准备一个 PolarDB 实例,可以参考文档 [搭建 PolarDB](../deploying/deploy.md)。 ## 备份恢复原理 @@ -223,7 +223,7 @@ Connection options: ### 使用 initdb 来搭建 RO -主要步骤是使用 initdb 初始化 RO 的 Local Dir 目录,然后修改配置文件,启动实例。具体请参考 [只读节点部署](./db-pfs.md#只读节点部署)。 +主要步骤是使用 `initdb` 初始化 RO 的本地存储目录,然后修改配置文件,启动实例。具体请参考 [只读节点部署](../deploying/db-pfs.md#只读节点部署)。 ### 备份 RW 的本地存储目录来搭建 RO @@ -303,7 +303,7 @@ polar_basebackup --host=[主节点所在IP] --port=5432 -D /home/postgres/standb ![undefined](https://intranetproxy.alipay.com/skylark/lark/0/2022/png/135683/1656315576998-2aaeff3e-4341-46df-bce1-eb211ea4c605.png) ::: tip -注意:这里是构建共享存储的 Standby,首先您需要找一台机器部署好 PolarDB 及其文件系统 PolarFS,且已经搭建好了共享存储`nvme0n2`, 具体操作请参考 [准备块设备与搭建文件系统](./deploy.md) +注意:这里是构建共享存储的 Standby,首先您需要找一台机器部署好 PolarDB 及其文件系统 PolarFS,且已经搭建好了共享存储`nvme0n2`, 具体操作请参考 [准备块设备与搭建文件系统](../deploying/deploy.md) ::: 备份完成后如下图所示: @@ -359,7 +359,7 @@ psql --host=[主节点所在IP] --port=5432 -d postgres -c 'SELECT * FROM pg_cr ### 修改 Standby 本地目录配置 -在 Standby 的 Local Dir 中 `recovery.conf` 文件中增加如下参数: +在 Standby 的本地存储目录中 `recovery.conf` 文件中增加如下参数: ```ini recovery_target_timeline = 'latest' diff --git a/docs/guide/tpcc-test.md b/docs/operation/tpcc-test.md similarity index 96% rename from docs/guide/tpcc-test.md rename to docs/operation/tpcc-test.md index ab9c189f36b..d396368cbf9 100644 --- a/docs/guide/tpcc-test.md +++ b/docs/operation/tpcc-test.md @@ -12,14 +12,14 @@ TPC-C 的具体说明和排名可以通过官方网站 [TPC-C 官网](https://ww ### 部署 PolarDB PG -在运行前默认已经通过文档 [PolarDB 编译部署:单机文件系统](./db-localfs.md) 部署好 PolarDB PG 的本地实例。 +在运行前默认已经通过文档 [PolarDB 编译部署:单机文件系统](../deploying/db-localfs.md) 部署好 PolarDB PG 的本地实例。 ### 安装 Java 和 Ant 由于 TPC-C 测试工具 benchmarksql 需要通过 Ant 来编译,所以需要安装 Java 和 Ant。这里安装的 Java 版本为 8.0[^java-install],Ant 版本为 1.9.7[^ant-install]。 ::: tip -安装 Java 和 Ant 的时候需要注意修改环境变量。 +安装 Java 和 Ant 的后需要修改环境变量。 ::: ```bash @@ -59,7 +59,7 @@ Apache Ant(TM) version 1.9.16 compiled on July 10 2021 我们将通过 benchmarksql 工具来进行 TPC-C 测试。 ::: tip -下面链接中的 benchmarksql 采用的是 5.1 版本。相较于 5.0 版本,5.1 版本可以用 Procedures 性能表现较好。推荐使用 5.1 版本。 +下面链接中的 benchmarksql 采用的是 5.1 版本。相较于 5.0 版本,5.1 版本可以使用 Procedures,性能表现较好。推荐使用 5.1 版本。 ::: ```bash diff --git a/docs/architecture/README.md b/docs/theory/arch-overview.md similarity index 95% rename from docs/architecture/README.md rename to docs/theory/arch-overview.md index 73c32238a04..d4cfa6a8bc6 100644 --- a/docs/architecture/README.md +++ b/docs/theory/arch-overview.md @@ -3,10 +3,12 @@ [[toc]] PolarDB PostgreSQL (hereafter simplified as PolarDB) is a stable, reliable, scalable, highly available, and secure enterprise-grade database service that is independently developed by Alibaba Cloud to help you increase security compliance and cost-effectiveness. PolarDB is compatible with PostgreSQL and Oracle. It runs in a proprietary compute-storage separation architecture of Alibaba Cloud to support the horizontal scaling of the storage and computing capabilities. + PolarDB can process a mix of online transaction processing (OLTP) workloads and online analytical processing (OLAP) workloads in parallel. PolarDB also provides a wide range of innovative multi-model database capabilities to help you process, analyze, and search for diversified data, such as spatio-temporal, GIS, image, vector, and graph data. + PolarDB supports various deployment architectures. For example, PolarDB supports compute-storage separation, three-node X-Paxos clusters, and local SSDs. -## Issues in conventional database systems +## Issues in Conventional Database Systems If you are using a conventional database system and the complexity of your workloads continues to increase, you may face the following challenges as the amount of your business data grows: @@ -18,6 +20,7 @@ If you are using a conventional database system and the complexity of your workl ## Benefits of PolarDB ![image.png](../imgs/1_polardb_architecture.png) + To help you resolve the issues that occur in conventional database systems, Alibaba Cloud provides PolarDB. PolarDB runs in a proprietary compute-storage separation architecture of Alibaba Cloud. This architecture has the following benefits: 1. Scalability: Computing is separated from storage. You can flexibly scale out the computing cluster or the storage cluster based on your business requirements. @@ -25,27 +28,24 @@ To help you resolve the issues that occur in conventional database systems, Alib 3. Easy to use: Each PolarDB cluster consists of one primary node and one or more read-only nodes to support read/write splitting. 4. Reliability: Data is stored in triplicate, and a backup can be finished in seconds. -# A guide to this document +## A Guide to This Document PolarDB is integrated with various technologies and innovations. This document describes the following two aspects of the PolarDB architecture in sequence: compute-storage separation and hybrid transactional/analytical processing (HTAP). You can find and read the content of your interest with ease. - Compute-storage separation is the foundation of the PolarDB architecture. Conventional database systems run in the shared-nothing architecture, in which each instance is allocated independent computing resources and storage resources. As conventional database systems evolve towards compute-storage separation, database engines developers face challenges in managing executors, transactions, and buffers. PolarDB is designed to help you address these challenges. - HTAP is designed to support OLAP queries in OLTP scenarios and fully utilize the computing capabilities of multiple read-only nodes. HTAP is achieved by using a shared storage-based massively parallel processing (MPP) architecture. In the shared storage-based MPP architecture, each table or index tree is stored as a whole and is not divided into virtual partitions that are stored on different nodes. This way, you can retain the workflows used in OLTP scenarios. In addition, you can use the shared storage-based MPP architecture without the need to modify your application data. -# Overview of the PolarDB architecture - This section explains the following two aspects of the PolarDB architecture: compute-storage separation and HTAP. -## Compute-storage separation +### Compute-Storage Separation ![image.png](../imgs/2_compute-storage_separation_architecture.png) + PolarDB supports compute-storage separation. Each PolarDB cluster consists of a computing cluster and a storage cluster. You can flexibly scale out the computing cluster or the storage cluster based on your business requirements. 1. If the computing power is insufficient, you can scale out only the computing cluster. 2. If the storage capacity is insufficient, you can scale out only the storage cluster. -​ - After the shared-storage architecture is used in PolarDB, the primary node and the read-only nodes share the same physical storage. If the primary node still uses the method that is used in conventional database systems to flush write-ahead logging (WAL) records, the following issues may occur. 1. The pages that the read-only nodes read from the shared storage are outdated pages. Outdated pages are pages that are of earlier versions than the versions that are recorded on the read-only nodes. @@ -54,21 +54,24 @@ After the shared-storage architecture is used in PolarDB, the primary node and t To resolve the first issue, PolarDB must support multiple versions for each page. To resolve the second issue, PolarDB must control the speed at which the primary node flushes WAL records. -## HTAP +### HTAP When read/write splitting is enabled, each individual compute node cannot fully utilize the high I/O throughput that is provided by the shared storage. In addition, you cannot accelerate large queries by adding computing resources. To resolve these issues, PolarDB uses the shared storage-based MPP architecture to accelerate OLAP queries in OLTP scenarios. + PolarDB supports a complete suite of data types that are used in OLTP scenarios. PolarDB also supports two computing engines, which can process these types of data: - Standalone execution engine: processes highly concurrent OLTP queries. - Distributed execution engine: processes large OLAP queries. ![image.png](../imgs/3_HTAP_architecture.png) + When the same hardware resources are used, PolarDB delivers performance that is 90% of the performance delivered by Greenplum. PolarDB also provides SQL statement-level scalability. If the computing power of your PolarDB cluster is insufficient, you can allocate more CPU resources to OLAP queries without the need to rearrange data. + The following sections provide more details about compute-storage separation and HTAP. -# PolarDB - Compute-storage separation +## PolarDB: Compute-Storage Separation -### Challenges of shared storage +### Challenges of Shared Storage Compute-storage separation enables the compute nodes of your PolarDB cluster to share the same physical storage. Shared storage brings the following challenges: @@ -77,9 +80,10 @@ Compute-storage separation enables the compute nodes of your PolarDB cluster to - High availability: how to perform recovery and failover. - I/O model: how to optimize the file system from buffered I/O to direct I/O. -### Basic principles of shared storage +### Basic Principles of Shared Storage ![image.png](../imgs/4_principles_of_shared_storage.png) + The following basic principles of shared storage apply to PolarDB: - The primary node can process read requests and write requests. The read-only nodes can process only read requests. @@ -88,24 +92,25 @@ The following basic principles of shared storage apply to PolarDB: - The primary node writes WAL records to the shared storage, and only the metadata of the WAL records is replicated to the read-only nodes. - The read-only nodes read WAL records from the shared storage and apply the WAL records. -### Data consistency +### Data Consistency -#### In-memory page synchronization in the shared-nothing architecture +#### In-memory Page Synchronization in Shared-nothing Architecture In a conventional database system, the primary instance and read-only instances each are allocated independent memory resources and storage resources. The primary instance replicates WAL records to the read-only instances, and the read-only instances read and apply the WAL records. These basic principles also apply to replication state machines. -#### In-memory page synchronization in the shared-storage architecture +#### In-memory Page Synchronization in Shared-storage Architecture In a PolarDB cluster, the primary node replicates WAL records to the shared storage. The read-only nodes read and apply the most recent WAL records from the shared storage to ensure that the pages in the memory of the read-only nodes are synchronous with the pages in the memory of the primary node. + ![image.png](../imgs/5_In-memory_page_synchronization.png) 1. The primary node flushes the WAL records of a page to write version 200 of the page to the shared storage. 2. The read-only nodes read and apply the WAL records of the page to update the page from version 100 to version 200. -#### Outdated pages in the shared-storage architecture +#### Outdated Pages in Shared-storage Architecture + +In the workflow shown in the preceding figure, the new page that the read-only nodes obtain by applying WAL records is removed from the buffer pools of the read-only nodes. When you query the page on the read-only nodes, the read-only nodes read the page from the shared storage. As a result, only the previous version of the page is returned. This previous version is called an outdated page. The following figure shows more details. -In the workflow shown in the preceding figure, the new page that the read-only nodes obtain by applying WAL records is removed from the buffer pools of the read-only nodes. When you query the page on the read-only nodes, the read-only nodes read the page from the shared storage. As a result, only the previous version of the page is returned. This previous version is called an outdated page. -The following figure shows more details. ![image.png](../imgs/6_outdated_pages.png) 1. At T1, the primary node writes a WAL record with a log sequence number (LSN) of 200 to the memory to update Page 1 from version 500 to version 600. @@ -116,18 +121,16 @@ The following figure shows more details. 6. The primary node does not write version 600 of Page 1 to the shared storage. The most recent version of Page 1 in the shared storage is still version 500. 7. At T5, you query Page 1 on the read-only nodes. The read-only nodes read Page 1 from the shared storage because Page 1 has been removed from the memory of the read-only nodes. In this case, the outdated version 500 of Page 1 is returned. -##### Solution to outdated pages +#### Solution to Outdated Pages + +When you query a page on the read-only nodes at a specific point in time, the read-only nodes need to read the base version of the page and the WAL records up to that point in time. Then, the read-only nodes need to apply the WAL records one by one in sequence. The following figure shows more details. -When you query a page on the read-only nodes at a specific point in time, the read-only nodes need to read the base version of the page and the WAL records up to that point in time. Then, the read-only nodes need to apply the WAL records one by one in sequence. -The following figure shows more details. ![image.png](../imgs/7_solution_to_outdated_pages.png) 1. The metadata of the WAL records of each page is retained in the memory of the read-only nodes. 2. When you query a page on the read-only nodes, the read-only nodes need to read and apply the WAL records of the page until the read-only nodes obtain the most recent version of the page. 3. The read-only nodes read and apply WAL records from the shared storage based on the metadata of the WAL records. -​ - PolarDB needs to maintain an inverted index that stores the mapping from each page to the WAL records of the page. However, the memory capacity of each read-only node is limited. Therefore, these inverted indexes must be persistently stored. To meet this requirement, PolarDB provides LogIndex. LogIndex is an index structure, which is used to persistently store hash data. 1. The WAL receiver processes of the read-only nodes receive the metadata of WAL records from the primary node. @@ -138,11 +141,13 @@ PolarDB needs to maintain an inverted index that stores the mapping from each pa 6. When the memory usage of the read-only nodes reaches a specific threshold, the hash data that is stored in LogIndex structures is asynchronously flushed from the memory to the disk. ![image.png](../imgs/8_solution_to_outdated_pages_LogIndex.png) + LogIndex helps prevent outdated pages and enable the read-only nodes to run in lazy log apply mode. In the lazy log apply mode, the read-only nodes apply only the metadata of the WAL records for dirty pages. -#### Future pages in the shared-storage architecture +#### Future Pages in Shared-storage Architecture The read-only nodes may return future pages, whose versions are later than the versions that are recorded on the read-only nodes. The following figure shows more details. + ![image.png](../imgs/9_future_pages.png) 1. At T1, the primary node updates Page 1 twice from version 500 to version 700. Two WAL records are generated during the update process. The LSN of one WAL record is 200, and the LSN of the other WAL record is 300. At this time, Page 1 is still in version 500 on the primary node and the read-only nodes. @@ -152,26 +157,25 @@ The read-only nodes may return future pages, whose versions are later than the v 5. At T5, the read-only nodes attempt to read Page 1 again. Page 1 cannot be found in the buffer pools of the read-only nodes. Therefore, the read-only nodes obtain version 700 of Page 1 from the shared storage. Version 700 of Page 1 is a future page to the read-only nodes because the read-only nodes have not read or applied WAL Record 300. 6. If some of the pages that the read-only nodes obtain from the shared storage are future pages and some are normal pages, data inconsistencies may occur. For example, after an index block is split into two indexes that each map a page, one of the pages the read-only nodes read is a normal page and the other is a future page. In this case, the B+ tree structures of the indexes are damaged. -#### Solutions to future pages +#### Solutions to Future Pages + +The read-only nodes apply WAL records at high speeds in lazy apply mode. However, the speeds may still be lower than the speed at which the primary node flushes WAL records. If the primary node flushes WAL records faster than the read-only nodes apply WAL records, future pages are returned. To prevent future pages, PolarDB must ensure that the speed at which the primary node flushes WAL records does not exceed the speeds at which the read-only nodes apply WAL records. The following figure shows more details. -The read-only nodes apply WAL records at high speeds in lazy apply mode. However, the speeds may still be lower than the speed at which the primary node flushes WAL records. If the primary node flushes WAL records faster than the read-only nodes apply WAL records, future pages are returned. To prevent future pages, PolarDB must ensure that the speed at which the primary node flushes WAL records does not exceed the speeds at which the read-only nodes apply WAL records. -The following figure shows more details. ![image.png](../imgs/10_solutions_to_future_pages.png) 1. The read-only nodes apply the WAL record that is generated at T4. 2. When the primary node flushes WAL records to the shared storage, it sorts all WAL records by LSN and flushes only the WAL records that are updated up to T4. 3. The file position of the LSN that is generated at T4 is defined as the file position of consistency. -### Low-latency replication +### Low-latency Replication -#### Issues of conventional streaming replication +#### Issues of Conventional Streaming Replication 1. The I/O loads on the log synchronization link are heavy, and a large amount of data is transmitted over the network. 2. When the read-only nodes process I/O-bound workloads or CPU-bound workloads, they read pages and modify the pages in their buffer pools at low speeds. 3. When file- and data-related DDL operations attempt to acquire locks on specific objects, blocking exceptions may occur. As a result, the operations are run at low speeds. -4. When the read-only nodes process highly concurrent queries, transaction snapshots are taken at low speeds. +4. When the read-only nodes process highly concurrent queries, transaction snapshots are taken at low speeds. The following figure shows more details. -The following figure shows more details. ![image.png](../imgs/11_issues_of_conventional_streaming_replication.png) 1. The primary node writes WAL records to its local file system. @@ -182,7 +186,7 @@ The following figure shows more details. The full path is long, and the latency on the read-only nodes is high. This may cause an imbalance between the read loads and write loads over the read/write splitting link. -#### Optimization Method 1 - Replicate only the metadata of WAL records +#### Optimization Method 1: Replicate Only the Metadata of WAL Records The read-only nodes can read WAL records from the shared storage. Therefore, the primary node can remove the payloads of WAL records and send only the metadata of WAL records to the read-only nodes. This alleviates the pressure on network transmission and reduces the I/O loads on critical paths. The following figure shows more details. @@ -191,12 +195,15 @@ The read-only nodes can read WAL records from the shared storage. Therefore, the 3. The read-only nodes read WAL records from the shared storage based on the metadata of the WAL records. ![image.png](../imgs/12_Replicate_only_metadata_of_WAL_records.png) + This optimization method significantly reduces the amount of data that needs to be transmitted between the primary node and the read-only nodes. The amount of data that needs to be transmitted decreases by 98%, as shown in the following figure. + ![image.png](../imgs/13_optimization1_result.png) -#### Optimization Method 2 - Optimize the log apply of WAL records +#### Optimization Method 2: Optimize the Log Apply of WAL Records Conventional database systems need to read a large number of pages, apply WAL records to these pages one by one, and then flush the updated pages to the disk. To reduce the read I/O loads on critical paths, PolarDB supports compute-storage separation. If the page that you query on the read-only nodes cannot be hit in the buffer pools of the read-only nodes, no I/O loads are generated and only LogIndex records are recorded. + The following I/O operations that are performed by log apply processes can be offloaded to session processes: 1. Data page-related I/O operations @@ -204,6 +211,7 @@ The following I/O operations that are performed by log apply processes can be of 3. I/O operations to apply multiple versions of pages based on LogIndex records In the example shown in the following figure, when the log apply process of a read-only node applies the metadata of a WAL record of a page: + ![image.png](../imgs/14_optimize_log_apply_of_WAL_records.png) 1. If the page cannot be hit in the memory, only the LogIndex record that maps the WAL record is recorded. @@ -212,31 +220,39 @@ In the example shown in the following figure, when the log apply process of a re 4. Major I/O operations are no longer run by a single log apply process. These operations are offloaded to multiple user processes. This optimization method significantly reduces the log apply latency and increases the log apply speed by 30 times compared with Amazon Aurora. + ![image.png](../imgs/15_optimization2_result.png) -#### Optimization Method 3 - Optimize the log apply of DDL locks +#### Optimization Method 3: Optimize the Log Apply of DDL Locks When the primary node runs a DDL operation such as DROP TABLE to modify a table, the primary node acquires an exclusive DDL lock on the table. The exclusive DDL lock is replicated to the read-only nodes along with WAL records. The read-only nodes apply the WAL records to acquire the exclusive DDL lock on the table. This ensures that the table cannot be deleted by the primary node when a read-only node is reading the table. Only one copy of the table is stored in the shared storage. + When the applying process of a read-only node applies the exclusive DDL lock, the read-only node may require a long period of time to acquire the exclusive DDL lock on the table. You can optimize the critical path of the log apply process by offloading the task of acquiring the exclusive DDL lock to other processes. + ![image.png](../imgs/16_optimize_log_apply_of_DDL_locks.png) + This optimization method ensures that the critical path of the log apply process of a read-only node is not blocked even if the log apply process needs to wait for the release of an exclusive DDL lock. + ![image.png](../imgs/17_optimization3_result.png) + The three optimization methods in combination significantly reduce replication latency and have the following benefits: - Read/write splitting: Loads are balanced, which allows PolarDB to deliver user experience that is comparable to Oracle Real Application Clusters (RAC). - High availability: The time that is required for failover is reduced. - Stability: The number of future pages is minimized, and fewer or even no page snapshots need to be taken. -### Recovery optimization +### Recovery Optimization -#### Background information +#### Background Information If the read-only nodes apply WAL records at low speeds, your PolarDB cluster may require a long period of time to recover from exceptions such as out of memory (OOM) errors and unexpected crashes. When the direct I/O model is used for the shared storage, the severity of this issue increases. + ![image.png](../imgs/18_recovery_optimization_background.png) -#### Lazy recovery +#### Lazy Recovery The preceding sections explain how LogIndex enables the read-only nodes to apply WAL records in lazy log apply mode. In general, the recovery process of the primary node after a restart is the same as the process in which the read-only nodes apply WAL records. In this sense, the lazy log apply mode can also be used to accelerate the recovery of the primary node. + ![image.png](../imgs/19_lazy_recovery.png) 1. The primary node begins to apply WAL records in lazy log apply mode one by one starting from a specific checkpoint. @@ -245,27 +261,33 @@ The preceding sections explain how LogIndex enables the read-only nodes to apply 4. The actual log apply workloads are offloaded to the session process that is started after the primary node restarts. The example in the following figure shows how the optimized recovery method significantly reduces the time that is required to apply 500 MB of WAL records. + ![image.png](../imgs/20_recovery_optimization_result.png) -#### Persistent buffer pool +#### Persistent Buffer Pool After the primary node recovers, a session process may need to apply the pages that the session process reads. When a session process is applying pages, the primary node responds at low speeds for a short period of time. To resolve this issue, PolarDB does not delete pages from the buffer pool of the primary node if the primary node restarts or unexpectedly crashes. + ![image.png](../imgs/21_Persistent_BufferPool.png) + The shared memory of the database engine consists of the following two parts: 1. One part is used to store global structures and ProcArray structures. 2. The other part is used to store buffer pool structures. The buffer pool is allocated as a specific amount of named shared memory. Therefore, the buffer pool remains valid after the primary node restarts. However, global structures need to be reinitialized after the primary node restarts. ![image.png](../imgs/22_buffer_pool_structure.png) + Not all pages in the buffer pool of the primary node can be reused. For example, if a process acquires an exclusive lock on a page before the primary node restarts and then unexpectedly crashes, no other processes can release the exclusive lock on the page. Therefore, after the primary node unexpectedly crashes or restarts, it needs to traverse all pages in its buffer pool to identify and remove the pages that cannot be reused. In addition, the recycling of buffer pools depends on Kubernetes. + This optimized buffer pool mechanism ensures the stable performance of your PolarDB cluster before and after a restart. + ![image.png](../imgs/23_persistent_buffer_pool_result.png) -# PolarDB - HTAP +## PolarDB HTAP The shared storage of PolarDB is organized as a storage pool. When read/write splitting is enabled, the theoretical I/O throughput that is supported by the shared storage is infinite. However, large queries can be run only on individual compute nodes, and the CPU, memory, and I/O specifications of a single compute node are limited. Therefore, a single compute node cannot fully utilize the high I/O throughput that is supported by the shared storage or accelerate large queries by acquiring more computing resources. To resolve these issues, PolarDB uses the shared storage-based MPP architecture to accelerate OLAP queries in OLTP scenarios. -## Basic principles of HTAP +### Basic Principles of HTAP In a PolarDB cluster, the physical storage is shared among all compute nodes. Therefore, you cannot use the method of scanning tables in conventional MPP databases to scan tables in PolarDB clusters. PolarDB supports MPP on standalone execution engines and provides optimized shared storage. This shared storage-based MPP architecture is the first architecture of its kind in the industry. We recommend that you familiarize yourself with following basic principles of this architecture before you use PolarDB: @@ -274,23 +296,29 @@ This shared storage-based MPP architecture is the first architecture of its kind 2. The ParallelScan operator masks the shared storage. ![image.png](../imgs/24_principles_of_HTAP.png) + The preceding figure shows an example. 1. Table A and Table B are joined and aggregated. 2. Table A and Table B are still individual tables in the shared storage. These tables are not physically partitioned. 3. Four types of scan operators are redesigned to scan tables in the shared storage as virtual partitions. -## Distributed optimizer +### Distributed Optimizer The GPORCA optmizer is extended to provide a set of transformation rules that can recognize shared storage. The GPORCA optimizer enables PolarDB to access a specific amount of planned search space. For example, PolarDB can scan a table as a whole or as different virtual partitions. This is a major difference between shared storage-based MPP and conventional MPP. + The modules in gray in the upper part of the following figure are modules of the database engine. These modules enable the database engine of PolarDB to adapt to the GPORCA optimizer. + The modules in the lower part of the following figure comprise the GPORCA optimizer. Among these modules, the modules in gray are extended modules, which enable the GPORCA optimizer to communicate with the shared storage of PolarDB. + ![image.png](../imgs/25_distributed_optimizer.png) -## Parallelism of operators +### Parallelism of Operators Four types of operators in PolarDB require parallelism. This section describes how to enable parallelism for operators that are used to run sequential scans. To fully utilize the I/O throughput that is supported by the shared storage, PolarDB splits each table into logical units during a sequential scan. Each unit contains 4 MB of data. This way, PolarDB can distribute I/O loads to different disks, and the disks can simultaneously scan data to accelerate the sequential scan. In addition, each read-only node needs to scan only specific tables rather than all tables. The size of tables that can be cached is the total size of the buffer pools of all read-only nodes. + ![image.png](../imgs/26_parallelism_of_operators.png) + Parallelism has the following benefits, as shown in the following figure: 1. You can increase scan performance by 30 times by creating read-only nodes. @@ -298,7 +326,7 @@ Parallelism has the following benefits, as shown in the following figure: ![image.png](../imgs/27_parallelism_of_operators_result.png) -## Solve the issue of data skew +### Solve the Issue of Data Skew Data skew is a common issue in conventional MPP: @@ -317,51 +345,65 @@ Data skew is a common issue in conventional MPP: Although a scan task is dynamically distributed, we recommend that you maintain the affinity of buffers at your best. In addition, the context of each operator is stored in the private memory of the worker threads. The coordinator node does not store the information about specific tables. In the example shown in the following table, PolarDB uses static sharding to shard large objects. During the static sharding process, data skew occurs, but the performance of dynamic scanning can still linearly increase. + ![image.png](../imgs/29_Solve_data_skew_result.png) -## SQL statement-level scalability +## SQL Statement-level Scalability Data sharing helps deliver ultimate scalability in cloud-native environments. The full path of the coordinator node involves various modules, and PolarDB can store the external dependencies of these modules to the shared storage. In addition, the full path of a worker thread involves a number of operational parameters, and PolarDB can synchronize these parameters from the coordinator node over the control path. This way, the coordinator node and the worker thread are stateless. + ![image.png](../imgs/30_SQL_statement-level_scalability.png) + The following conclusions are made based on the preceding analysis: 1. All read-only nodes that run SQL joins can function as coordinator nodes. Therefore, the performance of PolarDB is no longer limited due to the availability of only a single coordinator node. 2. Each SQL statement can start any number of worker threads on any compute node. This increases the computing power and allows you to schedule your workloads in a more flexible manner. You can configure PolarDB to simultaneously run different kinds of workloads on different compute nodes. - ![image.png](../imgs/31_schedule_workloads.png) -## Transactional consistency +![image.png](../imgs/31_schedule_workloads.png) + +### Transactional Consistency The log apply wait mechanism and the global snapshot mechanism are used to ensure data consistency among multiple compute nodes. The log apply wait mechanism ensures that all worker threads can obtain the most recent version of each page. The global snapshot mechanism ensures that a unified version of each page can be selected. + ![image.png](../imgs/32_transactional_consistency.png) -## TPC-H performance - Speedup +### TPC-H Performance: Speedup ![image.png](../imgs/33_TPC-H_performance_Speedup1.png) + A total of 1 TB of data is used for TPC-H testing. First, run 22 SQL statements in a PolarDB cluster and in a conventional database system. The PolarDB cluster supports distributed parallelism, and the conventional database system supports standalone parallelism. The test result shows that the PolarDB cluster executes three SQL statements at speeds that are 60 times higher and 19 statements at speeds that are 10 times higher than the conventional database system. + ![image.png](../imgs/34_TPC-H_performance_Speedup2.png) + ![image.png](../imgs/35_TPC-H_performance_Speedup3.png) + Then, run a TPC-H test by using a distributed execution engine. The test result shows that the speed at which each of the 22 SQL statements runs linearly increases as the number of cores increases from 16 to 128. -​ -## TPC-H performance - Comparison with Greenplum +### TPC-H Performance: Comparison with Greenplum When 16 nodes are configured, PolarDB delivers performance that is 90% of the performance delivered by MPP-based Greenplum. + ![image.png](../imgs/36_TPC-H_performance_Comparison_with_Greenplum1.png) + ![image.png](../imgs/37_TPC-H_performance_Comparison_with_Greenplum2.png) + As mentioned earlier, the distributed execution engine of PolarDB supports scalability, and data in PolarDB does not need to be redistributed. When the degree of parallelism (DOP) is 8, PolarDB delivers performance that is 5.6 times the performance delivered by Greenplum. -## Index creation accelerated by distributed execution +### Index Creation Accelerated by Distributed Execution A large number of indexes are created in OLTP scenarios. The workloads that you run to create these indexes are divided into two parts: 80% of the workloads are run to sort and create index pages, and 20% of the workloads are run to write index pages. Distributed execution accelerates the process of sorting indexes and supports the batch writing of index pages. + ![image.png](../imgs/38_Index_creation_accelerated_by_PX.png) Distributed execution accelerates the creation of indexes by four to five times. + ![image.png](../imgs/39_Index_creation_accelerated_by_PX2.png) -## Multi-model spatio-temporal databases accelerated by distributed, parallel execution +### Multi-model Spatio-temporal Database Accelerated by Distributed, Parallel Execution PolarDB is a multi-model database service that supports spatio-temporal data. PolarDB runs CPU-bound workloads and I/O-bound workloads. These workloads can be accelerated by distributed execution. The shared storage of PolarDB supports scans on shared R-tree indexes. + ![image.png](../imgs/40_spatio-temporal_databases.png) - Data volume: 400 million data records, which amount to 500 GB in total @@ -372,7 +414,7 @@ PolarDB is a multi-model database service that supports spatio-temporal data. Po ![image.png](../imgs/41_spatio-temporal_databases_result.png) -# Summary +## Summary This document describes the crucial technologies that are used in the PolarDB architecture: diff --git a/docs/architecture/buffer-management.md b/docs/theory/buffer-management.md similarity index 100% rename from docs/architecture/buffer-management.md rename to docs/theory/buffer-management.md diff --git a/docs/architecture/ddl-synchronization.md b/docs/theory/ddl-synchronization.md similarity index 100% rename from docs/architecture/ddl-synchronization.md rename to docs/theory/ddl-synchronization.md diff --git a/docs/architecture/logindex.md b/docs/theory/logindex.md similarity index 100% rename from docs/architecture/logindex.md rename to docs/theory/logindex.md diff --git a/docs/zh/README.md b/docs/zh/README.md index c95fce36c07..6d26569d88c 100644 --- a/docs/zh/README.md +++ b/docs/zh/README.md @@ -2,26 +2,14 @@ home: true title: 文档 heroImage: /images/polardb.png -actions: - - text: 快速上手 - link: /zh/guide/quick-start.html - type: primary - - text: 架构解读 - link: /zh/architecture/ - type: secondary -features: - - title: 极致弹性 - details: 存储与计算节点均可独立地横向扩展。 - - title: 毫秒级节点间延迟 - details: 基于 LogIndex 的延迟回放和并行回放。 - - title: HTAP 能力 - details: 基于共享存储的分布式并行执行框架。 footer: Apache 2.0 Licensed | Copyright © Alibaba Group, Inc. --- +--- + ### 通过 Docker 快速使用 -从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [本地存储实例镜像](https://hub.docker.com/r/polardb/polardb_pg_local_instance/tags),创建、运行并进入容器,然后直接使用 PolarDB 实例: +从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [本地存储实例镜像](https://hub.docker.com/r/polardb/polardb_pg_local_instance/tags),创建、运行并进入容器,然后直接使用 PolarDB: :::: code-group ::: code-group-item 单节点实例 @@ -75,29 +63,62 @@ psql -h 127.0.0.1 -c 'select version();' ::: :::: -### 通过 Docker 快速开发 +
-从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [开发镜像](https://hub.docker.com/r/polardb/polardb_pg_devel/tags),创建、运行并进入容器: +
+

部署指南

+ +
-```bash -# 拉取 PolarDB 开发镜像 -docker pull polardb/polardb_pg_devel -# 创建、运行并进入容器 -docker run -it --cap-add=SYS_PTRACE --privileged=true --name polardb_pg_devel polardb/polardb_pg_devel bash -``` +
+

使用与运维

+ +
-进入容器后,从 GitHub 拉取最新的稳定代码,快速编译部署最简单的 PolarDB 实例并进行验证: +
+

特性实践

+ +
-```bash -# 代码拉取 -git clone -b POLARDB_11_STABLE https://github.com/ApsaraDB/PolarDB-for-PostgreSQL.git -cd PolarDB-for-PostgreSQL -# 编译部署 -./polardb_build.sh -# 验证 -psql -h 127.0.0.1 -c 'select version();' - version --------------------------------- - PostgreSQL 11.9 (POLARDB 11.9) -(1 row) -``` +
+

原理解读

+ +
+ + + +
+

社区贡献

+ +
+ +
diff --git a/docs/guide/db-localfs.md b/docs/zh/deploying/db-localfs.md similarity index 99% rename from docs/guide/db-localfs.md rename to docs/zh/deploying/db-localfs.md index cc0dc0afbc1..fbef8fb3b96 100644 --- a/docs/guide/db-localfs.md +++ b/docs/zh/deploying/db-localfs.md @@ -1,4 +1,4 @@ -# PolarDB 编译部署:单机文件系统 +# 编译部署:基于单机文件系统 本文将指导您在单机文件系统(如 ext4)上编译部署 PolarDB,适用于所有计算节点都可以访问相同本地磁盘存储的场景。 diff --git a/docs/guide/db-pfs.md b/docs/zh/deploying/db-pfs.md similarity index 99% rename from docs/guide/db-pfs.md rename to docs/zh/deploying/db-pfs.md index eaceda862b3..f4daa0453cc 100644 --- a/docs/guide/db-pfs.md +++ b/docs/zh/deploying/db-pfs.md @@ -1,4 +1,4 @@ -# PolarDB 编译部署:PFS 文件系统 +# 编译部署:基于 PFS 文件系统 本文将指导您在分布式文件系统 PolarDB File System(PFS)上编译部署 PolarDB,适用于已经在共享存储上格式化并挂载 PFS 的计算节点。 diff --git a/docs/zh/deploying/deploy-official.md b/docs/zh/deploying/deploy-official.md new file mode 100644 index 00000000000..48484d5e5cc --- /dev/null +++ b/docs/zh/deploying/deploy-official.md @@ -0,0 +1,3 @@ +# 阿里云官网购买实例 + +阿里云官网直接提供了可供购买的 [云原生关系型数据库 PolarDB PostgreSQL 引擎](https://www.aliyun.com/product/polardb)。 diff --git a/docs/zh/guide/deploy-more.md b/docs/zh/deploying/deploy-stack.md similarity index 66% rename from docs/zh/guide/deploy-more.md rename to docs/zh/deploying/deploy-stack.md index d0bb1177564..0ce722f4c97 100644 --- a/docs/zh/guide/deploy-more.md +++ b/docs/zh/deploying/deploy-stack.md @@ -1,13 +1,7 @@ -# 更多部署方式 - -## 基于 PolarDB Stack 共享存储 +# 基于 PolarDB Stack 共享存储 PolarDB Stack 是轻量级 PolarDB PaaS 软件。基于共享存储提供一写多读的 PolarDB 数据库服务,特别定制和深度优化了数据库生命周期管理。通过 PolarDB Stack 可以一键部署 PolarDB-for-PostgreSQL 内核和 PolarDB-FileSystem。 PolarDB Stack 架构如下图所示,进入 [PolarDB Stack 的部署文档](https://github.com/ApsaraDB/PolarDB-Stack-Operator/blob/master/README.md) ![PolarDB Stack arch](../imgs/63-PolarDBStack-arch.png) - -## 阿里云官网购买实例 - -阿里云官网直接提供了可供购买的 [云原生关系型数据库 PolarDB PostgreSQL 引擎](https://www.aliyun.com/product/polardb)。 diff --git a/docs/zh/guide/deploy.md b/docs/zh/deploying/deploy.md similarity index 100% rename from docs/zh/guide/deploy.md rename to docs/zh/deploying/deploy.md diff --git a/docs/zh/guide/fs-pfs.md b/docs/zh/deploying/fs-pfs.md similarity index 98% rename from docs/zh/guide/fs-pfs.md rename to docs/zh/deploying/fs-pfs.md index c7c3563e0f9..83bdf929b7b 100644 --- a/docs/zh/guide/fs-pfs.md +++ b/docs/zh/deploying/fs-pfs.md @@ -1,4 +1,4 @@ -# 格式化并挂载 PFS +# 格式化并挂载 PolarDB File System PolarDB File System,简称 PFS 或 PolarFS,是由阿里云自主研发的高性能类 POSIX 的用户态分布式文件系统,服务于阿里云数据库 PolarDB 产品。使用 PFS 对共享存储进行格式化并挂载后,能够保证一个计算节点对共享存储的写入能够立刻对另一个计算节点可见。 diff --git a/docs/zh/guide/introduction.md b/docs/zh/deploying/introduction.md similarity index 97% rename from docs/zh/guide/introduction.md rename to docs/zh/deploying/introduction.md index 5c253a425d2..e576502469d 100644 --- a/docs/zh/guide/introduction.md +++ b/docs/zh/deploying/introduction.md @@ -1,4 +1,4 @@ -# PolarDB 架构简介 +# 架构简介 PolarDB for PostgreSQL 采用了基于 Shared-Storage 的存储计算分离架构。数据库由传统的 Share-Nothing 架构,转变成了 Shared-Storage 架构——由原来的 N 份计算 + N 份存储,转变成了 N 份计算 + 1 份存储;而 PostgreSQL 使用了传统的单体数据库架构,存储和计算耦合在一起。 diff --git a/docs/zh/guide/quick-start.md b/docs/zh/deploying/quick-start.md similarity index 73% rename from docs/zh/guide/quick-start.md rename to docs/zh/deploying/quick-start.md index 05d69c5ca02..e39d7b65d7a 100644 --- a/docs/zh/guide/quick-start.md +++ b/docs/zh/deploying/quick-start.md @@ -1,4 +1,4 @@ -# 快速上手 +# 快速部署 仅需单台计算机,同时满足以下要求,就可以快速开启您的 PolarDB 之旅: @@ -12,8 +12,6 @@ - Fedora:[在 Fedora 上安装 Docker Engine](https://docs.docker.com/engine/install/fedora/) - macOS(支持 M1 芯片):[在 Mac 上安装 Docker Desktop](https://docs.docker.com/desktop/mac/install/),并建议将内存调整为 4GB 以上 -## 快速体验 - 从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [本地存储实例镜像](https://hub.docker.com/r/polardb/polardb_pg_local_instance/tags),创建、运行并进入容器,然后直接使用 PolarDB 实例: :::: code-group @@ -67,33 +65,3 @@ psql -h 127.0.0.1 -c 'select version();' ::: :::: - -## 快速开发 - -从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [开发镜像](https://hub.docker.com/r/polardb/polardb_pg_devel/tags),创建、运行并进入容器: - -```bash -# 拉取 PolarDB 开发镜像 -docker pull polardb/polardb_pg_devel -# 创建、运行并进入容器 -docker run -it \ - --cap-add=SYS_PTRACE --privileged=true \ - --name polardb_pg \ - polardb/polardb_pg_devel bash -``` - -进入容器后,从 GitHub 拉取最新的稳定代码,快速编译部署最简单的 PolarDB 实例并进行验证: - -```bash -# 代码拉取 -git clone -b POLARDB_11_STABLE https://github.com/ApsaraDB/PolarDB-for-PostgreSQL.git -cd PolarDB-for-PostgreSQL -# 编译部署 -./polardb_build.sh -# 验证 -psql -h 127.0.0.1 -c 'select version();' - version --------------------------------- - PostgreSQL 11.9 (POLARDB 11.9) -(1 row) -``` diff --git a/docs/zh/guide/storage-aliyun-essd.md b/docs/zh/deploying/storage-aliyun-essd.md similarity index 100% rename from docs/zh/guide/storage-aliyun-essd.md rename to docs/zh/deploying/storage-aliyun-essd.md diff --git a/docs/zh/guide/storage-ceph.md b/docs/zh/deploying/storage-ceph.md similarity index 100% rename from docs/zh/guide/storage-ceph.md rename to docs/zh/deploying/storage-ceph.md diff --git a/docs/zh/guide/storage-nbd.md b/docs/zh/deploying/storage-nbd.md similarity index 100% rename from docs/zh/guide/storage-nbd.md rename to docs/zh/deploying/storage-nbd.md diff --git a/docs/zh/guide/customize-dev-env.md b/docs/zh/development/customize-dev-env.md similarity index 100% rename from docs/zh/guide/customize-dev-env.md rename to docs/zh/development/customize-dev-env.md diff --git a/docs/zh/development/dev-on-docker.md b/docs/zh/development/dev-on-docker.md new file mode 100644 index 00000000000..6329b12af35 --- /dev/null +++ b/docs/zh/development/dev-on-docker.md @@ -0,0 +1,70 @@ +# 基于 Docker 容器开发 + +## 在开发机器上下载源代码 + +从 [GitHub](https://github.com/ApsaraDB/PolarDB-for-PostgreSQL) 上下载 PolarDB for PostgreSQL 的源代码,稳定分支为 `POLARDB_11_STABLE`。如果因网络原因不能稳定访问 GitHub,则可以访问 [Gitee 国内镜像](https://gitee.com/mirrors/PolarDB-for-PostgreSQL)。 + +:::: code-group +::: code-group-item GitHub + +```bash:no-line-numbers +git clone -b POLARDB_11_STABLE https://github.com/ApsaraDB/PolarDB-for-PostgreSQL.git +``` + +::: +::: code-group-item Gitee 国内镜像 + +```bash:no-line-numbers +git clone -b POLARDB_11_STABLE https://gitee.com/mirrors/PolarDB-for-PostgreSQL +``` + +::: +:::: + +代码克隆完毕后,进入源码目录: + +```bash:no-line-numbers +cd PolarDB-for-PostgreSQL/ +``` + +## 拉取开发镜像 + +从 DockerHub 上拉取 PolarDB for PostgreSQL 的 [开发镜像](https://hub.docker.com/r/polardb/polardb_pg_devel/tags)。 + +```bash +# 拉取 PolarDB 开发镜像 +docker pull polardb/polardb_pg_devel +``` + +## 创建并运行容器 + +此时我们已经在开发机器的源码目录中。从开发镜像上创建一个容器,将当前目录作为一个 volume 挂载到容器中,这样可以: + +- 在容器内的环境中编译源码 +- 在容器外(开发机器上)使用编辑器来查看或修改代码 + +```bash +docker run -it \ + -v $PWD:/home/postgres/polardb_pg \ + --shm-size=512m --cap-add=SYS_PTRACE --privileged=true \ + --name polardb_pg_devel \ + polardb/polardb_pg_devel \ + bash +``` + +进入容器后,为容器内用户获取源码目录的权限,然后编译部署 PolarDB 实例。 + +```bash +# 获取权限并编译部署 +cd polardb_pg +sudo chmod -R a+wr ./ +sudo chown -R postgres:postgres ./ +./polardb_build.sh + +# 验证 +psql -h 127.0.0.1 -c 'select version();' + version +-------------------------------- + PostgreSQL 11.9 (POLARDB 11.9) +(1 row) +``` diff --git a/docs/zh/guide/tpch-on-px.md b/docs/zh/features/tpch-on-px.md similarity index 99% rename from docs/zh/guide/tpch-on-px.md rename to docs/zh/features/tpch-on-px.md index 8a07daef3ee..bc3a64e6767 100644 --- a/docs/zh/guide/tpch-on-px.md +++ b/docs/zh/features/tpch-on-px.md @@ -6,7 +6,7 @@ ### 部署 PolarDB PG -在运行前默认已经通过 [前置文档](./db-localfs.md#本地多节点-htap-实例) 部署好本地多节点 HTAP 实例。 +在运行前默认已经通过 [前置文档](../deploying/db-localfs.md#本地多节点-htap-实例) 部署好本地多节点 HTAP 实例。 也可以直接从 DockerHub 上拉取 HTAP 实例镜像: diff --git a/docs/zh/guide/backup-and-restore.md b/docs/zh/operation/backup-and-restore.md similarity index 97% rename from docs/zh/guide/backup-and-restore.md rename to docs/zh/operation/backup-and-restore.md index 81e8e61b87a..c2d94fa3541 100644 --- a/docs/zh/guide/backup-and-restore.md +++ b/docs/zh/operation/backup-and-restore.md @@ -9,7 +9,7 @@ PolarDB 是基于共享存储的存算分离架构,因此 PolarDB 的备份恢 5. PolarDB 搭建 Standby 6. PolarDB 按时间点恢复 -前置条件是先准备一个 PolarDB 实例,可以参考文档 [搭建 PolarDB](./deploy.md)。 +前置条件是先准备一个 PolarDB 实例,可以参考文档 [搭建 PolarDB](../deploying/deploy.md)。 ## 备份恢复原理 @@ -223,7 +223,7 @@ Connection options: ### 使用 initdb 来搭建 RO -主要步骤是使用 initdb 初始化 RO 的 Local Dir 目录,然后修改配置文件,启动实例。具体请参考 [只读节点部署](./db-pfs.md#只读节点部署)。 +主要步骤是使用 `initdb` 初始化 RO 的本地存储目录,然后修改配置文件,启动实例。具体请参考 [只读节点部署](../deploying/db-pfs.md#只读节点部署)。 ### 备份 RW 的本地存储目录来搭建 RO @@ -303,7 +303,7 @@ polar_basebackup --host=[主节点所在IP] --port=5432 -D /home/postgres/standb ![undefined](https://intranetproxy.alipay.com/skylark/lark/0/2022/png/135683/1656315576998-2aaeff3e-4341-46df-bce1-eb211ea4c605.png) ::: tip -注意:这里是构建共享存储的 Standby,首先您需要找一台机器部署好 PolarDB 及其文件系统 PolarFS,且已经搭建好了共享存储`nvme0n2`, 具体操作请参考 [准备块设备与搭建文件系统](./deploy.md) +注意:这里是构建共享存储的 Standby,首先您需要找一台机器部署好 PolarDB 及其文件系统 PolarFS,且已经搭建好了共享存储`nvme0n2`, 具体操作请参考 [准备块设备与搭建文件系统](../deploying/deploy.md) ::: 备份完成后如下图所示: @@ -359,7 +359,7 @@ psql --host=[主节点所在IP] --port=5432 -d postgres -c 'SELECT * FROM pg_cr ### 修改 Standby 本地目录配置 -在 Standby 的 Local Dir 中 `recovery.conf` 文件中增加如下参数: +在 Standby 的本地存储目录中 `recovery.conf` 文件中增加如下参数: ```ini recovery_target_timeline = 'latest' diff --git a/docs/zh/guide/tpcc-test.md b/docs/zh/operation/tpcc-test.md similarity index 96% rename from docs/zh/guide/tpcc-test.md rename to docs/zh/operation/tpcc-test.md index ab9c189f36b..d396368cbf9 100644 --- a/docs/zh/guide/tpcc-test.md +++ b/docs/zh/operation/tpcc-test.md @@ -12,14 +12,14 @@ TPC-C 的具体说明和排名可以通过官方网站 [TPC-C 官网](https://ww ### 部署 PolarDB PG -在运行前默认已经通过文档 [PolarDB 编译部署:单机文件系统](./db-localfs.md) 部署好 PolarDB PG 的本地实例。 +在运行前默认已经通过文档 [PolarDB 编译部署:单机文件系统](../deploying/db-localfs.md) 部署好 PolarDB PG 的本地实例。 ### 安装 Java 和 Ant 由于 TPC-C 测试工具 benchmarksql 需要通过 Ant 来编译,所以需要安装 Java 和 Ant。这里安装的 Java 版本为 8.0[^java-install],Ant 版本为 1.9.7[^ant-install]。 ::: tip -安装 Java 和 Ant 的时候需要注意修改环境变量。 +安装 Java 和 Ant 的后需要修改环境变量。 ::: ```bash @@ -59,7 +59,7 @@ Apache Ant(TM) version 1.9.16 compiled on July 10 2021 我们将通过 benchmarksql 工具来进行 TPC-C 测试。 ::: tip -下面链接中的 benchmarksql 采用的是 5.1 版本。相较于 5.0 版本,5.1 版本可以用 Procedures 性能表现较好。推荐使用 5.1 版本。 +下面链接中的 benchmarksql 采用的是 5.1 版本。相较于 5.0 版本,5.1 版本可以使用 Procedures,性能表现较好。推荐使用 5.1 版本。 ::: ```bash diff --git a/docs/zh/architecture/README.md b/docs/zh/theory/arch-overview.md similarity index 87% rename from docs/zh/architecture/README.md rename to docs/zh/theory/arch-overview.md index 9e8eed4b629..22f62e1fa7d 100644 --- a/docs/zh/architecture/README.md +++ b/docs/zh/theory/arch-overview.md @@ -1,30 +1,30 @@ -# 架构详解 - -## 特性总览 +# 特性总览 [[toc]] -PolarDB PostgreSQL(以下简称 PolarDB)是一款阿里云自主研发的企业级数据库产品,采用计算存储分离架构,兼容 PostgreSQL 与 Oracle。PolarDB 的存储与计算能力均可横向扩展,具有高可靠、高可用、弹性扩展等企业级数据库特性。同时,PolarDB 具有大规模并行计算能力,可以应对 OLTP 与 OLAP 混合负载;还具有时空、向量、搜索、图谱等多模创新特性,可以满足企业对数据处理日新月异的新需求。 +PolarDB PostgreSQL(以下简称 PolarDB)是一款阿里云自主研发的企业级数据库产品,采用计算存储分离架构,兼容 PostgreSQL 与 Oracle。PolarDB 的存储与计算能力均可横向扩展,具有高可靠、高可用、弹性扩展等企业级数据库特性。同时,PolarDB 具有大规模并行计算能力,可以应对 OLTP 与 OLAP 混合负载;还具有时空、向量、搜索、图谱等多模创新特性,可以满足企业对数据处理日新月异的新需求。 + PolarDB 支持多种部署形态:存储计算分离部署、X-Paxos 三节点部署、本地盘部署。 -### 传统数据库的问题 +## 传统数据库的问题 随着用户业务数据量越来越大,业务越来越复杂,传统数据库系统面临巨大挑战,如: 1. 存储空间无法超过单机上限。 -1. 通过只读实例进行读扩展,每个只读实例独享一份存储,成本增加。 -1. 随着数据量增加,创建只读实例的耗时增加。 -1. 主备延迟高。 +2. 通过只读实例进行读扩展,每个只读实例独享一份存储,成本增加。 +3. 随着数据量增加,创建只读实例的耗时增加。 +4. 主备延迟高。 -### PolarDB 云原生数据库的优势 +## PolarDB 云原生数据库的优势 ![image.png](../imgs/1_polardb_architecture.png) + 针对上述传统数据库的问题,阿里云研发了 PolarDB 云原生数据库。采用了自主研发的计算集群和存储集群分离的架构。具备如下优势: 1. 扩展性:存储计算分离,极致弹性。 -1. 成本:共享一份数据,存储成本低。 -1. 易用性:一写多读,透明读写分离。 -1. 可靠性:三副本、秒级备份。 +2. 成本:共享一份数据,存储成本低。 +3. 易用性:一写多读,透明读写分离。 +4. 可靠性:三副本、秒级备份。 ## PolarDB 整体架构概述 @@ -33,16 +33,17 @@ PolarDB 支持多种部署形态:存储计算分离部署、X-Paxos 三节点 ### 存储计算分离架构概述 ![image.png](../imgs/2_compute-storage_separation_architecture.png) + PolarDB 是存储计算分离的设计,存储集群和计算集群可以分别独立扩展: 1. 当计算能力不够时,可以单独扩展计算集群。 -1. 当存储容量不够时,可以单独扩展存储集群。 +2. 当存储容量不够时,可以单独扩展存储集群。 基于 Shared-Storage 后,主节点和多个只读节点共享一份存储数据,主节点刷脏不能再像传统的刷脏方式了,否则: 1. 只读节点去存储中读取的页面,可能是比较老的版本,不符合他自己的状态。 -1. 只读节点指读取到的页面比自身内存中想要的数据要超前。 -1. 主节点切换到只读节点时,只读节点接管数据更新时,存储中的页面可能是旧的,需要读取日志重新对脏页的恢复。 +2. 只读节点指读取到的页面比自身内存中想要的数据要超前。 +3. 主节点切换到只读节点时,只读节点接管数据更新时,存储中的页面可能是旧的,需要读取日志重新对脏页的恢复。 对于第一个问题,我们需要有页面多版本能力;对于第二个问题,我们需要主库控制脏页的刷脏速度。 @@ -57,7 +58,7 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 ![image.png](../imgs/3_HTAP_architecture.png) 在使用相同的硬件资源时性能达到了传统 Greenplum 的 90%,同时具备了 SQL 级别的弹性:在计算能力不足时,可随时增加参与 OLAP 分析查询的 CPU,而数据无需重分布。 -## PolarDB - 存储计算分离架构详解 +## PolarDB:存储计算分离架构详解 ### Shared-Storage 带来的挑战 @@ -71,6 +72,7 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 ### 架构原理 ![image.png](../imgs/4_principles_of_shared_storage.png) + 首先来看下基于 Shared-Storage 的 PolarDB 的架构原理。 - 主节点为可读可写节点(RW),只读节点为只读(RO)。 @@ -88,6 +90,7 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 #### 基于 Shared-Storage 的内存状态同步 前面讲到过存储计算分离后,Shared-Storage 上读取到的页面是一致的,内存状态是通过从 Shared-Storage 上读取最新的 WAL 并回放得来,如下图: + ![image.png](../imgs/5_In-memory_page_synchronization.png) 1. 主节点通过刷脏把版本 200 写入到 Shared-Storage。 @@ -95,7 +98,8 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 #### 基于 Shared-Storage 的“过去页面” -上述流程中,只读节点中基于日志回放出来的页面会被淘汰掉,此后需要再次从存储上读取页面,会出现读取的页面是之前的老页面,称为“过去页面”。如下图: +上述流程中,只读节点中基于日志回放出来的页面会被淘汰掉,此后需要再次从存储上读取页面,会出现读取的页面是之前的老页面,称为“过去页面”。如下图: + ![image.png](../imgs/6_outdated_pages.png) 1. T1 时刻,主节点在 T1 时刻写入日志 LSN=200,把页面 P1 的内容从 500 更新到 600; @@ -108,45 +112,49 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 #### “过去页面” 的解法 -只读节点在任意时刻读取页面时,需要找到对应的 Base 页面和对应起点的日志,依次回放。如下图: +只读节点在任意时刻读取页面时,需要找到对应的 Base 页面和对应起点的日志,依次回放。如下图: + ![image.png](../imgs/7_solution_to_outdated_pages.png) 1. 在只读节点内存中维护每个 Page 对应的日志 meta。 -1. 在读取时一个 Page 时,按需逐个应用日志直到期望的 Page 版本。 -1. 应用日志时,通过日志的 meta 从 Shared-Storage 上读取。 +2. 在读取时一个 Page 时,按需逐个应用日志直到期望的 Page 版本。 +3. 应用日志时,通过日志的 meta 从 Shared-Storage 上读取。 通过上述分析,需要维护每个 Page 到日志的“倒排”索引,而只读节点的内存是有限的,因此这个 Page 到日志的索引需要持久化,PolarDB 设计了一个可持久化的索引结构 - LogIndex。LogIndex 本质是一个可持久化的 hash 数据结构。 1. 只读节点通过 WAL receiver 接收从主节点过来的 WAL meta 信息。 -1. WAL meta 记录该条日志修改了哪些 Page。 -1. 将该条 WAL meta 插入到 LogIndex 中,key 是 PageID,value 是 LSN。 -1. 一条 WAL 日志可能更新了多个 Page(索引分裂),在 LogIndex 对有多条记录。 -1. 同时在 BufferPool 中给该该 Page 打上 outdate 标记,以便使得下次读取的时候从 LogIndex 重回放对应的日志。 -1. 当内存达到一定阈值时,LogIndex 异步将内存中的 hash 刷到盘上。 +2. WAL meta 记录该条日志修改了哪些 Page。 +3. 将该条 WAL meta 插入到 LogIndex 中,key 是 PageID,value 是 LSN。 +4. 一条 WAL 日志可能更新了多个 Page(索引分裂),在 LogIndex 对有多条记录。 +5. 同时在 BufferPool 中给该该 Page 打上 outdate 标记,以便使得下次读取的时候从 LogIndex 重回放对应的日志。 +6. 当内存达到一定阈值时,LogIndex 异步将内存中的 hash 刷到盘上。 ![image.png](../imgs/8_solution_to_outdated_pages_LogIndex.png) + 通过 LogIndex 解决了刷脏依赖“过去页面”的问题,也是得只读节点的回放转变成了 Lazy 的回放:只需要回放日志的 meta 信息即可。 #### 基于 Shared-Storage 的“未来页面” 在存储计算分离后,刷脏依赖还存在“未来页面”的问题。如下图所示: + ![image.png](../imgs/9_future_pages.png) 1. T1 时刻,主节点对 P1 更新了 2 次,产生了 2 条日志,此时主节点和只读节点上页面 P1 的内容都是 500。 -1. T2 时刻, 发送日志 LSN=200 给只读节点。 -1. T3 时刻,只读节点回放 LSN=200 的日志,得到 P1 的内容为 600,此时只读节点日志回放到了 200,后面的 LSN=300 的日志对他来说还不存在。 -1. T4 时刻,主节点刷脏,将 P1 最新的内容 700 刷到了 Shared-Storage 上,同时只读节点上 BufferPool 淘汰掉了页面 P1。 -1. T5 时刻,只读节点再次读取页面 P1,由于 BufferPool 中不存在 P1,因此从共享内存上读取了最新的 P1,但是只读节点并没有回放 LSN=300 的日志,读取到了一个对他来说超前的“未来页面”。 -1. “未来页面”的问题是:部分页面是未来页面,部分页面是正常的页面,会到时数据不一致,比如索引分裂成 2 个 Page 后,一个读取到了正常的 Page,另一个读取到了“未来页面”,B+Tree 的索引结构会被破坏。 +2. T2 时刻, 发送日志 LSN=200 给只读节点。 +3. T3 时刻,只读节点回放 LSN=200 的日志,得到 P1 的内容为 600,此时只读节点日志回放到了 200,后面的 LSN=300 的日志对他来说还不存在。 +4. T4 时刻,主节点刷脏,将 P1 最新的内容 700 刷到了 Shared-Storage 上,同时只读节点上 BufferPool 淘汰掉了页面 P1。 +5. T5 时刻,只读节点再次读取页面 P1,由于 BufferPool 中不存在 P1,因此从共享内存上读取了最新的 P1,但是只读节点并没有回放 LSN=300 的日志,读取到了一个对他来说超前的“未来页面”。 +6. “未来页面”的问题是:部分页面是未来页面,部分页面是正常的页面,会到时数据不一致,比如索引分裂成 2 个 Page 后,一个读取到了正常的 Page,另一个读取到了“未来页面”,B+Tree 的索引结构会被破坏。 #### “未来页面”的解法 -“未来页面”的原因是主节点刷脏的速度超过了任一只读节点的回放速度(虽然只读节点的 Lazy 回放已经很快了)。因此,解法就是对主节点刷脏进度时做控制:不能超过最慢的只读节点的回放位点。如下图所示: +“未来页面”的原因是主节点刷脏的速度超过了任一只读节点的回放速度(虽然只读节点的 Lazy 回放已经很快了)。因此,解法就是对主节点刷脏进度时做控制:不能超过最慢的只读节点的回放位点。如下图所示: + ![image.png](../imgs/10_solutions_to_future_pages.png) 1. 只读节点回放到 T4 位点。 -1. 主节点在刷脏时,对所有脏页按照 LSN 排序,仅刷在 T4 之前的脏页(包括 T4),之后的脏页不刷。 -1. 其中,T4 的 LSN 位点称为“一致性位点”。 +2. 主节点在刷脏时,对所有脏页按照 LSN 排序,仅刷在 T4 之前的脏页(包括 T4),之后的脏页不刷。 +3. 其中,T4 的 LSN 位点称为“一致性位点”。 ### 低延迟复制 @@ -158,6 +166,7 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 1. 快照更新:RO 高并发引起事务快照更新慢。 如下图所示: + ![image.png](../imgs/11_issues_of_conventional_streaming_replication.png) 1. 主节点写入 WAL 日志到本地文件系统中。 @@ -168,9 +177,10 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 可以看到,整个链路是很长的,只读节点延迟高,影响用户业务读写分离负载均衡。 -#### 优化 1 - 只复制 Meta +#### 优化 1:只复制 Meta + +因为底层是 Shared-Storage,只读节点可直接从 Shared-Storage 上读取所需要的 WAL 数据。因此主节点只把 WAL 日志的元数据(去掉 Payload)复制到只读节点,这样网络传输量小,减少关键路径上的 IO。如下图所示: -因为底层是 Shared-Storage,只读节点可直接从 Shared-Storage 上读取所需要的 WAL 数据。因此主节点只把 WAL 日志的元数据(去掉 Payload)复制到只读节点,这样网络传输量小,减少关键路径上的 IO。如下图所示: ![image.png](../imgs/12_Replicate_only_metadata_of_WAL_records.png) 1. WAL Record 是由:Header,PageID,Payload 组成。 @@ -178,11 +188,13 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 1. 在只读节点上,通过 WAL 的元数据直接读取 Shared-Storage 上完整的 WAL 文件。 通过上述优化,能显著减少主节点和只读节点间的网络传输量。从下图可以看到网络传输量减少了 98%。 + ![image.png](../imgs/13_optimization1_result.png) -#### 优化 2 - 页面回放优化 +#### 优化 2:页面回放优化 + +在传统 DB 中日志回放的过程中会读取大量的 Page 并逐个日志 Apply,然后落盘。该流程在用户读 IO 的关键路径上,借助存储计算分离可以做到:如果只读节点上 Page 不在 BufferPool 中,不产生任何 IO,仅仅记录 LogIndex 即可。 -在传统 DB 中日志回放的过程中会读取大量的 Page 并逐个日志 Apply,然后落盘。该流程在用户读 IO 的关键路径上,借助存储计算分离可以做到:如果只读节点上 Page 不在 BufferPool 中,不产生任何 IO,仅仅记录 LogIndex 即可。 可以将回放进程中的如下 IO 操作 offload 到 session 进程中: 1. 数据页 IO 开销。 @@ -190,6 +202,7 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 1. 基于 LogIndex 页面的多版本回放。 如下图所示,在只读节点上的回放进程中,在 Apply 一条 WAL 的 meta 时: + ![image.png](../imgs/14_optimize_log_apply_of_WAL_records.png) 1. 如果对应 Page 不在内存中,仅仅记录 LogIndex。 @@ -198,15 +211,19 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 1. 可以看到,主要的 IO 操作有原来的单个回放进程 offload 到了多个用户进程。 通过上述优化,能显著减少回放的延迟,比 AWS Aurora 快 30 倍。 + ![image.png](../imgs/15_optimization2_result.png) -#### 优化 3 - DDL 锁回放优化 +#### 优化 3:DDL 锁回放优化 + +在主节点执行 DDL 时,比如:drop table,需要在所有节点上都对表上排他锁,这样能保证表文件不会在只读节点上读取时被主节点删除掉了(因为文件在 Shared-Storage 上只有一份)。在所有只读节点上对表上排他锁是通过 WAL 复制到所有的只读节点,只读节点回放 DDL 锁来完成。而回放进程在回放 DDL 锁时,对表上锁可能会阻塞很久,因此可以通过把 DDL 锁也 offload 到其他进程上来优化回访进程的关键路径。 -在主节点执行 DDL 时,比如:drop table,需要在所有节点上都对表上排他锁,这样能保证表文件不会在只读节点上读取时被主节点删除掉了(因为文件在 Shared-Storage 上只有一份)。在所有只读节点上对表上排他锁是通过 WAL 复制到所有的只读节点,只读节点回放 DDL 锁来完成。 -而回放进程在回放 DDL 锁时,对表上锁可能会阻塞很久,因此可以通过把 DDL 锁也 offload 到其他进程上来优化回访进程的关键路径。 ![image.png](../imgs/16_optimize_log_apply_of_DDL_locks.png) -通过上述优化,能够回放进程一直处于平滑的状态,不会因为去等 DDL 而阻塞了回放的关键路径。 + +通过上述优化,能够回放进程一直处于平滑的状态,不会因为去等 DDL 而阻塞了回放的关键路径。 + ![image.png](../imgs/17_optimization3_result.png) + 上述 3 个优化之后,极大的降低了复制延迟,能够带来如下优势: - 读写分离:负载均衡,更接近 Oracle RAC 使用体验。 @@ -217,12 +234,14 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 #### 背景 -数据库 OOM、Crash 等场景恢复时间长,本质上是日志回放慢,在共享存储 Direct-IO 模型下问题更加突出。 +数据库 OOM、Crash 等场景恢复时间长,本质上是日志回放慢,在共享存储 Direct-IO 模型下问题更加突出。 + ![image.png](../imgs/18_recovery_optimization_background.png) #### Lazy Recovery -前面讲到过通过 LogIndex 我们在只读节点上做到了 Lazy 的回放,那么在主节点重启后的 recovery 过程中,本质也是在回放日志,那么我们可以借助 Lazy 回放来加速 recovery 的过程: +前面讲到过通过 LogIndex 我们在只读节点上做到了 Lazy 的回放,那么在主节点重启后的 recovery 过程中,本质也是在回放日志,那么我们可以借助 Lazy 回放来加速 recovery 的过程: + ![image.png](../imgs/19_lazy_recovery.png) 1. 从 checkpoint 点开始逐条去读 WAL 日志。 @@ -230,24 +249,28 @@ PolarDB 支持一套 OLTP 场景型的数据在如下两种计算引擎下使用 1. recovery 完成,开始提供服务。 1. 真正的回放被 offload 到了重启之后进来的 session 进程中。 -优化之后(回放 500MB 日志量): +优化之后(回放 500MB 日志量): + ![image.png](../imgs/20_recovery_optimization_result.png) #### Persistent BufferPool -上述方案优化了在 recovery 的重启速度,但是在重启之后,session 进程通过读取 WAL 日志来回放想要的 page。表现就是在 recovery 之后会有短暂的响应慢的问题。优化的办法为在数据库重启时 BufferPool 并不销毁,如下图所示:crash 和 restart 期间 BufferPool 不销毁。 +上述方案优化了在 recovery 的重启速度,但是在重启之后,session 进程通过读取 WAL 日志来回放想要的 page。表现就是在 recovery 之后会有短暂的响应慢的问题。优化的办法为在数据库重启时 BufferPool 并不销毁,如下图所示:crash 和 restart 期间 BufferPool 不销毁。 + ![image.png](../imgs/21_Persistent_BufferPool.png) + 内核中的共享内存分成 2 部分: 1. 全局结构,ProcArray 等。 1. BufferPool 结构;其中 BufferPool 通过具名共享内存来分配,在进程重启后仍然有效。而全局结构在进程重启后需要重新初始化。 ![image.png](../imgs/22_buffer_pool_structure.png) -而 BufferPool 中并不是所有的 Page 都是可以复用的,比如:在重启前,某进程对 Page 上 X 锁,随后 crash 了,该 X 锁就没有进程来释放了。因此,在 crash 和 restart 之后需要把所有的 BufferPool 遍历一遍,剔除掉不能被复用的 Page。另外,BufferPool 的回收依赖 k8s。 -该优化之后,使得重启前后性能平稳。 + +而 BufferPool 中并不是所有的 Page 都是可以复用的,比如:在重启前,某进程对 Page 上 X 锁,随后 crash 了,该 X 锁就没有进程来释放了。因此,在 crash 和 restart 之后需要把所有的 BufferPool 遍历一遍,剔除掉不能被复用的 Page。另外,BufferPool 的回收依赖 k8s。该优化之后,使得重启前后性能平稳。 + ![image.png](../imgs/23_persistent_buffer_pool_result.png) -## PolarDB - HTAP 架构详解 +## PolarDB:HTAP 架构详解 PolaDB 读写分离后,由于底层是存储池,理论上 IO 吞吐是无限大的。而大查询只能在单个计算节点上执行,单个计算节点的 CPU/MEM/IO 是有限的,因此单个计算节点无法发挥出存储侧的大 IO 带宽的优势,也无法通过增加计算资源来加速大的查询。我们研发了基于 Shared-Storage 的 MPP 分布式并行执行,来加速在 OLTP 场景下 OLAP 查询。 @@ -260,6 +283,7 @@ PolarDB 底层存储在不同节点上是共享的,因此不能直接像传统 1. ParallelScan 算子屏蔽共享存储。 ![image.png](../imgs/24_principles_of_HTAP.png) + 如图所示: 1. 表 A 和表 B 做 join,并做聚合。 @@ -268,15 +292,16 @@ PolarDB 底层存储在不同节点上是共享的,因此不能直接像传统 ### 分布式优化器 -基于社区的 GPORCA 优化器扩展了能感知共享存储特性的 Transformation Rules。使得能够探索共享存储下特有的 Plan 空间,比如:对于一个表在 PolarDB 中既可以全量的扫描,也可以分区域扫描,这个是和传统 MPP 的本质区别。 -图中,上面灰色部分是 PolarDB 内核与 GPORCA 优化器的适配部分。 -下半部分是 ORCA 内核,灰色模块是我们在 ORCA 内核中对共享存储特性所做的扩展。 +基于社区的 GPORCA 优化器扩展了能感知共享存储特性的 Transformation Rules。使得能够探索共享存储下特有的 Plan 空间,比如:对于一个表在 PolarDB 中既可以全量的扫描,也可以分区域扫描,这个是和传统 MPP 的本质区别。图中,上面灰色部分是 PolarDB 内核与 GPORCA 优化器的适配部分。下半部分是 ORCA 内核,灰色模块是我们在 ORCA 内核中对共享存储特性所做的扩展。 + ![image.png](../imgs/25_distributed_optimizer.png) ### 算子并行化 PolarDB 中有 4 类算子需要并行化,下面介绍一个具有代表性的 Seqscan 的算子的并行化。为了最大限度的利用存储的大 IO 带宽,在顺序扫描时,按照 4MB 为单位做逻辑切分,将 IO 尽量打散到不同的盘上,达到所有的盘同时提供读服务的效果。这样做还有一个优势,就是每个只读节点只扫描部分表文件,那么最终能缓存的表大小是所有只读节点的 BufferPool 总和。 + ![image.png](../imgs/26_parallelism_of_operators.png) + 下面的图表中: 1. 增加只读节点,扫描性能线性提升 30 倍。 @@ -291,7 +316,8 @@ PolarDB 中有 4 类算子需要并行化,下面介绍一个具有代表性的 1. 在 PolarDB 中,大对象的是通过 heap 表关联 TOAST​ 表,无论对哪个表切分都无法达到均衡。 1. 另外,不同只读节点的事务、buffer、网络、IO 负载抖动。 -以上两点会导致分布执行时存在长尾进程。 +以上两点会导致分布执行时存在长尾进程。 + ![image.png](../imgs/28_data_skew.png) 1. 协调节点内部分成 DataThread 和 ControlThread。 @@ -302,50 +328,65 @@ PolarDB 中有 4 类算子需要并行化,下面介绍一个具有代表性的 需要注意的是:尽管是动态分配,尽量维护 buffer 的亲和性;另外,每个算子的上下文存储在 worker 的私有内存中,Coordinator 不存储具体表的信息; -下面表格中,当出现大对象时,静态切分出现数据倾斜,而动态扫描仍然能够线性提升。 +下面表格中,当出现大对象时,静态切分出现数据倾斜,而动态扫描仍然能够线性提升。 + ![image.png](../imgs/29_Solve_data_skew_result.png) ### SQL 级别弹性扩展 -那我们利用数据共享的特点,还可以支持云原生下极致弹性的要求:把 Coordinator 全链路上各个模块所需要的外部依赖存在共享存储上,同时 worker 全链路上需要的运行时参数通过控制链路从 Coordinator 同步过来,使 Coordinator 和 worker 无状态化。 +那我们利用数据共享的特点,还可以支持云原生下极致弹性的要求:把 Coordinator 全链路上各个模块所需要的外部依赖存在共享存储上,同时 worker 全链路上需要的运行时参数通过控制链路从 Coordinator 同步过来,使 Coordinator 和 worker 无状态化。 + ![image.png](../imgs/30_SQL_statement-level_scalability.png) + 因此: 1. SQL 连接的任意只读节点都可以成为 Coordinator 节点,这解决了 Coordinator 单点问题。 2. 一个 SQL 能在任意节点上启动任意 worker 数目,达到算力能 SQL 级别弹性扩展,也允许业务有更多的调度策略:不同业务域同时跑在不同的节点集合上。 - ![image.png](../imgs/31_schedule_workloads.png) + +![image.png](../imgs/31_schedule_workloads.png) ### 事务一致性 多个计算节点数据一致性通过等待回放和 globalsnapshot 机制来完成。等待回放保证所有 worker 能看到所需要的数据版本,而 globalsnapshot 保证了选出一个统一的版本。 + ![image.png](../imgs/32_transactional_consistency.png) -### TPCH 性能 - 加速比 +### TPCH 性能:加速比 ![image.png](../imgs/33_TPC-H_performance_Speedup1.png) -我们使用 1TB 的 TPCH 进行了测试,首先对比了 PolarDB 新的分布式并行和单机并行的性能:有 3 个 SQL 提速 60 倍,19 个 SQL 提速 10 倍以上; + +我们使用 1TB 的 TPCH 进行了测试,首先对比了 PolarDB 新的分布式并行和单机并行的性能:有 3 个 SQL 提速 60 倍,19 个 SQL 提速 10 倍以上; + ![image.png](../imgs/34_TPC-H_performance_Speedup2.png) + ![image.png](../imgs/35_TPC-H_performance_Speedup3.png) -另外,使用分布式执行引擎测,试增加 CPU 时的性能,可以看到,从 16 核和 128 核时性能线性提升; -单看 22 条 SQL,通过该增加 CPU,每个条 SQL 性能线性提升。 -### TPCH 性能 - 和 Greenplum 的对比 +另外,使用分布式执行引擎测,试增加 CPU 时的性能,可以看到,从 16 核和 128 核时性能线性提升;单看 22 条 SQL,通过该增加 CPU,每个条 SQL 性能线性提升。 + +### TPCH 性能:和 Greenplum 的对比 和传统 MPP 的 Greenplum 的对比,同样使用 16 个节点,PolarDB 的性能是 Greenplum 的 90%。 + ![image.png](../imgs/36_TPC-H_performance_Comparison_with_Greenplum1.png) + ![image.png](../imgs/37_TPC-H_performance_Comparison_with_Greenplum2.png) + 前面讲到我们给 PolarDB 的分布式引擎做到了弹性扩展,数据不需要充分重分布,当 dop=8 时,性能是 Greenplum 的 5.6 倍。 ### 分布式执行加速索引创建 OLTP 业务中会建大量的索引,经分析建索引过程中:80%是在排序和构建索引页,20%在写索引页。通过使用分布式并行来加速排序过程,同时流水化批量写入。 + ![image.png](../imgs/38_Index_creation_accelerated_by_PX.png) + 上述优化能够使得创建索引有 4~5 倍的提升。 + ![image.png](../imgs/39_Index_creation_accelerated_by_PX2.png) -### 分布式并行执行加速多模 - 时空数据库 +### 分布式并行执行加速多模:时空数据库 PolarDB 是对多模数据库,支持时空数据。时空数据库是计算密集型和 IO 密集型,可以借助分布式执行来加速。我们针对共享存储开发了扫描共享 RTREE 索引的功能。 + ![image.png](../imgs/40_spatio-temporal_databases.png) - 数据量:40000 万,500 GB diff --git a/docs/zh/architecture/buffer-management.md b/docs/zh/theory/buffer-management.md similarity index 100% rename from docs/zh/architecture/buffer-management.md rename to docs/zh/theory/buffer-management.md diff --git a/docs/zh/architecture/ddl-synchronization.md b/docs/zh/theory/ddl-synchronization.md similarity index 100% rename from docs/zh/architecture/ddl-synchronization.md rename to docs/zh/theory/ddl-synchronization.md diff --git a/docs/zh/architecture/logindex.md b/docs/zh/theory/logindex.md similarity index 100% rename from docs/zh/architecture/logindex.md rename to docs/zh/theory/logindex.md From 950c12dcdcae043feb7851db57885aad7272f08e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E6=A3=A0=E7=BE=BD?= Date: Thu, 7 Jul 2022 11:10:37 +0800 Subject: [PATCH 02/12] docs: fix missing sidebar for more deployment chapter --- docs/.vuepress/configs/navbar/en.ts | 2 +- docs/.vuepress/configs/sidebar/en.ts | 7 +++++++ docs/.vuepress/configs/sidebar/zh.ts | 7 +++++++ 3 files changed, 15 insertions(+), 1 deletion(-) diff --git a/docs/.vuepress/configs/navbar/en.ts b/docs/.vuepress/configs/navbar/en.ts index 4a2ce976c34..34b91bf915e 100644 --- a/docs/.vuepress/configs/navbar/en.ts +++ b/docs/.vuepress/configs/navbar/en.ts @@ -24,7 +24,7 @@ export const en: NavbarConfig = [ children: ["/deploying/db-localfs.html", "/deploying/db-pfs.html"], }, { - text: "More about Deploying", + text: "More about Deployment", children: [ "/deploying/deploy-stack.html", "/deploying/deploy-official.html", diff --git a/docs/.vuepress/configs/sidebar/en.ts b/docs/.vuepress/configs/sidebar/en.ts index b01c9f8418a..4562592d881 100644 --- a/docs/.vuepress/configs/sidebar/en.ts +++ b/docs/.vuepress/configs/sidebar/en.ts @@ -29,6 +29,13 @@ export const en: SidebarConfig = { }, ], }, + { + text: "More about Deployment", + children: [ + "/deploying/deploy-stack.md", + "/deploying/deploy-official.md", + ], + }, ], }, ], diff --git a/docs/.vuepress/configs/sidebar/zh.ts b/docs/.vuepress/configs/sidebar/zh.ts index e00678d7d51..7db2fd07647 100644 --- a/docs/.vuepress/configs/sidebar/zh.ts +++ b/docs/.vuepress/configs/sidebar/zh.ts @@ -32,6 +32,13 @@ export const zh: SidebarConfig = { }, ], }, + { + text: "更多部署方式", + children: [ + "/zh/deploying/deploy-stack.md", + "/zh/deploying/deploy-official.md", + ], + }, ], }, ], From 3882d68ac2c86fe370c12054bbed973805c8f6dc Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Thu, 7 Jul 2022 12:28:14 +0000 Subject: [PATCH 03/12] make query in COPY TO command parallel --- src/backend/commands/copy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 1e274024270..ca05040d990 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -1529,7 +1529,7 @@ BeginCopy(ParseState *pstate, } /* plan the query */ - plan = pg_plan_query(query, CURSOR_OPT_PARALLEL_OK, NULL); + plan = pg_plan_query(query, CURSOR_OPT_PX_OK, NULL); /* * With row level security and a user using "COPY relation TO", we From 2839c7f44be6a91acf29c2b52b6fb43dc2ad59e6 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Thu, 21 Jul 2022 10:52:48 +0000 Subject: [PATCH 04/12] update Makefile for px_copy --- src/backend/px/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/src/backend/px/Makefile b/src/backend/px/Makefile index fac27b98d9c..8203102d225 100644 --- a/src/backend/px/Makefile +++ b/src/backend/px/Makefile @@ -14,6 +14,7 @@ override CPPFLAGS := -I$(libpq_srcdir) $(CPPFLAGS) SUBDIRS := motion dispatcher OBJS = px_cat.o \ + px_copy.o \ px_hash.o \ px_llize.o \ px_mutate.o \ From 7ded382375ea03c7bac3e1e406e3d3b9e50da58c Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Thu, 21 Jul 2022 10:54:07 +0000 Subject: [PATCH 05/12] add global variables for px copy feature --- polardb_build.sh | 1 + src/include/utils/guc.h | 1 + src/include/utils/px_unsync_guc_name.h | 1 + 3 files changed, 3 insertions(+) diff --git a/polardb_build.sh b/polardb_build.sh index 8e914bc6004..ce3c0fdf893 100755 --- a/polardb_build.sh +++ b/polardb_build.sh @@ -118,6 +118,7 @@ function del_cov() { function px_init() { echo "################################ px_init ################################" echo "polar_enable_px=0" >> $pg_bld_master_dir/postgresql.conf + echo "polar_px_enable_copy=0" >> $pg_bld_master_dir/postgresql.conf echo "polar_px_enable_check_workers=0" >> $pg_bld_master_dir/postgresql.conf echo "polar_px_enable_replay_wait=1" >> $pg_bld_master_dir/postgresql.conf echo "polar_px_dop_per_node=3" >> $pg_bld_master_dir/postgresql.conf diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 41d7b89047a..2ddb53bcfb0 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -804,6 +804,7 @@ extern bool px_optimizer_enable_relsize_collection; /* Optimizer related gucs */ extern bool polar_enable_px; +extern bool px_enable_copy; extern bool px_enable_executor; extern bool px_enable_join; extern bool px_enable_window_function; diff --git a/src/include/utils/px_unsync_guc_name.h b/src/include/utils/px_unsync_guc_name.h index 02712e5e80f..a7f6a1b67c8 100644 --- a/src/include/utils/px_unsync_guc_name.h +++ b/src/include/utils/px_unsync_guc_name.h @@ -360,6 +360,7 @@ "polar_px_dop_per_node", "polar_px_enable_check_workers", "polar_px_enable_cte", + "polar_px_enable_copy", "polar_px_enable_executor", "polar_px_enable_join", "polar_px_enable_plan_cache", From 5daaae6c9cf12f2b6171aa2566781f61e18c66d7 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Thu, 21 Jul 2022 10:56:36 +0000 Subject: [PATCH 06/12] add replicated policy mode to adapt to greenplum policy --- src/backend/px/px_cat.c | 38 ++++++++++++++++++++++++++++++++- src/include/catalog/px_policy.h | 12 +++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/src/backend/px/px_cat.c b/src/backend/px/px_cat.c index 35271172961..46a717c6a4f 100644 --- a/src/backend/px/px_cat.c +++ b/src/backend/px/px_cat.c @@ -87,4 +87,40 @@ PxPolicy * createRandomPartitionedPolicy(int numsegments) { return makePxPolicy(POLICYTYPE_PARTITIONED, 0, numsegments); -} \ No newline at end of file +} + + +/* + * createReplicatedPxPolicy-- Create a policy with replicated distribution + */ +PxPolicy * +createReplicatedPolicy(int numsegments) +{ + return makePxPolicy(POLICYTYPE_REPLICATED, 0, numsegments); +} + + +/* + * PxPolicyCopy -- Return a copy of a PxPolicy object. + * + * The copy is palloc'ed. + */ +PxPolicy * +PxPolicyCopy(const PxPolicy *src) +{ + PxPolicy *tgt; + int i; + + if (!src) + return NULL; + + tgt = makePxPolicy(src->ptype, src->nattrs, src->numsegments); + + for (i = 0; i < src->nattrs; i++) + { + tgt->attrs[i] = src->attrs[i]; + tgt->opclasses[i] = src->opclasses[i]; + } + + return tgt; +} /* PxPolicyCopy */ diff --git a/src/include/catalog/px_policy.h b/src/include/catalog/px_policy.h index b898c8c8dcd..e1c6ddaa7e8 100644 --- a/src/include/catalog/px_policy.h +++ b/src/include/catalog/px_policy.h @@ -73,7 +73,19 @@ typedef struct PxPolicy Oid *opclasses; /* and their opclasses */ } PxPolicy; +/* + * PxPolicyCopy -- Return a copy of a PxPolicy object. + * + * The copy is palloc'ed in the specified context. + */ +extern PxPolicy *PxPolicyCopy(const PxPolicy *src); + extern PxPolicy *makePxPolicy(PxPolicyType ptype, int nattrs, int numsegments); extern PxPolicy *createRandomPartitionedPolicy(int numsegments); +extern PxPolicy *createReplicatedPolicy(int numsegments); + +extern bool PxPolicyIsReplicated(const PxPolicy *policy); + + #endif /*_PX_POLICY_H_*/ From 2f450fa2fb0c710c9a28bbb0e000f39f5e0a9ee1 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Thu, 21 Jul 2022 11:00:28 +0000 Subject: [PATCH 07/12] complete most part of copy in query, but meet Segment dispatched with invalid option error --- src/backend/commands/copy.c | 535 ++++++++----- src/backend/nodes/outfuncs.c | 19 + src/backend/nodes/readfuncs.c | 17 + src/backend/px/dispatcher/px_disp_query.c | 264 +++++++ src/backend/px/dispatcher/px_dispatchresult.c | 102 +++ src/backend/px/dispatcher/px_gang.c | 18 + src/backend/px/px_copy.c | 727 ++++++++++++++++++ src/backend/px/px_util.c | 17 + src/backend/utils/misc/guc_px.c | 11 + src/include/commands/copy.h | 180 ++++- src/include/px/px_copy.h | 55 ++ src/include/px/px_disp_query.h | 8 + src/include/px/px_dispatchresult.h | 6 + src/include/px/px_gang.h | 2 + 14 files changed, 1785 insertions(+), 176 deletions(-) create mode 100644 src/backend/px/px_copy.c create mode 100644 src/include/px/px_copy.h diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index ca05040d990..05168c0f7a2 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -14,6 +14,8 @@ */ #include "postgres.h" +#include "libpq-int.h" + #include #include #include @@ -54,178 +56,13 @@ /* POLAR */ #include "utils/guc.h" - +#include "px/px_copy.h" +#include "px/px_disp_query.h" +#include "px/px_dispatchresult.h" #define ISOCTAL(c) (((c) >= '0') && ((c) <= '7')) #define OCTVALUE(c) ((c) - '0') -/* - * Represents the different source/dest cases we need to worry about at - * the bottom level - */ -typedef enum CopyDest -{ - COPY_FILE, /* to/from file (or a piped program) */ - COPY_OLD_FE, /* to/from frontend (2.0 protocol) */ - COPY_NEW_FE, /* to/from frontend (3.0 protocol) */ - COPY_CALLBACK /* to/from callback function */ -} CopyDest; - -/* - * Represents the end-of-line terminator type of the input - */ -typedef enum EolType -{ - EOL_UNKNOWN, - EOL_NL, - EOL_CR, - EOL_CRNL -} EolType; - -/* - * This struct contains all the state variables used throughout a COPY - * operation. For simplicity, we use the same struct for all variants of COPY, - * even though some fields are used in only some cases. - * - * Multi-byte encodings: all supported client-side encodings encode multi-byte - * characters by having the first byte's high bit set. Subsequent bytes of the - * character can have the high bit not set. When scanning data in such an - * encoding to look for a match to a single-byte (ie ASCII) character, we must - * use the full pg_encoding_mblen() machinery to skip over multibyte - * characters, else we might find a false match to a trailing byte. In - * supported server encodings, there is no possibility of a false match, and - * it's faster to make useless comparisons to trailing bytes than it is to - * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true - * when we have to do it the hard way. - */ -typedef struct CopyStateData -{ - /* low-level state data */ - CopyDest copy_dest; /* type of copy source/destination */ - FILE *copy_file; /* used if copy_dest == COPY_FILE */ - StringInfo fe_msgbuf; /* used for all dests during COPY TO, only for - * dest == COPY_NEW_FE in COPY FROM */ - bool is_copy_from; /* COPY TO, or COPY FROM? */ - bool reached_eof; /* true if we read to end of copy data (not - * all copy_dest types maintain this) */ - EolType eol_type; /* EOL type of input */ - int file_encoding; /* file or remote side's character encoding */ - bool need_transcoding; /* file encoding diff from server? */ - bool encoding_embeds_ascii; /* ASCII can be non-first byte? */ - - /* parameters from the COPY command */ - Relation rel; /* relation to copy to or from */ - QueryDesc *queryDesc; /* executable query to copy from */ - List *attnumlist; /* integer list of attnums to copy */ - char *filename; /* filename, or NULL for STDIN/STDOUT */ - bool is_program; /* is 'filename' a program to popen? */ - copy_data_source_cb data_source_cb; /* function for reading data */ - bool binary; /* binary format? */ - bool oids; /* include OIDs? */ - bool freeze; /* freeze rows on loading? */ - bool csv_mode; /* Comma Separated Value format? */ - bool header_line; /* CSV header line? */ - char *null_print; /* NULL marker string (server encoding!) */ - int null_print_len; /* length of same */ - char *null_print_client; /* same converted to file encoding */ - char *delim; /* column delimiter (must be 1 byte) */ - char *quote; /* CSV quote char (must be 1 byte) */ - char *escape; /* CSV escape char (must be 1 byte) */ - List *force_quote; /* list of column names */ - bool force_quote_all; /* FORCE_QUOTE *? */ - bool *force_quote_flags; /* per-column CSV FQ flags */ - List *force_notnull; /* list of column names */ - bool *force_notnull_flags; /* per-column CSV FNN flags */ - List *force_null; /* list of column names */ - bool *force_null_flags; /* per-column CSV FN flags */ - bool convert_selectively; /* do selective binary conversion? */ - List *convert_select; /* list of column names (can be NIL) */ - bool *convert_select_flags; /* per-column CSV/TEXT CS flags */ - - /* these are just for error messages, see CopyFromErrorCallback */ - const char *cur_relname; /* table name for error messages */ - uint64 cur_lineno; /* line number for error messages */ - const char *cur_attname; /* current att for error messages */ - const char *cur_attval; /* current att value for error messages */ - - /* - * Working state for COPY TO/FROM - */ - MemoryContext copycontext; /* per-copy execution context */ - - /* - * Working state for COPY TO - */ - FmgrInfo *out_functions; /* lookup info for output functions */ - MemoryContext rowcontext; /* per-row evaluation context */ - - /* - * Working state for COPY FROM - */ - AttrNumber num_defaults; - bool file_has_oids; - FmgrInfo oid_in_function; - Oid oid_typioparam; - FmgrInfo *in_functions; /* array of input functions for each attrs */ - Oid *typioparams; /* array of element types for in_functions */ - int *defmap; /* array of default att numbers */ - ExprState **defexprs; /* array of default att expressions */ - bool volatile_defexprs; /* is any of defexprs volatile? */ - List *range_table; - - /* Tuple-routing support info */ - PartitionTupleRouting *partition_tuple_routing; - - TransitionCaptureState *transition_capture; - - /* - * These variables are used to reduce overhead in textual COPY FROM. - * - * attribute_buf holds the separated, de-escaped text for each field of - * the current line. The CopyReadAttributes functions return arrays of - * pointers into this buffer. We avoid palloc/pfree overhead by re-using - * the buffer on each cycle. - */ - StringInfoData attribute_buf; - - /* field raw data pointers found by COPY FROM */ - - int max_fields; - char **raw_fields; - - /* - * Similarly, line_buf holds the whole input line being processed. The - * input cycle is first to read the whole line into line_buf, convert it - * to server encoding there, and then extract the individual attribute - * fields into attribute_buf. line_buf is preserved unmodified so that we - * can display it in error messages if appropriate. - */ - StringInfoData line_buf; - bool line_buf_converted; /* converted to server encoding? */ - bool line_buf_valid; /* contains the row being processed? */ - - /* - * Finally, raw_buf holds raw data read from the data source (file or - * client connection). CopyReadLine parses this data sufficiently to - * locate line boundaries, then transfers the data to line_buf and - * converts it. Note: we guarantee that there is a \0 at - * raw_buf[raw_buf_len]. - */ -#define RAW_BUF_SIZE 65536 /* we palloc RAW_BUF_SIZE+1 bytes */ - char *raw_buf; - int raw_buf_index; /* next byte to process */ - int raw_buf_len; /* total # of bytes stored */ -} CopyStateData; - -/* DestReceiver for COPY (query) TO */ -typedef struct -{ - DestReceiver pub; /* publicly-known function pointers */ - CopyState cstate; /* CopyStateData for the command */ - uint64 processed; /* # of tuples processed */ -} DR_copy; - - /* * These macros centralize code used to process line_buf and raw_buf buffers. * They are macros because they often do continue/break control and to avoid @@ -291,6 +128,10 @@ if (1) \ goto not_end_of_copy; \ } else ((void) 0) + +/* GPDB_91_MERGE_FIXME: passing through a global variable like this is ugly */ +static CopyStmt *glob_copystmt = NULL; + static const char BinarySignature[11] = "PGCOPY\n\377\r\n\0"; @@ -305,6 +146,7 @@ static CopyState BeginCopyTo(ParseState *pstate, Relation rel, RawStmt *query, List *attnamelist, List *options); static void EndCopyTo(CopyState cstate); static uint64 DoCopyTo(CopyState cstate); +static uint64 CopyToDispatch(CopyState cstate); static uint64 CopyTo(CopyState cstate); static void CopyOneRowTo(CopyState cstate, Oid tupleOid, Datum *values, bool *nulls); @@ -345,6 +187,81 @@ static void CopySendInt16(CopyState cstate, int16 val); static bool CopyGetInt16(CopyState cstate, int16 *val); +typedef struct +{ + /* + * First field that should be processed in the QE. Any fields before + * this will be included as Datums in the rows that follow. + */ + int16 first_qe_processed_field; +} copy_from_dispatch_header; + +typedef struct +{ + /* + * Information about this input line. + * + * 'relid' is the target relation's OID. Normally, the same as + * cstate->relid, but for a partitioned relation, it indicates the target + * partition. Note: this must be the first field, because InvalidOid means + * that this is actually a 'copy_from_dispatch_error' struct. + * + * 'lineno' is the input line number, for error reporting. + */ + int64 lineno; + Oid relid; + + uint32 line_len; /* size of the included input line */ + uint32 residual_off; /* offset in the line, where QE should + * process remaining fields */ + bool delim_seen_at_end; /* conveys to QE if QD saw a delim at end + * of its processing */ + uint16 fld_count; /* # of fields that were processed in the + * QD. */ + + /* The input line follows. */ + + /* + * For each field that was parsed in the QD already, the following data follows: + * + * int16 fieldnum; + * + * + * NULL values are not included, any attributes that are not included in + * the message are implicitly NULL. + * + * For pass-by-value datatypes, the is the raw Datum. For + * simplicity, it is always sent as a full-width 8-byte Datum, regardless + * of the datatype's length. + * + * For other fixed width datatypes, is the datatype's value. + * + * For variable-length datatypes, begins with a 4-byte length field, + * followed by the data. Cstrings (typlen = -2) are also sent in this + * format. + */ +} copy_from_dispatch_row; + +/* Size of the struct, without padding at the end. */ +#define SizeOfCopyFromDispatchRow (offsetof(copy_from_dispatch_row, fld_count) + sizeof(uint16)) + +typedef struct +{ + int64 error_marker; /* constant -1, to mark that this is an error + * frame rather than 'copy_from_dispatch_row' */ + int64 lineno; + uint32 errmsg_len; + uint32 line_len; + bool line_buf_converted; + + /* 'errmsg' follows */ + /* 'line' follows */ +} copy_from_dispatch_error; + +/* Size of the struct, without padding at the end. */ +#define SizeOfCopyFromDispatchError (offsetof(copy_from_dispatch_error, line_buf_converted) + sizeof(bool)) + + /* * Send copy start/stop messages for frontend copies. These have changed * in past protocol redesigns. @@ -545,6 +462,77 @@ CopySendEndOfRow(CopyState cstate) resetStringInfo(fe_msgbuf); } +/* + * AXG: This one is equivalent to CopySendEndOfRow() besides that + * it doesn't send end of row - it just flushed the data. We need + * this method for the dispatcher COPY TO since it already has data + * with newlines (from the executors). + */ +static void +CopyToDispatchFlush(CopyState cstate) +{ + StringInfo fe_msgbuf = cstate->fe_msgbuf; + + switch (cstate->copy_dest) + { + case COPY_FILE: + + (void) fwrite(fe_msgbuf->data, fe_msgbuf->len, + 1, cstate->copy_file); + if (ferror(cstate->copy_file)) + { + if (cstate->is_program) + { + if (errno == EPIPE) + { + /* + * The pipe will be closed automatically on error at + * the end of transaction, but we might get a better + * error message from the subprocess' exit code than + * just "Broken Pipe" + */ + ClosePipeToProgram(cstate); + + /* + * If ClosePipeToProgram() didn't throw an error, + * the program terminated normally, but closed the + * pipe first. Restore errno, and throw an error. + */ + errno = EPIPE; + } + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to COPY program: %m"))); + } + else + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not write to COPY file: %m"))); + } + break; + case COPY_OLD_FE: + + if (pq_putbytes(fe_msgbuf->data, fe_msgbuf->len)) + { + /* no hope of recovering connection sync, so FATAL */ + ereport(FATAL, + (errcode(ERRCODE_CONNECTION_FAILURE), + errmsg("connection lost during COPY to stdout"))); + } + break; + case COPY_NEW_FE: + + /* Dump the accumulated row as one CopyData message */ + (void) pq_putmessage('d', fe_msgbuf->data, fe_msgbuf->len); + break; + case COPY_CALLBACK: + elog(ERROR, "unexpected destination COPY_CALLBACK to flush data"); + break; + } + + resetStringInfo(fe_msgbuf); +} + /* * CopyGetData reads data from the source (file or frontend) * @@ -792,10 +780,14 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, { CopyState cstate; bool is_from = stmt->is_from; - bool pipe = (stmt->filename == NULL); + bool pipe = (stmt->filename == NULL || px_role == PX_ROLE_PX); Relation rel; Oid relid; RawStmt *query = NULL; + List *options; + + glob_copystmt = (CopyStmt *) stmt; + options = stmt->options; /* * Disallow COPY to/from file or program except to users with the @@ -1001,13 +993,23 @@ DoCopy(ParseState *pstate, const CopyStmt *stmt, } else { - cstate = BeginCopyTo(pstate, rel, query, relid, - stmt->filename, stmt->is_program, - stmt->attlist, stmt->options); - *processed = DoCopyTo(cstate); /* copy from database to file */ + PG_TRY(); + { + cstate = BeginCopyTo(pstate, rel, query, relid, + stmt->filename, stmt->is_program, + stmt->attlist, stmt->options); + *processed = DoCopyTo(cstate); /* copy from database to file */ + } + PG_CATCH(); + { + PG_RE_THROW(); + } + PG_END_TRY(); + EndCopyTo(cstate); } + /* * Close the relation. If reading, we can release the AccessShareLock we * got; if writing, we should hold the lock until end of transaction to @@ -1921,10 +1923,31 @@ DoCopyTo(CopyState cstate) if (fe_copy) SendCopyBegin(cstate); - processed = CopyTo(cstate); + /* + * We want to dispatch COPY TO commands only in the case that + * we are the dispatcher and we are copying from a user relation + * (a relation where data is distributed in the segment databases). + * Otherwize, if we are not the dispatcher *or* if we are + * doing COPY (SELECT) we just go straight to work, without + * dispatching COPY commands to executors. + */ + // if (px_role == PX_ROLE_QC && cstate->rel && cstate->rel->rd_pxpolicy && px_enable_copy) + if (px_role == PX_ROLE_QC && cstate->rel && px_enable_copy) + processed = CopyToDispatch(cstate); + else + processed = CopyTo(cstate); if (fe_copy) SendCopyEnd(cstate); + else if (px_role == PX_ROLE_PX) + { + /* + * For COPY ON SEGMENT command, switch back to front end + * before sending copy end which is "\." + */ + cstate->copy_dest = COPY_NEW_FE; + SendCopyEnd(cstate); + } } PG_CATCH(); { @@ -1933,6 +1956,9 @@ DoCopyTo(CopyState cstate) * okay to do this in all cases, since it does nothing if the mode is * not on. */ + if (px_role == PX_ROLE_PX && px_enable_copy) + cstate->copy_dest = COPY_NEW_FE; + pq_endcopyout(true); PG_RE_THROW(); } @@ -1960,6 +1986,167 @@ EndCopyTo(CopyState cstate) EndCopy(cstate); } +/* + * Copy FROM relation TO file, in the dispatcher. Starts a COPY TO command on + * each of the executors and gathers all the results and writes it out. + */ +static uint64 +CopyToDispatch(CopyState cstate) +{ + CopyStmt *stmt = glob_copystmt; + TupleDesc tupDesc; + int num_phys_attrs; + int attr_count; + FormData_pg_attribute *attr; + PxCopy *pxCopy; + uint64 processed = 0; + + tupDesc = cstate->rel->rd_att; + attr = tupDesc->attrs; + num_phys_attrs = tupDesc->natts; + attr_count = list_length(cstate->attnumlist); + + /* We use fe_msgbuf as a per-row buffer regardless of copy_dest */ + cstate->fe_msgbuf = makeStringInfo(); + + pxCopy = makePxCopy(cstate, false); + + /* XXX: lock all partitions */ + + /* + * Start a COPY command in every db of every segment in Greenplum Database. + * + * From this point in the code we need to be extra careful + * about error handling. ereport() must not be called until + * the COPY command sessions are closed on the executors. + * Calling ereport() will leave the executors hanging in + * COPY state. + */ + elog(DEBUG5, "COPY command sent to segdbs"); + + PG_TRY(); + { + bool done; + + pxCopyStart(pxCopy, stmt, cstate->file_encoding); + + if (cstate->binary) + { + /* Generate header for a binary copy */ + int32 tmp; + + /* Signature */ + CopySendData(cstate, (char *) BinarySignature, 11); + /* Flags field */ + tmp = 0; + CopySendInt32(cstate, tmp); + /* No header extension */ + tmp = 0; + CopySendInt32(cstate, tmp); + } + + /* if a header has been requested send the line */ + if (cstate->header_line) + { + ListCell *cur; + bool hdr_delim = false; + + /* + * For non-binary copy, we need to convert null_print to client + * encoding, because it will be sent directly with CopySendString. + * + * MPP: in here we only care about this if we need to print the + * header. We rely on the segdb server copy out to do the conversion + * before sending the data rows out. We don't need to repeat it here + */ + if (cstate->need_transcoding) + cstate->null_print = (char *) + // pg_server_to_custom(cstate->null_print, + // strlen(cstate->null_print), + // cstate->file_encoding, + // cstate->enc_conversion_proc); + pg_server_to_any(cstate->null_print, + strlen(cstate->null_print), + cstate->file_encoding); + + foreach(cur, cstate->attnumlist) + { + int attnum = lfirst_int(cur); + char *colname; + + if (hdr_delim) + CopySendChar(cstate, cstate->delim[0]); + hdr_delim = true; + + colname = NameStr(attr[attnum - 1].attname); + + CopyAttributeOutCSV(cstate, colname, false, + list_length(cstate->attnumlist) == 1); + } + + /* add a newline and flush the data */ + CopySendEndOfRow(cstate); + } + + /* + * This is the main work-loop. In here we keep collecting data from the + * COPY commands on the segdbs, until no more data is available. We + * keep writing data out a chunk at a time. + */ + do + { + bool copy_cancel = (QueryCancelPending ? true : false); + + /* get a chunk of data rows from the QE's */ + done = pxCopyGetData(pxCopy, copy_cancel, &processed); + + /* send the chunk of data rows to destination (file or stdout) */ + if (pxCopy->copy_out_buf.len > 0) /* conditional is important! */ + { + /* + * in the dispatcher we receive chunks of whole rows with row endings. + * We don't want to use CopySendEndOfRow() b/c it adds row endings and + * also b/c it's intended for a single row at a time. Therefore we need + * to fill in the out buffer and just flush it instead. + */ + CopySendData(cstate, (void *) pxCopy->copy_out_buf.data, pxCopy->copy_out_buf.len); + CopyToDispatchFlush(cstate); + } + } while(!done); + + pxCopyEnd(pxCopy, NULL, NULL); + + /* now it's safe to destroy the whole dispatcher state */ + PxDispatchCopyEnd(pxCopy); + } + /* catch error from CopyStart, CopySendEndOfRow or CopyToDispatchFlush */ + PG_CATCH(); + { + MemoryContext oldcontext = MemoryContextSwitchTo(cstate->copycontext); + + pxCopyAbort(pxCopy); + + MemoryContextSwitchTo(oldcontext); + PG_RE_THROW(); + } + PG_END_TRY(); + + if (cstate->binary) + { + /* Generate trailer for a binary copy */ + CopySendInt16(cstate, -1); + /* Need to flush out the trailer */ + CopySendEndOfRow(cstate); + } + + /* we can throw the error now if QueryCancelPending was set previously */ + CHECK_FOR_INTERRUPTS(); + + pfree(pxCopy); + + return processed; +} + /* * Copy from relation or query TO file. */ diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c index 14db48dddc8..d8fd50da89e 100644 --- a/src/backend/nodes/outfuncs.c +++ b/src/backend/nodes/outfuncs.c @@ -2941,6 +2941,22 @@ _outDeclareCursorStmt(StringInfo str, const DeclareCursorStmt *node) WRITE_NODE_FIELD(query); } +// #ifndef COMPILING_BINARY_FUNCS +static void +_outCopyStmt(StringInfo str, const CopyStmt *node) +{ + WRITE_NODE_TYPE("COPYSTMT"); + + WRITE_NODE_FIELD(relation); + WRITE_NODE_FIELD(attlist); + WRITE_BOOL_FIELD(is_from); + WRITE_BOOL_FIELD(is_program); + WRITE_STRING_FIELD(filename); + WRITE_NODE_FIELD(options); + // WRITE_NODE_FIELD(sreh); +} +// #endif/* COMPILING_BINARY_FUNCS */ + static void _outSelectStmt(StringInfo str, const SelectStmt *node) { @@ -4592,6 +4608,9 @@ outNode(StringInfo str, const void *obj) case T_DeclareCursorStmt: _outDeclareCursorStmt(str, obj); break; + case T_CopyStmt: + _outCopyStmt(str, obj); + break; case T_SelectStmt: _outSelectStmt(str, obj); break; diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c index e90b37c851b..ce24b328dee 100644 --- a/src/backend/nodes/readfuncs.c +++ b/src/backend/nodes/readfuncs.c @@ -2978,6 +2978,21 @@ _readDMLActionExpr(void) READ_DONE(); } + +static CopyStmt * +_readCopyStmt(void) +{ + READ_LOCALS(CopyStmt); + + READ_NODE_FIELD(relation); + READ_NODE_FIELD(attlist); + READ_BOOL_FIELD(is_from); + READ_BOOL_FIELD(is_program); + READ_STRING_FIELD(filename); + READ_NODE_FIELD(options); + + READ_DONE(); +} /* POLAR end */ @@ -3265,6 +3280,8 @@ parseNodeString(void) return_value = _readSplitUpdate(); else if (MATCH("DMLACTIONEXPR", 13)) return_value = _readDMLActionExpr(); + else if (MATCH("COPYSTMT", 8)) + return_value = _readCopyStmt(); /* POLAR end */ else { diff --git a/src/backend/px/dispatcher/px_disp_query.c b/src/backend/px/dispatcher/px_disp_query.c index da92c1aeeb2..f4cc7da1c29 100644 --- a/src/backend/px/dispatcher/px_disp_query.c +++ b/src/backend/px/dispatcher/px_disp_query.c @@ -32,6 +32,7 @@ #include "utils/typcache.h" #include "px/px_conn.h" +#include "px/px_copy.h" #include "px/px_disp.h" #include "px/px_disp_query.h" #include "px/px_dispatchresult.h" @@ -99,6 +100,11 @@ static int fillSliceVector(SliceTable * sliceTable, static char *buildPXQueryString(DispatchCommandQueryParms *pQueryParms, int *finalLen); static DispatchCommandQueryParms *pxdisp_buildPlanQueryParms(struct QueryDesc *queryDesc, bool planRequiresTxn); +static DispatchCommandQueryParms *pxdisp_buildUtilityQueryParms(struct Node *stmt, int flags, List *oid_assignments); + +static void pxdisp_dispatchCommandInternal(DispatchCommandQueryParms *pQueryParms, + int flags, List *segments, + PxPgResults *cdb_pgresults); static void pxdisp_dispatchX(QueryDesc *queryDesc, bool planRequiresTxn, @@ -251,6 +257,75 @@ pxdisp_buildPlanQueryParms(struct QueryDesc *queryDesc, return pQueryParms; } +static DispatchCommandQueryParms * +pxdisp_buildUtilityQueryParms(struct Node *stmt, + int flags, + List *oid_assignments) +{ + char *serializedPlantree = NULL; + char *serializedQueryDispatchDesc = NULL; + char *sparams; + int serializedPlantree_len = 0; + int serializedQueryDispatchDesc_len = 0; + int sparams_len = 0; + QueryDispatchDesc *qddesc; + PlannedStmt *pstmt; + DispatchCommandQueryParms *pQueryParms; + Oid save_userid; + + Assert(stmt != NULL); + Assert(stmt->type < 1000); + Assert(stmt->type > 0); + + /* Wrap it in a PlannedStmt */ + pstmt = makeNode(PlannedStmt); + pstmt->commandType = CMD_UTILITY; + + /* + * We must set q->canSetTag = true. False would be used to hide a command + * introduced by rule expansion which is not allowed to return its + * completion status in the command tag (PQcmdStatus/PQcmdTuples). For + * example, if the original unexpanded command was SELECT, the status + * should come back as "SELECT n" and should not reflect other commands + * inserted by rewrite rules. True means we want the status. + */ + pstmt->canSetTag = true; + pstmt->utilityStmt = stmt; + pstmt->stmt_location = 0; + pstmt->stmt_len = 0; + + /* + * serialized the stmt tree, and create the sql statement: mppexec .... + */ + serializedPlantree = serializeNode((Node *) pstmt, &serializedPlantree_len, + NULL /* uncompressed_size */ ); + Assert(serializedPlantree != NULL); + + if (oid_assignments) + { + qddesc = makeNode(QueryDispatchDesc); + qddesc->oidAssignments = oid_assignments; + // GetUserIdAndSecContext(&save_userid, &qddesc->secContext); + serializedQueryDispatchDesc = serializeNode((Node *) qddesc, &serializedQueryDispatchDesc_len, + NULL /* uncompressed_size */ ); + } + + pQueryParms = palloc0(sizeof(*pQueryParms)); + pQueryParms->strCommand = PointerIsValid(debug_query_string) ? debug_query_string : ""; + // pQueryParms->serializedQuerytree = NULL; + // pQueryParms->serializedQuerytreelen = 0; + pQueryParms->serializedPlantree = serializedPlantree; + pQueryParms->serializedPlantreelen = serializedPlantree_len; + // pQueryParms->serializedParams = sparams; + // pQueryParms->serializedParamslen = sparams_len; + pQueryParms->serializedQueryDispatchDesc = serializedQueryDispatchDesc; + pQueryParms->serializedQueryDispatchDesclen = serializedQueryDispatchDesc_len; + // pQueryParms->serializedSnapshot = pxsn_get_serialized_snapshot(); + // pQueryParms->serializedSnapshotlen = pxsn_get_serialized_snapshot_size(); + + return pQueryParms; +} + /* * Three Helper functions for pxdisp_dispatchX: * @@ -968,3 +1043,192 @@ deserializeParamListInfo(const char *str, int slen) return paramLI; } + +/* + * PxDispatchCopyStart allocate a writer gang and + * dispatch the COPY command to segments. + * + * In COPY protocol, after a COPY command is dispatched, a response + * to this will be a PGresult object bearing a status code of + * PGRES_COPY_OUT or PGRES_COPY_IN, then client can use APIs like + * PQputCopyData/PQgetCopyData to copy in/out data. + * + * pxdisp_checkDispatchResult() will block until all connections + * has issued a PGRES_COPY_OUT/PGRES_COPY_IN PGresult response. + */ +void +PxDispatchCopyStart(struct PxCopy *pxCopy, Node *stmt, int flags) +{ + DispatchCommandQueryParms *pQueryParms; + char *queryText; + int queryTextLength; + PxDispatcherState *ds; + Gang *primaryGang; + ErrorData *error = NULL; + // bool needTwoPhase = flags & DF_NEED_TWO_PHASE; + + + // elogif(log_min_messages <= DEBUG5, LOG, + // "PxDispatchCopyStart: %s (needTwoPhase = %s)", + // (PointerIsValid(debug_query_string) ? debug_query_string : "\"\""), + // (needTwoPhase ? "true" : "false")); + + pQueryParms = pxdisp_buildUtilityQueryParms(stmt, flags, NULL); + + /* + * Dispatch the command. + */ + ds = pxdisp_makeDispatcherState(false); + + queryText = buildPXQueryString(pQueryParms, &queryTextLength); + + /* + * Allocate a primary QE for every available segDB in the system. + */ + primaryGang = AllocateGang(ds, GANGTYPE_PRIMARY_WRITER, pxCopy->seglist); + Assert(primaryGang); + + pxdisp_makeDispatchResults(ds, 1, flags & DF_CANCEL_ON_ERROR); + pxdisp_makeDispatchParams (ds, 1, queryText, queryTextLength); + + pxdisp_dispatchToGang(ds, primaryGang, -1); + // if ((flags & DF_NEED_TWO_PHASE) != 0 || isDtxExplicitBegin()) + // addToGxactDtxSegments(primaryGang); + + pxdisp_waitDispatchFinish(ds); + + pxdisp_checkDispatchResult(ds, DISPATCH_WAIT_NONE); + + if (!pxdisp_getDispatchResults(ds, &error)) + { + FlushErrorState(); + ReThrowError(error); + } + + /* + * Notice: Do not call pxdisp_finishCommand to destroy dispatcher state, + * following PQputCopyData/PQgetCopyData will be called on those connections + */ + pxCopy->dispatcherState = ds; +} + +void +PxDispatchCopyEnd(struct PxCopy *pxCopy) +{ + PxDispatcherState *ds; + + ds = pxCopy->dispatcherState; + pxCopy->dispatcherState = NULL; + pxdisp_destroyDispatcherState(ds); +} + + +/* + * PxDispatchUtilityStatement + * + * Dispatch an already parsed statement to all primary writer QEs, wait until + * all QEs finished successfully. If one or more QEs got error, + * throw an Error. + * + * -flags: + * Is the combination of DF_NEED_TWO_PHASE, DF_WITH_SNAPSHOT,DF_CANCEL_ON_ERROR + * + * -px_pgresults: + * Indicate whether return the pg_result for each QE connection. + * + */ +void +PxDispatchUtilityStatement(struct Node *stmt, + int flags, + List *oid_assignments, + PxPgResults *px_pgresults) +{ + DispatchCommandQueryParms *pQueryParms; + bool needTwoPhase = flags & DF_NEED_TWO_PHASE; + + // if (needTwoPhase) + // setupDtxTransaction(); + + // elogif((Debug_print_full_dtm || log_min_messages <= DEBUG5), LOG, + // "PxDispatchUtilityStatement: %s (needTwoPhase = %s)", + // (PointerIsValid(debug_query_string) ? debug_query_string : "\"\""), + // (needTwoPhase ? "true" : "false")); + + pQueryParms = pxdisp_buildUtilityQueryParms(stmt, flags, oid_assignments); + + return pxdisp_dispatchCommandInternal(pQueryParms, + flags, + pxcomponent_getPxComponentsList(), + px_pgresults); +} + +static void +pxdisp_dispatchCommandInternal(DispatchCommandQueryParms *pQueryParms, + int flags, + List *segments, + PxPgResults *px_pgresults) +{ + PxDispatcherState *ds; + Gang *primaryGang; + PxDispatchResults *pr; + ErrorData *qeError = NULL; + char *queryText; + int queryTextLength; + + /* + * Dispatch the command. + */ + ds = pxdisp_makeDispatcherState(false); + + /* + * Reader gangs use local snapshot to access catalog, as a result, it will + * not synchronize with the global snapshot from write gang which will lead + * to inconsistent visibilty of catalog table. Considering the case: + * + * select * from t, t t1; -- create a reader gang. + * begin; + * create role r1; + * set role r1; -- set command will also dispatched to idle reader gang + * + * When set role command dispatched to reader gang, reader gang cannot see + * the new tuple t1 in catalog table pg_auth. + * To fix this issue, we should drop the idle reader gangs after each + * utility statement which may modify the catalog table. + */ + // ds->destroyIdleReaderGang = true; + + queryText = buildPXQueryString(pQueryParms, &queryTextLength); + + /* + * Allocate a primary QE for every available segDB in the system. + */ + primaryGang = AllocateGang(ds, GANGTYPE_PRIMARY_WRITER, segments); + Assert(primaryGang); + + pxdisp_makeDispatchResults(ds, 1, flags & DF_CANCEL_ON_ERROR); + pxdisp_makeDispatchParams (ds, 1, queryText, queryTextLength); + + pxdisp_dispatchToGang(ds, primaryGang, -1); + + // if ((flags & DF_NEED_TWO_PHASE) != 0 || isDtxExplicitBegin()) + // addToGxactDtxSegments(primaryGang); + + pxdisp_waitDispatchFinish(ds); + + pxdisp_checkDispatchResult(ds, DISPATCH_WAIT_NONE); + + pr = pxdisp_getDispatchResults(ds, &qeError); + + if (qeError) + { + FlushErrorState(); + ReThrowError(qeError); + } + + /* collect pgstat from QEs for current transaction level */ + // pgstat_combine_from_qe(pr, -1); + + pxdisp_returnResults(pr, px_pgresults); + + pxdisp_destroyDispatcherState(ds); +} diff --git a/src/backend/px/dispatcher/px_dispatchresult.c b/src/backend/px/dispatcher/px_dispatchresult.c index b87c40e8df9..9e56960e100 100644 --- a/src/backend/px/dispatcher/px_dispatchresult.c +++ b/src/backend/px/dispatcher/px_dispatchresult.c @@ -26,6 +26,9 @@ #include "px/px_vars.h" #include "utils/faultinjector.h" +static int pxdisp_snatchPGresults(PxDispatchResult *dispatchResult, + struct pg_result **pgresultptrs, int maxresults); + static void noTrailingNewlinePQ(PQExpBuffer buf) { @@ -731,6 +734,49 @@ pxdisp_resultEnd(PxDispatchResults *results, int sliceIndex) return &results->resultArray[si->resultEnd]; } +void +pxdisp_returnResults(PxDispatchResults *primaryResults, PxPgResults *px_pgresults) +{ + PxDispatchResult *dispatchResult; + int nslots; + int nresults = 0; + int i; + + if (!primaryResults || !px_pgresults) + return; + + /* + * Allocate result set ptr array. The caller must PQclear() each PGresult + * and free() the array. + */ + nslots = 0; + + for (i = 0; i < primaryResults->resultCount; ++i) + nslots += pxdisp_numPGresult(&primaryResults->resultArray[i]); + + px_pgresults->pg_results = (struct pg_result **) palloc0(nslots * sizeof(struct pg_result *)); + + /* + * Collect results from primary gang. + */ + for (i = 0; i < primaryResults->resultCount; ++i) + { + dispatchResult = &primaryResults->resultArray[i]; + + /* + * Take ownership of this QE's PGresult object(s). + */ + nresults += pxdisp_snatchPGresults(dispatchResult, + px_pgresults->pg_results + nresults, + nslots - nresults); + } + + Assert(nresults == nslots); + + /* tell the caller how many sets we're returning. */ + px_pgresults->numResults = nresults; +} + /* * used in the interconnect on the dispatcher to avoid error-cleanup deadlocks. */ @@ -792,3 +838,59 @@ pxdisp_makeDispatchResults(PxDispatcherState *ds, ds->primaryResults = results; } + +void +pxdisp_clearPxPgResults(PxPgResults *px_pgresults) +{ + int i = 0; + + if (!px_pgresults) + return; + + for (i = 0; i < px_pgresults->numResults; i++) + PQclear(px_pgresults->pg_results[i]); + + if (px_pgresults->pg_results) + { + pfree(px_pgresults->pg_results); + px_pgresults->pg_results = NULL; + } + + px_pgresults->numResults = 0; +} + +/* + * Remove all of the PGresult ptrs from a PxDispatchResult object + * and place them into an array provided by the caller. The caller + * becomes responsible for PQclear()ing them. Returns the number of + * PGresult ptrs placed in the array. + */ +static int +pxdisp_snatchPGresults(PxDispatchResult *dispatchResult, + struct pg_result **pgresultptrs, int maxresults) +{ + PQExpBuffer buf = dispatchResult->resultbuf; + PGresult **begp = (PGresult **) buf->data; + PGresult **endp = (PGresult **) (buf->data + buf->len); + PGresult **p; + int nresults = 0; + + /* + * Snatch the PGresult objects. + */ + for (p = begp; p < endp; ++p) + { + Assert(*p != NULL); + Assert(nresults < maxresults); + pgresultptrs[nresults++] = *p; + *p = NULL; + } + + /* + * Empty our PGresult array. + */ + resetPQExpBuffer(buf); + dispatchResult->okindex = -1; + + return nresults; +} \ No newline at end of file diff --git a/src/backend/px/dispatcher/px_gang.c b/src/backend/px/dispatcher/px_gang.c index 8e5a0f608ea..8a2f24c273c 100644 --- a/src/backend/px/dispatcher/px_gang.c +++ b/src/backend/px/dispatcher/px_gang.c @@ -34,6 +34,7 @@ #include "utils/guc.h" #include "px/px_conn.h" /* PxWorkerDescriptor */ +#include "px/px_copy.h" #include "px/px_disp.h" #include "px/px_disp_query.h" #include "px/px_gang.h" /* me */ @@ -575,6 +576,23 @@ makePxProcess(PxWorkerDescriptor *pxWorkerDesc) return process; } +struct PxWorkerDescriptor * +getSegmentDescriptorFromGang(const Gang *gp, int seg) +{ + int i = 0; + + if (gp == NULL) + return NULL; + + for (i = 0; i < gp->size; i++) + { + if (gp->db_descriptors[i]->logicalWorkerInfo.idx == seg) + return gp->db_descriptors[i]; + } + + return NULL; +} + /* * Create a list of PxProcess and initialize with Gang information. * diff --git a/src/backend/px/px_copy.c b/src/backend/px/px_copy.c new file mode 100644 index 00000000000..5bdef8fec0f --- /dev/null +++ b/src/backend/px/px_copy.c @@ -0,0 +1,727 @@ +/*-------------------------------------------------------------------------- + * + * px_copy.c + * Provides routines that executed a COPY command on an MPP cluster. These + * routines are called from the backend COPY command whenever MPP is in the + * default dispatch mode. + * + * Usage: + * + * PxCopy pxCopy = makePxCopy(); + * + * PG_TRY(); + * { + * pxCopyStart(pxCopy, ...); + * + * // process each row + * while (...) + * { + * pxCopyGetData(pxCopy, ...) + * or + * pxCopySendData(pxCopy, ...) + * } + * pxCopyEnd(pxCopy); + * } + * PG_CATCH(); + * { + * pxCopyAbort(pxCopy); + * } + * PG_END_TRY(); + * + * + * makePxCopy() creates a struct to hold information about the on-going COPY. + * It does not change the state of the connection yet. + * + * pxCopyStart() puts the connections in the gang into COPY mode. If an error + * occurs during or after pxCopyStart(), you must call pxCopyAbort() to reset + * the connections to normal state! + * + * pxCopyGetData() and pxCopySendData() call libpq's PQgetCopyData() and + * PQputCopyData(), respectively. If an error occurs, it is thrown with ereport(). + * + * When you're done, call pxCopyEnd(). + * + * Portions Copyright (c) 2005-2008, Greenplum inc + * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates. + * + * + * IDENTIFICATION + * src/backend/px/px_copy.c +* +*-------------------------------------------------------------------------- +*/ + +#include "postgres.h" +#include "miscadmin.h" +#include "libpq-fe.h" +#include "libpq-int.h" +#include "access/xact.h" +#include "px/px_conn.h" +#include "px/px_copy.h" +#include "px/px_disp_query.h" +#include "px/px_dispatchresult.h" +// #include "px/px_fts.h" +#include "px/px_gang.h" +// #include "px/px_tm.h" +#include "px/px_vars.h" +#include "commands/copy.h" +#include "commands/defrem.h" +#include "mb/pg_wchar.h" +#include "nodes/makefuncs.h" +#include "pgstat.h" +#include "storage/pmsignal.h" +#include "tcop/tcopprot.h" +#include "utils/faultinjector.h" +#include "utils/relcache.h" +#include "utils/memutils.h" +#include "utils/rel.h" + +#include + +static void pxCopyEndInternal(PxCopy *c, char *abort_msg, + int64 *total_rows_completed_p, + int64 *total_rows_rejected_p); + +static Gang * +getPxCopyPrimaryGang(PxCopy *c) +{ + if (!c || !c->dispatcherState) + return NULL; + + return (Gang *)linitial(c->dispatcherState->allocatedGangs); +} + +/* + * Create a pxCopy object that includes all the px + * information and state needed by the backend COPY. + */ +PxCopy * +makePxCopy(CopyState cstate, bool is_copy_in) +{ + PxCopy *c; + PxPolicy *policy; + + /* initial replicated policy*/ + int numsegments = -1; + numsegments = pxnode_getPxNodes()->totalPxNodes + * polar_get_stmt_px_dop(); + policy = createReplicatedPolicy(numsegments); + Assert(policy); + + c = palloc0(sizeof(PxCopy)); + + /* fresh start */ + c->total_segs = 0; + c->copy_in = is_copy_in; + c->seglist = NIL; + c->dispatcherState = NULL; + initStringInfo(&(c->copy_out_buf)); + + + int i; + + c->total_segs = policy->numsegments; + + for (i = 0; i < c->total_segs; i++) + c->seglist = lappend_int(c->seglist, i); + + cstate->pxCopy = c; + + return c; +} + +/* + * starts a copy command on a specific segment database. + * + * may pg_throw via elog/ereport. + */ +void +pxCopyStart(PxCopy *c, CopyStmt *stmt, int file_encoding) +{ + int flags; + + stmt = copyObject(stmt); + + /* + * If the output needs to be in a different encoding, tell the segment. + * Normally, when we run normal queries, we keep the segment connections + * in database encoding, and do the encoding conversions in the QD, just + * before sending results to the client. But in COPY TO, we don't do + * any conversions to the data we receive from the segments, so they + * must produce the output in the correct encoding. + * + * We do this by adding "ENCODING 'xxx'" option to the options list of + * the CopyStmt that we dispatch. + */ + if (file_encoding != GetDatabaseEncoding()) + { + bool found; + ListCell *option; + + /* + * But first check if the encoding option is already in the options + * list (i.e the user specified it explicitly in the COPY command) + */ + found = false; + foreach(option, stmt->options) + { + DefElem *defel = (DefElem *) lfirst(option); + + if (strcmp(defel->defname, "encoding") == 0) + { + /* + * The 'file_encoding' came from the options, so they should match, but + * let's sanity-check. + */ + if (pg_char_to_encoding(defGetString(defel)) != file_encoding) + elog(ERROR, "encoding option in original COPY command does not match encoding being dispatched"); + found = true; + } + } + + if (!found) + { + const char *encname = pg_encoding_to_char(file_encoding); + + stmt->options = lappend(stmt->options, + makeDefElem("encoding", + (Node *) makeString(pstrdup(encname)), -1)); + } + } + + flags = DF_WITH_SNAPSHOT | DF_CANCEL_ON_ERROR; + if (c->copy_in) + flags |= DF_NEED_TWO_PHASE; + + PxDispatchCopyStart(c, (Node *) stmt, flags); + + SIMPLE_FAULT_INJECTOR("px_copy_start_after_dispatch"); +} + +/* + * sends data to a copy command on all segments. + */ +void +pxCopySendDataToAll(PxCopy *c, const char *buffer, int nbytes) +{ + Gang *px = getPxCopyPrimaryGang(c); + + Assert(px); + + // for (int i = 0; i < px->size; ++i) + // { + // int seg = px->db_descriptors[i]->segindex; + + // pxCopySendData(c, seg, buffer, nbytes); + // } +} + +/* + * sends data to a copy command on a specific segment (usually + * the hash result of the data value). + */ +void +pxCopySendData(PxCopy *c, int target_seg, const char *buffer, + int nbytes) +{ + PxWorkerDescriptor *q; + Gang *px; + int result; + + /* + * NOTE!! note that another DELIM was added, for the buf_converted in the + * code above. I didn't do it because it's broken right now + */ + + px = getPxCopyPrimaryGang(c); + Assert(px); + q = getSegmentDescriptorFromGang(px, target_seg); + + /* transmit the COPY data */ + result = PQputCopyData(q->conn, buffer, nbytes); + + if (result != 1) + { + if (result == 0) + { + /* We don't use blocking mode, so this shouldn't happen */ + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not send COPY data to segment %d, attempt blocked", + target_seg))); + } + else + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not send COPY data to segment %d: %s", + target_seg, PQerrorMessage(q->conn)))); + } +} + +/* + * gets a chunk of rows of data from a copy command. + * returns boolean true if done. Caller should still + * empty the leftovers in the outbuf in that case. + */ +bool +pxCopyGetData(PxCopy *c, bool copy_cancel, uint64 *rows_processed) +{ + PxWorkerDescriptor *q; + Gang *px; + int nbytes; + + /* clean out buf data */ + resetStringInfo(&c->copy_out_buf); + + px = getPxCopyPrimaryGang(c); + + /* + * MPP-7712: we used to issue the cancel-requests for each *row* we got + * back from each segment -- this is potentially millions of + * cancel-requests. Cancel requests consist of an out-of-band connection + * to the segment-postmaster, this is *not* a lightweight operation! + */ + if (copy_cancel) + { + ListCell *cur; + + /* iterate through all the segments that still have data to give */ + foreach(cur, c->seglist) + { + int source_seg = lfirst_int(cur); + + q = getSegmentDescriptorFromGang(px, source_seg); + + /* send a query cancel request to that segdb */ + PQrequestCancel(q->conn); + } + } + + /* + * Collect data rows from the segments that still have rows to give until + * chunk minimum size is reached + */ + while (c->copy_out_buf.len < COPYOUT_CHUNK_SIZE) + { + ListCell *cur; + + /* iterate through all the segments that still have data to give */ + foreach(cur, c->seglist) + { + int source_seg = lfirst_int(cur); + char *buffer; + + q = getSegmentDescriptorFromGang(px, source_seg); + + /* get 1 row of COPY data */ + nbytes = PQgetCopyData(q->conn, &buffer, false); + + /* + * SUCCESS -- got a row of data + */ + if (nbytes > 0 && buffer) + { + /* append the data row to the data chunk */ + appendBinaryStringInfo(&(c->copy_out_buf), buffer, nbytes); + + /* increment the rows processed counter for the end tag */ + (*rows_processed)++; + + PQfreemem(buffer); + } + + /* + * DONE -- Got all the data rows from this segment, or a cancel + * request. + * + * Remove the segment that completed sending data, from the list + * of in-progress segments. + * + * Note: After PQgetCopyData() returns -1, you need to call + * PGgetResult() to get any possible errors. But we don't do that + * here. That's done later, in the call to pxCopyEnd() (or + * pxCopyAbort(), if something went wrong.) + */ + else if (nbytes == -1) + { + c->seglist = list_delete_int(c->seglist, source_seg); + + if (list_length(c->seglist) == 0) + return true; /* all segments are done */ + + /* start over from first seg as we just changed the seg list */ + break; + } + /* + * ERROR! + */ + else + { + /* + * should never happen since we are blocking. Don't bother to + * try again, exit with error. + */ + if (nbytes == 0) + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not send COPY data to segment %d, attempt blocked", + source_seg))); + + if (nbytes == -2) + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not receive COPY data from segment %d: %s", + source_seg, PQerrorMessage(q->conn)))); + } + } + + if (c->copy_out_buf.len > COPYOUT_CHUNK_SIZE) + break; + } + + return false; +} + +/* + * Commands to end the pxCopy. + * + * If an error occurrs, or if an error is reported by one of the segments, + * pxCopyEnd() throws it with ereport(), after closing the COPY and cleaning + * up any resources associated with it. + * + * pxCopyAbort() usually does not throw an error. It is used in error-recovery + * codepaths, typically in a PG_CATCH() block, and the caller is about to + * re-throw the original error that caused the abortion. + */ +void +pxCopyAbort(PxCopy *c) +{ + pxCopyEndInternal(c, "aborting COPY in PX due to error in QC", + NULL, NULL); +} + +/* + * End the copy command on all segment databases, + * and fetch the total number of rows completed by all QEs + */ +void +pxCopyEnd(PxCopy *c, + int64 *total_rows_completed_p, + int64 *total_rows_rejected_p) +{ + CHECK_FOR_INTERRUPTS(); + + pxCopyEndInternal(c, NULL, + total_rows_completed_p, + total_rows_rejected_p); +} + +static void +pxCopyEndInternal(PxCopy *c, char *abort_msg, + int64 *total_rows_completed_p, + int64 *total_rows_rejected_p) +{ + Gang *gp; + int num_bad_connections = 0; + int64 total_rows_completed = 0; /* total num rows completed by all + * QEs */ + int64 total_rows_rejected = 0; /* total num rows rejected by all + * QEs */ + ErrorData *first_error = NULL; + int seg; + struct pollfd *pollRead; + bool io_errors = false; + StringInfoData io_err_msg; + // List *oidList = NIL; + int nest_level; + + SIMPLE_FAULT_INJECTOR("px_copy_end_internal_start"); + + initStringInfo(&io_err_msg); + + /* + * Don't try to end a copy that already ended with the destruction of the + * writer gang. We know that this has happened if the PxCopy's + * primary_writer is NULL. + * + * GPDB_91_MERGE_FIXME: ugh, this is nasty. We shouldn't be calling + * pxCopyEnd twice on the same PxCopy in the first place! + */ + gp = getPxCopyPrimaryGang(c); + if (!gp) + { + if (total_rows_completed_p != NULL) + *total_rows_completed_p = 0; + if (total_rows_rejected_p != NULL) + *total_rows_completed_p = -1; + return; + } + + /* + * In COPY in mode, call PQputCopyEnd() to tell the segments that we're done. + */ + if (c->copy_in) + { + for (seg = 0; seg < gp->size; seg++) + { + PxWorkerDescriptor *q = gp->db_descriptors[seg]; + int result; + + elog(DEBUG1, "PQputCopyEnd seg %d ", q->logicalWorkerInfo.idx); + /* end this COPY command */ + result = PQputCopyEnd(q->conn, abort_msg); + + /* get command end status */ + if (result == -1) + { + /* error */ + appendStringInfo(&io_err_msg, + "Failed to send end-of-copy to segment %d: %s", + seg, PQerrorMessage(q->conn)); + io_errors = true; + } + if (result == 0) + { + /* attempt blocked */ + + /* + * CDB TODO: Can this occur? The libpq documentation says, "this + * case is only possible if the connection is in nonblocking + * mode... wait for write-ready and try again", i.e., the proper + * response would be to retry, not error out. + */ + appendStringInfo(&io_err_msg, + "primary segment %d, dbid %d, attempt blocked\n", + seg, q->pxNodeInfo->config->dbid); + io_errors = true; + } + } + } + + nest_level = GetCurrentTransactionNestLevel(); + + pollRead = (struct pollfd *) palloc(sizeof(struct pollfd)); + for (seg = 0; seg < gp->size; seg++) + { + PxWorkerDescriptor *q = gp->db_descriptors[seg]; + int result; + PGresult *res; + int64 segment_rows_completed = 0; /* # of rows completed by this QE */ + int64 segment_rows_rejected = 0; /* # of rows rejected by this QE */ + + pollRead->fd = PQsocket(q->conn); + pollRead->events = POLLIN; + pollRead->revents = 0; + + while (PQisBusy(q->conn) && PQstatus(q->conn) == CONNECTION_OK) + { + if ((px_role == PX_ROLE_QC) && IS_PX_NEED_CANCELED()) + { + PQrequestCancel(q->conn); + } + + if (poll(pollRead, 1, 200) > 0) + { + break; + } + } + + forwardPXNotices(); + + /* + * Fetch any error status existing on completion of the COPY command. + * It is critical that for any connection that had an asynchronous + * command sent thru it, we call PQgetResult until it returns NULL. + * Otherwise, the next time a command is sent to that connection, it + * will return an error that there's a command pending. + */ + HOLD_INTERRUPTS(); + while ((res = PQgetResult(q->conn)) != NULL && PQstatus(q->conn) != CONNECTION_BAD) + { + elog(DEBUG1, "PQgetResult got status %d seg %d ", + PQresultStatus(res), q->logicalWorkerInfo.idx); + + forwardPXNotices(); + + /* if the COPY command had a data error */ + if (PQresultStatus(res) == PGRES_FATAL_ERROR) + { + /* + * Always append error from the primary. Append error from + * mirror only if its primary didn't have an error. + * + * For now, we only report the first error we get from the + * QE's. + * + * We get the error message in pieces so that we could append + * whoami to the primary error message only. + */ + if (!first_error) + first_error = pxdisp_get_PXerror(res); + } + + // pgstat_combine_one_qe_result(&oidList, res, nest_level, q->logicalWorkerInfo.idx); + + // if (q->conn->wrote_xlog) + // { + // MarkTopTransactionWriteXLogOnExecutor(); + + // /* + // * Reset the worte_xlog here. Since if the received pgresult not process + // * the xlog write message('x' message sends from QE in ReadyForQuery), + // * the value may still refer to previous dispatch statement. Which may + // * always mark current top transaction has wrote xlog on executor. + // */ + // q->conn->wrote_xlog = false; + // } + + /* + * If we are still in copy mode, tell QE to stop it. COPY_IN + * protocol has a way to say 'end of copy' but COPY_OUT doesn't. + * We have no option but sending cancel message and consume the + * output until the state transition to non-COPY. + */ + if (PQresultStatus(res) == PGRES_COPY_IN) + { + elog(LOG, "Segment still in copy in, retrying the putCopyEnd"); + result = PQputCopyEnd(q->conn, NULL); + } + else if (PQresultStatus(res) == PGRES_COPY_OUT) + { + char *buffer = NULL; + int ret; + + elog(LOG, "Segment still in copy out, canceling QE"); + + /* + * I'm a bit worried about sending a cancel, as if this is a + * success case the QE gets inconsistent state than QD. But + * this code path is mostly for error handling and in a + * success case we wouldn't see COPY_OUT here. It's not clear + * what to do if this cancel failed, since this is not a path + * we can error out. FATAL maybe the way, but I leave it for + * now. + */ + PQrequestCancel(q->conn); + + /* + * Need to consume data from the QE until cancellation is + * recognized. PQgetCopyData() returns -1 when the COPY is + * done, a non-zero result indicates data was returned and in + * that case we'll drop it immediately since we aren't + * interested in the contents. + */ + while ((ret = PQgetCopyData(q->conn, &buffer, false)) != -1) + { + if (ret > 0) + { + if (buffer) + PQfreemem(buffer); + continue; + } + + /* An error occurred, log the error and break out */ + if (ret == -2) + { + ereport(WARNING, + (errmsg("Error during cancellation: \"%s\"", + PQerrorMessage(q->conn)))); + break; + } + } + if (buffer) + PQfreemem(buffer); + } + + /* in SREH mode, check if this seg rejected (how many) rows */ + if (res->numRejected > 0) + segment_rows_rejected = res->numRejected; + + /* + * When COPY FROM, need to calculate the number of this + * segment's completed rows + */ + if (res->numCompleted > 0) + segment_rows_completed = res->numCompleted; + + /* free the PGresult object */ + PQclear(res); + } + RESUME_INTERRUPTS(); + + /* + * add up the number of rows completed and rejected from this segment + * to the totals. Only count from primary segs. + */ + if (segment_rows_rejected > 0) + total_rows_rejected += segment_rows_rejected; + if (segment_rows_completed > 0) + total_rows_completed += segment_rows_completed; + + /* Lost the connection? */ + if (PQstatus(q->conn) == CONNECTION_BAD) + { + /* command error */ + io_errors = true; + appendStringInfo(&io_err_msg, + "Primary segment %d, dbid %d, with error: %s\n", + seg, q->pxNodeInfo->config->dbid, + PQerrorMessage(q->conn)); + + /* Free the PGconn object. */ + PQfinish(q->conn); + q->conn = NULL; + + /* Let FTS deal with it! */ + num_bad_connections++; + } + } + + PxDispatchCopyEnd(c); + + // /* If lost contact with segment db, try to reconnect. */ + // if (num_bad_connections > 0) + // { + // elog(LOG, "error occurred while ending COPY: %s", io_err_msg.data); + // elog(LOG, "COPY signals FTS to probe segments"); + + // SendPostmasterSignal(PMSIGNAL_WAKEN_FTS); + // /* + // * Before error out, we need to reset the session. Gang will be cleaned up + // * when next transaction start, since it will find FTS version bump and + // * call pxcomponent_updatePxComponents(). + // */ + // resetSessionForPrimaryGangLoss(); + + // ereport(ERROR, + // (errcode(ERRCODE_PX_INTERCONNECTION_ERROR), + // (errmsg("MPP detected %d segment failures, system is reconnected", + // num_bad_connections)))); + // } + + /* + * Unless we are aborting the COPY, report any errors with ereport() + */ + if (!abort_msg) + { + /* errors reported by the segments */ + if (first_error) + { + FlushErrorState(); + ReThrowError(first_error); + } + + /* errors that occurred in the COPY itself */ + if (io_errors) + ereport(ERROR, + (errcode(ERRCODE_IO_ERROR), + errmsg("could not complete COPY on some segments"), + errdetail("%s", io_err_msg.data))); + } + + if (total_rows_completed_p != NULL) + *total_rows_completed_p = total_rows_completed; + if (total_rows_rejected_p != NULL) + *total_rows_rejected_p = total_rows_rejected; + return; +} diff --git a/src/backend/px/px_util.c b/src/backend/px/px_util.c index bcf570ba622..a37d666c394 100644 --- a/src/backend/px/px_util.c +++ b/src/backend/px/px_util.c @@ -730,3 +730,20 @@ getPxWorkerCount(void) numsegments = 1; return numsegments; } + +List * +pxcomponent_getPxComponentsList(void) +{ + PxNodes *px_nodes;; + List *segments = NIL; + int i; + + px_nodes = pxnode_getPxNodes(); + + for (i = 0; i < px_nodes->totalPxNodes; i++) + { + segments = lappend_int(segments, i); + } + + return segments; +} \ No newline at end of file diff --git a/src/backend/utils/misc/guc_px.c b/src/backend/utils/misc/guc_px.c index c3f8197b424..059c23112f7 100644 --- a/src/backend/utils/misc/guc_px.c +++ b/src/backend/utils/misc/guc_px.c @@ -60,6 +60,7 @@ bool px_optimizer_enable_relsize_collection = false; /* Optimizer related gucs */ bool polar_enable_px; +bool px_enable_copy; bool px_enable_executor; bool px_enable_join; bool px_enable_window_function; @@ -348,6 +349,16 @@ struct config_bool ConfigureNamesBool_px[] = NULL, NULL, NULL }, + { + {"polar_px_enable_copy", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enable px_enable_copy."), + NULL + }, + &px_enable_copy, + false, + NULL, NULL, NULL + }, + { {"polar_px_enable_executor", PGC_USERSET, QUERY_TUNING_METHOD, gettext_noop("Enable px_enable_executor."), diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h index f393e7e73d7..70e8fa455ff 100644 --- a/src/include/commands/copy.h +++ b/src/include/commands/copy.h @@ -14,15 +14,191 @@ #ifndef COPY_H #define COPY_H +#include "commands/trigger.h" #include "nodes/execnodes.h" #include "nodes/parsenodes.h" #include "parser/parse_node.h" +#include "executor/execPartition.h" #include "tcop/dest.h" +#include "px/px_hash.h" +#include "px/px_copy.h" + +/* + * Represents the different source/dest cases we need to worry about at + * the bottom level + */ +typedef enum CopyDest +{ + COPY_FILE, /* to/from file (or a piped program) */ + COPY_OLD_FE, /* to/from frontend (2.0 protocol) */ + COPY_NEW_FE, /* to/from frontend (3.0 protocol) */ + COPY_CALLBACK /* to/from callback function */ +} CopyDest; -/* CopyStateData is private in commands/copy.c */ -typedef struct CopyStateData *CopyState; typedef int (*copy_data_source_cb) (void *outbuf, int minread, int maxread); +/* + * Represents the end-of-line terminator type of the input + */ +typedef enum EolType +{ + EOL_UNKNOWN, + EOL_NL, + EOL_CR, + EOL_CRNL +} EolType; + +/* + * This struct contains all the state variables used throughout a COPY + * operation. For simplicity, we use the same struct for all variants of COPY, + * even though some fields are used in only some cases. + * + * Multi-byte encodings: all supported client-side encodings encode multi-byte + * characters by having the first byte's high bit set. Subsequent bytes of the + * character can have the high bit not set. When scanning data in such an + * encoding to look for a match to a single-byte (ie ASCII) character, we must + * use the full pg_encoding_mblen() machinery to skip over multibyte + * characters, else we might find a false match to a trailing byte. In + * supported server encodings, there is no possibility of a false match, and + * it's faster to make useless comparisons to trailing bytes than it is to + * invoke pg_encoding_mblen() to skip over them. encoding_embeds_ascii is true + * when we have to do it the hard way. + */ +typedef struct CopyStateData +{ + /* low-level state data */ + CopyDest copy_dest; /* type of copy source/destination */ + FILE *copy_file; /* used if copy_dest == COPY_FILE */ + StringInfo fe_msgbuf; /* used for all dests during COPY TO, only for + * dest == COPY_NEW_FE in COPY FROM */ + bool is_copy_from; /* COPY TO, or COPY FROM? */ + bool reached_eof; /* true if we read to end of copy data (not + * all copy_dest types maintain this) */ + EolType eol_type; /* EOL type of input */ + int file_encoding; /* file or remote side's character encoding */ + bool need_transcoding; /* file encoding diff from server? */ + bool encoding_embeds_ascii; /* ASCII can be non-first byte? */ + + /* parameters from the COPY command */ + Relation rel; /* relation to copy to or from */ + QueryDesc *queryDesc; /* executable query to copy from */ + List *attnumlist; /* integer list of attnums to copy */ + char *filename; /* filename, or NULL for STDIN/STDOUT */ + bool is_program; /* is 'filename' a program to popen? */ + copy_data_source_cb data_source_cb; /* function for reading data */ + bool binary; /* binary format? */ + bool oids; /* include OIDs? */ + bool freeze; /* freeze rows on loading? */ + bool csv_mode; /* Comma Separated Value format? */ + bool header_line; /* CSV header line? */ + char *null_print; /* NULL marker string (server encoding!) */ + int null_print_len; /* length of same */ + char *null_print_client; /* same converted to file encoding */ + char *delim; /* column delimiter (must be 1 byte) */ + char *quote; /* CSV quote char (must be 1 byte) */ + char *escape; /* CSV escape char (must be 1 byte) */ + List *force_quote; /* list of column names */ + bool force_quote_all; /* FORCE_QUOTE *? */ + bool *force_quote_flags; /* per-column CSV FQ flags */ + List *force_notnull; /* list of column names */ + bool *force_notnull_flags; /* per-column CSV FNN flags */ + List *force_null; /* list of column names */ + bool *force_null_flags; /* per-column CSV FN flags */ + bool convert_selectively; /* do selective binary conversion? */ + List *convert_select; /* list of column names (can be NIL) */ + bool *convert_select_flags; /* per-column CSV/TEXT CS flags */ + + + /* these are just for error messages, see CopyFromErrorCallback */ + const char *cur_relname; /* table name for error messages */ + uint64 cur_lineno; /* line number for error messages */ + const char *cur_attname; /* current att for error messages */ + const char *cur_attval; /* current att value for error messages */ + + /* + * Working state for COPY TO/FROM + */ + MemoryContext copycontext; /* per-copy execution context */ + + /* + * Working state for COPY TO + */ + FmgrInfo *out_functions; /* lookup info for output functions */ + MemoryContext rowcontext; /* per-row evaluation context */ + + /* + * Working state for COPY FROM + */ + AttrNumber num_defaults; + bool file_has_oids; + FmgrInfo oid_in_function; + Oid oid_typioparam; + FmgrInfo *in_functions; /* array of input functions for each attrs */ + Oid *typioparams; /* array of element types for in_functions */ + int *defmap; /* array of default att numbers */ + ExprState **defexprs; /* array of default att expressions */ + bool volatile_defexprs; /* is any of defexprs volatile? */ + List *range_table; + + /* Tuple-routing support info */ + PartitionTupleRouting *partition_tuple_routing; + + TransitionCaptureState *transition_capture; + + /* + * These variables are used to reduce overhead in textual COPY FROM. + * + * attribute_buf holds the separated, de-escaped text for each field of + * the current line. The CopyReadAttributes functions return arrays of + * pointers into this buffer. We avoid palloc/pfree overhead by re-using + * the buffer on each cycle. + */ + StringInfoData attribute_buf; + + /* field raw data pointers found by COPY FROM */ + + int max_fields; + char **raw_fields; + + /* + * Similarly, line_buf holds the whole input line being processed. The + * input cycle is first to read the whole line into line_buf, convert it + * to server encoding there, and then extract the individual attribute + * fields into attribute_buf. line_buf is preserved unmodified so that we + * can display it in error messages if appropriate. + */ + StringInfoData line_buf; + bool line_buf_converted; /* converted to server encoding? */ + bool line_buf_valid; /* contains the row being processed? */ + + /* + * Finally, raw_buf holds raw data read from the data source (file or + * client connection). CopyReadLine parses this data sufficiently to + * locate line boundaries, then transfers the data to line_buf and + * converts it. Note: we guarantee that there is a \0 at + * raw_buf[raw_buf_len]. + */ +#define RAW_BUF_SIZE 65536 /* we palloc RAW_BUF_SIZE+1 bytes */ + char *raw_buf; + int raw_buf_index; /* next byte to process */ + int raw_buf_len; /* total # of bytes stored */ + + + /* Information on the connections to QEs. */ + PxCopy *pxCopy; +} CopyStateData; + +typedef struct CopyStateData *CopyState; + +/* DestReceiver for COPY (query) TO */ +typedef struct +{ + DestReceiver pub; /* publicly-known function pointers */ + CopyState cstate; /* CopyStateData for the command */ + uint64 processed; /* # of tuples processed */ +} DR_copy; + + extern void DoCopy(ParseState *state, const CopyStmt *stmt, int stmt_location, int stmt_len, uint64 *processed); diff --git a/src/include/px/px_copy.h b/src/include/px/px_copy.h new file mode 100644 index 00000000000..9eaa1e48ce7 --- /dev/null +++ b/src/include/px/px_copy.h @@ -0,0 +1,55 @@ +/*-------------------------------------------------------------------------- + * + * px_copy.h + * Definitions and API functions for pxcopy.c + * These are functions that are used by the backend + * COPY command in Greenplum Database. + * + * Portions Copyright (c) 2005-2008, Greenplum inc + * Portions Copyright (c) 2012-Present Pivotal Software, Inc. + * + * + * IDENTIFICATION + * src/include/px/px_copy.h + * + *-------------------------------------------------------------------------- + */ + +#ifndef PXCOPY_H +#define PXCOPY_H + +#include "lib/stringinfo.h" +#include "px/px_gang.h" + +#define COPYOUT_CHUNK_SIZE 16 * 1024 + +struct PxDispatcherState; +struct CopyStateData; + +typedef struct PxCopy +{ + int total_segs; /* total number of segments in px */ + bool copy_in; /* direction: true for COPY FROM false for COPY TO */ + + StringInfoData copy_out_buf;/* holds a chunk of data from the database */ + + List *seglist; /* segs that currently take part in copy. + * for copy out, once a segment gave away all it's + * data rows, it is taken out of the list */ + struct PxDispatcherState *dispatcherState; +} PxCopy; + + + +/* global function declarations */ +extern PxCopy *makePxCopy(struct CopyStateData *cstate, bool copy_in); +extern void pxCopyStart(PxCopy *pxCopy, CopyStmt *stmt, int file_encoding); +extern void pxCopySendDataToAll(PxCopy *c, const char *buffer, int nbytes); +extern void pxCopySendData(PxCopy *c, int target_seg, const char *buffer, int nbytes); +extern bool pxCopyGetData(PxCopy *c, bool cancel, uint64 *rows_processed); +extern void pxCopyAbort(PxCopy *c); +extern void pxCopyEnd(PxCopy *c, + int64 *total_rows_completed_p, + int64 *total_rows_rejected_p); + +#endif /* PXCOPY_H */ \ No newline at end of file diff --git a/src/include/px/px_disp_query.h b/src/include/px/px_disp_query.h index ed5f7153d4e..dee0401bb0c 100644 --- a/src/include/px/px_disp_query.h +++ b/src/include/px/px_disp_query.h @@ -61,5 +61,13 @@ void PxDispatchPlan(struct QueryDesc *queryDesc, extern ParamListInfo deserializeParamListInfo(const char *str, int slen); +extern void PxDispatchUtilityStatement(struct Node *stmt, + int flags, + List *oid_assignments, + struct PxPgResults* cdb_pgresults); +extern void PxDispatchCopyStart(struct PxCopy *pxCopy, Node *stmt, int flags); +extern void PxDispatchCopyEnd(struct PxCopy *pxCopy); + + void px_log_querydesc(QueryDispatchDesc *ddesc); #endif /* PXDISP_QUERY_H */ diff --git a/src/include/px/px_dispatchresult.h b/src/include/px/px_dispatchresult.h index fed5c442575..39071dbf8a1 100644 --- a/src/include/px/px_dispatchresult.h +++ b/src/include/px/px_dispatchresult.h @@ -272,6 +272,9 @@ PxDispatchResult *pxdisp_resultBegin(PxDispatchResults *results, int sliceIndex) */ PxDispatchResult *pxdisp_resultEnd(PxDispatchResults *results, int sliceIndex); +void +pxdisp_returnResults(PxDispatchResults *primaryResults, PxPgResults *cdb_pgresults); + /* * used in the interconnect on the dispatcher to avoid error-cleanup deadlocks. */ @@ -287,6 +290,9 @@ void pxdisp_makeDispatchResults(struct PxDispatcherState *ds, int sliceCapacity, bool cancelOnError); +void +pxdisp_clearPxPgResults(PxPgResults* px_pgresults); + /* Px adaptive scan */ extern volatile bool px_adps_dispatch_wait; extern pg_atomic_uint32 px_adps_eno; diff --git a/src/include/px/px_gang.h b/src/include/px/px_gang.h index f4a01d10945..973b27a89dc 100644 --- a/src/include/px/px_gang.h +++ b/src/include/px/px_gang.h @@ -76,6 +76,8 @@ extern List *getPxProcessesForQC(int isPrimary); extern Gang *AllocateGang(struct PxDispatcherState *ds, enum GangType type, List *segments); extern void RecycleGang(Gang *gp, bool forceDestroy); +extern struct PxWorkerDescriptor *getSegmentDescriptorFromGang(const Gang *gp, int seg); + Gang *buildGangDefinition(List *segments, SegmentType segmentType); bool build_pxid_param(char *buf, int bufsz, int identifier, int icHtabSize); From 5a2bb4ac4d91de9c6985014e14abc2a632e20fc7 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Sat, 23 Jul 2022 21:28:14 +0000 Subject: [PATCH 08/12] complete copy to statement --- src/backend/commands/copy.c | 13 ++++------ src/backend/px/dispatcher/px_disp_query.c | 29 +++++++++++++---------- src/backend/px/px_copy.c | 27 +++++++++++++-------- src/backend/tcop/postgres_px.c | 3 ++- src/include/px/px_copy.h | 2 +- 5 files changed, 40 insertions(+), 34 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 05168c0f7a2..60a0150bb29 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -1531,7 +1531,10 @@ BeginCopy(ParseState *pstate, } /* plan the query */ - plan = pg_plan_query(query, CURSOR_OPT_PX_OK, NULL); + int cursor_options = CURSOR_OPT_PARALLEL_OK; + if (px_enable_copy) + cursor_options |= CURSOR_OPT_PX_OK; + plan = pg_plan_query(query, cursor_options, NULL); /* * With row level security and a user using "COPY relation TO", we @@ -1995,16 +1998,12 @@ CopyToDispatch(CopyState cstate) { CopyStmt *stmt = glob_copystmt; TupleDesc tupDesc; - int num_phys_attrs; - int attr_count; FormData_pg_attribute *attr; PxCopy *pxCopy; uint64 processed = 0; tupDesc = cstate->rel->rd_att; attr = tupDesc->attrs; - num_phys_attrs = tupDesc->natts; - attr_count = list_length(cstate->attnumlist); /* We use fe_msgbuf as a per-row buffer regardless of copy_dest */ cstate->fe_msgbuf = makeStringInfo(); @@ -2061,10 +2060,6 @@ CopyToDispatch(CopyState cstate) */ if (cstate->need_transcoding) cstate->null_print = (char *) - // pg_server_to_custom(cstate->null_print, - // strlen(cstate->null_print), - // cstate->file_encoding, - // cstate->enc_conversion_proc); pg_server_to_any(cstate->null_print, strlen(cstate->null_print), cstate->file_encoding); diff --git a/src/backend/px/dispatcher/px_disp_query.c b/src/backend/px/dispatcher/px_disp_query.c index f4cc7da1c29..3d6d25587c9 100644 --- a/src/backend/px/dispatcher/px_disp_query.c +++ b/src/backend/px/dispatcher/px_disp_query.c @@ -264,10 +264,8 @@ pxdisp_buildUtilityQueryParms(struct Node *stmt, { char *serializedPlantree = NULL; char *serializedQueryDispatchDesc = NULL; - char *sparams; int serializedPlantree_len = 0; int serializedQueryDispatchDesc_len = 0; - int sparams_len = 0; QueryDispatchDesc *qddesc; PlannedStmt *pstmt; DispatchCommandQueryParms *pQueryParms; @@ -305,23 +303,20 @@ pxdisp_buildUtilityQueryParms(struct Node *stmt, { qddesc = makeNode(QueryDispatchDesc); qddesc->oidAssignments = oid_assignments; - // GetUserIdAndSecContext(&save_userid, &qddesc->secContext); serializedQueryDispatchDesc = serializeNode((Node *) qddesc, &serializedQueryDispatchDesc_len, NULL /* uncompressed_size */ ); } pQueryParms = palloc0(sizeof(*pQueryParms)); pQueryParms->strCommand = PointerIsValid(debug_query_string) ? debug_query_string : ""; - // pQueryParms->serializedQuerytree = NULL; - // pQueryParms->serializedQuerytreelen = 0; + pQueryParms->serializedQuerytree = NULL; + pQueryParms->serializedQuerytreelen = 0; pQueryParms->serializedPlantree = serializedPlantree; pQueryParms->serializedPlantreelen = serializedPlantree_len; - // pQueryParms->serializedParams = sparams; - // pQueryParms->serializedParamslen = sparams_len; pQueryParms->serializedQueryDispatchDesc = serializedQueryDispatchDesc; pQueryParms->serializedQueryDispatchDesclen = serializedQueryDispatchDesc_len; - // pQueryParms->serializedSnapshot = pxsn_get_serialized_snapshot(); - // pQueryParms->serializedSnapshotlen = pxsn_get_serialized_snapshot_size(); + pQueryParms->serializedSnapshot = pxsn_get_serialized_snapshot(); + pQueryParms->serializedSnapshotlen = pxsn_get_serialized_snapshot_size(); return pQueryParms; } @@ -1065,8 +1060,12 @@ PxDispatchCopyStart(struct PxCopy *pxCopy, Node *stmt, int flags) PxDispatcherState *ds; Gang *primaryGang; ErrorData *error = NULL; - // bool needTwoPhase = flags & DF_NEED_TWO_PHASE; - + bool needTwoPhase = flags & DF_NEED_TWO_PHASE; + + // if (needTwoPhase) + // { + px_sql_wal_lsn = polar_px_max_valid_lsn(); + // } // elogif(log_min_messages <= DEBUG5, LOG, // "PxDispatchCopyStart: %s (needTwoPhase = %s)", @@ -1094,8 +1093,12 @@ PxDispatchCopyStart(struct PxCopy *pxCopy, Node *stmt, int flags) pxdisp_dispatchToGang(ds, primaryGang, -1); // if ((flags & DF_NEED_TWO_PHASE) != 0 || isDtxExplicitBegin()) // addToGxactDtxSegments(primaryGang); - - pxdisp_waitDispatchFinish(ds); + + // /* Start a background libpq thread */ + // pxdisp_startPqThread(ds); + // /* If libpq is not run in background*/ + // if (!pxdisp_isDsThreadRuning()) + // pxdisp_waitDispatchFinish(ds); pxdisp_checkDispatchResult(ds, DISPATCH_WAIT_NONE); diff --git a/src/backend/px/px_copy.c b/src/backend/px/px_copy.c index 5bdef8fec0f..e4e8c6c3b43 100644 --- a/src/backend/px/px_copy.c +++ b/src/backend/px/px_copy.c @@ -43,7 +43,7 @@ * * Portions Copyright (c) 2005-2008, Greenplum inc * Portions Copyright (c) 2012-Present VMware, Inc. or its affiliates. - * + * Portions Copyright (c) 2021, Alibaba Group Holding Limited * * IDENTIFICATION * src/backend/px/px_copy.c @@ -111,19 +111,28 @@ makePxCopy(CopyState cstate, bool is_copy_in) c = palloc0(sizeof(PxCopy)); /* fresh start */ + + c->total_segs = 0; c->copy_in = is_copy_in; c->seglist = NIL; c->dispatcherState = NULL; initStringInfo(&(c->copy_out_buf)); + + if (!is_copy_in) + { + c->total_segs = 1; + c->seglist = list_make1_int(px_session_id % c->total_segs); + } + else + { + int i; - - int i; - - c->total_segs = policy->numsegments; + c->total_segs = policy->numsegments; - for (i = 0; i < c->total_segs; i++) - c->seglist = lappend_int(c->seglist, i); + for (i = 0; i < c->total_segs; i++) + c->seglist = lappend_int(c->seglist, i); + } cstate->pxCopy = c; @@ -432,7 +441,7 @@ pxCopyEndInternal(PxCopy *c, char *abort_msg, struct pollfd *pollRead; bool io_errors = false; StringInfoData io_err_msg; - // List *oidList = NIL; + List *oidList = NIL; int nest_level; SIMPLE_FAULT_INJECTOR("px_copy_end_internal_start"); @@ -560,8 +569,6 @@ pxCopyEndInternal(PxCopy *c, char *abort_msg, first_error = pxdisp_get_PXerror(res); } - // pgstat_combine_one_qe_result(&oidList, res, nest_level, q->logicalWorkerInfo.idx); - // if (q->conn->wrote_xlog) // { // MarkTopTransactionWriteXLogOnExecutor(); diff --git a/src/backend/tcop/postgres_px.c b/src/backend/tcop/postgres_px.c index 7f367b5476a..ed7cc0aa325 100644 --- a/src/backend/tcop/postgres_px.c +++ b/src/backend/tcop/postgres_px.c @@ -193,7 +193,8 @@ exec_px_query(const char *query_string, if (plan->commandType != CMD_SELECT && plan->commandType != CMD_INSERT && plan->commandType != CMD_UPDATE && - plan->commandType != CMD_DELETE) + plan->commandType != CMD_DELETE && + plan->commandType != CMD_UTILITY) elog(ERROR, "POLARPX: received non-DML Plan"); commandType = plan->commandType; diff --git a/src/include/px/px_copy.h b/src/include/px/px_copy.h index 9eaa1e48ce7..c82d9e75a9c 100644 --- a/src/include/px/px_copy.h +++ b/src/include/px/px_copy.h @@ -7,7 +7,7 @@ * * Portions Copyright (c) 2005-2008, Greenplum inc * Portions Copyright (c) 2012-Present Pivotal Software, Inc. - * + * Portions Copyright (c) 2021, Alibaba Group Holding Limited * * IDENTIFICATION * src/include/px/px_copy.h From 7705d735c88ce56f99760930d5fca1b4120dfa2e Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Tue, 26 Jul 2022 08:10:27 +0000 Subject: [PATCH 09/12] update global setting to limit the distributed copy usage --- src/backend/commands/copy.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 60a0150bb29..c21f9b5176f 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -1532,7 +1532,7 @@ BeginCopy(ParseState *pstate, /* plan the query */ int cursor_options = CURSOR_OPT_PARALLEL_OK; - if (px_enable_copy) + if (px_enable_copy && polar_enable_px) cursor_options |= CURSOR_OPT_PX_OK; plan = pg_plan_query(query, cursor_options, NULL); From 6872895d15d675aec8d8d939892ee57ff5ae6726 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Mon, 22 Aug 2022 09:37:48 +0000 Subject: [PATCH 10/12] error: try to connect to the different process, but the server closed the connection unexpectedly --- src/backend/commands/copy.c | 498 +++++++++++++++++++++- src/backend/px/dispatcher/px_disp_query.c | 125 +----- src/backend/px/dispatcher/px_gang.c | 3 +- src/backend/px/px_copy.c | 10 +- src/backend/px/px_hash.c | 31 ++ src/include/commands/copy.h | 44 +- src/include/px/px_disp_query.h | 4 - src/include/px/px_hash.h | 1 + 8 files changed, 582 insertions(+), 134 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index c21f9b5176f..02e93778bbc 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -186,6 +186,19 @@ static bool CopyGetInt32(CopyState cstate, int32 *val); static void CopySendInt16(CopyState cstate, int16 val); static bool CopyGetInt16(CopyState cstate, int16 *val); +static PxDistributionData *InitDistributionData(CopyState cstate, EState *estate); +static void InitCopyFromDispatchSplit(CopyState cstate, PxDistributionData *distData, EState *estate); +static unsigned int GetTargetSeg(PxDistributionData *distData, TupleTableSlot *slot); +static void SendCopyFromForwardedTuple(CopyState cstate, + PxCopy *pxCopy, + bool toAll, + int target_seg, + Relation rel, + int64 lineno, + char *line, + int line_len, + Datum *values, + bool *nulls); typedef struct { @@ -2529,6 +2542,9 @@ CopyFrom(CopyState cstate) Size bufferedTuplesSize = 0; uint64 firstBufferedLineNo = 0; + PxCopy *pxCopy = NULL; + PxDistributionData *distData = NULL; /* distribution data used to compute target seg */ + Assert(cstate->rel); /* @@ -2798,6 +2814,104 @@ CopyFrom(CopyState cstate) errcallback.arg = (void *) cstate; errcallback.previous = error_context_stack; error_context_stack = &errcallback; + +/* + * Initialize information about distribution keys, needed to compute target + * segment for each row. + */ +if (cstate->dispatch_mode == COPY_DISPATCH) +{ + distData = InitDistributionData(cstate, estate); +} + +/* Determine which fields we need to parse in the QD. */ +if (cstate->dispatch_mode == COPY_DISPATCH) + InitCopyFromDispatchSplit(cstate, distData, estate); + + +if (cstate->dispatch_mode == COPY_DISPATCH || + cstate->dispatch_mode == COPY_EXECUTOR) +{ + /* + * Now split the attnumlist into the parts that are parsed in the QD, and + * in QE. + */ + ListCell *lc; + int i = 0; + List *qd_attnumlist = NIL; + List *qe_attnumlist = NIL; + int first_qe_processed_field; + + first_qe_processed_field = cstate->first_qe_processed_field; + + // foreach(lc, cstate->attnumlist) + // { + // int attnum = lfirst_int(lc); + + // if (i < first_qe_processed_field) + // qd_attnumlist = lappend_int(qd_attnumlist, attnum); + // else + // qe_attnumlist = lappend_int(qe_attnumlist, attnum); + // i++; + // } + // cstate->qd_attnumlist = qd_attnumlist; + // cstate->qe_attnumlist = qe_attnumlist; +} + +if (cstate->dispatch_mode == COPY_DISPATCH) +{ + /* + * We are the QD node, and we are receiving rows from client, or + * reading them from a file. We are not writing any data locally, + * instead, we determine the correct target segment for row, + * and forward each to the correct segment. + */ + + /* + * pre-allocate buffer for constructing a message. + */ + cstate->dispatch_msgbuf = makeStringInfo(); + enlargeStringInfo(cstate->dispatch_msgbuf, SizeOfCopyFromDispatchRow); + + /* + * prepare to COPY data into segDBs: + * - set table partitioning information + * - set append only table relevant info for dispatch. + * - get the distribution policy for this table. + * - build a COPY command to dispatch to segdbs. + * - dispatch the modified COPY command to all segment databases. + * - prepare pxhash for hashing on row values. + */ + pxCopy = makePxCopy(cstate, true); + + /* + * Dispatch the COPY command. + * + * From this point in the code we need to be extra careful about error + * handling. ereport() must not be called until the COPY command sessions + * are closed on the executors. Calling ereport() will leave the executors + * hanging in COPY state. + * + * For errors detected by the dispatcher, we save the error message in + * pxcopy_err StringInfo, move on to closing all COPY sessions on the + * executors and only then raise an error. We need to make sure to TRY/CATCH + * all other errors that may be raised from elsewhere in the backend. All + * error during COPY on the executors will be detected only when we end the + * COPY session there, so we are fine there. + */ + elog(DEBUG5, "COPY command sent to segdbs"); + + pxCopyStart(pxCopy, glob_copystmt, cstate->file_encoding); + + /* + * Skip header processing if dummy file get from master for COPY FROM ON + * SEGMENT + */ + // if (!cstate->on_segment) + // { + // SendCopyFromForwardedHeader(cstate, pxCopy); + // } +} /* POLAR: delay dml if necessary, for once */ if (polar_delay_dml_option == POLAR_DELAY_DML_ONCE) @@ -2808,7 +2922,8 @@ CopyFrom(CopyState cstate) TupleTableSlot *slot; bool skip_tuple; Oid loaded_oid = InvalidOid; - + unsigned int target_seg = 0; /* result segment of pxhash */ + /* POLAR: delay dml if necessary, for multiple tuple */ if (polar_delay_dml_option == POLAR_DELAY_DML_MULTI) polar_delay_dml_wait(); @@ -2946,6 +3061,33 @@ CopyFrom(CopyState cstate) skip_tuple = false; + /* + * Compute which segment this row belongs to. + */ + if (cstate->dispatch_mode == COPY_DISPATCH) + { + /* In QD, compute the target segment to send this row to. */ + target_seg = GetTargetSeg(distData, myslot); + } + + if (cstate->dispatch_mode == COPY_DISPATCH) + { + // bool send_to_all = distData && + // PxPolicyIsReplicated(distData->policy); + bool send_to_all = false; + /* in the QD, forward the row to the correct segment(s). */ + SendCopyFromForwardedTuple(cstate, pxCopy, send_to_all, + send_to_all ? 0 : target_seg, + resultRelInfo->ri_RelationDesc, + cstate->cur_lineno, + cstate->line_buf.data, + cstate->line_buf.len, + myslot->tts_values, + myslot->tts_isnull); + skip_tuple = true; + processed++; + } + /* BEFORE ROW INSERT Triggers */ if (resultRelInfo->ri_TrigDesc && resultRelInfo->ri_TrigDesc->trig_insert_before_row) @@ -3249,6 +3391,19 @@ BeginCopyFrom(ParseState *pstate, cstate = BeginCopy(pstate, true, rel, NULL, InvalidOid, attnamelist, options); oldcontext = MemoryContextSwitchTo(cstate->copycontext); + + /* + * Determine the mode + */ + if (px_enable_copy && polar_enable_px) + { + if (px_role == PX_ROLE_QC && cstate->rel) + cstate->dispatch_mode = COPY_DISPATCH; + // else if (px_role == PX_ROLE_QC) + // cstate->dispatch_mode = COPY_EXECUTOR; + } + else + cstate->dispatch_mode = COPY_DIRECT; /* Initialize state variables */ cstate->reached_eof = false; @@ -3551,7 +3706,35 @@ NextCopyFrom(CopyState cstate, ExprContext *econtext, bool file_has_oids = cstate->file_has_oids; int *defmap = cstate->defmap; ExprState **defexprs = cstate->defexprs; - + + /* + * Figure out what fields we're going to process in this process. + * + * In the QD, set 'stop_processing_at_field' so that we only those + * fields that are needed in the QD. + */ + // switch (cstate->dispatch_mode) + // { + // case COPY_DIRECT: + // stop_processing_at_field = -1; + // attnumlist = cstate->attnumlist; + // break; + + // case COPY_DISPATCH: + // stop_processing_at_field = cstate->first_qe_processed_field; + // attnumlist = cstate->qd_attnumlist; + // break; + + // case COPY_EXECUTOR: + // stop_processing_at_field = -1; + // attnumlist = cstate->qe_attnumlist; + // break; + + // default: + // elog(ERROR, "unexpected COPY dispatch mode %d", cstate->dispatch_mode); + // } + + tupDesc = RelationGetDescr(cstate->rel); num_phys_attrs = tupDesc->natts; attr_count = list_length(cstate->attnumlist); @@ -5103,3 +5286,314 @@ CreateCopyDestReceiver(void) return (DestReceiver *) self; } + +static PxDistributionData * +InitDistributionData(CopyState cstate, EState *estate) +{ + PxDistributionData *distData; + PxPolicy *policy; + PxHash *pxHash; + + /* + * A non-partitioned table, or all the partitions have identical + * distribution policies. + */ + int numsegments = -1; + numsegments = pxnode_getPxNodes()->totalPxNodes + * polar_get_stmt_px_dop(); + policy = createReplicatedPolicy(numsegments); + pxHash = makePxHashForRelation(cstate->rel); + + distData = palloc(sizeof(PxDistributionData)); + distData->policy = policy; + distData->pxHash = pxHash; + + return distData; +} + +/* + * Compute which fields need to be processed in the QC, and which ones can + * be delayed to the PX. + */ +static void +InitCopyFromDispatchSplit(CopyState cstate, PxDistributionData *distData, + EState *estate) +{ + int first_qe_processed_field = 0; + Bitmapset *needed_cols = NULL; + ListCell *lc; + + if (cstate->binary) + { + foreach(lc, cstate->attnumlist) + { + AttrNumber attnum = lfirst_int(lc); + needed_cols = bms_add_member(needed_cols, attnum); + first_qe_processed_field++; + } + } + else + { + int fieldno; + /* + * We need all the columns that form the distribution key. + */ + if (distData->policy) + { + for (int i = 0; i < distData->policy->nattrs; i++) + needed_cols = bms_add_member(needed_cols, distData->policy->attrs[i]); + } + + /* Get the max fieldno that contains one of the needed attributes. */ + fieldno = 0; + foreach(lc, cstate->attnumlist) + { + AttrNumber attnum = lfirst_int(lc); + + if (bms_is_member(attnum, needed_cols)) + first_qe_processed_field = fieldno + 1; + fieldno++; + } + } + + cstate->first_qe_processed_field = first_qe_processed_field; + + // if (Test_copy_qd_qe_split) + // { + // if (first_qe_processed_field == list_length(cstate->attnumlist)) + // elog(INFO, "all fields will be processed in the QD"); + // else + // elog(INFO, "first field processed in the QE: %d", first_qe_processed_field); + // } +} + +/* + * Inlined versions of appendBinaryStringInfo and enlargeStringInfo, for + * speed. + * + * NOTE: These versions don't NULL-terminate the string. We don't need + * it here. + */ +#define APPEND_MSGBUF_NOCHECK(buf, ptr, datalen) \ + do { \ + memcpy((buf)->data + (buf)->len, ptr, (datalen)); \ + (buf)->len += (datalen); \ + } while(0) + +#define APPEND_MSGBUF(buf, ptr, datalen) \ + do { \ + if ((buf)->len + (datalen) >= (buf)->maxlen) \ + enlargeStringInfo((buf), (datalen)); \ + memcpy((buf)->data + (buf)->len, ptr, (datalen)); \ + (buf)->len += (datalen); \ + } while(0) + +#define ENLARGE_MSGBUF(buf, needed) \ + do { \ + if ((buf)->len + (needed) >= (buf)->maxlen) \ + enlargeStringInfo((buf), (needed)); \ + } while(0) + +/* + * This is the sending counterpart of NextCopyFromExecute. Used in the QD, + * to send a row to a QE. + */ +static void +SendCopyFromForwardedTuple(CopyState cstate, + PxCopy *pxCopy, + bool toAll, + int target_seg, + Relation rel, + int64 lineno, + char *line, + int line_len, + Datum *values, + bool *nulls) +{ + TupleDesc tupDesc; + FormData_pg_attribute *attr; + copy_from_dispatch_row *frame; + StringInfo msgbuf; + int num_sent_fields; + AttrNumber num_phys_attrs; + int i; + + if (!OidIsValid(RelationGetRelid(rel))) + elog(ERROR, "invalid target table OID in COPY"); + + tupDesc = RelationGetDescr(rel); + attr = tupDesc->attrs; + num_phys_attrs = tupDesc->natts; + + /* + * Reset the message buffer, and reserve enough space for the header, + * the OID if any, and the residual line. + */ + msgbuf = cstate->dispatch_msgbuf; + ENLARGE_MSGBUF(msgbuf, SizeOfCopyFromDispatchRow + sizeof(Oid) + cstate->line_buf.len); + + /* the header goes to the beginning of the struct, but it will be filled in later. */ + msgbuf->len = SizeOfCopyFromDispatchRow; + + /* + * Next, any residual text that we didn't process in the QD. + */ + APPEND_MSGBUF_NOCHECK(msgbuf, cstate->line_buf.data, cstate->line_buf.len); + + /* + * Append attributes to the buffer. + */ + num_sent_fields = 0; + for (i = 0; i < num_phys_attrs; i++) + { + int16 attnum = i + 1; + + /* NULLs are simply left out of the message. */ + if (nulls[i]) + continue; + + /* + * Make sure we have room for the attribute number. While we're at it, + * also reserve room for the Datum, if it's a by-value datatype, or for + * the length field, if it's a varlena. Allocating both in one call + * saves one size-check. + */ + ENLARGE_MSGBUF(msgbuf, sizeof(int16) + sizeof(Datum)); + + /* attribute number comes first */ + APPEND_MSGBUF_NOCHECK(msgbuf, &attnum, sizeof(int16)); + + if (attr[i].attbyval) + { + /* we already reserved space for this above, so we can just memcpy */ + APPEND_MSGBUF_NOCHECK(msgbuf, &values[i], sizeof(Datum)); + } + else + { + if (attr[i].attlen > 0) + { + APPEND_MSGBUF(msgbuf, DatumGetPointer(values[i]), attr[i].attlen); + } + else if (attr[i].attlen == -1) + { + int32 len; + char *ptr; + + /* For simplicity, varlen's are always transmitted in "long" format */ + Assert(!VARATT_IS_SHORT(values[i])); + len = VARSIZE(values[i]); + ptr = VARDATA(values[i]); + + /* we already reserved space for this int */ + APPEND_MSGBUF_NOCHECK(msgbuf, &len, sizeof(int32)); + APPEND_MSGBUF(msgbuf, ptr, len - VARHDRSZ); + } + else if (attr[i].attlen == -2) + { + /* + * These attrs are NULL-terminated in memory, but we send + * them length-prefixed (like the varlen case above) so that + * the receiver can preallocate a data buffer. + */ + int32 len; + size_t slen; + char *ptr; + + ptr = DatumGetPointer(values[i]); + slen = strlen(ptr); + + if (slen > PG_INT32_MAX) + { + elog(ERROR, "attribute %d is too long (%lld bytes)", + attnum, (long long) slen); + } + + len = (int32) slen; + + APPEND_MSGBUF_NOCHECK(msgbuf, &len, sizeof(int32)); + APPEND_MSGBUF(msgbuf, ptr, len); + } + else + { + elog(ERROR, "attribute %d has invalid length %d", + attnum, attr[i].attlen); + } + } + + num_sent_fields++; + } + + /* + * Fill in the header. We reserved room for this at the beginning of the + * buffer. + */ + frame = (copy_from_dispatch_row *) msgbuf->data; + frame->lineno = lineno; + frame->relid = RelationGetRelid(rel); + frame->line_len = cstate->line_buf.len; + frame->residual_off = cstate->line_buf.cursor; + frame->fld_count = num_sent_fields; + // frame->delim_seen_at_end = cstate->stopped_processing_at_delim; + + // if (toAll) + // pxCopySendDataToAll(pxCopy, msgbuf->data, msgbuf->len); + // else + pxCopySendData(pxCopy, target_seg, msgbuf->data, msgbuf->len); +} + +static unsigned int +GetTargetSeg(PxDistributionData *distData, TupleTableSlot *slot) +{ + unsigned int target_seg; + PxHash *pxHash = distData->pxHash; + PxPolicy *policy = distData->policy; /* the partitioning policy for this table */ + AttrNumber p_nattrs; /* num of attributes in the distribution policy */ + + /* + * These might be NULL, if we're called with a "main" GpDistributionData, + * for a partitioned table with heterogenous partitions. The caller + * should've used GetDistributionPolicyForPartition() to get the right + * distdata object for the partition. + */ + if (!policy) + elog(ERROR, "missing distribution policy."); + if (!pxHash) + elog(ERROR, "missing pxhash"); + + /* + * At this point in the code, baseValues[x] is final for this + * data row -- either the input data, a null or a default + * value is in there, and constraints applied. + * + * Perform a pxhash on this data row. Perform a hash operation + * on each attribute. + */ + p_nattrs = policy->nattrs; + if (p_nattrs > 0) + { + pxhashinit(pxHash); + + for (int i = 0; i < p_nattrs; i++) + { + /* current attno from the policy */ + AttrNumber h_attnum = policy->attrs[i]; + Datum d; + bool isnull; + + d = slot_getattr(slot, h_attnum, &isnull); + + pxhash(pxHash, i + 1, d, isnull); + } + + target_seg = pxhashreduce(pxHash); /* hash result segment */ + } + else + { + /* + * Randomly distributed. Pick a segment at random. + */ + target_seg = pxhashrandomseg(policy->numsegments); + } + + return target_seg; +} \ No newline at end of file diff --git a/src/backend/px/dispatcher/px_disp_query.c b/src/backend/px/dispatcher/px_disp_query.c index 3d6d25587c9..b55b4e4eff5 100644 --- a/src/backend/px/dispatcher/px_disp_query.c +++ b/src/backend/px/dispatcher/px_disp_query.c @@ -1062,15 +1062,7 @@ PxDispatchCopyStart(struct PxCopy *pxCopy, Node *stmt, int flags) ErrorData *error = NULL; bool needTwoPhase = flags & DF_NEED_TWO_PHASE; - // if (needTwoPhase) - // { px_sql_wal_lsn = polar_px_max_valid_lsn(); - // } - - // elogif(log_min_messages <= DEBUG5, LOG, - // "PxDispatchCopyStart: %s (needTwoPhase = %s)", - // (PointerIsValid(debug_query_string) ? debug_query_string : "\"\""), - // (needTwoPhase ? "true" : "false")); pQueryParms = pxdisp_buildUtilityQueryParms(stmt, flags, NULL); @@ -1097,8 +1089,8 @@ PxDispatchCopyStart(struct PxCopy *pxCopy, Node *stmt, int flags) // /* Start a background libpq thread */ // pxdisp_startPqThread(ds); // /* If libpq is not run in background*/ - // if (!pxdisp_isDsThreadRuning()) - // pxdisp_waitDispatchFinish(ds); + if (!pxdisp_isDsThreadRuning()) + pxdisp_waitDispatchFinish(ds); pxdisp_checkDispatchResult(ds, DISPATCH_WAIT_NONE); @@ -1123,115 +1115,4 @@ PxDispatchCopyEnd(struct PxCopy *pxCopy) ds = pxCopy->dispatcherState; pxCopy->dispatcherState = NULL; pxdisp_destroyDispatcherState(ds); -} - - -/* - * PxDispatchUtilityStatement - * - * Dispatch an already parsed statement to all primary writer QEs, wait until - * all QEs finished successfully. If one or more QEs got error, - * throw an Error. - * - * -flags: - * Is the combination of DF_NEED_TWO_PHASE, DF_WITH_SNAPSHOT,DF_CANCEL_ON_ERROR - * - * -px_pgresults: - * Indicate whether return the pg_result for each QE connection. - * - */ -void -PxDispatchUtilityStatement(struct Node *stmt, - int flags, - List *oid_assignments, - PxPgResults *px_pgresults) -{ - DispatchCommandQueryParms *pQueryParms; - bool needTwoPhase = flags & DF_NEED_TWO_PHASE; - - // if (needTwoPhase) - // setupDtxTransaction(); - - // elogif((Debug_print_full_dtm || log_min_messages <= DEBUG5), LOG, - // "PxDispatchUtilityStatement: %s (needTwoPhase = %s)", - // (PointerIsValid(debug_query_string) ? debug_query_string : "\"\""), - // (needTwoPhase ? "true" : "false")); - - pQueryParms = pxdisp_buildUtilityQueryParms(stmt, flags, oid_assignments); - - return pxdisp_dispatchCommandInternal(pQueryParms, - flags, - pxcomponent_getPxComponentsList(), - px_pgresults); -} - -static void -pxdisp_dispatchCommandInternal(DispatchCommandQueryParms *pQueryParms, - int flags, - List *segments, - PxPgResults *px_pgresults) -{ - PxDispatcherState *ds; - Gang *primaryGang; - PxDispatchResults *pr; - ErrorData *qeError = NULL; - char *queryText; - int queryTextLength; - - /* - * Dispatch the command. - */ - ds = pxdisp_makeDispatcherState(false); - - /* - * Reader gangs use local snapshot to access catalog, as a result, it will - * not synchronize with the global snapshot from write gang which will lead - * to inconsistent visibilty of catalog table. Considering the case: - * - * select * from t, t t1; -- create a reader gang. - * begin; - * create role r1; - * set role r1; -- set command will also dispatched to idle reader gang - * - * When set role command dispatched to reader gang, reader gang cannot see - * the new tuple t1 in catalog table pg_auth. - * To fix this issue, we should drop the idle reader gangs after each - * utility statement which may modify the catalog table. - */ - // ds->destroyIdleReaderGang = true; - - queryText = buildPXQueryString(pQueryParms, &queryTextLength); - - /* - * Allocate a primary QE for every available segDB in the system. - */ - primaryGang = AllocateGang(ds, GANGTYPE_PRIMARY_WRITER, segments); - Assert(primaryGang); - - pxdisp_makeDispatchResults(ds, 1, flags & DF_CANCEL_ON_ERROR); - pxdisp_makeDispatchParams (ds, 1, queryText, queryTextLength); - - pxdisp_dispatchToGang(ds, primaryGang, -1); - - // if ((flags & DF_NEED_TWO_PHASE) != 0 || isDtxExplicitBegin()) - // addToGxactDtxSegments(primaryGang); - - pxdisp_waitDispatchFinish(ds); - - pxdisp_checkDispatchResult(ds, DISPATCH_WAIT_NONE); - - pr = pxdisp_getDispatchResults(ds, &qeError); - - if (qeError) - { - FlushErrorState(); - ReThrowError(qeError); - } - - /* collect pgstat from QEs for current transaction level */ - // pgstat_combine_from_qe(pr, -1); - - pxdisp_returnResults(pr, px_pgresults); - - pxdisp_destroyDispatcherState(ds); -} +} \ No newline at end of file diff --git a/src/backend/px/dispatcher/px_gang.c b/src/backend/px/dispatcher/px_gang.c index 8a2f24c273c..94584762b14 100644 --- a/src/backend/px/dispatcher/px_gang.c +++ b/src/backend/px/dispatcher/px_gang.c @@ -240,7 +240,8 @@ buildGangDefinition(List *segments, SegmentType segmentType) { workerId = lfirst_int(lc); newGangDefinition->db_descriptors[i] = - pxnode_allocateIdlePX(workerId, totalPxNodes, segmentType); + // pxnode_allocateIdlePX(workerId, totalPxNodes, segmentType); + pxnode_allocateIdlePX(RW_SEGMENT, totalPxNodes, segmentType); } } PG_CATCH(); diff --git a/src/backend/px/px_copy.c b/src/backend/px/px_copy.c index e4e8c6c3b43..bb47e9126af 100644 --- a/src/backend/px/px_copy.c +++ b/src/backend/px/px_copy.c @@ -60,9 +60,7 @@ #include "px/px_copy.h" #include "px/px_disp_query.h" #include "px/px_dispatchresult.h" -// #include "px/px_fts.h" #include "px/px_gang.h" -// #include "px/px_tm.h" #include "px/px_vars.h" #include "commands/copy.h" #include "commands/defrem.h" @@ -118,8 +116,12 @@ makePxCopy(CopyState cstate, bool is_copy_in) c->seglist = NIL; c->dispatcherState = NULL; initStringInfo(&(c->copy_out_buf)); - - if (!is_copy_in) + + /* + * COPY replicated table TO file, pick only one replica, otherwise, duplicate + * rows will be copied. + */ + if (!is_copy_in) { c->total_segs = 1; c->seglist = list_make1_int(px_session_id % c->total_segs); diff --git a/src/backend/px/px_hash.c b/src/backend/px/px_hash.c index 964f474e7d2..f84dec64cb5 100644 --- a/src/backend/px/px_hash.c +++ b/src/backend/px/px_hash.c @@ -123,6 +123,37 @@ makePxHash(int numsegs, int natts, Oid *hashfuncs) return h; } +/* + * Convenience routine, to create a PxHash according to a relation's + * distribution policy. + */ +PxHash * +makePxHashForRelation(Relation rel) +{ + // PxPolicy *policy = rel->rd_pxpolicy; + PxPolicy *policy; + int numsegments = getPxWorkerCount(); + policy = createReplicatedPolicy(numsegments); + Oid *hashfuncs; + int i; + TupleDesc desc = RelationGetDescr(rel); + + hashfuncs = palloc(policy->nattrs * sizeof(Oid)); + + for (i = 0; i < policy->nattrs; i++) + { + AttrNumber attnum = policy->attrs[i]; + Oid typeoid = TupleDescAttr(desc, attnum - 1)->atttypid; + Oid opfamily; + + opfamily = get_opclass_family(policy->opclasses[i]); + + hashfuncs[i] = px_hashproc_in_opfamily(opfamily, typeoid); + } + + return makePxHash(policy->numsegments, policy->nattrs, hashfuncs); +} + /* * Initialize PxHash for hashing the next tuple values. */ diff --git a/src/include/commands/copy.h b/src/include/commands/copy.h index 70e8fa455ff..a7811daf013 100644 --- a/src/include/commands/copy.h +++ b/src/include/commands/copy.h @@ -48,6 +48,32 @@ typedef enum EolType EOL_CRNL } EolType; +/* + * + * COPY FROM modes (from file/client to table) + * + * 1. "normal", direct, mode. This means ON SEGMENT running on a segment, or + * utility mode, or non-distributed table in QD. + * 2. Dispatcher mode. We are reading from file/client, and forwarding all data to QEs, + * or vice versa. + * 3. Executor mode. We are receiving pre-processed data from QD, and inserting to table. + * + * COPY TO modes (table/query to file/client) + * + * 1. Direct. This can mean ON SEGMENT running on segment, or utility mode, or + * non-distributed table in QD. Or COPY TO running on segment. + * 2. Dispatcher mode. We are receiving pre-formatted data from segments, and forwarding + * it all to to the client. + * 3. Executor mode. Not used. + */ + +typedef enum +{ + COPY_DIRECT, + COPY_DISPATCH, + COPY_EXECUTOR +} CopyDispatchMode; + /* * This struct contains all the state variables used throughout a COPY * operation. For simplicity, we use the same struct for all variants of COPY, @@ -118,6 +144,7 @@ typedef struct CopyStateData /* * Working state for COPY TO/FROM */ + CopyDispatchMode dispatch_mode; MemoryContext copycontext; /* per-copy execution context */ /* @@ -145,6 +172,9 @@ typedef struct CopyStateData TransitionCaptureState *transition_capture; + StringInfo dispatch_msgbuf; /* used in COPY_DISPATCH mode, to construct message + * to send to QE. */ + /* * These variables are used to reduce overhead in textual COPY FROM. * @@ -184,8 +214,10 @@ typedef struct CopyStateData int raw_buf_len; /* total # of bytes stored */ + int first_qe_processed_field; /* Information on the connections to QEs. */ - PxCopy *pxCopy; + PxCopy *pxCopy; + } CopyStateData; typedef struct CopyStateData *CopyState; @@ -217,4 +249,14 @@ extern uint64 CopyFrom(CopyState cstate); extern DestReceiver *CreateCopyDestReceiver(void); +/* + * This is used to hold information about the target's distribution policy, + * during COPY FROM. + */ +typedef struct PxDistributionData +{ + PxPolicy *policy; /* partitioning policy for this table */ + PxHash *pxHash; /* corresponding CdbHash object */ +} PxDistributionData; + #endif /* COPY_H */ diff --git a/src/include/px/px_disp_query.h b/src/include/px/px_disp_query.h index dee0401bb0c..66329ccb3b0 100644 --- a/src/include/px/px_disp_query.h +++ b/src/include/px/px_disp_query.h @@ -61,10 +61,6 @@ void PxDispatchPlan(struct QueryDesc *queryDesc, extern ParamListInfo deserializeParamListInfo(const char *str, int slen); -extern void PxDispatchUtilityStatement(struct Node *stmt, - int flags, - List *oid_assignments, - struct PxPgResults* cdb_pgresults); extern void PxDispatchCopyStart(struct PxCopy *pxCopy, Node *stmt, int flags); extern void PxDispatchCopyEnd(struct PxCopy *pxCopy); diff --git a/src/include/px/px_hash.h b/src/include/px/px_hash.h index 764f43684a1..a1ce88712a7 100644 --- a/src/include/px/px_hash.h +++ b/src/include/px/px_hash.h @@ -49,6 +49,7 @@ typedef struct PxHash * Create and initialize a PxHash in the current memory context. */ extern PxHash *makePxHash(int numsegs, int natts, Oid *typeoids); +extern PxHash *makePxHashForRelation(Relation rel); /* * Initialize PxHash for hashing the next tuple values. From 261b48c3f7e31de8a25ed7c512c8719553527ab9 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Tue, 23 Aug 2022 11:06:38 +0000 Subject: [PATCH 11/12] complet copy from with some bugs --- src/backend/commands/copy.c | 16 ++-- src/backend/px/dispatcher/px_dispatchresult.c | 81 ------------------- src/backend/px/dispatcher/px_gang.c | 4 +- src/backend/px/px_copy.c | 7 +- src/backend/px/px_hash.c | 3 +- src/include/px/px_dispatchresult.h | 3 - 6 files changed, 18 insertions(+), 96 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 02e93778bbc..725a365f812 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -3067,7 +3067,7 @@ if (cstate->dispatch_mode == COPY_DISPATCH) if (cstate->dispatch_mode == COPY_DISPATCH) { /* In QD, compute the target segment to send this row to. */ - target_seg = GetTargetSeg(distData, myslot); + target_seg = GetTargetSeg(distData, tuple); } if (cstate->dispatch_mode == COPY_DISPATCH) @@ -3082,8 +3082,8 @@ if (cstate->dispatch_mode == COPY_DISPATCH) cstate->cur_lineno, cstate->line_buf.data, cstate->line_buf.len, - myslot->tts_values, - myslot->tts_isnull); + values, + slot->tts_isnull); skip_tuple = true; processed++; } @@ -3232,6 +3232,10 @@ if (cstate->dispatch_mode == COPY_DISPATCH) MemoryContextSwitchTo(oldcontext); + pxCopyEnd(pxCopy, NULL, NULL); + + PxDispatchCopyEnd(pxCopy); + /* * In the old protocol, tell pqcomm that we can process normal protocol * messages again. @@ -3716,7 +3720,7 @@ NextCopyFrom(CopyState cstate, ExprContext *econtext, // switch (cstate->dispatch_mode) // { // case COPY_DIRECT: - // stop_processing_at_field = -1; + // stop_processing_at_field = -1; // attnumlist = cstate->attnumlist; // break; @@ -5449,8 +5453,8 @@ SendCopyFromForwardedTuple(CopyState cstate, int16 attnum = i + 1; /* NULLs are simply left out of the message. */ - if (nulls[i]) - continue; + // if (nulls[i]) + // continue; /* * Make sure we have room for the attribute number. While we're at it, diff --git a/src/backend/px/dispatcher/px_dispatchresult.c b/src/backend/px/dispatcher/px_dispatchresult.c index 9e56960e100..d7fb97839c1 100644 --- a/src/backend/px/dispatcher/px_dispatchresult.c +++ b/src/backend/px/dispatcher/px_dispatchresult.c @@ -26,9 +26,6 @@ #include "px/px_vars.h" #include "utils/faultinjector.h" -static int pxdisp_snatchPGresults(PxDispatchResult *dispatchResult, - struct pg_result **pgresultptrs, int maxresults); - static void noTrailingNewlinePQ(PQExpBuffer buf) { @@ -734,48 +731,6 @@ pxdisp_resultEnd(PxDispatchResults *results, int sliceIndex) return &results->resultArray[si->resultEnd]; } -void -pxdisp_returnResults(PxDispatchResults *primaryResults, PxPgResults *px_pgresults) -{ - PxDispatchResult *dispatchResult; - int nslots; - int nresults = 0; - int i; - - if (!primaryResults || !px_pgresults) - return; - - /* - * Allocate result set ptr array. The caller must PQclear() each PGresult - * and free() the array. - */ - nslots = 0; - - for (i = 0; i < primaryResults->resultCount; ++i) - nslots += pxdisp_numPGresult(&primaryResults->resultArray[i]); - - px_pgresults->pg_results = (struct pg_result **) palloc0(nslots * sizeof(struct pg_result *)); - - /* - * Collect results from primary gang. - */ - for (i = 0; i < primaryResults->resultCount; ++i) - { - dispatchResult = &primaryResults->resultArray[i]; - - /* - * Take ownership of this QE's PGresult object(s). - */ - nresults += pxdisp_snatchPGresults(dispatchResult, - px_pgresults->pg_results + nresults, - nslots - nresults); - } - - Assert(nresults == nslots); - - /* tell the caller how many sets we're returning. */ - px_pgresults->numResults = nresults; -} /* * used in the interconnect on the dispatcher to avoid error-cleanup deadlocks. @@ -858,39 +813,3 @@ pxdisp_clearPxPgResults(PxPgResults *px_pgresults) px_pgresults->numResults = 0; } - -/* - * Remove all of the PGresult ptrs from a PxDispatchResult object - * and place them into an array provided by the caller. The caller - * becomes responsible for PQclear()ing them. Returns the number of - * PGresult ptrs placed in the array. - */ -static int -pxdisp_snatchPGresults(PxDispatchResult *dispatchResult, - struct pg_result **pgresultptrs, int maxresults) -{ - PQExpBuffer buf = dispatchResult->resultbuf; - PGresult **begp = (PGresult **) buf->data; - PGresult **endp = (PGresult **) (buf->data + buf->len); - PGresult **p; - int nresults = 0; - - /* - * Snatch the PGresult objects. - */ - for (p = begp; p < endp; ++p) - { - Assert(*p != NULL); - Assert(nresults < maxresults); - pgresultptrs[nresults++] = *p; - *p = NULL; - } - - /* - * Empty our PGresult array. - */ - resetPQExpBuffer(buf); - dispatchResult->okindex = -1; - - return nresults; -} \ No newline at end of file diff --git a/src/backend/px/dispatcher/px_gang.c b/src/backend/px/dispatcher/px_gang.c index 94584762b14..0e1d4bf5241 100644 --- a/src/backend/px/dispatcher/px_gang.c +++ b/src/backend/px/dispatcher/px_gang.c @@ -240,8 +240,8 @@ buildGangDefinition(List *segments, SegmentType segmentType) { workerId = lfirst_int(lc); newGangDefinition->db_descriptors[i] = - // pxnode_allocateIdlePX(workerId, totalPxNodes, segmentType); - pxnode_allocateIdlePX(RW_SEGMENT, totalPxNodes, segmentType); + pxnode_allocateIdlePX(workerId, totalPxNodes, segmentType); + // pxnode_allocateIdlePX(RW_SEGMENT, totalPxNodes, segmentType); } } PG_CATCH(); diff --git a/src/backend/px/px_copy.c b/src/backend/px/px_copy.c index bb47e9126af..f8898e171e8 100644 --- a/src/backend/px/px_copy.c +++ b/src/backend/px/px_copy.c @@ -101,8 +101,9 @@ makePxCopy(CopyState cstate, bool is_copy_in) /* initial replicated policy*/ int numsegments = -1; - numsegments = pxnode_getPxNodes()->totalPxNodes - * polar_get_stmt_px_dop(); + // numsegments = pxnode_getPxNodes()->totalPxNodes + // * polar_get_stmt_px_dop(); + numsegments = polar_get_stmt_px_dop(); policy = createReplicatedPolicy(numsegments); Assert(policy); @@ -133,7 +134,7 @@ makePxCopy(CopyState cstate, bool is_copy_in) c->total_segs = policy->numsegments; for (i = 0; i < c->total_segs; i++) - c->seglist = lappend_int(c->seglist, i); + c->seglist = lappend_int(c->seglist, RW_SEGMENT); } cstate->pxCopy = c; diff --git a/src/backend/px/px_hash.c b/src/backend/px/px_hash.c index f84dec64cb5..fed3e10ee34 100644 --- a/src/backend/px/px_hash.c +++ b/src/backend/px/px_hash.c @@ -132,7 +132,8 @@ makePxHashForRelation(Relation rel) { // PxPolicy *policy = rel->rd_pxpolicy; PxPolicy *policy; - int numsegments = getPxWorkerCount(); + // int numsegments = getPxWorkerCount(); + int numsegments = polar_get_stmt_px_dop(); policy = createReplicatedPolicy(numsegments); Oid *hashfuncs; int i; diff --git a/src/include/px/px_dispatchresult.h b/src/include/px/px_dispatchresult.h index 39071dbf8a1..ed849889f2e 100644 --- a/src/include/px/px_dispatchresult.h +++ b/src/include/px/px_dispatchresult.h @@ -272,9 +272,6 @@ PxDispatchResult *pxdisp_resultBegin(PxDispatchResults *results, int sliceIndex) */ PxDispatchResult *pxdisp_resultEnd(PxDispatchResults *results, int sliceIndex); -void -pxdisp_returnResults(PxDispatchResults *primaryResults, PxPgResults *cdb_pgresults); - /* * used in the interconnect on the dispatcher to avoid error-cleanup deadlocks. */ From 3a5e32f273fa697aa63d2d6897b2f1b26ac607d3 Mon Sep 17 00:00:00 2001 From: Weijun-H Date: Mon, 17 Apr 2023 19:16:04 +0800 Subject: [PATCH 12/12] meet the following error when I tested the copy to, server closed the connection unexpectedly This probably means the server terminated abnormally before or while processing the request. --- src/backend/commands/copy.c | 90 +++++++++++++++++++++++++++++++++++-- 1 file changed, 86 insertions(+), 4 deletions(-) diff --git a/src/backend/commands/copy.c b/src/backend/commands/copy.c index 725a365f812..4b9043a36c7 100644 --- a/src/backend/commands/copy.c +++ b/src/backend/commands/copy.c @@ -200,6 +200,68 @@ static void SendCopyFromForwardedTuple(CopyState cstate, Datum *values, bool *nulls); +/* ========================================================================== + * The following macros aid in major refactoring of data processing code (in + * CopyFrom(+Dispatch)). We use macros because in some cases the code must be in + * line in order to work (for example elog_dismiss() in PG_CATCH) while in + * other cases we'd like to inline the code for performance reasons. + * + * NOTE that an almost identical set of macros exists in fileam.c. If you make + * changes here you may want to consider taking a look there as well. + * ========================================================================== + */ + +#define RESET_LINEBUF \ +cstate->line_buf.len = 0; \ +cstate->line_buf.data[0] = '\0'; \ +cstate->line_buf.cursor = 0; + +#define RESET_ATTRBUF \ +cstate->attribute_buf.len = 0; \ +cstate->attribute_buf.data[0] = '\0'; \ +cstate->attribute_buf.cursor = 0; + +#define RESET_LINEBUF_WITH_LINENO \ +line_buf_with_lineno.len = 0; \ +line_buf_with_lineno.data[0] = '\0'; \ +line_buf_with_lineno.cursor = 0; + +/* + * When doing a COPY FROM through the dispatcher, the QD reads the input from + * the input file (or stdin or program), and forwards the data to the QE nodes, + * where they will actually be inserted. + * + * Ideally, the QD would just pass through each line to the QE as is, and let + * the QEs to do all the processing. Because the more processing the QD has + * to do, the more likely it is to become a bottleneck. + * + * However, the QD needs to figure out which QE to send each row to. For that, + * it needs to at least parse the distribution key. The distribution key might + * also be a DEFAULTed column, in which case the DEFAULT value needs to be + * evaluated in the QD. In that case, the QD must send the computed value + * to the QE - we cannot assume that the QE can re-evaluate the expression and + * arrive at the same value, at least not if the DEFAULT expression is volatile. + * + * Therefore, we need a flexible format between the QD and QE, where the QD + * processes just enough of each input line to figure out where to send it. + * It must send the values it had to parse and evaluate to the QE, as well + * as the rest of the original input line, so that the QE can parse the rest + * of it. + * + * The 'copy_from_dispatch_*' structs are used in the QD->QE stream. For each + * input line, the QD constructs a 'copy_from_dispatch_row' struct, and sends + * it to the QE. Before any rows, a QDtoQESignature is sent first, followed by + * a 'copy_from_dispatch_header'. When QD encounters a recoverable error that + * needs to be logged in the error log (LOG ERRORS SEGMENT REJECT LIMIT), it + * sends the erroneous raw to a QE, in a 'copy_from_dispatch_error' struct. + * + * + * COPY TO is simpler: The QEs form the output rows in the final form, and the QD + * just collects and forwards them to the client. The QD doesn't need to parse + * the rows at all. + */ +static const char QDtoQESignature[] = "PGCOPY-QD-TO-QE\n\377\r\n"; + typedef struct { /* @@ -3234,7 +3296,7 @@ if (cstate->dispatch_mode == COPY_DISPATCH) pxCopyEnd(pxCopy, NULL, NULL); - PxDispatchCopyEnd(pxCopy); + // PxDispatchCopyEnd(pxCopy); /* * In the old protocol, tell pqcomm that we can process normal protocol @@ -3403,8 +3465,8 @@ BeginCopyFrom(ParseState *pstate, { if (px_role == PX_ROLE_QC && cstate->rel) cstate->dispatch_mode = COPY_DISPATCH; - // else if (px_role == PX_ROLE_QC) - // cstate->dispatch_mode = COPY_EXECUTOR; + else if (px_role != PX_ROLE_QC) + cstate->dispatch_mode = COPY_EXECUTOR; } else cstate->dispatch_mode = COPY_DIRECT; @@ -3570,7 +3632,27 @@ BeginCopyFrom(ParseState *pstate, /* must rely on user to tell us... */ cstate->file_has_oids = cstate->oids; } - else + else if (cstate->dispatch_mode == COPY_EXECUTOR && cstate->copy_dest != COPY_CALLBACK) + { + /* Read special header from QD */ + static const size_t sigsize = sizeof(QDtoQESignature); + char readSig[sizeof(QDtoQESignature)]; + copy_from_dispatch_header header_frame; + + if (CopyGetData(cstate, &readSig, sigsize, sigsize) != sigsize || + memcmp(readSig, QDtoQESignature, sigsize) != 0) + ereport(ERROR, + (errcode(ERRCODE_INTERNAL_ERROR), + errmsg("QC->PC COPY communication signature not recognized"))); + + if (CopyGetData(cstate, &header_frame, sizeof(header_frame), sizeof(header_frame)) != sizeof(header_frame)) + ereport(ERROR, + (errcode(ERRCODE_BAD_COPY_FILE_FORMAT), + errmsg("invalid QC->QC COPY communication header"))); + + cstate->first_qe_processed_field = header_frame.first_qe_processed_field; + } + else if (!cstate->binary) { /* Read and verify binary header */ char readSig[11];