Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add functions to substrait extension #1

Draft
wants to merge 26 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
05a5363
pointed submodules to my own repositories
drin Feb 24, 2024
bfffd8d
switched duckdb submodule to coop-decomp branch
drin Feb 24, 2024
be6ee3f
switched substrait submodule to mohair branch
drin Feb 24, 2024
f7e17d9
set branches of duckdb and substrait submodules
drin Feb 24, 2024
62c207f
task: reorganized substrait code
drin Mar 6, 2024
31210d6
task: added code for transpilation of substrait
drin Mar 6, 2024
c022dc5
task: updated CMakeLists.txt to build new sources
drin Mar 6, 2024
e141f4c
minor: mysterious style changes
drin Mar 11, 2024
0b2aa67
task: added methods to DuckDBEnginePlan
drin Mar 11, 2024
d1b5b00
task: updates to `TranspilePlanMessage`
drin Mar 11, 2024
7582f9a
task: updated binding for "transpile_mohair"
drin Mar 11, 2024
3d03e5c
task: dropped transpile sources from build
drin Mar 11, 2024
a64b4f9
minor: updated comment that was copy-pasta
drin Mar 11, 2024
dac982a
task: Removed unnecessary code
drin Mar 13, 2024
1d0e8a1
task: removed unused function
drin Mar 13, 2024
53cc483
task: updated transpilation and execution
drin Mar 13, 2024
fc09368
minor: updated cmake version to my current
drin Mar 13, 2024
014e730
minor: removed tabs
drin Mar 13, 2024
42f9e3b
minor: removed unused code
drin Mar 22, 2024
e4bb0fe
task: purely aesthetic refactoring
drin Mar 22, 2024
87c7be9
task: moved engine specific code to engine_duckdb
drin Mar 22, 2024
5e41f34
task: moved methods into DuckDBTranslator
drin Mar 22, 2024
772c698
task: updated new TableFunctions
drin Mar 22, 2024
db21341
task: update duckdb to v0.10.1
drin Mar 27, 2024
02cd80a
task: updating duckdb dependency
drin Apr 3, 2024
27cadc7
minor: downgraded cmake minimum version
drin Apr 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
6 changes: 4 additions & 2 deletions .gitmodules
@@ -1,9 +1,11 @@
[submodule "duckdb"]
path = duckdb
url = https://github.com/duckdb/duckdb
url = https://github.com/drin/duckdb
branch = coop-decomp
[submodule "duckdb-r"]
path = duckdb-r
url = https://github.com/duckdb/duckdb-r
[submodule "substrait"]
path = substrait
url = https://github.com/substrait-io/substrait
url = https://github.com/drin/substrait
branch = mohair
21 changes: 18 additions & 3 deletions CMakeLists.txt
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 2.8.12)
cmake_minimum_required(VERSION 3.25.1)

# Set extension name here
set(TARGET_NAME substrait)
Expand Down Expand Up @@ -94,12 +94,27 @@ set(SUBSTRAIT_SOURCES
third_party/substrait/substrait/type_expressions.pb.cc
third_party/substrait/substrait/extensions/extensions.pb.cc)

set(EXTENSION_SOURCES
src/to_substrait.cpp
# custom sources for mohair integration
set(MOHAIR_SOURCES
src/plans.cpp
src/engine_duckdb.cpp
src/translation/duckdb_expressions.cpp
src/translation/duckdb_operators.cpp)
#src/transpilation/duckdb_expressions.cpp
#src/transpilation/duckdb_operators.cpp)

# official sources for substrait integration
set(SUBSTRAIT_EXT_SOURCES
src/from_substrait.cpp
src/to_substrait.cpp)

# primary sources first, then others
set(EXTENSION_SOURCES
src/substrait_extension.cpp
src/custom_extensions.cpp
src/custom_extensions_generated.cpp
${MOHAIR_SOURCES}
${SUBSTRAIT_EXT_SOURCES}
${SUBSTRAIT_SOURCES}
${PROTOBUF_SOURCES})

Expand Down
2 changes: 1 addition & 1 deletion duckdb
Submodule duckdb updated 1285 files
190 changes: 190 additions & 0 deletions src/engine_duckdb.cpp
@@ -0,0 +1,190 @@
// ------------------------------
// License
//
// Copyright 2024 Aldrin Montana
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.


// ------------------------------
// Dependencies

#include "engine_duckdb.hpp"


// ------------------------------
// Functions

// >> DuckDB-specific function renaming and validation

namespace duckdb {

// >> Static data and related functions for mapping functions from substrait -> duckdb

static FunctionRenameMap engine_remapped_functions {
{"modulus" , "mod" }
,{"std_dev" , "stddev" }
,{"starts_with", "prefix" }
,{"ends_with" , "suffix" }
,{"substring" , "substr" }
,{"char_length", "length" }
,{"is_nan" , "isnan" }
,{"is_finite" , "isfinite" }
,{"is_infinite", "isinf" }
,{"like" , "~~" }
,{"extract" , "date_part"}
};

string RemoveExtension(string &function_name) {
string name;

for (auto &c : function_name) {
if (c == ':') { break; }
name += c;
}

return name;
}

string RemapFunctionName(string &function_name) {
string name { RemoveExtension(function_name) };

auto it = engine_remapped_functions.find(name);
if (it != engine_remapped_functions.end()) { name = it->second; }

return name;
}


// >> Static data and related functions for extraction of date subfields

static case_insensitive_set_t engine_date_subfields {
"year" , "month" , "day"
,"decade" , "century" , "millenium"
,"quarter"
,"microsecond", "milliseconds", "second"
,"minute" , "hour"
};

void AssertValidDateSubfield(const string& subfield) {
D_ASSERT(engine_date_subfields.count(subfield));
}

} // namespace: duckdb


namespace duckdb {

//! Constructor for DuckDBTranslator
DuckDBTranslator::DuckDBTranslator(ClientContext &ctxt): context(ctxt) {
t_conn = make_uniq<Connection>(*ctxt.db);
functions_map = make_uniq<mohair::SubstraitFunctionMap>();

// create an http state, but I don't know what this is for
auto http_state = HTTPState::TryGetState(*(t_conn->context));
http_state->Reset();
}

// >> Entry points for substrait plan (json or binary) -> duckdb logical plan
shared_ptr<DuckLogicalPlan>
DuckDBTranslator::TranspilePlanMessage(shared_ptr<DuckSystemPlan> sys_plan) {
shared_ptr<Relation> plan_rel = sys_plan->engine;

// Transform Relation to QueryNode and wrap in a SQLStatement
auto plan_wrapper = make_uniq<SelectStatement>();
plan_wrapper->node = plan_rel->GetQueryNode();

// Create a planner to go from SQLStatement -> LogicalOperator
Planner planner { context };
planner.CreatePlan(std::move(plan_wrapper));
shared_ptr<LogicalOperator> logical_plan { std::move(planner.plan) };

return make_shared<DuckLogicalPlan>(sys_plan->substrait, logical_plan);
}

shared_ptr<DuckPhysicalPlan>
DuckDBTranslator::TranslateLogicalPlan( shared_ptr<DuckLogicalPlan> engine_plan
,bool optimize) {
// Make a copy that is a unique_ptr
auto logical_plan = engine_plan->engine->Copy(context);

// optimization
if (optimize) {
shared_ptr<Binder> binder { Binder::CreateBinder(context) };
Optimizer optimizer { *binder, context };

logical_plan = optimizer.Optimize(std::move(logical_plan));
}

// transformation to physical plan
PhysicalPlanGenerator physical_planner { context };
shared_ptr<PhysicalOperator> physical_plan {
physical_planner.CreatePlan(std::move(logical_plan))
};

return make_shared<DuckPhysicalPlan>(engine_plan->substrait, physical_plan);
}

bool ShouldKeepExecuting(PendingExecutionResult& exec_result) {
switch (exec_result) {
case PendingExecutionResult::RESULT_NOT_READY:
case PendingExecutionResult::RESULT_READY:
break;

case PendingExecutionResult::BLOCKED:
std::cout << "\t[Executor]: blocked" << std::endl;
break;

case PendingExecutionResult::NO_TASKS_AVAILABLE:
std::cout << "\t[Executor]: waiting for tasks" << std::endl;
break;

case PendingExecutionResult::EXECUTION_ERROR:
std::cerr << "\t[Executor]: execution error" << std::endl;
return false;

default:
std::cerr << "\t[Executor]: unknown execution result type" << std::endl;
return false;
}

return true;
}

unique_ptr<QueryResult> DuckDBExecutor::Execute() {
constexpr bool dry_run { false };
Executor plan_executor { context };

plan_executor.Initialize(
PhysicalResultCollector::GetResultCollector(context, plan_data)
);

auto exec_result = plan_executor.ExecuteTask(dry_run);
while (exec_result != PendingExecutionResult::RESULT_READY) {
if (not ShouldKeepExecuting(exec_result)) {
std::cerr << "\t\t" << plan_executor.GetError().Message() << std::endl;
break;
}

exec_result = plan_executor.ExecuteTask(dry_run);
}

if ( exec_result == PendingExecutionResult::RESULT_READY
and plan_executor.HasResultCollector()) {
return std::move(plan_executor.GetResult());
}

return nullptr;
}

} // namespace: duckdb