Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,7 @@ add_library(flapi-lib STATIC
src/auth_middleware.cpp
src/cache_manager.cpp
src/database_manager_cache_adapter.cpp
src/duckdb_embed_fs.cpp
src/config_loader.cpp
src/endpoint_repository.cpp
src/config_validator.cpp
Expand Down
255 changes: 255 additions & 0 deletions src/duckdb_embed_fs.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,255 @@
#include "duckdb_embed_fs.hpp"

#include "database_manager.hpp"
#include "vfs_adapter.hpp"

#include "duckdb.hpp"
#include "duckdb/common/file_system.hpp"
#include "duckdb/common/helper.hpp"
#include "duckdb/main/capi/capi_internal.hpp"

#include <algorithm>
#include <cstring>
#include <regex>
#include <stdexcept>
#include <string>
#include <utility>

namespace flapi {

namespace {

// File handle holding a non-owning pointer to bytes in ArchiveEntries
// plus a streaming position. The owning shared_ptr lives in the
// EmbeddedFileSystem instance, so the data is valid as long as the
// filesystem outlives all its handles -- guaranteed for DuckDB
// sub-systems registered with RegisterSubSystem.
class EmbeddedFileHandle : public ::duckdb::FileHandle {
public:
EmbeddedFileHandle(EmbeddedFileSystem& fs,
std::string path,
::duckdb::FileOpenFlags flags,
const std::vector<std::uint8_t>* data)
: ::duckdb::FileHandle(fs, std::move(path), flags),
data_(data),
position_(0) {}

void Close() override { /* in-memory, nothing to release */ }

const std::vector<std::uint8_t>* data() const { return data_; }
::duckdb::idx_t position() const { return position_; }
void set_position(::duckdb::idx_t p) { position_ = p; }

private:
const std::vector<std::uint8_t>* data_;
::duckdb::idx_t position_;
};

EmbeddedFileHandle& AsEmbedded(::duckdb::FileHandle& h) {
return h.Cast<EmbeddedFileHandle>();
}

bool HasEmbedScheme(const std::string& path) {
const std::size_t scheme_len = std::strlen(kEmbedScheme);
return path.size() >= scheme_len &&
path.compare(0, scheme_len, kEmbedScheme) == 0;
}

std::string StripEmbedScheme(const std::string& path) {
const std::size_t scheme_len = std::strlen(kEmbedScheme);
if (HasEmbedScheme(path)) {
return path.substr(scheme_len);
}
return path;
}

bool ContainsGlobMeta(const std::string& s) {
return s.find('*') != std::string::npos ||
s.find('?') != std::string::npos ||
s.find('[') != std::string::npos;
}

// Convert a basic glob (just *, ?, and literal chars) into a regex.
std::regex GlobToRegex(const std::string& pattern) {
std::string out;
out.reserve(pattern.size() * 2);
for (char c : pattern) {
switch (c) {
case '*':
out += "[^/]*";
break;
case '?':
out += "[^/]";
break;
case '.': case '+': case '^': case '$':
case '(': case ')': case '[': case ']':
case '{': case '}': case '|': case '\\':
out += '\\';
out += c;
break;
default:
out += c;
break;
}
}
return std::regex(out);
}

} // namespace

EmbeddedFileSystem::EmbeddedFileSystem(std::shared_ptr<const ArchiveEntries> entries)
: _entries(std::move(entries)) {
if (_entries == nullptr) {
throw std::invalid_argument(
"EmbeddedFileSystem requires non-null ArchiveEntries");
}
}

bool EmbeddedFileSystem::CanHandleFile(const std::string& fpath) {
return HasEmbedScheme(fpath);
}

::duckdb::unique_ptr<::duckdb::FileHandle> EmbeddedFileSystem::OpenFile(
const std::string& path,
::duckdb::FileOpenFlags flags,
::duckdb::optional_ptr<::duckdb::FileOpener> /*opener*/) {
const std::string key = StripEmbedScheme(path);
auto it = _entries->find(key);
if (it == _entries->end()) {
throw ::duckdb::IOException("embed://: no such entry: " + path);
}
return ::duckdb::make_uniq<EmbeddedFileHandle>(*this, path, flags, &it->second);
}

int64_t EmbeddedFileSystem::Read(::duckdb::FileHandle& handle, void* buffer, int64_t nr_bytes) {
auto& h = AsEmbedded(handle);
if (h.data() == nullptr) {
return 0;
}
const auto& bytes = *h.data();
const ::duckdb::idx_t pos = h.position();
if (pos >= bytes.size()) {
return 0;
}
const ::duckdb::idx_t remaining = bytes.size() - pos;
const int64_t to_copy =
std::min<int64_t>(nr_bytes, static_cast<int64_t>(remaining));
std::memcpy(buffer, bytes.data() + pos, static_cast<std::size_t>(to_copy));
h.set_position(pos + static_cast<::duckdb::idx_t>(to_copy));
return to_copy;
}

void EmbeddedFileSystem::Read(::duckdb::FileHandle& handle, void* buffer,
int64_t nr_bytes, ::duckdb::idx_t location) {
auto& h = AsEmbedded(handle);
if (h.data() == nullptr) {
throw ::duckdb::IOException("embed://: handle has no data");
}
const auto& bytes = *h.data();
if (location > bytes.size() ||
nr_bytes < 0 ||
location + static_cast<::duckdb::idx_t>(nr_bytes) > bytes.size()) {
throw ::duckdb::IOException(
"embed://: positional read out of bounds for " + h.GetPath());
}
std::memcpy(buffer, bytes.data() + location, static_cast<std::size_t>(nr_bytes));
// Positional Read intentionally does not move the cursor (mirrors
// the documented FileSystem::Read(handle, buf, n, location) contract).
}

int64_t EmbeddedFileSystem::GetFileSize(::duckdb::FileHandle& handle) {
auto& h = AsEmbedded(handle);
if (h.data() == nullptr) {
return -1;
}
return static_cast<int64_t>(h.data()->size());
}

void EmbeddedFileSystem::Seek(::duckdb::FileHandle& handle, ::duckdb::idx_t location) {
auto& h = AsEmbedded(handle);
if (h.data() == nullptr) {
throw ::duckdb::IOException("embed://: handle has no data");
}
if (location > h.data()->size()) {
throw ::duckdb::IOException(
"embed://: seek beyond EOF for " + h.GetPath());
}
h.set_position(location);
}

::duckdb::idx_t EmbeddedFileSystem::SeekPosition(::duckdb::FileHandle& handle) {
return AsEmbedded(handle).position();
}

bool EmbeddedFileSystem::OnDiskFile(::duckdb::FileHandle& /*handle*/) {
return false;
}

bool EmbeddedFileSystem::FileExists(const std::string& filename,
::duckdb::optional_ptr<::duckdb::FileOpener> /*opener*/) {
const std::string key = StripEmbedScheme(filename);
return _entries->find(key) != _entries->end();
}

::duckdb::vector<::duckdb::OpenFileInfo> EmbeddedFileSystem::Glob(
const std::string& path,
::duckdb::FileOpener* /*opener*/) {
::duckdb::vector<::duckdb::OpenFileInfo> result;

// Non-glob path: just check existence.
if (!ContainsGlobMeta(path)) {
const std::string key = StripEmbedScheme(path);
if (_entries->find(key) != _entries->end()) {
result.emplace_back(path);
}
return result;
}

// We only support glob inside the embed:// scheme. Mixing schemes
// here would be a bug in the caller.
if (!HasEmbedScheme(path)) {
return result;
}

const std::string key_pattern = StripEmbedScheme(path);
const std::regex re = GlobToRegex(key_pattern);

for (const auto& [name, _data] : *_entries) {
if (std::regex_match(name, re)) {
result.emplace_back(std::string(kEmbedScheme) + name);
}
}
std::sort(result.begin(), result.end());
return result;
}

bool RegisterEmbeddedFileSystem() {
auto entries = FileProviderFactory::GetBundleContents();
if (entries == nullptr) {
return false;
}

auto dbm = DatabaseManager::getInstance();
if (!dbm || !dbm->isInitialized()) {
return false;
}

auto conn = dbm->getConnection();
if (conn == nullptr) {
return false;
}

auto* conn_wrapper = reinterpret_cast<::duckdb::Connection*>(conn);
if (conn_wrapper == nullptr || conn_wrapper->context == nullptr) {
duckdb_disconnect(&conn);
return false;
}

auto& fs = ::duckdb::FileSystem::GetFileSystem(*conn_wrapper->context);
fs.RegisterSubSystem(::duckdb::make_uniq<EmbeddedFileSystem>(entries));

duckdb_disconnect(&conn);
return true;
}

} // namespace flapi
68 changes: 68 additions & 0 deletions src/include/duckdb_embed_fs.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
#pragma once

#include "archive_io.hpp"

#include "duckdb/common/file_system.hpp"
#include "duckdb/common/file_open_flags.hpp"

#include <cstdint>
#include <memory>
#include <string>

namespace flapi {

constexpr const char* kEmbedScheme = "embed://";

// DuckDB FileSystem implementation that serves entries from an
// in-memory ZIP via the `embed://` scheme. Sibling to
// `EmbeddedArchiveFileProvider` (#43) -- the two share one
// `std::shared_ptr<const ArchiveEntries>` so there is exactly one
// decompressed copy in memory.
//
// The spike's hard-won lesson is encoded in this override set:
// `OpenFile` / `Read` / `Seek` / `GetFileSize` alone are not enough
// for `read_csv()`. We also override `Glob` (path expansion happens
// before opening) and `SeekPosition` (the base class throws
// "not implemented" rather than no-oping for either).
class EmbeddedFileSystem : public ::duckdb::FileSystem {
public:
explicit EmbeddedFileSystem(std::shared_ptr<const ArchiveEntries> entries);
~EmbeddedFileSystem() override = default;

::duckdb::unique_ptr<::duckdb::FileHandle> OpenFile(
const std::string& path,
::duckdb::FileOpenFlags flags,
::duckdb::optional_ptr<::duckdb::FileOpener> opener = nullptr) override;

int64_t Read(::duckdb::FileHandle& handle, void* buffer, int64_t nr_bytes) override;
void Read(::duckdb::FileHandle& handle, void* buffer, int64_t nr_bytes,
::duckdb::idx_t location) override;

int64_t GetFileSize(::duckdb::FileHandle& handle) override;

void Seek(::duckdb::FileHandle& handle, ::duckdb::idx_t location) override;
::duckdb::idx_t SeekPosition(::duckdb::FileHandle& handle) override;

::duckdb::vector<::duckdb::OpenFileInfo> Glob(
const std::string& path,
::duckdb::FileOpener* opener = nullptr) override;

bool CanHandleFile(const std::string& fpath) override;
bool FileExists(const std::string& filename,
::duckdb::optional_ptr<::duckdb::FileOpener> opener = nullptr) override;
bool OnDiskFile(::duckdb::FileHandle& handle) override;
bool CanSeek() override { return true; }

std::string GetName() const override { return "embed"; }

private:
std::shared_ptr<const ArchiveEntries> _entries;
};

// Registers an EmbeddedFileSystem on the global DuckDB instance.
// No-op when no bundle has been set via FileProviderFactory or when
// DatabaseManager isn't initialised yet. Returns true if a filesystem
// was registered.
bool RegisterEmbeddedFileSystem();

} // namespace flapi
8 changes: 8 additions & 0 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include "bundle_locator.hpp"
#include "config_manager.hpp"
#include "database_manager.hpp"
#include "duckdb_embed_fs.hpp"
#include "flapi_telemetry.hpp"
#include "rate_limit_middleware.hpp"
#include "config_token_utils.hpp"
Expand Down Expand Up @@ -412,6 +413,13 @@ int main(int argc, char* argv[])

initializeDatabase(config_manager);

// If a bundle was detected at startup, register the embed:// FS
// on the DuckDB instance so `read_csv('embed://...')` and similar
// calls inside SQL templates can resolve to the in-memory archive.
if (RegisterEmbeddedFileSystem()) {
CROW_LOG_INFO << "Registered embed:// DuckDB filesystem";
}

// Configure cloud credentials in DuckDB after database is initialized
configureCloudCredentialsInDuckDB();

Expand Down
1 change: 1 addition & 0 deletions test/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ add_executable(flapi_tests
config_service_test.cpp
config_service_filesystem_test.cpp
cors_policy_test.cpp
duckdb_embed_fs_test.cpp
config_service_parameters_test.cpp
config_service_slug_test.cpp
config_service_template_lookup_test.cpp
Expand Down
Loading
Loading