Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Flexible std::strings. #744

Merged
merged 1 commit into from
Oct 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion include/highfive/bits/H5Attribute_misc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -87,8 +87,10 @@ inline void Attribute::read(T& array) const {
read(r.getPointer(), buffer_info.data_type);
// re-arrange results
r.unserialize(array);
auto t = create_datatype<typename details::inspector<T>::base_type>();

auto t = buffer_info.data_type;
auto c = t.getClass();

if (c == DataTypeClass::VarLen || t.isVariableStr()) {
#if H5_VERSION_GE(1, 12, 0)
// This one have been created in 1.12.0
Expand Down
270 changes: 268 additions & 2 deletions include/highfive/bits/H5Converter_misc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,23 @@
namespace HighFive {
namespace details {

template <class T>
struct is_std_string {
static constexpr bool value =
std::is_same<typename inspector<T>::base_type, std::string>::value;
};

template <class T, class V = void>
struct enable_shallow_copy: public std::enable_if<inspector<T>::is_trivially_copyable, V> {};
struct enable_shallow_copy
: public std::enable_if<!is_std_string<T>::value && inspector<T>::is_trivially_copyable, V> {};

template <class T, class V = void>
struct enable_deep_copy: public std::enable_if<!inspector<T>::is_trivially_copyable, V> {};
struct enable_deep_copy
: public std::enable_if<!is_std_string<T>::value && !inspector<T>::is_trivially_copyable, V> {};

template <class T, class V = void>
struct enable_string_copy: public std::enable_if<is_std_string<T>::value, V> {};


template <typename T, bool IsReadOnly>
struct ShallowCopyBuffer {
Expand Down Expand Up @@ -85,6 +97,243 @@
std::vector<size_t> dims;
};

enum class BufferMode { Read, Write };


///
/// \brief String length in bytes excluding the `\0`.
///
inline size_t char_buffer_size(char const* const str, size_t max_string_length) {
for (size_t i = 0; i <= max_string_length; ++i) {
if (str[i] == '\0') {
return i;
}
}

return max_string_length;
}


///
/// \brief A buffer for reading/writing strings.
///
/// A string in HDF5 can be represented as a fixed or variable length string.
/// The important difference for this buffer is that `H5D{read,write}` expects
/// different input depending on whether the strings are fixed or variable length.
/// For fixed length strings, it expects an array of chars, i.e. one string
/// packed after the other contiguously. While for variable length strings it
/// expects a list of pointers to the beginning of each string. Variable length
/// string must be null-terminated; because that's how their length is
/// determined.
///
/// This buffer hides the difference between fixed and variable length strings
/// by having internal data structures available for both cases at compile time.
/// The choice which internal buffer to use is made at runtime.
///
/// Consider an HDF5 dataset with N fixed-length strings, each of which is M
/// characters long. Then the in-memory strings are copied into an internal
/// buffer of size N*M. If null- or space-padded the buffer should be filled
/// with the appropriate character. This is important if the in-memory strings
/// are less than M characters long.
///
/// An HDF5 dataset with N variable-length strings (all null-terminated) uses
/// the internal list of pointers to the beginning of each string. Those
/// pointers can either point to the in-memory strings themselves, if those
/// strings are known to be null-terminated. Otherwise the in-memory strings are
/// copied to an internal buffer of null-terminated strings; and the pointer
/// points to the start of the string in the internal buffer.
///
/// This class is responsible for arranging the strings properly before passing
/// the buffers to HDF5. To keep this class generic, it provides a generic
/// read/write interface to the internal strings, i.e. a pointer with a size.
/// For reading from the buffer the proxy is called `StringConstView`. This
/// proxy object is to be used by the `inspector` to copy from the buffer into
/// the final destination, e.g. an `std::string`. Similarly, there's a proxy
/// object for serializing into the buffer, i.e. the `StringView`. Again the
/// `inspector` is responsible for obtaining the pointer, size and padding of
/// the string.
///
/// Nomenclature:
/// - size of a string is the number of bytes required to store the string,
/// including the null character for null-terminated strings.
///
/// - length of a string is the number of bytes without the null character.
///
/// Note: both 'length' and 'size' are counted in number of bytes, not number
/// of symbols or characters. Even for UTF8 strings.
template <typename T, BufferMode buffer_mode>
struct StringBuffer {
using type = unqualified_t<T>;
using hdf5_type = typename inspector<type>::hdf5_type;

class StringView {
public:
StringView(StringBuffer<T, buffer_mode>& _buffer, size_t _i)
: buffer(_buffer)
, i(_i) {}

///
/// \brief Assign the in-memory string to the buffer.
///
/// This method copies the in-memory string to the appropriate
/// internal buffer as needed.
///
/// The `length` is the length of the string in bytes.
void assign(char const* data, size_t length, StringPadding padding) {
if (buffer.isVariableLengthString()) {
if (padding == StringPadding::NullTerminated) {
buffer.variable_length_pointers[i] = data;
} else {
buffer.variable_length_buffer[i] = std::string(data, length);
buffer.variable_length_pointers[i] = buffer.variable_length_buffer[i].data();

Check warning on line 188 in include/highfive/bits/H5Converter_misc.hpp

View check run for this annotation

Codecov / codecov/patch

include/highfive/bits/H5Converter_misc.hpp#L187-L188

Added lines #L187 - L188 were not covered by tests
}
} else if (buffer.isFixedLengthString()) {
// If the buffer is fixed-length and null-terminated, then
// `buffer.string_length` doesn't include the null-character.
if (length > buffer.string_length) {
throw std::invalid_argument("String length too big.");
}

memcpy(&buffer.fixed_length_buffer[i * buffer.string_size], data, length);
}
}

private:
StringBuffer<T, buffer_mode>& buffer;
size_t i;
};


class StringConstView {
public:
StringConstView(const StringBuffer<T, buffer_mode>& _buffer, size_t _i)
: buffer(_buffer)
, i(_i) {}

/// \brief Pointer to the first byte of the string.
///
/// The valid indices for this pointer are: 0, ..., length() - 1.
char const* data() const {
if (buffer.isVariableLengthString()) {
return buffer.variable_length_pointers[i];
} else {
return &buffer.fixed_length_buffer[i * buffer.string_size];
}
}

/// \brief Length of the string in bytes.
///
/// Note that for null-terminated strings the "length" doesn't include
/// the null character. Hence, if storing this string as a
/// null-terminated string, the destination buffer needs to be at least
/// `length() + 1` bytes long.
size_t length() const {
if (buffer.isNullTerminated()) {
return char_buffer_size(data(), buffer.string_length);
} else {
return buffer.string_length;
}
}

private:
const StringBuffer<T, buffer_mode>& buffer;
size_t i;
};


class Iterator {
public:
Iterator(StringBuffer<T, buffer_mode>& _buffer, size_t _pos)
: buffer(_buffer)
, pos(_pos) {}

Iterator operator+(size_t n_strings) const {
return Iterator(buffer, pos + n_strings);
}

void operator+=(size_t n_strings) {
pos += n_strings;
}

StringView operator*() {
return StringView(buffer, pos);
}

StringConstView operator*() const {
return StringConstView(buffer, pos);
}

private:
StringBuffer<T, buffer_mode>& buffer;
size_t pos;
};

StringBuffer(std::vector<size_t> _dims, const DataType& _file_datatype)
: file_datatype(_file_datatype.asStringType())
, padding(file_datatype.getPadding())
, string_size(file_datatype.isVariableStr() ? size_t(-1) : file_datatype.getSize())
, string_length(string_size - size_t(isNullTerminated()))
, dims(_dims) {
if (string_size == 0 && isNullTerminated()) {
throw DataTypeException(
"Fixed-length, null-terminated need at least one byte to store the "
"null-character.");

Check warning on line 280 in include/highfive/bits/H5Converter_misc.hpp

View check run for this annotation

Codecov / codecov/patch

include/highfive/bits/H5Converter_misc.hpp#L280

Added line #L280 was not covered by tests
}

auto n_strings = compute_total_size(dims);
if (isVariableLengthString()) {
variable_length_buffer.resize(n_strings);
variable_length_pointers.resize(n_strings);
} else {
char pad = padding == StringPadding::SpacePadded ? ' ' : '\0';
fixed_length_buffer.assign(n_strings * string_size, pad);
}
}

bool isVariableLengthString() const {
return file_datatype.isVariableStr();
}

bool isFixedLengthString() const {
return file_datatype.isFixedLenStr();
}

bool isNullTerminated() const {
return file_datatype.getPadding() == StringPadding::NullTerminated;
}


void* getPointer() {
if (file_datatype.isVariableStr()) {
return variable_length_pointers.data();
} else {
return fixed_length_buffer.data();
}
}

Iterator begin() {
return Iterator(*this, 0ul);
}

void unserialize(T& val) {
inspector<type>::unserialize(begin(), dims, val);
}

private:
StringType file_datatype;
StringPadding padding;
size_t string_size; // Size of buffer required to store the string.
// Meaningful for fixed length strings only.
size_t string_length; // Semantic length of string.
std::vector<size_t> dims;

std::vector<char> fixed_length_buffer;
std::vector<std::string> variable_length_buffer;
std::vector<
typename std::conditional<buffer_mode == BufferMode::Write, const char, char>::type*>
variable_length_pointers;
};


template <typename T, typename Enable = void>
struct Writer;
Expand All @@ -107,6 +356,14 @@
}
};

template <typename T>
struct Writer<T, typename enable_string_copy<T>::type>: public StringBuffer<T, BufferMode::Write> {
explicit Writer(const T& val, const DataType& _file_datatype)
: StringBuffer<T, BufferMode::Write>(inspector<T>::getDimensions(val), _file_datatype) {
inspector<T>::serialize(val, this->begin());
}
};

template <typename T, typename Enable = void>
struct Reader;

Expand All @@ -133,6 +390,15 @@
};


template <typename T>
struct Reader<T, typename enable_string_copy<T>::type>: public StringBuffer<T, BufferMode::Write> {
public:
explicit Reader(const std::vector<size_t>& _dims,
const T& /* val */,
const DataType& _file_datatype)
: StringBuffer<T, BufferMode::Write>(_dims, _file_datatype) {}
};

struct data_converter {
template <typename T>
static Writer<T> serialize(const typename inspector<T>::type& val,
Expand Down
2 changes: 1 addition & 1 deletion include/highfive/bits/H5DataType_misc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
#include <half.hpp>
#endif

#include "H5Converter_misc.hpp"
#include "H5Inspector_misc.hpp"

namespace HighFive {

Expand Down
18 changes: 12 additions & 6 deletions include/highfive/bits/H5Inspector_misc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
#include <type_traits>
#include <cstring>
#include <cassert>
#include <vector>
#include <array>
#include <string>
#include <numeric>

#include "../H5Reference.hpp"
Expand All @@ -28,7 +31,9 @@
#include <Eigen/Eigen>
#endif


namespace HighFive {

namespace details {

inline bool checkDimensions(const std::vector<size_t>& dims, size_t n_dim_requested) {
Expand Down Expand Up @@ -260,14 +265,15 @@ struct inspector<std::string>: type_helper<std::string> {
throw DataSpaceException("A std::string cannot be written directly.");
}

static void serialize(const type& val, hdf5_type* m) {
*m = val.c_str();
template <class It>
static void serialize(const type& val, It m) {
(*m).assign(val.data(), val.size(), StringPadding::NullTerminated);
}

static void unserialize(const hdf5_type* vec,
const std::vector<size_t>& /* dims */,
type& val) {
val = vec[0];
template <class It>
static void unserialize(const It& vec, const std::vector<size_t>& /* dims */, type& val) {
const auto& view = *vec;
val.assign(view.data(), view.length());
}
};

Expand Down