diff --git a/cpp/src/arrow/sparse_tensor.cc b/cpp/src/arrow/sparse_tensor.cc index f3cfd42b1bec5..f8f8bf425e39d 100644 --- a/cpp/src/arrow/sparse_tensor.cc +++ b/cpp/src/arrow/sparse_tensor.cc @@ -419,6 +419,154 @@ class SparseTensorConverter inline Status CheckMaximumValue(const uint64_t) const { return Status::OK(); } }; +// ---------------------------------------------------------------------- +// SparseTensorConverter for SparseCSFIndex + +template +class SparseTensorConverter + : private SparseTensorConverterBase { + public: + using BaseClass = SparseTensorConverterBase; + using typename BaseClass::NumericTensorType; + using typename BaseClass::value_type; + + SparseTensorConverter(const NumericTensorType& tensor, + const std::shared_ptr& index_value_type, + MemoryPool* pool) + : BaseClass(tensor, index_value_type, pool) {} + + template + Status Convert() { + using c_index_value_type = typename IndexValueType::c_type; + const int64_t indices_elsize = sizeof(c_index_value_type); + + std::shared_ptr sparse_coo_tensor; + RETURN_NOT_OK(SparseCOOTensor::Make(tensor_, &sparse_coo_tensor)); + std::shared_ptr coords = + arrow::internal::checked_pointer_cast( + sparse_coo_tensor->sparse_index()) + ->indices(); + + // Convert SparseCOOTensor to long CSF buffers + const int64_t ndim = tensor_.ndim(); + const int64_t nonzero_count = sparse_coo_tensor->non_zero_length(); + + std::vector counts(ndim); + std::fill_n(counts.begin(), ndim, static_cast(0)); + + std::vector axis_order(ndim); + for (int64_t i = 0; i < ndim; ++i) axis_order[i] = i; + + std::shared_ptr indices_buffer; + std::shared_ptr indptr_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, indices_elsize * ndim * nonzero_count, &indices_buffer)); + RETURN_NOT_OK(AllocateBuffer(pool_, indices_elsize * (ndim - 1) * (nonzero_count + 1), + &indptr_buffer)); + int64_t* indices = reinterpret_cast(indices_buffer->mutable_data()); + int64_t* indptr = reinterpret_cast(indptr_buffer->mutable_data()); + + for (int64_t row = 0; row < nonzero_count; ++row) { + bool tree_split = false; + for (int64_t column = 0; column < ndim; ++column) { + bool change = coords->Value({row, column}) != + coords->Value({row - 1, column}); + + if (tree_split || change || row == 0) { + if (row > 1) tree_split = true; + + indices[column * nonzero_count + counts[column]] = + coords->Value({row, column}); + indptr[column * (nonzero_count + 1) + counts[column]] = counts[column + 1]; + ++counts[column]; + } + } + } + + for (int64_t column = 0; column < ndim; ++column) { + indptr[column * (nonzero_count + 1) + counts[column]] = counts[column + 1]; + } + + int64_t total_size = counts[0]; + for (int64_t column = 1; column < ndim; ++column) { + for (int64_t i = 0; i < counts[column] + 1; ++i) { + if (column < ndim - 1) + indptr[total_size + column + i] = indptr[column * (nonzero_count + 1) + i]; + if (i < counts[column]) + indices[total_size + i] = indices[column * nonzero_count + i]; + } + total_size += counts[column]; + } + + // Copy CSF index data into smaller buffers + std::shared_ptr out_indices_buffer; + std::shared_ptr out_indptr_buffer; + RETURN_NOT_OK( + AllocateBuffer(pool_, indices_elsize * total_size, &out_indices_buffer)); + RETURN_NOT_OK(AllocateBuffer(pool_, + indices_elsize * total_size - nonzero_count + ndim - 1, + &out_indptr_buffer)); + int64_t* out_indices = reinterpret_cast(out_indices_buffer->mutable_data()); + int64_t* out_indptr = reinterpret_cast(out_indptr_buffer->mutable_data()); + + for (int64_t i = 0; i < total_size; ++i) out_indices[i] = indices[i]; + + for (int64_t i = 0; i < total_size - nonzero_count + ndim - 1; ++i) + out_indptr[i] = indptr[i]; + + // Construct SparseCSFTensor + std::vector out_indptr_shape({total_size - nonzero_count + ndim - 1}); + std::shared_ptr out_indptr_tensor = + std::make_shared(int64(), out_indptr_buffer, out_indptr_shape); + + std::vector out_indices_shape({total_size}); + std::shared_ptr out_indices_tensor = + std::make_shared(int64(), out_indices_buffer, out_indices_shape); + + std::vector indptr_offsets(ndim - 1); + std::vector indices_offsets(ndim); + std::fill_n(indptr_offsets.begin(), ndim - 1, static_cast(0)); + std::fill_n(indices_offsets.begin(), ndim, static_cast(0)); + + for (int64_t i = 0; i < ndim - 2; ++i) + indptr_offsets[i + 1] = indptr_offsets[i] + counts[i] + 1; + + for (int64_t i = 0; i < ndim; ++i) + indices_offsets[i + 1] = indices_offsets[i] + counts[i]; + + sparse_index = + std::make_shared(out_indptr_tensor, out_indices_tensor, + indptr_offsets, indices_offsets, axis_order); + data = sparse_coo_tensor->data(); + + return Status::OK(); + } + +#define CALL_TYPE_SPECIFIC_CONVERT(TYPE_CLASS) \ + case TYPE_CLASS##Type::type_id: \ + return Convert(); + + Status Convert() { + switch (index_value_type_->id()) { + ARROW_GENERATE_FOR_ALL_INTEGER_TYPES(CALL_TYPE_SPECIFIC_CONVERT); + // LCOV_EXCL_START: The following invalid causes program failure. + default: + return Status::TypeError("Unsupported SparseTensor index value type"); + // LCOV_EXCL_STOP + } + } + +#undef CALL_TYPE_SPECIFIC_CONVERT + + std::shared_ptr sparse_index; + std::shared_ptr data; + + private: + using BaseClass::index_value_type_; + using BaseClass::pool_; + using BaseClass::tensor_; +}; + // ---------------------------------------------------------------------- // Instantiate templates @@ -502,7 +650,8 @@ Status MakeSparseTensorFromTensor(const Tensor& tensor, return MakeSparseTensorFromTensor(tensor, index_value_type, pool, out_sparse_index, out_data); case SparseTensorFormat::CSF: - return Status::Invalid("Unsupported Tensor value type"); + return MakeSparseTensorFromTensor(tensor, index_value_type, pool, + out_sparse_index, out_data); // LCOV_EXCL_START: ignore program failure default: @@ -812,7 +961,7 @@ SparseCSFIndex::SparseCSFIndex(const std::shared_ptr& indptr, const std::vector& indptr_offsets, const std::vector& indices_offsets, const std::vector& axis_order) - : SparseIndexBase(indices->shape()[0] - indices_offsets.back()), + : SparseIndexBase(indices->size() - indices_offsets.back()), indptr_(indptr), indices_(indices), indptr_offsets_(indptr_offsets), diff --git a/cpp/src/arrow/sparse_tensor_test.cc b/cpp/src/arrow/sparse_tensor_test.cc index 91588ac27a70b..5496df8b0033c 100644 --- a/cpp/src/arrow/sparse_tensor_test.cc +++ b/cpp/src/arrow/sparse_tensor_test.cc @@ -982,4 +982,32 @@ TEST_F(TestSparseCSFTensor, TestToTensor) { ASSERT_TRUE(tensor.Equals(*dense_tensor)); } + +TEST_F(TestSparseCSFTensor, CreationFromTensor) { + std::vector values = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 4, 0, 5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 7, 8}; + std::vector shape({3, 3, 3, 4}); + std::vector dim_names({"a", "b", "c", "d"}); + std::shared_ptr buffer = Buffer::Wrap(values); + Tensor tensor(int64(), buffer, shape, {}, dim_names); + + std::shared_ptr st; + ASSERT_OK(SparseCSFTensor::Make(tensor, &st)); + + ASSERT_EQ(8, st->non_zero_length()); + ASSERT_TRUE(st->is_mutable()); + + ASSERT_EQ(dim_names, st->dim_names()); + ASSERT_EQ("a", st->dim_name(0)); + ASSERT_EQ("b", st->dim_name(1)); + ASSERT_EQ("c", st->dim_name(2)); + ASSERT_EQ("d", st->dim_name(3)); + + std::shared_ptr dt; + ASSERT_OK(st->ToTensor(&dt)); + ASSERT_TRUE(tensor.Equals(*dt)); +} } // namespace arrow