-
Notifications
You must be signed in to change notification settings - Fork 14k
[libc] utf8 to 32 CharacterConverter #143973
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Conversation
@llvm/pr-subscribers-libc Author: None (sribee8) ChangesImplemented push and pop for utf8 to 32 conversion and tests. Full diff: https://github.com/llvm/llvm-project/pull/143973.diff 5 Files Affected:
diff --git a/libc/src/__support/wchar/character_converter.cpp b/libc/src/__support/wchar/character_converter.cpp
index 3cdb8ca83b7f0..9c2fde3134837 100644
--- a/libc/src/__support/wchar/character_converter.cpp
+++ b/libc/src/__support/wchar/character_converter.cpp
@@ -18,15 +18,79 @@ namespace internal {
CharacterConverter::CharacterConverter(mbstate *mbstate) { state = mbstate; }
-bool CharacterConverter::isComplete() {}
+bool CharacterConverter::isComplete() {
+ return state->bytes_processed == state->total_bytes;
+}
-int CharacterConverter::push(char8_t utf8_byte) {}
+int CharacterConverter::push(char8_t utf8_byte) {
+ // Checking the first byte if first push
+ if (state->bytes_processed == 0 && state->total_bytes == 0) {
+ // 1 byte total
+ if ((utf8_byte & 128) == 0) {
+ state->total_bytes = 1;
+ state->bytes_processed = 1;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 2 bytes total
+ else if ((utf8_byte & 0xE0) == 0xC0) {
+ state->total_bytes = 2;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x1F;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 3 bytes total
+ else if ((utf8_byte & 0xF0) == 0xE0) {
+ state->total_bytes = 3;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x0F;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // 4 bytes total
+ else if ((utf8_byte & 0xF8) == 0xF0) {
+ state->total_bytes = 4;
+ state->bytes_processed = 1;
+ utf8_byte &= 0x07;
+ state->partial = static_cast<char32_t>(utf8_byte);
+ return 0;
+ }
+ // Invalid
+ else {
+ state->bytes_processed++;
+ return -1;
+ }
+ }
+ // Any subsequent push
+ if ((utf8_byte & 0xC0) == 0x80) {
+ state->partial = state->partial << 6;
+ char32_t byte = utf8_byte & 0x3F;
+ state->partial |= byte;
+ state->bytes_processed++;
+ return 0;
+ }
+ state->bytes_processed++;
+ return -1;
+}
-int CharacterConverter::push(char32_t utf32) {}
+int CharacterConverter::push(char32_t utf32) {
+ return utf32;
+}
-utf_ret<char8_t> CharacterConverter::pop_utf8() {}
+utf_ret<char8_t> CharacterConverter::pop_utf8() {
+ utf_ret<char8_t> utf8;
+ utf8.error = 0;
+ utf8.out = 0;
+ return utf8;
+}
-utf_ret<char32_t> CharacterConverter::pop_utf32() {}
+utf_ret<char32_t> CharacterConverter::pop_utf32() {
+ utf_ret<char32_t> utf32;
+ utf32.error = 0;
+ utf32.out = state->partial;
+ return utf32;
+}
} // namespace internal
} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/__support/wchar/mbstate.h b/libc/src/__support/wchar/mbstate.h
index cb8950374de41..d33ee354a5443 100644
--- a/libc/src/__support/wchar/mbstate.h
+++ b/libc/src/__support/wchar/mbstate.h
@@ -18,7 +18,7 @@ namespace internal {
struct mbstate {
char32_t partial;
- uint8_t bits_processed;
+ uint8_t bytes_processed;
uint8_t total_bytes;
};
diff --git a/libc/test/src/__support/CMakeLists.txt b/libc/test/src/__support/CMakeLists.txt
index 4fb0dae86e5ca..8905ac2127620 100644
--- a/libc/test/src/__support/CMakeLists.txt
+++ b/libc/test/src/__support/CMakeLists.txt
@@ -275,3 +275,4 @@ add_subdirectory(fixed_point)
add_subdirectory(HashTable)
add_subdirectory(time)
add_subdirectory(threads)
+add_subdirectory(wchar)
diff --git a/libc/test/src/__support/wchar/CMakeLists.txt b/libc/test/src/__support/wchar/CMakeLists.txt
new file mode 100644
index 0000000000000..cf8e615a4fd59
--- /dev/null
+++ b/libc/test/src/__support/wchar/CMakeLists.txt
@@ -0,0 +1,11 @@
+add_custom_target(libc-support-wchar-tests)
+
+add_libc_test(
+ utf8_to_32_test
+ SUITE
+ libc-support-tests
+ SRCS
+ utf8_to_32_test.cpp
+ DEPENDS
+ libc.src.__support.wchar.character_converter
+)
\ No newline at end of file
diff --git a/libc/test/src/__support/wchar/utf8_to_32_test.cpp b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
new file mode 100644
index 0000000000000..aef9cfc557549
--- /dev/null
+++ b/libc/test/src/__support/wchar/utf8_to_32_test.cpp
@@ -0,0 +1,125 @@
+//===-- Unittests for character_converter utf8->3 -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/__support/wchar/character_converter.h"
+#include "src/__support/wchar/mbstate.h"
+#include "src/__support/wchar/utf_ret.h"
+#include "test/UnitTest/Test.h"
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, OneByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ char ch = 'A';
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ EXPECT_EQ(err, 0);
+ EXPECT_EQ(wch.error, 0);
+ EXPECT_EQ(static_cast<int>(wch.out), 65);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, TwoBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "�"; // hex 0xC2, 0x8E
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 142);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, ThreeBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "∑"; // hex 0xE2, 0x88, 0x91
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 8721);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, FourBytes) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char *ch = "🤡"; // hex 0xF0, 0x9F, 0xA4, 0xA1
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ char_conv.push(static_cast<char8_t>(ch[0]));
+ char_conv.push(static_cast<char8_t>(ch[1]));
+ char_conv.push(static_cast<char8_t>(ch[2]));
+ char_conv.push(static_cast<char8_t>(ch[3]));
+ LIBC_NAMESPACE::internal::utf_ret<char32_t> wch = char_conv.pop_utf32();
+
+ ASSERT_EQ(wch.error, 0);
+ ASSERT_EQ(static_cast<int>(wch.out), 129313);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch = static_cast<char>(0x80); // invalid starting bit sequence
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch));
+
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMultiByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0x80), static_cast<char>(0x00),
+ static_cast<char>(0x00),
+ static_cast<char>(0x00)}; // All bytes are invalid
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, -1);
+}
+
+TEST(LlvmLibcCharacterConverterUTF8To32Test, InvalidMiddleByte) {
+ LIBC_NAMESPACE::internal::mbstate state;
+ state.bytes_processed = 0;
+ state.total_bytes = 0;
+ const char ch[4] = {static_cast<char>(0xF1), static_cast<char>(0xC0),
+ static_cast<char>(0x80),
+ static_cast<char>(0x80)}; // invalid second byte
+
+ LIBC_NAMESPACE::internal::CharacterConverter char_conv(&state);
+ int err = char_conv.push(static_cast<char8_t>(ch[0]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[1]));
+ ASSERT_EQ(err, -1);
+ err = char_conv.push(static_cast<char8_t>(ch[2]));
+ ASSERT_EQ(err, 0);
+ err = char_conv.push(static_cast<char8_t>(ch[3]));
+ ASSERT_EQ(err, 0);
+}
|
✅ With the latest revision this PR passed the C/C++ code formatter. |
Implemented push and pop for utf8 to 32 conversion and tests.
3e18ffb
to
9561ab5
Compare
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Looks mostly good to me!
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
for constants that are just simple numbers that can be known an compile time, it's better to use constexpr
instead of const
.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
pretty much done, just a couple style things
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This generally looks good, though I have a number of detail-level comments....
LIBC_NAMESPACE::internal::mbstate state; | ||
state.bytes_processed = 0; | ||
state.total_bytes = 0; | ||
const char ch[2] = {static_cast<char>(0xC2), |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Dumb question: Are these static casts required? I would have expected them to be implicit.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It is unfortunately not :( it wouldn't build without the casting
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Minor nit, but I think this PR is basically done. After both this and the other half of the conversions land I think we should plan to have a cleanup patch to unify their implementations a bit (deduplicate shared constants, match on table vs bitshifts, etc.) but that can be done later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
LGTM, you can merge once the presubmits are done
LLVM Buildbot has detected a new failure on builder Full details are available at: https://lab.llvm.org/buildbot/#/builders/131/builds/24134 Here is the relevant piece of the build log for the reference
|
Reverts #143973 This merge broke the build and I'm currently looking into the issue to fix it.
Reverts llvm/llvm-project#143973 This merge broke the build and I'm currently looking into the issue to fix it.
Implemented push and pop for utf8 to 32 conversion and tests.