diff --git a/src/main/java/com/timgroup/statsd/NonBlockingDirectStatsDClient.java b/src/main/java/com/timgroup/statsd/NonBlockingDirectStatsDClient.java index eec0c48b..67a926d2 100644 --- a/src/main/java/com/timgroup/statsd/NonBlockingDirectStatsDClient.java +++ b/src/main/java/com/timgroup/statsd/NonBlockingDirectStatsDClient.java @@ -64,10 +64,10 @@ public final boolean writeTo(StringBuilder builder, int capacity, String contain private int metadataSize(StringBuilder builder, String containerID) { if (metadataSize == -1) { final int previousLength = builder.length(); - final int previousEncodedLength = builder.toString().getBytes(UTF_8).length; + final int previousEncodedLength = Utf8.encodedLength(builder); writeHeadMetadata(builder); writeTailMetadata(builder, containerID); - metadataSize = builder.toString().getBytes(UTF_8).length - previousEncodedLength; + metadataSize = Utf8.encodedLength(builder) - previousEncodedLength; builder.setLength(previousLength); } return metadataSize; diff --git a/src/main/java/com/timgroup/statsd/Utf8.java b/src/main/java/com/timgroup/statsd/Utf8.java new file mode 100644 index 00000000..5c2b91c0 --- /dev/null +++ b/src/main/java/com/timgroup/statsd/Utf8.java @@ -0,0 +1,96 @@ +/* + * Copyright (C) 2013 The Guava Authors + * + * Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except + * in compliance with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +package com.timgroup.statsd; + +import static java.lang.Character.MAX_SURROGATE; +import static java.lang.Character.MIN_SURROGATE; + +import java.nio.charset.StandardCharsets; + +/** + * This class is a partial copy of the {@code com.google.common.base.Utf8} + * class + * from the Guava library. + * It is copied here to avoid a dependency on Guava. + */ +final class Utf8 { + + private static final int UTF8_REPLACEMENT_LENGTH = StandardCharsets.UTF_8.newEncoder().replacement().length; + + private Utf8() { + } + + /** + * Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this + * method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both + * time and space. + * + * @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired + * surrogates) + */ + public static int encodedLength(CharSequence sequence) { + // Warning to maintainers: this implementation is highly optimized. + int utf16Length = sequence.length(); + int utf8Length = utf16Length; + int index = 0; + + // This loop optimizes for pure ASCII. + while (index < utf16Length && sequence.charAt(index) < 0x80) { + index++; + } + + // This loop optimizes for chars less than 0x800. + for (; index < utf16Length; index++) { + char character = sequence.charAt(index); + if (character < 0x800) { + utf8Length += ((0x7f - character) >>> 31); // branch free! + } else { + utf8Length += encodedLengthGeneral(sequence, index); + break; + } + } + + if (utf8Length < utf16Length) { + // Necessary and sufficient condition for overflow because of maximum 3x expansion + throw new IllegalArgumentException( + "UTF-8 length does not fit in int: " + (utf8Length + (1L << 32))); + } + return utf8Length; + } + + private static int encodedLengthGeneral(CharSequence sequence, int start) { + int utf16Length = sequence.length(); + int utf8Length = 0; + for (int index = start; index < utf16Length; index++) { + char character = sequence.charAt(index); + if (character < 0x800) { + utf8Length += (0x7f - character) >>> 31; // branch free! + } else { + utf8Length += 2; + // jdk7+: if (Character.isSurrogate(character)) { + if (MIN_SURROGATE <= character && character <= MAX_SURROGATE) { + // Check that we have a well-formed surrogate pair. + if (Character.codePointAt(sequence, index) == character) { + // Bad input so deduct char length and account for the replacement characters + utf8Length += -2 + UTF8_REPLACEMENT_LENGTH - 1; + } else { + index++; + } + } + } + } + return utf8Length; + } +} diff --git a/src/test/java/com/timgroup/statsd/Utf8Test.java b/src/test/java/com/timgroup/statsd/Utf8Test.java new file mode 100644 index 00000000..1a215301 --- /dev/null +++ b/src/test/java/com/timgroup/statsd/Utf8Test.java @@ -0,0 +1,32 @@ +package com.timgroup.statsd; + +import org.junit.Test; + +import java.nio.ByteBuffer; +import java.nio.CharBuffer; +import java.nio.charset.CharacterCodingException; +import java.nio.charset.CharsetEncoder; +import java.nio.charset.CodingErrorAction; +import java.nio.charset.StandardCharsets; + +import static java.lang.Character.MIN_SURROGATE; +import static org.hamcrest.MatcherAssert.assertThat; +import static org.hamcrest.Matchers.equalTo; + +public class Utf8Test { + + @Test + public void should_handle_malformed_inputs() throws CharacterCodingException { + shouldHandleMalformedInput("foo" + MIN_SURROGATE + "bar"); + shouldHandleMalformedInput("🍻☀️😎🏖️" + MIN_SURROGATE + "🍻☀️😎🏖️"); + } + + private static void shouldHandleMalformedInput(String malformedInput) throws CharacterCodingException { + CharsetEncoder utf8Encoder = StandardCharsets.UTF_8.newEncoder() + .onMalformedInput(CodingErrorAction.REPLACE) + .onUnmappableCharacter(CodingErrorAction.REPLACE); + ByteBuffer encoded = utf8Encoder.encode(CharBuffer.wrap(malformedInput)); + + assertThat(Utf8.encodedLength(malformedInput), equalTo(encoded.limit())); + } +}