Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #236 from blemale/bastien.lemale/perf_do_not_alloc…
…ate_computing_metadata_size perf: Do not allocate when computing metadata size
- Loading branch information
Showing
3 changed files
with
130 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,96 @@ | ||
/* | ||
* Copyright (C) 2013 The Guava Authors | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except | ||
* in compliance with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software distributed under the License | ||
* is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express | ||
* or implied. See the License for the specific language governing permissions and limitations under | ||
* the License. | ||
*/ | ||
|
||
package com.timgroup.statsd; | ||
|
||
import static java.lang.Character.MAX_SURROGATE; | ||
import static java.lang.Character.MIN_SURROGATE; | ||
|
||
import java.nio.charset.StandardCharsets; | ||
|
||
/** | ||
* This class is a partial copy of the {@code com.google.common.base.Utf8} | ||
* <a href="https://github.com/google/guava/blob/v33.0.0/guava/src/com/google/common/base/Utf8.java">class</a> | ||
* from the Guava library. | ||
* It is copied here to avoid a dependency on Guava. | ||
*/ | ||
final class Utf8 { | ||
|
||
private static final int UTF8_REPLACEMENT_LENGTH = StandardCharsets.UTF_8.newEncoder().replacement().length; | ||
|
||
private Utf8() { | ||
} | ||
|
||
/** | ||
* Returns the number of bytes in the UTF-8-encoded form of {@code sequence}. For a string, this | ||
* method is equivalent to {@code string.getBytes(UTF_8).length}, but is more efficient in both | ||
* time and space. | ||
* | ||
* @throws IllegalArgumentException if {@code sequence} contains ill-formed UTF-16 (unpaired | ||
* surrogates) | ||
*/ | ||
public static int encodedLength(CharSequence sequence) { | ||
// Warning to maintainers: this implementation is highly optimized. | ||
int utf16Length = sequence.length(); | ||
int utf8Length = utf16Length; | ||
int index = 0; | ||
|
||
// This loop optimizes for pure ASCII. | ||
while (index < utf16Length && sequence.charAt(index) < 0x80) { | ||
index++; | ||
} | ||
|
||
// This loop optimizes for chars less than 0x800. | ||
for (; index < utf16Length; index++) { | ||
char character = sequence.charAt(index); | ||
if (character < 0x800) { | ||
utf8Length += ((0x7f - character) >>> 31); // branch free! | ||
} else { | ||
utf8Length += encodedLengthGeneral(sequence, index); | ||
break; | ||
} | ||
} | ||
|
||
if (utf8Length < utf16Length) { | ||
// Necessary and sufficient condition for overflow because of maximum 3x expansion | ||
throw new IllegalArgumentException( | ||
"UTF-8 length does not fit in int: " + (utf8Length + (1L << 32))); | ||
} | ||
return utf8Length; | ||
} | ||
|
||
private static int encodedLengthGeneral(CharSequence sequence, int start) { | ||
int utf16Length = sequence.length(); | ||
int utf8Length = 0; | ||
for (int index = start; index < utf16Length; index++) { | ||
char character = sequence.charAt(index); | ||
if (character < 0x800) { | ||
utf8Length += (0x7f - character) >>> 31; // branch free! | ||
} else { | ||
utf8Length += 2; | ||
// jdk7+: if (Character.isSurrogate(character)) { | ||
if (MIN_SURROGATE <= character && character <= MAX_SURROGATE) { | ||
// Check that we have a well-formed surrogate pair. | ||
if (Character.codePointAt(sequence, index) == character) { | ||
// Bad input so deduct char length and account for the replacement characters | ||
utf8Length += -2 + UTF8_REPLACEMENT_LENGTH - 1; | ||
} else { | ||
index++; | ||
} | ||
} | ||
} | ||
} | ||
return utf8Length; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package com.timgroup.statsd; | ||
|
||
import org.junit.Test; | ||
|
||
import java.nio.ByteBuffer; | ||
import java.nio.CharBuffer; | ||
import java.nio.charset.CharacterCodingException; | ||
import java.nio.charset.CharsetEncoder; | ||
import java.nio.charset.CodingErrorAction; | ||
import java.nio.charset.StandardCharsets; | ||
|
||
import static java.lang.Character.MIN_SURROGATE; | ||
import static org.hamcrest.MatcherAssert.assertThat; | ||
import static org.hamcrest.Matchers.equalTo; | ||
|
||
public class Utf8Test { | ||
|
||
@Test | ||
public void should_handle_malformed_inputs() throws CharacterCodingException { | ||
shouldHandleMalformedInput("foo" + MIN_SURROGATE + "bar"); | ||
shouldHandleMalformedInput("🍻☀️😎🏖️" + MIN_SURROGATE + "🍻☀️😎🏖️"); | ||
} | ||
|
||
private static void shouldHandleMalformedInput(String malformedInput) throws CharacterCodingException { | ||
CharsetEncoder utf8Encoder = StandardCharsets.UTF_8.newEncoder() | ||
.onMalformedInput(CodingErrorAction.REPLACE) | ||
.onUnmappableCharacter(CodingErrorAction.REPLACE); | ||
ByteBuffer encoded = utf8Encoder.encode(CharBuffer.wrap(malformedInput)); | ||
|
||
assertThat(Utf8.encodedLength(malformedInput), equalTo(encoded.limit())); | ||
} | ||
} |