Skip to content

Commit

Permalink
Handle masking of unicode keys (#141)
Browse files Browse the repository at this point in the history
* Handle masking of unicode keys

* Fix condition on ValueMaskers.withTextFunction

* Added invalid surrogate test into exclusions

* Actually switch back to not masking the key instead

* Move all unicode tests into java code to avoid jackson parsing all the unicode

* Polish

* Added comments for key matching, inverted if condition to go from 1 to 4 bytes
  • Loading branch information
gavlyukovskiy committed Jun 12, 2024
1 parent d3422b0 commit 899bacd
Show file tree
Hide file tree
Showing 5 changed files with 250 additions and 159 deletions.
77 changes: 70 additions & 7 deletions src/main/java/dev/blaauwendraad/masker/json/KeyMatcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import dev.blaauwendraad.masker.json.config.JsonMaskingConfig;
import dev.blaauwendraad.masker.json.config.KeyMaskingConfig;
import dev.blaauwendraad.masker.json.util.Utf8Util;
import org.jspecify.annotations.Nullable;

import java.nio.charset.StandardCharsets;
Expand Down Expand Up @@ -37,8 +38,6 @@ final class KeyMatcher {
private static final int SKIP_KEY_LOOKUP = -1;
private final JsonMaskingConfig maskingConfig;
private final TrieNode root;
// used for an optimization to remember all key length and return early if the key length is not known
private final boolean[] knownKeyLengthsInBytes = new boolean[256];

public KeyMatcher(JsonMaskingConfig maskingConfig) {
this.maskingConfig = maskingConfig;
Expand Down Expand Up @@ -70,7 +69,6 @@ public KeyMatcher(JsonMaskingConfig maskingConfig) {
private void insert(String word, boolean negativeMatch) {
boolean caseInsensitive = !maskingConfig.caseSensitiveTargetKeys();
byte[] bytes = word.getBytes(StandardCharsets.UTF_8);
knownKeyLengthsInBytes[Math.min(bytes.length, 255)] = true;
byte[] lowerBytes = null;
byte[] upperBytes = null;
if (caseInsensitive) {
Expand Down Expand Up @@ -178,14 +176,79 @@ KeyMaskingConfig getMaskConfigIfMatched(byte[] bytes, int keyOffset, int keyLeng

@Nullable
private TrieNode searchNode(byte[] bytes, int offset, int length) {
if (!knownKeyLengthsInBytes[Math.min(length, 255)]) {
return null;
}
TrieNode node = root;

for (int i = offset; i < offset + length; i++) {
byte b = bytes[i];
node = node.child(b);
// every character of the input key can be escaped \\uXXXX, but since the KeyMatcher uses byte
// representation of non-escaped characters of the key (e.g. 'key' -> [107, 101, 121]) in UTF-16 format,
// we need to make sure to transform individual escaped characters into bytes before matching them against
// the trie.
// Any escaped character (6 bytes from the input) represents 1 to 4 bytes of unescaped key,
// each of the bytes has to be matched against the trie to return a TrieNode
if (b == '\\' && bytes[i + 1] == 'u' && i <= offset + length - 6) {
char unicodeHexBytesAsChar = Utf8Util.unicodeHexToChar(bytes, i + 2);
i += 6;
if (unicodeHexBytesAsChar < 0x80) {
// < 128 (in decimal) fits in 7 bits which is 1 byte of data in UTF-8
node = node.child((byte) unicodeHexBytesAsChar); // check 1st byte
} else if (unicodeHexBytesAsChar < 0x800) { // 2048 in decimal,
// < 2048 (in decimal) fits in 11 bits which is 2 bytes of data in UTF-8
node = node.child((byte) (0xc0 | (unicodeHexBytesAsChar >> 6))); // check 1st byte
if (node == null) {
return null;
}
node = node.child((byte) (0x80 | (unicodeHexBytesAsChar & 0x3f))); // check 2nd byte
} else if (!Character.isSurrogate(unicodeHexBytesAsChar)) {
// dealing with characters with values between 2048 and 65536 which
// equals to 2^16 or 16 bits, which is 3 bytes of data in UTF-8 encoding
node = node.child((byte) (0xe0 | (unicodeHexBytesAsChar >> 12))); // check 1st byte
if (node == null) {
return null;
}
node = node.child((byte) (0x80 | ((unicodeHexBytesAsChar >> 6) & 0x3f))); // check 2nd byte
if (node == null) {
return null;
}
node = node.child((byte) (0x80 | (unicodeHexBytesAsChar & 0x3f))); // check 3rd byte
} else {
// decoding non-BMP characters in UTF-16 using a pair of high and low
// surrogates which together form one unicode character.
int codePoint = -1;
if (Character.isHighSurrogate(unicodeHexBytesAsChar) // first surrogate must be the high surrogate
&& i <= offset + length - 6 /* -6 for all bytes of the byte encoded unicode character (\\u + 4 hex bytes) to prevent possible ArrayIndexOutOfBoundsExceptions */
&& bytes[i] == '\\' // the high surrogate must be followed by a low surrogate (starting with \\u)
&& bytes[i + 1] == 'u'
) {
char lowSurrogate = Utf8Util.unicodeHexToChar(bytes, i + 2);
if (Character.isLowSurrogate(lowSurrogate)) {
codePoint = Character.toCodePoint(unicodeHexBytesAsChar, lowSurrogate);
}
}
if (codePoint < 0) {
// the key contains invalid surrogate pair and won't be matched
return null;
} else {
node = node.child((byte) (0xf0 | (codePoint >> 18))); // check 1st byte
if (node == null) {
return null;
}
node = node.child((byte) (0x80 | ((codePoint >> 12) & 0x3f))); // check 2nd byte
if (node == null) {
return null;
}
node = node.child((byte) (0x80 | ((codePoint >> 6) & 0x3f))); // check 3rd byte
if (node == null) {
return null;
}
node = node.child((byte) (0x80 | (codePoint & 0x3f))); // check 4th byte
}
i += 6;
}
i--; // to offset loop increment
} else {
node = node.child(b);
}

if (node == null) {
return null;
Expand Down
18 changes: 9 additions & 9 deletions src/main/java/dev/blaauwendraad/masker/json/ValueMaskers.java
Original file line number Diff line number Diff line change
Expand Up @@ -389,12 +389,18 @@ public static ValueMasker.AnyValueMasker withTextFunction(Function<String, @Null
// < 2048 (in decimal) fits in 11 bits which is 2 bytes of data in UTF-8
decodedBytes[decodedIndex++] = (byte) (0xc0 | (unicodeHexBytesAsChar >> 6));
decodedBytes[decodedIndex++] = (byte) (0x80 | (unicodeHexBytesAsChar & 0x3f));
} else if (Character.isSurrogate(unicodeHexBytesAsChar)) {
} else if (!Character.isSurrogate(unicodeHexBytesAsChar)) {
// dealing with characters with values between 2048 and 65536 which
// equals to 2^16 or 16 bits, which is 3 bytes of data in UTF-8 encoding
decodedBytes[decodedIndex++] = (byte) (0xe0 | (unicodeHexBytesAsChar >> 12));
decodedBytes[decodedIndex++] = (byte) (0x80 | ((unicodeHexBytesAsChar >> 6) & 0x3f));
decodedBytes[decodedIndex++] = (byte) (0x80 | (unicodeHexBytesAsChar & 0x3f));
} else {
// decoding non-BMP characters in UTF-16 using a pair of high and low
// surrogates which together form one unicode character.
int codePoint = -1;
if (Character.isHighSurrogate(unicodeHexBytesAsChar) // first surrogate must be the high surrogate
&& encodedIndex < context.byteLength() - 6 /* -6 for all bytes of
&& encodedIndex <= context.byteLength() - 6 /* -6 for all bytes of
the byte encoded unicode character (\\u + 4 hex bytes) to prevent possible ArrayIndexOutOfBoundsExceptions */
&& context.getByte(encodedIndex) == '\\' // the high surrogate must be followed by a low surrogate (starting with \\u)
&& context.getByte(encodedIndex + 1) == 'u'
Expand All @@ -412,7 +418,7 @@ the byte encoded unicode character (\\u + 4 hex bytes) to prevent possible Array
}
if (codePoint < 0) {
// default String behaviour is to replace invalid surrogate pairs
// with the character '?', but from the JSON perspective,
// with the character '', but from the JSON perspective,
// it's better to throw an InvalidJsonException
throw context.invalidJson("Invalid surrogate pair '%s'"
.formatted(context.asString(valueStartIndex, encodedIndex - valueStartIndex)), valueStartIndex);
Expand All @@ -422,12 +428,6 @@ the byte encoded unicode character (\\u + 4 hex bytes) to prevent possible Array
decodedBytes[decodedIndex++] = (byte) (0x80 | ((codePoint >> 6) & 0x3f));
decodedBytes[decodedIndex++] = (byte) (0x80 | (codePoint & 0x3f));
}
} else {
// dealing with characters with values between 2048 and 65536 which
// equals to 2^16 or 16 bits, which is 3 bytes of data in UTF-8 encoding
decodedBytes[decodedIndex++] = (byte) (0xe0 | (unicodeHexBytesAsChar >> 12));
decodedBytes[decodedIndex++] = (byte) (0x80 | ((unicodeHexBytesAsChar >> 6) & 0x3f));
decodedBytes[decodedIndex++] = (byte) (0x80 | (unicodeHexBytesAsChar & 0x3f));
}
} catch (IllegalArgumentException | IndexOutOfBoundsException e) {
throw context.invalidJson(Objects.requireNonNull(e.getMessage()), valueStartIndex);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,10 @@ public static char unicodeHexToChar(byte b1, byte b2, byte b3, byte b4) {
return (char) value;
}

public static char unicodeHexToChar(byte[] array, int offset) {
return unicodeHexToChar(array[offset], array[offset + 1], array[offset + 2], array[offset + 3]);
}

private static byte validateHex(byte hexByte) {
if (hexByte >= 48 && hexByte <= 57) {
return hexByte; // a digit from 0 to 9
Expand Down
171 changes: 167 additions & 4 deletions src/test/java/dev/blaauwendraad/masker/json/UnicodeCharacterTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,182 @@
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.MethodSource;

import java.io.IOException;
import java.util.Set;
import java.util.stream.Stream;

import static org.assertj.core.api.Assertions.assertThat;

class UnicodeCharacterTest {

@ParameterizedTest
@MethodSource("unicodeCharacterFile")
@MethodSource("unicodeCharacters")
void unicodeCharacter(JsonMaskerTestInstance testInstance) {
assertThat(testInstance.jsonMasker().mask(testInstance.input())).isEqualTo(testInstance.expectedOutput());
}

private static Stream<JsonMaskerTestInstance> unicodeCharacterFile() throws IOException {
return JsonMaskerTestUtil.getJsonMaskerTestInstancesFromFile("test-unicode-characters.json").stream();
private static Stream<JsonMaskerTestInstance> unicodeCharacters() {
return Stream.of(
new JsonMaskerTestInstance("""
{
"targetKey2": {
"targetKey3": {}
},
"kh”b\\u0007 ": true,
"\\u001C\\u000F": true,
"=E\\u0018Xi=": {
":": "\\u000F\\u0017\\u0017\\u000Bs\\b\\u0014Xƒ",
"targetKey2": [],
"targetKey4": "kA—=Đ-"
}
}
""",
"""
{
"targetKey2": {
"targetKey3": {}
},
"kh”b\\u0007 ": true,
"\\u001C\\u000F": true,
"=E\\u0018Xi=": {
":": "\\u000F\\u0017\\u0017\\u000Bs\\b\\u0014Xƒ",
"targetKey2": [],
"targetKey4": "kA—=Đ-"
}
}
""",
JsonMasker.getMasker(Set.of("targetKey1", "targetKey2"))),
new JsonMaskerTestInstance("""
{
"someKey": "\\u2020",
"otherKey": null
}
""",
"""
{
"someKey": "***",
"otherKey": null
}
""",
JsonMasker.getMasker(Set.of("someKey"))),
new JsonMaskerTestInstance("""
{
"someKey": "a\\u2020b",
"otherKey": null
}
""",
"""
{
"someKey": "***",
"otherKey": null
}
""",
JsonMasker.getMasker(Set.of("someKey"))),
new JsonMaskerTestInstance("""
{
"someKey": "a\\\\\\u2020b"
}
""",
"""
{
"someKey": "***"
}
""",
JsonMasker.getMasker(Set.of("someKey"))),
new JsonMaskerTestInstance("""
{
"someKey": [
{
"someKey": "\\u0003\\u0015",
"otherKey": null
}
]
}
""",
"""
{
"someKey": [
{
"someKey": "***",
"otherKey": null
}
]
}
""",
JsonMasker.getMasker(Set.of("someKey"))),
new JsonMaskerTestInstance("""
{
"someKey": "\\u0014…"
}
""",
"""
{
"someKey": "***"
}
""",
JsonMasker.getMasker(Set.of("someKey"))),
new JsonMaskerTestInstance("""
{
"someKey": "\\u0014\\u0085"
}
""",
"""
{
"someKey": "***"
}
""",
JsonMasker.getMasker(Set.of("someKey"))),
new JsonMaskerTestInstance("""
{
"someKey": "\\u0085"
}
""",
"""
{
"someKey": "***"
}
""",
JsonMasker.getMasker(Set.of("someKey"))),
new JsonMaskerTestInstance("""
{
"maskMe": "secret",
"̀": "secret",
"€": "secret",
"†": "secret",
"䀀": "secret",
"𐍈": "secret",
"💩": "secret",
"encoded": {
"\\u006D\\u0061\\u0073\\u006B\\u004D\\u0065": "secret",
"\\u0300": "secret",
"\\u20AC": "secret",
"\\u2020": "secret",
"\\u4000": "secret",
"\\uD800\\uDF48": "secret",
"\\uD83D\\uDCA9": "secret"
}
}
""", """
{
"maskMe": "***",
"̀": "***",
"€": "***",
"†": "***",
"䀀": "***",
"𐍈": "***",
"💩": "***",
"encoded": {
"\\u006D\\u0061\\u0073\\u006B\\u004D\\u0065": "***",
"\\u0300": "***",
"\\u20AC": "***",
"\\u2020": "***",
"\\u4000": "***",
"\\uD800\\uDF48": "***",
"\\uD83D\\uDCA9": "***"
}
}
""",
JsonMasker.getMasker(Set.of("maskMe", "̀", "�", "€", "†", "䀀", "𐍈", "💩"))
)
);
}
}
Loading

0 comments on commit 899bacd

Please sign in to comment.