Handle masking of unicode keys (#141)

* Handle masking of unicode keys * Fix condition on ValueMaskers.withTextFunction * Added invalid surrogate test into exclusions * Actually switch back to not masking the key instead * Move all unicode tests into java code to avoid jackson parsing all the unicode * Polish * Added comments for key matching, inverted if condition to go from 1 to 4 bytes
Breus · Jun 12, 2024 · 899bacd · 899bacd
1 parent d3422b0
commit 899bacd
Show file tree

Hide file tree

Showing 5 changed files with 250 additions and 159 deletions.
diff --git a/src/main/java/dev/blaauwendraad/masker/json/KeyMatcher.java b/src/main/java/dev/blaauwendraad/masker/json/KeyMatcher.java
@@ -2,6 +2,7 @@
 
 import dev.blaauwendraad.masker.json.config.JsonMaskingConfig;
 import dev.blaauwendraad.masker.json.config.KeyMaskingConfig;
+import dev.blaauwendraad.masker.json.util.Utf8Util;
 import org.jspecify.annotations.Nullable;
 
 import java.nio.charset.StandardCharsets;
@@ -37,8 +38,6 @@ final class KeyMatcher {
     private static final int SKIP_KEY_LOOKUP = -1;
     private final JsonMaskingConfig maskingConfig;
     private final TrieNode root;
-    // used for an optimization to remember all key length and return early if the key length is not known
-    private final boolean[] knownKeyLengthsInBytes = new boolean[256];
 
     public KeyMatcher(JsonMaskingConfig maskingConfig) {
         this.maskingConfig = maskingConfig;
@@ -70,7 +69,6 @@ public KeyMatcher(JsonMaskingConfig maskingConfig) {
     private void insert(String word, boolean negativeMatch) {
         boolean caseInsensitive = !maskingConfig.caseSensitiveTargetKeys();
         byte[] bytes = word.getBytes(StandardCharsets.UTF_8);
-        knownKeyLengthsInBytes[Math.min(bytes.length, 255)] = true;
         byte[] lowerBytes = null;
         byte[] upperBytes = null;
         if (caseInsensitive) {
@@ -178,14 +176,79 @@ KeyMaskingConfig getMaskConfigIfMatched(byte[] bytes, int keyOffset, int keyLeng
 
     @Nullable
     private TrieNode searchNode(byte[] bytes, int offset, int length) {
-        if (!knownKeyLengthsInBytes[Math.min(length, 255)]) {
-            return null;
-        }
         TrieNode node = root;
 
         for (int i = offset; i < offset + length; i++) {
             byte b = bytes[i];
-            node = node.child(b);
+            // every character of the input key can be escaped \\uXXXX, but since the KeyMatcher uses byte
+            // representation of non-escaped characters of the key (e.g. 'key' -> [107, 101, 121]) in UTF-16 format,
+            // we need to make sure to transform individual escaped characters into bytes before matching them against
+            // the trie.
+            // Any escaped character (6 bytes from the input) represents 1 to 4 bytes of unescaped key,
+            // each of the bytes has to be matched against the trie to return a TrieNode
+            if (b == '\\' && bytes[i + 1] == 'u' && i <= offset + length - 6) {
+                char unicodeHexBytesAsChar = Utf8Util.unicodeHexToChar(bytes, i + 2);
+                i += 6;
+                if (unicodeHexBytesAsChar < 0x80) {
+                    // < 128 (in decimal) fits in 7 bits which is 1 byte of data in UTF-8
+                    node = node.child((byte) unicodeHexBytesAsChar); // check 1st byte
+                } else if (unicodeHexBytesAsChar < 0x800) { // 2048 in decimal,
+                    // < 2048 (in decimal) fits in 11 bits which is 2 bytes of data in UTF-8
+                    node = node.child((byte) (0xc0 | (unicodeHexBytesAsChar >> 6))); // check 1st byte
+                    if (node == null) {
+                        return null;
+                    }
+                    node = node.child((byte) (0x80 | (unicodeHexBytesAsChar & 0x3f))); // check 2nd byte
+                } else if (!Character.isSurrogate(unicodeHexBytesAsChar)) {
+                    // dealing with characters with values between 2048 and 65536 which
+                    // equals to 2^16 or 16 bits, which is 3 bytes of data in UTF-8 encoding
+                    node = node.child((byte) (0xe0 | (unicodeHexBytesAsChar >> 12))); // check 1st byte
+                    if (node == null) {
+                        return null;
+                    }
+                    node = node.child((byte) (0x80 | ((unicodeHexBytesAsChar >> 6) & 0x3f))); // check 2nd byte
+                    if (node == null) {
+                        return null;
+                    }
+                    node = node.child((byte) (0x80 | (unicodeHexBytesAsChar & 0x3f))); // check 3rd byte
+                } else {
+                    // decoding non-BMP characters in UTF-16 using a pair of high and low
+                    // surrogates which together form one unicode character.
+                    int codePoint = -1;
+                    if (Character.isHighSurrogate(unicodeHexBytesAsChar) // first surrogate must be the high surrogate
+                        && i <= offset + length - 6 /* -6 for all bytes of the byte encoded unicode character (\\u + 4 hex bytes) to prevent possible ArrayIndexOutOfBoundsExceptions */
+                        && bytes[i] == '\\' // the high surrogate must be followed by a low surrogate (starting with \\u)
+                        && bytes[i + 1] == 'u'
+                    ) {
+                        char lowSurrogate = Utf8Util.unicodeHexToChar(bytes, i + 2);
+                        if (Character.isLowSurrogate(lowSurrogate)) {
+                            codePoint = Character.toCodePoint(unicodeHexBytesAsChar, lowSurrogate);
+                        }
+                    }
+                    if (codePoint < 0) {
+                        // the key contains invalid surrogate pair and won't be matched
+                        return null;
+                    } else {
+                        node = node.child((byte) (0xf0 | (codePoint >> 18))); // check 1st byte
+                        if (node == null) {
+                            return null;
+                        }
+                        node = node.child((byte) (0x80 | ((codePoint >> 12) & 0x3f))); // check 2nd byte
+                        if (node == null) {
+                            return null;
+                        }
+                        node = node.child((byte) (0x80 | ((codePoint >> 6) & 0x3f))); // check 3rd byte
+                        if (node == null) {
+                            return null;
+                        }
+                        node = node.child((byte) (0x80 | (codePoint & 0x3f))); // check 4th byte
+                    }
+                    i += 6;
+                }
+                i--; // to offset loop increment
+            } else {
+                node = node.child(b);
+            }
 
             if (node == null) {
                 return null;

diff --git a/src/main/java/dev/blaauwendraad/masker/json/ValueMaskers.java b/src/main/java/dev/blaauwendraad/masker/json/ValueMaskers.java
@@ -389,12 +389,18 @@ public static ValueMasker.AnyValueMasker withTextFunction(Function<String, @Null
                                                 // < 2048 (in decimal) fits in 11 bits which is 2 bytes of data in UTF-8
                                                 decodedBytes[decodedIndex++] = (byte) (0xc0 | (unicodeHexBytesAsChar >> 6));
                                                 decodedBytes[decodedIndex++] = (byte) (0x80 | (unicodeHexBytesAsChar & 0x3f));
-                                            } else if (Character.isSurrogate(unicodeHexBytesAsChar)) {
+                                            } else if (!Character.isSurrogate(unicodeHexBytesAsChar)) {
+                                                // dealing with characters with values between 2048 and 65536 which
+                                                // equals to 2^16 or 16 bits, which is 3 bytes of data in UTF-8 encoding
+                                                decodedBytes[decodedIndex++] = (byte) (0xe0 | (unicodeHexBytesAsChar >> 12));
+                                                decodedBytes[decodedIndex++] = (byte) (0x80 | ((unicodeHexBytesAsChar >> 6) & 0x3f));
+                                                decodedBytes[decodedIndex++] = (byte) (0x80 | (unicodeHexBytesAsChar & 0x3f));
+                                            } else {
                                                 // decoding non-BMP characters in UTF-16 using a pair of high and low
                                                 // surrogates which together form one unicode character.
                                                 int codePoint = -1;
                                                 if (Character.isHighSurrogate(unicodeHexBytesAsChar) // first surrogate must be the high surrogate
-                                                        && encodedIndex < context.byteLength() - 6 /* -6 for all bytes of
+                                                        && encodedIndex <= context.byteLength() - 6 /* -6 for all bytes of
                                                        the byte encoded unicode character (\\u + 4 hex bytes) to prevent possible ArrayIndexOutOfBoundsExceptions */
                                                         && context.getByte(encodedIndex) == '\\' // the high surrogate must be followed by a low surrogate (starting with \\u)
                                                         && context.getByte(encodedIndex + 1) == 'u'
@@ -412,7 +418,7 @@ the byte encoded unicode character (\\u + 4 hex bytes) to prevent possible Array
                                                 }
                                                 if (codePoint < 0) {
                                                     // default String behaviour is to replace invalid surrogate pairs
-                                                    // with the character '?', but from the JSON perspective,
+                                                    // with the character '�', but from the JSON perspective,
                                                     // it's better to throw an InvalidJsonException
                                                     throw context.invalidJson("Invalid surrogate pair '%s'"
                                                             .formatted(context.asString(valueStartIndex, encodedIndex - valueStartIndex)), valueStartIndex);
@@ -422,12 +428,6 @@ the byte encoded unicode character (\\u + 4 hex bytes) to prevent possible Array
                                                     decodedBytes[decodedIndex++] = (byte) (0x80 | ((codePoint >> 6) & 0x3f));
                                                     decodedBytes[decodedIndex++] = (byte) (0x80 | (codePoint & 0x3f));
                                                 }
-                                            } else {
-                                                // dealing with characters with values between 2048 and 65536 which
-                                                // equals to 2^16 or 16 bits, which is 3 bytes of data in UTF-8 encoding
-                                                decodedBytes[decodedIndex++] = (byte) (0xe0 | (unicodeHexBytesAsChar >> 12));
-                                                decodedBytes[decodedIndex++] = (byte) (0x80 | ((unicodeHexBytesAsChar >> 6) & 0x3f));
-                                                decodedBytes[decodedIndex++] = (byte) (0x80 | (unicodeHexBytesAsChar & 0x3f));
                                             }
                                         } catch (IllegalArgumentException | IndexOutOfBoundsException e) {
                                             throw context.invalidJson(Objects.requireNonNull(e.getMessage()), valueStartIndex);

diff --git a/src/main/java/dev/blaauwendraad/masker/json/util/Utf8Util.java b/src/main/java/dev/blaauwendraad/masker/json/util/Utf8Util.java
@@ -50,6 +50,10 @@ public static char unicodeHexToChar(byte b1, byte b2, byte b3, byte b4) {
         return (char) value;
     }
 
+    public static char unicodeHexToChar(byte[] array, int offset) {
+        return unicodeHexToChar(array[offset], array[offset + 1], array[offset + 2], array[offset + 3]);
+    }
+
     private static byte validateHex(byte hexByte) {
         if (hexByte >= 48 && hexByte <= 57) {
             return hexByte; // a digit from 0 to 9

diff --git a/src/test/java/dev/blaauwendraad/masker/json/UnicodeCharacterTest.java b/src/test/java/dev/blaauwendraad/masker/json/UnicodeCharacterTest.java
@@ -3,19 +3,182 @@
 import org.junit.jupiter.params.ParameterizedTest;
 import org.junit.jupiter.params.provider.MethodSource;
 
-import java.io.IOException;
+import java.util.Set;
 import java.util.stream.Stream;
 
 import static org.assertj.core.api.Assertions.assertThat;
 
 class UnicodeCharacterTest {
+
     @ParameterizedTest
-    @MethodSource("unicodeCharacterFile")
+    @MethodSource("unicodeCharacters")
     void unicodeCharacter(JsonMaskerTestInstance testInstance) {
         assertThat(testInstance.jsonMasker().mask(testInstance.input())).isEqualTo(testInstance.expectedOutput());
     }
 
-    private static Stream<JsonMaskerTestInstance> unicodeCharacterFile() throws IOException {
-        return JsonMaskerTestUtil.getJsonMaskerTestInstancesFromFile("test-unicode-characters.json").stream();
+    private static Stream<JsonMaskerTestInstance> unicodeCharacters() {
+        return Stream.of(
+                new JsonMaskerTestInstance("""
+                       {
+                         "targetKey2": {
+                           "targetKey3": {}
+                         },
+                         "khb\\u0007 ": true,
+                         "\\u001C\\u000F": true,
+                         "=E\\u0018Xi=": {
+                           ":": "\\u000F\\u0017\\u0017\\u000Bs\\b\\u0014X",
+                           "targetKey2": [],
+                           "targetKey4": "kA=Đ-"
+                         }
+                       }
+                       """,
+                        """
+                       {
+                         "targetKey2": {
+                           "targetKey3": {}
+                         },
+                         "khb\\u0007 ": true,
+                         "\\u001C\\u000F": true,
+                         "=E\\u0018Xi=": {
+                           ":": "\\u000F\\u0017\\u0017\\u000Bs\\b\\u0014X",
+                           "targetKey2": [],
+                           "targetKey4": "kA=Đ-"
+                         }
+                       }
+                       """,
+                        JsonMasker.getMasker(Set.of("targetKey1", "targetKey2"))),
+                new JsonMaskerTestInstance("""
+                       {
+                         "someKey": "\\u2020",
+                         "otherKey": null
+                       }
+                       """,
+                        """
+                       {
+                         "someKey": "***",
+                         "otherKey": null
+                       }
+                       """,
+                        JsonMasker.getMasker(Set.of("someKey"))),
+                new JsonMaskerTestInstance("""
+                       {
+                         "someKey": "a\\u2020b",
+                         "otherKey": null
+                       }
+                       """,
+                        """
+                       {
+                         "someKey": "***",
+                         "otherKey": null
+                       }
+                       """,
+                        JsonMasker.getMasker(Set.of("someKey"))),
+                new JsonMaskerTestInstance("""
+                       {
+                         "someKey": "a\\\\\\u2020b"
+                       }
+                       """,
+                        """
+                       {
+                         "someKey": "***"
+                       }
+                       """,
+                        JsonMasker.getMasker(Set.of("someKey"))),
+                new JsonMaskerTestInstance("""
+                       {
+                         "someKey": [
+                           {
+                             "someKey": "\\u0003\\u0015",
+                             "otherKey": null
+                           }
+                         ]
+                       }
+                       """,
+                        """
+                       {
+                         "someKey": [
+                           {
+                             "someKey": "***",
+                             "otherKey": null
+                           }
+                         ]
+                       }
+                       """,
+                        JsonMasker.getMasker(Set.of("someKey"))),
+                new JsonMaskerTestInstance("""
+                       {
+                         "someKey": "\\u0014"
+                       }
+                       """,
+                        """
+                       {
+                         "someKey": "***"
+                       }
+                       """,
+                        JsonMasker.getMasker(Set.of("someKey"))),
+                new JsonMaskerTestInstance("""
+                       {
+                         "someKey": "\\u0014\\u0085"
+                       }
+                       """,
+                        """
+                       {
+                         "someKey": "***"
+                       }
+                       """,
+                        JsonMasker.getMasker(Set.of("someKey"))),
+                new JsonMaskerTestInstance("""
+                       {
+                         "someKey": "\\u0085"
+                       }
+                       """,
+                        """
+                       {
+                         "someKey": "***"
+                       }
+                       """,
+                        JsonMasker.getMasker(Set.of("someKey"))),
+                new JsonMaskerTestInstance("""
+                        {
+                          "maskMe": "secret",
+                          "̀": "secret",
+                          "€": "secret",
+                          "†": "secret",
+                          "䀀": "secret",
+                          "𐍈": "secret",
+                          "💩": "secret",
+                          "encoded": {
+                            "\\u006D\\u0061\\u0073\\u006B\\u004D\\u0065": "secret",
+                            "\\u0300": "secret",
+                            "\\u20AC": "secret",
+                            "\\u2020": "secret",
+                            "\\u4000": "secret",
+                            "\\uD800\\uDF48": "secret",
+                            "\\uD83D\\uDCA9": "secret"
+                          }
+                        }
+                        """, """
+                        {
+                          "maskMe": "***",
+                          "̀": "***",
+                          "€": "***",
+                          "†": "***",
+                          "䀀": "***",
+                          "𐍈": "***",
+                          "💩": "***",
+                          "encoded": {
+                            "\\u006D\\u0061\\u0073\\u006B\\u004D\\u0065": "***",
+                            "\\u0300": "***",
+                            "\\u20AC": "***",
+                            "\\u2020": "***",
+                            "\\u4000": "***",
+                            "\\uD800\\uDF48": "***",
+                            "\\uD83D\\uDCA9": "***"
+                          }
+                        }
+                        """,
+                        JsonMasker.getMasker(Set.of("maskMe", "̀", "�", "€", "†", "䀀", "𐍈", "💩"))
+                )
+        );
     }
 }