Dealing with a lot of problems escaping characters in PythonRegex. Se…

…e Issue #19 .
Aunsiels · Jan 25, 2024 · 4dd02ef · 4dd02ef · github-actions · Jan 25, 2024
1 parent 81d69bd
commit 4dd02ef
Show file tree

Hide file tree

Showing 4 changed files with 141 additions and 7 deletions.
diff --git a/pyformlang/regular_expression/python_regex.py b/pyformlang/regular_expression/python_regex.py
@@ -4,6 +4,7 @@
 
 import re
 import string
+import unicodedata
 
 # pylint: disable=cyclic-import
 from pyformlang.regular_expression import regex, MisformedRegexError
@@ -25,6 +26,14 @@
     '\\': '\\\\'
 }
 
+RECOMBINE = {
+    "\\b": "\b",
+    "\\n": "\n",
+    "\\r": "\r",
+    "\\t": "\t",
+    "\\f": "\f"
+}
+
 ESCAPED_PRINTABLES = [TRANSFORMATIONS.get(x, x)
                       for x in PRINTABLES
                       if TRANSFORMATIONS.get(x, x)]
@@ -40,6 +49,9 @@
     r"\w": "[a-zA-Z0-9_]"
 }
 
+HEXASTRING = "0123456789ABCDEF"
+OCTAL = "01234567"
+ESCAPED_OCTAL = ["\\0", "\\1", "\\2", "\\3", "\\4", "\\5", "\\6", "\\7"]
 
 class PythonRegex(regex.Regex):
     """ Represents a regular expression as used in Python.
@@ -87,6 +99,7 @@ def __init__(self, python_regex):
             python_regex = python_regex.pattern
         else:
             re.compile(python_regex)  # Check if it is valid
+
         self._python_regex = python_regex
         self._replace_shortcuts()
         self._escape_in_brackets()
@@ -95,6 +108,7 @@ def __init__(self, python_regex):
         self._preprocess_optional()
         self._preprocess_dot()
         self._separate()
+        self._python_regex = self._python_regex.lstrip('\b')
         super().__init__(self._python_regex)
 
     def _separate(self):
@@ -104,6 +118,7 @@ def _separate(self):
                 regex_temp[-1] += symbol
             else:
                 regex_temp.append(symbol)
+        regex_temp = self._recombine(regex_temp)
         self._python_regex = " ".join(regex_temp)
 
     def _preprocess_brackets(self):
@@ -134,14 +149,68 @@ def _preprocess_brackets(self):
                 in_brackets -= 1
                 in_brackets_temp.pop()
             elif in_brackets > 0:
-                in_brackets_temp[-1].append(symbol)
+                if self._should_escape_next_symbol(in_brackets_temp[-1]):
+                    in_brackets_temp[-1][-1] += symbol
+                elif symbol == "|":
+                    in_brackets_temp[-1].append("\\|")
+                else:
+                    in_brackets_temp[-1].append(symbol)
             else:
                 if self._should_escape_next_symbol(regex_temp):
                     regex_temp[-1] += symbol
                 else:
                     regex_temp.append(symbol)
         self._python_regex = "".join(regex_temp)
 
+    @staticmethod
+    def _recombine(regex_to_recombine):
+        temp = []
+        idx = 0
+        while idx < len(regex_to_recombine):
+            if regex_to_recombine[idx] == "\\x" and idx < len(regex_to_recombine) - 2 \
+                    and regex_to_recombine[idx + 1] in HEXASTRING \
+                    and regex_to_recombine[idx + 2] in HEXASTRING:
+                next_str = "".join(regex_to_recombine[idx+1:idx+3])
+                s_trans = chr(int(next_str, 16))
+                temp.append(TRANSFORMATIONS.get(s_trans, s_trans))
+                idx += 3
+            elif regex_to_recombine[idx] in ESCAPED_OCTAL \
+                    and idx < len(regex_to_recombine) - 2 \
+                    and regex_to_recombine[idx + 1] in OCTAL \
+                    and regex_to_recombine[idx + 2] in OCTAL:
+                next_str = "".join(regex_to_recombine[idx:idx + 3])[1:]
+                s_trans = chr(int(next_str, 8))
+                temp.append(TRANSFORMATIONS.get(s_trans, s_trans))
+                idx += 3
+            elif regex_to_recombine[idx] == "\\N":
+                idx_end = idx
+                while regex_to_recombine[idx_end] != "}":
+                    idx_end += 1
+                name = "".join(regex_to_recombine[idx + 2: idx_end])
+                name = unicodedata.lookup(name)
+                temp.append(TRANSFORMATIONS.get(name, name))
+                idx = idx_end + 1
+            elif regex_to_recombine[idx] == "\\u":
+                unicode_str = "".join(regex_to_recombine[idx+1: idx+5])
+                decoded = chr(int(unicode_str, 16))
+                temp.append(TRANSFORMATIONS.get(decoded, decoded))
+                idx = idx + 5
+            elif regex_to_recombine[idx] == "\\U":
+                unicode_str = "".join(regex_to_recombine[idx+1: idx+9])
+                decoded = chr(int(unicode_str, 16))
+                temp.append(TRANSFORMATIONS.get(decoded, decoded))
+                idx = idx + 9
+            else:
+                temp.append(regex_to_recombine[idx])
+                idx += 1
+        res = []
+        for x in temp:
+            if x in RECOMBINE:
+                res.append(RECOMBINE[x])
+            else:
+                res.append(x)
+        return res
+
     def _preprocess_brackets_content(self, bracket_content):
         bracket_content_temp = []
         previous_is_valid_for_range = False
@@ -153,8 +222,9 @@ def _preprocess_brackets_content(self, bracket_content):
                     bracket_content_temp.append("-")
                     previous_is_valid_for_range = True
                 else:
-                    for j in range(ord(bracket_content[i - 1]) + 1,
-                                   ord(bracket_content[i + 1])):
+                    bracket_content[i - 1] = self._recombine(bracket_content[i - 1])
+                    for j in range(ord(bracket_content[i - 1][-1]) + 1,
+                                   ord(bracket_content[i + 1][-1])):
                         next_char = chr(j)
                         if next_char in TRANSFORMATIONS:
                             bracket_content_temp.append(TRANSFORMATIONS[next_char])
@@ -171,7 +241,19 @@ def _preprocess_brackets_content(self, bracket_content):
                     previous_is_valid_for_range = False
                 else:
                     previous_is_valid_for_range = True
-        return "|".join(bracket_content_temp)
+        bracket_content_temp = self._insert_or(bracket_content_temp)
+        bracket_content_temp = self._recombine(bracket_content_temp)
+        return bracket_content_temp
+
+    @staticmethod
+    def _insert_or(l_to_modify):
+        res = []
+        for x in l_to_modify:
+            res.append(x)
+            res.append("|")
+        if res:
+            return res[:-1]
+        return res
 
     def _find_previous_opening_parenthesis(self, split_sequence):
         counter = 0

diff --git a/pyformlang/regular_expression/tests/test_python_regex.py b/pyformlang/regular_expression/tests/test_python_regex.py
@@ -199,7 +199,7 @@ def test_shortcut_word(self):
     def _test_compare(self, regex, s_test):
         r_pyformlang = PythonRegex(regex)
         r_python = re.compile(regex)
-        self.assertEqual(r_pyformlang.accepts(s_test), r_python.match(s_test) is not None)
+        self.assertEqual(r_python.match(s_test) is not None, r_pyformlang.accepts(s_test))
 
     def test_backslash(self):
         self._test_compare(".*", "]")
@@ -209,10 +209,11 @@ def test_escape_dot(self):
         self._test_compare("\\.", ".")
 
     def test_brackets(self):
+        self._test_compare(r"[{-}]", "}")
         self._test_compare(r"[{}]", "{")
         self._test_compare(r"[{-}]", "{")
         self._test_compare(r"[{-}]", "-")
-        self._test_compare(r"[{-}]", "}")
+        self._test_compare(r"[{-}]", "|")
 
     def test_brackets_escape(self):
         self._test_compare(r"[\[]", "[")
@@ -226,3 +227,49 @@ def test_brackets_escape(self):
         self._test_compare(r"[Z-\]]", "]")
         self._test_compare(r"[\]-a]", "]")
         self._test_compare(r"[\]-a]", "a")
+
+    def test_brackets_end_range_escaped(self):
+        self._test_compare(r"[{-\}]", "|")
+        self._test_compare(r"[{\}]", "{")
+        self._test_compare(r"[{-\}]", "{")
+        self._test_compare(r"[{-\}]", "-")
+        self._test_compare(r"[{-\}]", "}")
+
+    def test_brackets_backslash_middle(self):
+        self._test_compare(r"[a\b]", "b")
+        self._test_compare(r"[a\b]", "\b")
+        self._test_compare(r"[a\\b]", "a")
+        self._test_compare(r"[a\\b]", "b")
+        self._test_compare(r"[a\\b]", "\\")
+        self._test_compare(r"[a\b]", "a")
+        self._test_compare(r"[a\b]", "\\b")
+        self._test_compare(r"[a\b]", "\\")
+
+    def test_backslash(self):
+        self._test_compare(r"\t", "t")
+        self._test_compare(r"\t", "\t")
+        self._test_compare(r"\t", "\\t")
+        self._test_compare(r"(a | \t)", "t")
+        self._test_compare(r"(a | \t)", "\t")
+        self._test_compare(r"(a | \t)", "\\t")
+
+    def test_octal(self):
+        self._test_compare(r"\x10", "\x10")
+        self._test_compare(r"\110", "\110")
+
+    def test_backspace(self):
+        self._test_compare(r"a[b\b]", "ab")
+        self._test_compare(r"a[b\b]", "a\b")
+        self._test_compare(r"\ba[b\b]", "ab")
+        self._test_compare(r"\ba[b\b]", "a\b")
+        self._test_compare(r"a[b|\b]", "ab")
+        self._test_compare(r"a[b|\b]", "a|")
+
+    def test_unicode_name(self):
+        self._test_compare(r" ", " ")
+        self._test_compare(r"\N{space}", " ")
+        self._test_compare(r"\N{space}", "a")
+
+    def test_unicode(self):
+        self._test_compare(r"\u1111", "\u1111")
+        self._test_compare(r"\U00001111", "\U00001111")
diff --git a/pyformlang/regular_expression/tests/test_regex.py b/pyformlang/regular_expression/tests/test_regex.py
@@ -285,3 +285,8 @@ def test_priority(self):
         self.assertTrue(Regex('b a* | a').accepts('a'))
         self.assertTrue(Regex('b a* | a').accepts('b'))
         self.assertTrue(Regex('(b a*) | a').accepts('a'))
+
+    def test_backslash_b(self):
+        self.assertTrue(Regex("( a | \b )").accepts("\b"))
+        self.assertTrue(Regex("( a | \b )").accepts("a"))
+        self.assertFalse(Regex("( a | \b )").accepts("b"))
diff --git a/setup.py b/setup.py
@@ -11,7 +11,7 @@
 
 setuptools.setup(
     name='pyformlang',
-    version='1.0.5',
+    version='1.0.6',
     #scripts=['pyformlang'] ,
     author="Julien Romero",
     author_email="romerojulien34@gmail.com",
File	Stmts	Miss	Cover	Missing
pyformlang
__init__.py	1	0	100%
pyformlang/cfg
__init__.py	7	0	100%
cfg.py	550	0	100%
cfg_object.py	10	0	100%
cyk_table.py	72	0	100%
epsilon.py	6	0	100%
llone_parser.py	156	2	2	99%
parse_tree.py	62	1	1	98%
pda_object_creator.py	31	0	100%
production.py	33	0	100%
recursive_decent_parser.py	56	1	1	98%
set_queue.py	14	0	100%
terminal.py	15	0	100%
utils.py	10	0	100%
utils_cfg.py	25	0	100%
variable.py	26	0	100%
pyformlang/cfg/tests
__init__.py	0	0	100%
test_cfg.py	598	1	1	99%
test_llone_parser.py	118	1	1	99%
test_production.py	22	0	100%
test_recursive_decent_parser.py	25	0	100%
test_terminal.py	19	0	100%
test_variable.py	16	0	100%
pyformlang/fcfg
__init__.py	4	0	100%
fcfg.py	117	1	1	99%
feature_production.py	26	0	100%
feature_structure.py	191	3	3	98%
state.py	35	0	100%
pyformlang/fcfg/tests
__init__.py	0	0	100%
test_fcfg.py	123	0	100%
test_feature_structure.py	159	0	100%
pyformlang/finite_automaton
__init__.py	10	0	100%
deterministic_finite_automaton.py	207	2	2	99%
doubly_linked_list.py	34	0	100%
doubly_linked_node.py	15	0	100%
epsilon.py	10	0	100%
epsilon_nfa.py	372	0	100%
finite_automaton.py	162	0	100%
finite_automaton_object.py	10	0	100%
hopcroft_processing_list.py	22	0	100%
nondeterministic_finite_automaton.py	22	0	100%
nondeterministic_transition_function.py	50	0	100%
partition.py	36	0	100%
regexable.py	16	0	100%
state.py	15	0	100%
symbol.py	11	0	100%
transition_function.py	51	1	1	98%
pyformlang/finite_automaton/tests
__init__.py	0	0	100%
test_deterministic_finite_automaton.py	261	0	100%
test_epsilon.py	10	0	100%
test_epsilon_nfa.py	621	0	100%
test_nondeterministic_finite_automaton.py	93	0	100%
test_nondeterministic_transition_function.py	61	0	100%
test_state.py	28	0	100%
test_symbol.py	27	0	100%
test_transition_function.py	60	0	100%
pyformlang/fst
__init__.py	2	0	100%
fst.py	242	0	100%
pyformlang/fst/tests
__init__.py	0	0	100%
test_fst.py	160	0	100%
pyformlang/indexed_grammar
__init__.py	7	0	100%
consumption_rule.py	34	0	100%
duplication_rule.py	30	0	100%
end_rule.py	30	0	100%
indexed_grammar.py	257	2	2	99%
production_rule.py	32	0	100%
reduced_rule.py	25	0	100%
rule_ordering.py	70	0	100%
rules.py	69	0	100%
pyformlang/indexed_grammar/tests
__init__.py	0	0	100%
test_indexed_grammar.py	225	0	100%
test_rules.py	36	0	100%
pyformlang/pda
__init__.py	6	0	100%
cfg_variable_converter.py	67	4	4	94%
epsilon.py	4	0	100%
pda.py	309	0	100%
stack_symbol.py	16	0	100%
state.py	18	0	100%
symbol.py	14	0	100%
transition_function.py	46	0	100%
utils.py	36	0	100%
pyformlang/pda/tests
__init__.py	0	0	100%
test_pda.py	246	0	100%
pyformlang/regular_expression
__init__.py	4	0	100%
python_regex.py	200	3	3	98%
regex.py	143	0	100%
regex_objects.py	79	0	100%
regex_reader.py	160	4	4	98%
pyformlang/regular_expression/tests
__init__.py	0	0	100%
test_python_regex.py	222	2	2	99%
test_regex.py	249	0	100%
pyformlang/rsa
__init__.py	3	0	100%
box.py	38	6	6	84%
recursive_automaton.py	87	6	6	93%
pyformlang/rsa/tests
__init__.py	0	0	100%
test_rsa.py	51	0	100%
pyformlang/tests
__init__.py	0	0	100%
TOTAL	7948	40	99%