Skip to content

Commit

Permalink
Dealing with a lot of problems escaping characters in PythonRegex. Se…
Browse files Browse the repository at this point in the history
…e Issue #19 .
  • Loading branch information
Aunsiels committed Jan 25, 2024
1 parent 81d69bd commit 4dd02ef
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 7 deletions.
90 changes: 86 additions & 4 deletions pyformlang/regular_expression/python_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import re
import string
import unicodedata

# pylint: disable=cyclic-import
from pyformlang.regular_expression import regex, MisformedRegexError
Expand All @@ -25,6 +26,14 @@
'\\': '\\\\'
}

RECOMBINE = {
"\\b": "\b",
"\\n": "\n",
"\\r": "\r",
"\\t": "\t",
"\\f": "\f"
}

ESCAPED_PRINTABLES = [TRANSFORMATIONS.get(x, x)
for x in PRINTABLES
if TRANSFORMATIONS.get(x, x)]
Expand All @@ -40,6 +49,9 @@
r"\w": "[a-zA-Z0-9_]"
}

HEXASTRING = "0123456789ABCDEF"
OCTAL = "01234567"
ESCAPED_OCTAL = ["\\0", "\\1", "\\2", "\\3", "\\4", "\\5", "\\6", "\\7"]

class PythonRegex(regex.Regex):
""" Represents a regular expression as used in Python.
Expand Down Expand Up @@ -87,6 +99,7 @@ def __init__(self, python_regex):
python_regex = python_regex.pattern
else:
re.compile(python_regex) # Check if it is valid

self._python_regex = python_regex
self._replace_shortcuts()
self._escape_in_brackets()
Expand All @@ -95,6 +108,7 @@ def __init__(self, python_regex):
self._preprocess_optional()
self._preprocess_dot()
self._separate()
self._python_regex = self._python_regex.lstrip('\b')
super().__init__(self._python_regex)

def _separate(self):
Expand All @@ -104,6 +118,7 @@ def _separate(self):
regex_temp[-1] += symbol
else:
regex_temp.append(symbol)
regex_temp = self._recombine(regex_temp)
self._python_regex = " ".join(regex_temp)

def _preprocess_brackets(self):
Expand Down Expand Up @@ -134,14 +149,68 @@ def _preprocess_brackets(self):
in_brackets -= 1
in_brackets_temp.pop()
elif in_brackets > 0:
in_brackets_temp[-1].append(symbol)
if self._should_escape_next_symbol(in_brackets_temp[-1]):
in_brackets_temp[-1][-1] += symbol
elif symbol == "|":
in_brackets_temp[-1].append("\\|")
else:
in_brackets_temp[-1].append(symbol)
else:
if self._should_escape_next_symbol(regex_temp):
regex_temp[-1] += symbol
else:
regex_temp.append(symbol)
self._python_regex = "".join(regex_temp)

@staticmethod
def _recombine(regex_to_recombine):
temp = []
idx = 0
while idx < len(regex_to_recombine):
if regex_to_recombine[idx] == "\\x" and idx < len(regex_to_recombine) - 2 \
and regex_to_recombine[idx + 1] in HEXASTRING \
and regex_to_recombine[idx + 2] in HEXASTRING:
next_str = "".join(regex_to_recombine[idx+1:idx+3])
s_trans = chr(int(next_str, 16))
temp.append(TRANSFORMATIONS.get(s_trans, s_trans))
idx += 3
elif regex_to_recombine[idx] in ESCAPED_OCTAL \
and idx < len(regex_to_recombine) - 2 \
and regex_to_recombine[idx + 1] in OCTAL \
and regex_to_recombine[idx + 2] in OCTAL:
next_str = "".join(regex_to_recombine[idx:idx + 3])[1:]
s_trans = chr(int(next_str, 8))
temp.append(TRANSFORMATIONS.get(s_trans, s_trans))
idx += 3
elif regex_to_recombine[idx] == "\\N":
idx_end = idx
while regex_to_recombine[idx_end] != "}":
idx_end += 1
name = "".join(regex_to_recombine[idx + 2: idx_end])
name = unicodedata.lookup(name)
temp.append(TRANSFORMATIONS.get(name, name))
idx = idx_end + 1
elif regex_to_recombine[idx] == "\\u":
unicode_str = "".join(regex_to_recombine[idx+1: idx+5])
decoded = chr(int(unicode_str, 16))
temp.append(TRANSFORMATIONS.get(decoded, decoded))
idx = idx + 5
elif regex_to_recombine[idx] == "\\U":
unicode_str = "".join(regex_to_recombine[idx+1: idx+9])
decoded = chr(int(unicode_str, 16))
temp.append(TRANSFORMATIONS.get(decoded, decoded))
idx = idx + 9
else:
temp.append(regex_to_recombine[idx])
idx += 1
res = []
for x in temp:
if x in RECOMBINE:
res.append(RECOMBINE[x])
else:
res.append(x)
return res

def _preprocess_brackets_content(self, bracket_content):
bracket_content_temp = []
previous_is_valid_for_range = False
Expand All @@ -153,8 +222,9 @@ def _preprocess_brackets_content(self, bracket_content):
bracket_content_temp.append("-")
previous_is_valid_for_range = True
else:
for j in range(ord(bracket_content[i - 1]) + 1,
ord(bracket_content[i + 1])):
bracket_content[i - 1] = self._recombine(bracket_content[i - 1])
for j in range(ord(bracket_content[i - 1][-1]) + 1,
ord(bracket_content[i + 1][-1])):
next_char = chr(j)
if next_char in TRANSFORMATIONS:
bracket_content_temp.append(TRANSFORMATIONS[next_char])
Expand All @@ -171,7 +241,19 @@ def _preprocess_brackets_content(self, bracket_content):
previous_is_valid_for_range = False
else:
previous_is_valid_for_range = True
return "|".join(bracket_content_temp)
bracket_content_temp = self._insert_or(bracket_content_temp)
bracket_content_temp = self._recombine(bracket_content_temp)
return bracket_content_temp

@staticmethod
def _insert_or(l_to_modify):
res = []
for x in l_to_modify:
res.append(x)
res.append("|")
if res:
return res[:-1]
return res

def _find_previous_opening_parenthesis(self, split_sequence):
counter = 0
Expand Down
51 changes: 49 additions & 2 deletions pyformlang/regular_expression/tests/test_python_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ def test_shortcut_word(self):
def _test_compare(self, regex, s_test):
r_pyformlang = PythonRegex(regex)
r_python = re.compile(regex)
self.assertEqual(r_pyformlang.accepts(s_test), r_python.match(s_test) is not None)
self.assertEqual(r_python.match(s_test) is not None, r_pyformlang.accepts(s_test))

def test_backslash(self):
self._test_compare(".*", "]")
Expand All @@ -209,10 +209,11 @@ def test_escape_dot(self):
self._test_compare("\\.", ".")

def test_brackets(self):
self._test_compare(r"[{-}]", "}")
self._test_compare(r"[{}]", "{")
self._test_compare(r"[{-}]", "{")
self._test_compare(r"[{-}]", "-")
self._test_compare(r"[{-}]", "}")
self._test_compare(r"[{-}]", "|")

def test_brackets_escape(self):
self._test_compare(r"[\[]", "[")
Expand All @@ -226,3 +227,49 @@ def test_brackets_escape(self):
self._test_compare(r"[Z-\]]", "]")
self._test_compare(r"[\]-a]", "]")
self._test_compare(r"[\]-a]", "a")

def test_brackets_end_range_escaped(self):
self._test_compare(r"[{-\}]", "|")
self._test_compare(r"[{\}]", "{")
self._test_compare(r"[{-\}]", "{")
self._test_compare(r"[{-\}]", "-")
self._test_compare(r"[{-\}]", "}")

def test_brackets_backslash_middle(self):
self._test_compare(r"[a\b]", "b")
self._test_compare(r"[a\b]", "\b")
self._test_compare(r"[a\\b]", "a")
self._test_compare(r"[a\\b]", "b")
self._test_compare(r"[a\\b]", "\\")
self._test_compare(r"[a\b]", "a")
self._test_compare(r"[a\b]", "\\b")
self._test_compare(r"[a\b]", "\\")

def test_backslash(self):
self._test_compare(r"\t", "t")
self._test_compare(r"\t", "\t")
self._test_compare(r"\t", "\\t")
self._test_compare(r"(a | \t)", "t")
self._test_compare(r"(a | \t)", "\t")
self._test_compare(r"(a | \t)", "\\t")

def test_octal(self):
self._test_compare(r"\x10", "\x10")
self._test_compare(r"\110", "\110")

def test_backspace(self):
self._test_compare(r"a[b\b]", "ab")
self._test_compare(r"a[b\b]", "a\b")
self._test_compare(r"\ba[b\b]", "ab")
self._test_compare(r"\ba[b\b]", "a\b")
self._test_compare(r"a[b|\b]", "ab")
self._test_compare(r"a[b|\b]", "a|")

def test_unicode_name(self):
self._test_compare(r" ", " ")
self._test_compare(r"\N{space}", " ")
self._test_compare(r"\N{space}", "a")

def test_unicode(self):
self._test_compare(r"\u1111", "\u1111")
self._test_compare(r"\U00001111", "\U00001111")
5 changes: 5 additions & 0 deletions pyformlang/regular_expression/tests/test_regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,3 +285,8 @@ def test_priority(self):
self.assertTrue(Regex('b a* | a').accepts('a'))
self.assertTrue(Regex('b a* | a').accepts('b'))
self.assertTrue(Regex('(b a*) | a').accepts('a'))

def test_backslash_b(self):
self.assertTrue(Regex("( a | \b )").accepts("\b"))
self.assertTrue(Regex("( a | \b )").accepts("a"))
self.assertFalse(Regex("( a | \b )").accepts("b"))
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

setuptools.setup(
name='pyformlang',
version='1.0.5',
version='1.0.6',
#scripts=['pyformlang'] ,
author="Julien Romero",
author_email="romerojulien34@gmail.com",
Expand Down

1 comment on commit 4dd02ef

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage

Coverage Report
FileStmtsMissCoverMissing
pyformlang
   __init__.py10100% 
pyformlang/cfg
   __init__.py70100% 
   cfg.py5500100% 
   cfg_object.py100100% 
   cyk_table.py720100% 
   epsilon.py60100% 
   llone_parser.py15622 99%
   parse_tree.py6211 98%
   pda_object_creator.py310100% 
   production.py330100% 
   recursive_decent_parser.py5611 98%
   set_queue.py140100% 
   terminal.py150100% 
   utils.py100100% 
   utils_cfg.py250100% 
   variable.py260100% 
pyformlang/cfg/tests
   __init__.py00100% 
   test_cfg.py59811 99%
   test_llone_parser.py11811 99%
   test_production.py220100% 
   test_recursive_decent_parser.py250100% 
   test_terminal.py190100% 
   test_variable.py160100% 
pyformlang/fcfg
   __init__.py40100% 
   fcfg.py11711 99%
   feature_production.py260100% 
   feature_structure.py19133 98%
   state.py350100% 
pyformlang/fcfg/tests
   __init__.py00100% 
   test_fcfg.py1230100% 
   test_feature_structure.py1590100% 
pyformlang/finite_automaton
   __init__.py100100% 
   deterministic_finite_automaton.py20722 99%
   doubly_linked_list.py340100% 
   doubly_linked_node.py150100% 
   epsilon.py100100% 
   epsilon_nfa.py3720100% 
   finite_automaton.py1620100% 
   finite_automaton_object.py100100% 
   hopcroft_processing_list.py220100% 
   nondeterministic_finite_automaton.py220100% 
   nondeterministic_transition_function.py500100% 
   partition.py360100% 
   regexable.py160100% 
   state.py150100% 
   symbol.py110100% 
   transition_function.py5111 98%
pyformlang/finite_automaton/tests
   __init__.py00100% 
   test_deterministic_finite_automaton.py2610100% 
   test_epsilon.py100100% 
   test_epsilon_nfa.py6210100% 
   test_nondeterministic_finite_automaton.py930100% 
   test_nondeterministic_transition_function.py610100% 
   test_state.py280100% 
   test_symbol.py270100% 
   test_transition_function.py600100% 
pyformlang/fst
   __init__.py20100% 
   fst.py2420100% 
pyformlang/fst/tests
   __init__.py00100% 
   test_fst.py1600100% 
pyformlang/indexed_grammar
   __init__.py70100% 
   consumption_rule.py340100% 
   duplication_rule.py300100% 
   end_rule.py300100% 
   indexed_grammar.py25722 99%
   production_rule.py320100% 
   reduced_rule.py250100% 
   rule_ordering.py700100% 
   rules.py690100% 
pyformlang/indexed_grammar/tests
   __init__.py00100% 
   test_indexed_grammar.py2250100% 
   test_rules.py360100% 
pyformlang/pda
   __init__.py60100% 
   cfg_variable_converter.py6744 94%
   epsilon.py40100% 
   pda.py3090100% 
   stack_symbol.py160100% 
   state.py180100% 
   symbol.py140100% 
   transition_function.py460100% 
   utils.py360100% 
pyformlang/pda/tests
   __init__.py00100% 
   test_pda.py2460100% 
pyformlang/regular_expression
   __init__.py40100% 
   python_regex.py20033 98%
   regex.py1430100% 
   regex_objects.py790100% 
   regex_reader.py16044 98%
pyformlang/regular_expression/tests
   __init__.py00100% 
   test_python_regex.py22222 99%
   test_regex.py2490100% 
pyformlang/rsa
   __init__.py30100% 
   box.py3866 84%
   recursive_automaton.py8766 93%
pyformlang/rsa/tests
   __init__.py00100% 
   test_rsa.py510100% 
pyformlang/tests
   __init__.py00100% 
TOTAL79484099% 

Tests Skipped Failures Errors Time
267 0 💤 0 ❌ 0 🔥 3.403s ⏱️

Please sign in to comment.