In [1]:
from typing import Optional

import regex as just_kidding_we_wont_use_it
from enum import Enum

import json

def print_as_json(obj):
    def default_serializer(o):
        return o.__dict__

    json_str = json.dumps(obj, default=default_serializer, indent=4)
    print(json_str)

In [2]:
class State:
    terminal = False
    transitions: list['Transition'] = None
    def __init__(self, terminal: bool = False, transitions: list['Transition'] = None):
        if transitions is None:
            transitions = []
        self.isTerminal = terminal
        self.transitions = transitions

class Transition:
    action = None
    state = None
    def __init__(self, action, state: 'State'):
        self.action = action
        self.state = state

class Action:
    type = None
    value = None
    def __init__(self, type, value):
        self.type = type
        self.value = value

In [6]:
class NodeType(Enum):
    Root = 0
    Char = 1
    Group = 2
    CharacterSet = 3
    Range = 4
    Any = 5
    UNKNOWN = -1

class ExprNode:
    type:   NodeType = NodeType.UNKNOWN
    value: Optional[str] = None
    params: dict[str, any] = {}
    children: list[list['ExprNode']] = []


PARAM_ONE_OR_ZERO = '?'
PARAM_ONE_OR_MORE = '+'
PARAM_ANY_OR_MORE = '*'
PARAM_RANGE_START = 'start'
PARAM_RANGE_END   = 'end'

QUANTIFIES = ['?', '+', '*']
SPECIAL_CHARS = ['(', ')', '[', ']', '|', '?', '+' , '*']

class RegularExpression:
    ast: ExprNode
    expr_str: str
    _position: int = 0
    def __init__(self, expr_str: str):
        self.exprStr = expr_str
        self.ast = self._parse(NodeType.Root)
        self.ast.type = NodeType.Root

    def _eat(self, char: Optional[str] = None) -> bool:
        if self._position >= len(self.exprStr):
            return False

        if char is None:
            self._position = self._position + 1
            return True

        for i in range(len(char)):
            if self.exprStr[self._position + i] != char[i]:
                return False
        self._position = self._position + 1
        print(f"Eat OK: {char}, Pos = {self._position}")
        return True

    def _peak(self):
        if self._position >= len(self.exprStr):
            return None
        return self.exprStr[self._position]

    def _done(self):
        return self._position >= len(self.exprStr)

    def _parse(self, curr_type: NodeType = None) -> ExprNode:
        if curr_type == NodeType.Root or curr_type == NodeType.Group:
            return self._parseGroup(isRoot=curr_type == NodeType.Root)
        elif curr_type == NodeType.CharacterSet:
            return self._parseSet()
        elif curr_type == NodeType.Char:
            return self._parseChar()
        elif curr_type == NodeType.Any:
            return self._parseAny()
        else:
            raise Exception('Unknown node type')

    def _assignQuantifier(self, node: ExprNode):
        if self._eat(PARAM_ONE_OR_ZERO):
            node.params['quantifier'] = PARAM_ONE_OR_ZERO
        elif self._eat(PARAM_ONE_OR_MORE):
            node.params['quantifier'] = PARAM_ONE_OR_MORE
        elif self._eat(PARAM_ANY_OR_MORE):
            node.params['quantifier'] = PARAM_ANY_OR_MORE

    def _parseGroup(self, isRoot: bool = False) -> ExprNode:
        curr_node = ExprNode()
        curr_node.type = NodeType.Group
        curr_node.children.append([])
        curr_or = 0
        while not self._done():
            if self._eat('('): # should parse a group
                node = self._parse(NodeType.Group)
                curr_node.children[curr_or].append(node)
                if not self._eat(')'):
                    raise Exception(f'Expected ")" at position {self._position}')
                self._assignQuantifier(node)

            elif self._eat('['): # Characters set
                node = self._parse(NodeType.CharacterSet)
                curr_node.children[curr_or].append(node)
                if not self._eat(']'):
                    raise Exception(f'Expected "]" at position {self._position}')
                self._assignQuantifier(node)

            elif self._peak() == '\\': # match next char
                self._eat()
                node = self._parse(NodeType.Char)
                curr_node.children[curr_or].append(node)
                self._assignQuantifier(node)

            elif self._peak() == '.': # match any char
                node = self._parse(NodeType.Any)
                curr_node.children[curr_or].append(node)
                self._assignQuantifier(node)

            elif self._peak() == '|': # logical or
                curr_node.children.append([])
                curr_or = curr_or + 1

            elif self._peak() not in SPECIAL_CHARS: # normal char
                node = self._parse(NodeType.Char)
                curr_node.children[curr_or].append(node)
                self._assignQuantifier(node)
            elif self._peak() == ')':
                if isRoot:
                    raise Exception(f'Unexpected "{self._peak()}" at position {self._position}')
                break
            else:
                raise Exception(f'Unexpected "{self._peak()}" at position {self._position}')

        return curr_node

    def _parseChar(self) -> ExprNode:
        top = self._peak()
        self._eat(top)
        node = ExprNode()
        node.type = NodeType.Char
        node.value = top
        return node

    def _parseAny(self):
        self._eat('.')
        node = ExprNode()
        node.type = NodeType.Any
        return node


test = "1234(55)"
re = RegularExpression(test)

print(len(re.ast.children[0]))
print_as_json(re)

print("ok")


Eat OK: 1, Pos = 1
Eat OK: 2, Pos = 2
Eat OK: 3, Pos = 3
Eat OK: 4, Pos = 4
Eat OK: (, Pos = 5
Eat OK: 5, Pos = 6
Eat OK: 5, Pos = 7
Eat OK: ), Pos = 8
7


AttributeError: 'mappingproxy' object has no attribute '__dict__'