In [12]:
import re
from functools import wraps
from typing import Generator, Literal, NamedTuple, TypeAlias

EOF = object()

def coroutine(fn):
	'''
	Wraps a generator which is intended to be used as a pure coroutine by
	.send()ing it values. The only thing that the wrapper does is calling
	.next() for the first time which is required by Python generator protocol.
	'''
	@wraps(fn)
	def wrapper(*args, **kwargs):
		g = fn(*args, **kwargs)
		next(g)
		return g
	return wrapper

LEXEMERE = re.compile(r"""\s*(?:
	# Empty string
	((?:''|""|``)(?!$)|(?:'{6}|"{6}|`{6}))|
	# String with content
	('''|\"""|```|['"`](?!$))|
	# Unquoted string
	((?:\\.|[^\\\n:,\(\)\[\]\{\}'"`]+?)+)|
	(\S) # Other
)""", re.X)
UNQUOTED_RE = re.compile(r"""(?:\\.|[^\\\n:,\(\)\[\]\{\}'"`]+?)+""")

'''
In LMON, all values are strings or aggregates. Defer parsing to the user
Inspirations:
* ijson
* HJSON
* Strict YAML (https://hitchdev.com/strictyaml/features-removed/ - no implicit typing)
'''

class Lexemere(NamedTuple):
	'''
	Lexeme + -mere, a part of a lexeme. Originally SubLexeme, but then I saw the regex
	name LEXEME_RE and "lexemere" just makes so much sense.
	'''
	kind: Literal['atom', 'start', 'cont', 'end']
	value: str

@coroutine
def lexer(target):
	try:
		buf = yield
	except GeneratorExit:
		buf = ''
	while True:
		print(buf)
		if m := LEXEMERE.match(buf):
			# Empty string
			if m[1]:
				target.send(Lexemere('atom', m[1]))
				buf = buf[m.end():]
			# Quoted string
			elif q := m[2]:
				STREND_RE = re.compile(rf"(?:\\.|(?!{q})[^\\]+?)*{q}")
				
				buf = buf[m.end():])
				
				# Check if the string is fully consumed
				if m := STREND_RE.match(buf):
					target.send(Lexemere('atom', q + m[0]))
					buf = buf[m.end():]
					continue
				
				target.send(Lexemere('start', q + buf))
				
				# Incomplete string, wait for more data
				while True:
					try:
						buf = yield
					except GeneratorExit:
						buf = ''
					
					if m := STREND_RE.match(buf):
						target.send(Lexemere('end', q + m[1]))
						buf = buf[m.end():]
						break
					
					target.send(Lexemere('cont', buf))
				continue
			# Unquoted string
			elif m[3]:
				target.send(Lexemere('start', m[3]))
				if buf := buf[m.end():]:
					continue
				
				while True:
					try:
						buf = yield
					except GeneratorExit:
						buf = ''
					
					if m := UNQUOTED_RE.match(buf):
						target.send(Lexemere('cont', m[0]))
						if buf := buf[m.end():]:
							break
					else:
						break
				continue
			# Punctuators
			elif buf[4] in '()[]{}:,':
				target.send(Lexemere('atom', buf[0]))
				buf = buf[1:]
				continue
			# ???
			else:
				raise ValueError(f"Unknown sublexeme {m[0]!r}")
		# Nothing matched, consume more data
		else:
			if buf:
				try:
					buf = yield
				except GeneratorExit:
					buf = ''
			
			if not buf:
				try:
					target.send(Lexemere('atom', EOF))
				except StopIteration:
					pass
				break

@coroutine
def print_coro():
    while True:
        x = yield
        print(x)
coro = lexer(print_coro())
coro.send('{"a": hello world}')

SyntaxError: unmatched ')' (1432857271.py, line 64)