In [1]:
from pycco.tokenizers import *
from pycco.ast import *
from pycco.parsers import *
from inspect import signature
from functools import reduce

"""Uses Parsers to transform a token stream into an AST"""
from typing import List, Optional
from pycco.parser import match, Parser, parser, S, T, U, ParserResult
from pycco.tokens import Token, TokenKind, BinaryOperator, UnaryOperator, OtherOperator
from pycco import ast

# utility to group sub-list items
group = lambda l: [l]

# Define token matchers
type = match(TokenKind.TYPE)
name = match(TokenKind.IDENTIFIER)
open_paren = match(Token(TokenKind.SYMBOL, "("))
close_paren = match(Token(TokenKind.SYMBOL, ")"))
open_bracket = match(Token(TokenKind.SYMBOL, "{"))
close_bracket = match(Token(TokenKind.SYMBOL, "}"))
comma = match(Token(TokenKind.SYMBOL, ","))
semicolon = match(Token(TokenKind.SYMBOL, ";"))
eq = match(Token(TokenKind.OPERATOR, "="))
star = match(Token(TokenKind.OPERATOR, "*"))

# forward declarations
expression = Parser()
statement = Parser()

# Grammar Components with Map Functions

def map_ident(token: Token)->ast.Ident:
    return ast.Ident(token.value).set_tokens([token])

ident = name @ map_ident

# Variable Declaration
def map_variable_decl(tokens: List[Token]) -> ast.VariableDecl:
    """
    Map tokens to a VariableDecl.
    Assumes tokens are in the order: type [*] identifier ;
    """
    type = ast.Type(tokens[0].value).set_tokens([tokens[0]])
    pointer = semicolon = False
    if len(tokens) > 2:
        pointer = tokens[1].value == "*"
        semicolon = tokens[-1].value == ';'
    
    name_token = next(token for token in tokens[1:] if token.kind == TokenKind.IDENTIFIER)
    name = ast.Ident(name_token.value).set_tokens([name_token])
    type.pointer = pointer
    return ast.VariableDecl(type=type, name=name, semicolon=semicolon).set_tokens(tokens)


def map_number(token: Token)->ast.Number:
    type = ast.Type('int')
    if '.' in token.value:
        type = ast.Type('float')
    return ast.Number(token.value, type).set_tokens([token])
    
number = match(Token(TokenKind.NUMBER)) @ map_number

def map_string(token: Token)->ast.StringLiteral:
    return ast.StringLiteral(token.value).set_tokens([token])
    
string = match(Token(TokenKind.STRING_LITERAL)) @ map_string

variable_ = type + ~star + name
variable = variable_ @ map_variable_decl
# type with optional * with an identifier and optional ;
variable_decl = (variable_ + semicolon) @ map_variable_decl

args = open_paren >> variable.sep_by(Token(TokenKind.SYMBOL, ',')) << close_paren



def map_variable_assign(elements: List[Token|ast.Node])->ast.Assign:
    return ast.Assign(*elements)
    
variable_assign = (((name@map_ident | variable_@map_variable_decl) << Token(TokenKind.OPERATOR, '='))+expression<<semicolon)
variable_assign@=map_variable_assign

def map_return(expr: ast.Expression)->ast.Return:
    return ast.Return(expr)

return_ = match(Token(TokenKind.KEYWORD, 'return')) >> expression << semicolon
return_ @= map_return

def map_function(nodes: List[ast.Node])->ast.Function:
    var, args, *nodes = nodes
    ret = None
    if nodes and isinstance(nodes[-1], ast.Return):
        body, ret = nodes[:-1], nodes[-1]
    else:
        body = nodes
    return ast.Function(var, args, body, ret)

# need to declare in sta
function = variable + (args @ group)
function += open_bracket >> (statement.many() + ~return_) << close_bracket
function @= map_function

# Unary Operator Parser
def map_unary_op(elements: List[Token | ast.Expression]) -> ast.UnaryOp:
    op, operand = elements
    return ast.UnaryOp(operator=op.value, operand=operand)

unary_op = (any_of_enum(UnaryOperator) + expression) @ map_unary_op

In [2]:
# 1) Our base expression (numbers, strings, ident)
paren_expr = open_paren >> expression << close_paren
base_expr = (number | string | ident | paren_expr).describe("base_expr")


# 2) The binop chain
def map_binops(parsed_list: List) -> ast.Expression:
    """
    parsed_list = [left_expr, [ (op, right_expr), (op, right_expr), ... ]]
    """
    left = parsed_list[0]
    pairs = parsed_list[1]
    for (op, right_expr) in pairs:
        left = ast.BinaryOp(left, op.value, right_expr)
    return left

binop = (
    base_expr + 
    ( any_of_enum(BinaryOperator) + base_expr ).many()
) @ map_binops

# 3) The unary_op (already defined)
def map_unary_op(elements: List[Token | ast.Expression]) -> ast.UnaryOp:
    op, operand = elements
    return ast.UnaryOp(operator=op.value, operand=operand)

unary_op = (
    any_of_enum(UnaryOperator) + expression
) @ map_unary_op

# 4) Final expression definition
expression.define(
    unary_op | binop
)

# Now 'expression' references a parser that
# - tries unary_op first, or
# - tries binop (chain of base_expr with binary operators).


Parser((((one of UnaryOperator members + None) @ map_unary_op) | ((base_expr + (one of BinaryOperator members + base_expr)*) @ map_binops)))

In [None]:
@parser
def statement(stream: List[Token], index: int)-> ParserResult[ast.Statement]:
    return (variable_decl | variable_assign).parse_fn(stream, index) 

@parser
def expression(stream: List[Token], index: int)-> ParserResult[ast.Expression]:
    return ( number | string | ident).parse_fn(stream, index) 

In [8]:
.result

RecursionError: maximum recursion depth exceeded

In [13]:
# Primary Expressions: Numbers, Strings, Identifiers, and Grouped Expressions
@parser
def primary_expression(stream: List[Token], index: int) -> ParserResult[ast.Expression]:
    return (
        number
        | string
        | ident
        | (open_paren >> expression << close_paren)  # Grouped expressions: (x + y)
    ).parse_fn(stream, index)

# Unary Operations
unary_op = (any_of_enum(UnaryOperator) + primary_expression) @ map_unary_op

# Binary Operations (Left-Associative with Precedence)
def map_binop(elements: List[ast.Node | Token]) -> ast.BinaryOp:
    left, op, right = elements
    return ast.BinaryOp(left=left, operator=op.value, right=right)

binary_operator = any_of_enum(BinaryOperator)

@parser
def binop(stream: List[Token], index: int) -> ParserResult[ast.Expression]:
    # Start by parsing the left operand (unary or primary expression)
    import pdb; pdb.set_trace()
    left_result = unary_op.parse_fn(stream, index)
    current_index = left_result.index  # Correct attribute
    left = left_result.result

    while True:
        try:
            # Parse the operator
            op_result = binary_operator.parse_fn(stream, current_index)
            operator = op_result.result
            current_index = op_result.index  # Correct attribute

            # Parse the right-hand operand
            right_result = unary_op.parse_fn(stream, current_index)
            right = right_result.result
            current_index = right_result.index  # Correct attribute

            # Combine into a BinaryOp
            left = ast.BinaryOp(left=left, operator=operator.value, right=right)

        except Exception:
            # If parsing fails, break the loop
            break

    return ParserResult(index=current_index, result=left)


# Function Calls
def map_function_call(elements: List[ast.Node | Token]) -> ast.FunctionCall:
    name, args = elements
    return ast.FunctionCall(name=name, args=args)

function_call = (
    ident
    + open_paren
    >> expression.sep_by(comma)
    << close_paren
) @ map_function_call

# Array Indexing
def map_array_index(elements: List[ast.Node]) -> ast.ArrayIndex:
    array, index = elements
    return ast.ArrayIndex(array=array, index=index)

array_index = ident + open_bracket >> expression << close_bracket @ map_array_index

# Struct Access
def map_struct_access(elements: List[ast.Node | Token]) -> ast.StructAccess:
    obj, op, field = elements
    return ast.StructAccess(obj=obj, operator=op.value, field=field)

struct_access = (
    ident
    + any_of(Symbol.DOT, Symbol.ARROW)
    + ident
) @ map_struct_access

# Comprehensive Expression Parser
@parser
def expression(stream: List[Token], index: int) -> ParserResult[ast.Expression]:
    return (
        struct_access
        | array_index
        | function_call
        | binop
        | unary_op
        | primary_expression
    ).parse_fn(stream, index)


In [14]:
# If-Else Parser
def map_if_else(elements: List[Token | ast.Node]) -> ast.IfStatement:
    condition, then_body, else_body = elements
    return ast.IfStatement(condition, then_body, else_body)

if_else = (
    match(Token(TokenKind.KEYWORD, "if"))
    >> open_paren
    >> expression
    << close_paren
    + open_bracket
    >> statement.many()
    << close_bracket
    + (match(Token(TokenKind.KEYWORD, "else")) >> open_bracket >> statement.many() << close_bracket).optional()
) @ map_if_else

In [37]:
expression(tokenize('x>0 ')).result

> [0;32m/var/folders/vl/0mv20zzj0ld26z8h0ngg3wn00000gn/T/ipykernel_38709/2568700022.py[0m(19)[0;36mmap_binop[0;34m()[0m
[0;32m     17 [0;31m[0;34m[0m[0m
[0m[0;32m     18 [0;31m[0;32mdef[0m [0mmap_binop[0m[0;34m([0m[0melems[0m[0;34m)[0m[0;34m:[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m---> 19 [0;31m    [0;32mimport[0m [0mpdb[0m[0;34m;[0m [0mpdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     20 [0;31m    [0mleft[0m[0;34m,[0m [0mop[0m[0;34m,[0m [0mright[0m [0;34m=[0m [0melems[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     21 [0;31m    [0;32mreturn[0m [0mast[0m[0;34m.[0m[0mBinaryOp[0m[0;34m([0m[0mleft[0m[0;34m=[0m[0mleft[0m[0;34m,[0m [0moperator[0m[0;34m=[0m[0mop[0m[0;34m.[0m[0mvalue[0m[0;34m,[0m [0mright[0m[0;34m=[0m[0mright[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0m


ipdb>  elems


[Ident(name='x')]


ipdb>  q


In [19]:
(match(Token(TokenKind.KEYWORD, "if"))
    >> open_paren
>> expression)(tokenize('''
if (x > 0) {
    y = 1;
} else {
    y = -1;
}
'''))

RecursionError: maximum recursion depth exceeded

In [None]:
# While Loop Parser
def map_while(elements: List[Token | ast.Node]) -> ast.WhileLoop:
    condition, body = elements
    return ast.WhileLoop(condition, body)

while_loop = (
    match(Token(TokenKind.KEYWORD, "while"))
    >> open_paren
    >> expression
    << close_paren
    + open_bracket
    >> statement.many()
    << close_bracket
) @ map_while

# For Loop Parser
def map_for(elements: List[Token | ast.Node]) -> ast.For:
    init, condition, increment, body = elements
    return ast.For(init, condition, increment, body)

for_loop = (
    match(Token(TokenKind.KEYWORD, "for"))
    >> open_paren
    >> variable_assign
    << semicolon
    + expression
    << semicolon
    + variable_assign
    << close_paren
    + open_bracket
    >> statement.many()
    << close_bracket
) @ map_for

# Function Call Parser
def map_function_call(elements: List[Token | ast.Node]) -> ast.FunctionCall:
    name, args = elements
    return ast.FunctionCall(name, args)

function_call = (
    ident
    + open_paren
    >> expression.sep_by(comma)
    << close_paren
) @ map_function_call

# Array Index Parser
def map_array_index(elements: List[Token | ast.Node]) -> ast.ArrayIndex:
    array, index = elements
    return ast.ArrayIndex(array=array, index=index)

array_index = ident + open_bracket >> expression << close_bracket @ map_array_index

# Struct Access Parser
def map_struct_access(elements: List[Token | ast.Node]) -> ast.StructAccess:
    obj, operator, field = elements
    return ast.StructAccess(obj=obj, operator=operator.value, field=field)

struct_access = (
    ident
    + any_of_enum(OtherOperator)
    .filter(lambda op: op in {OtherOperator.DOT, OtherOperator.ARROW})
    + ident
) @ map_struct_access

# Update Statement Parser
@parser
def statement(stream: List[Token], index: int) -> ParserResult[ast.Statement]:
    return (
        if_else | while_loop | for_loop | variable_decl | variable_assign | return_
    ).parse_fn(stream, index)

# Update Expression Parser
@parser
def expression(stream: List[Token], index: int) -> ParserResult[ast.Expression]:
    return (
        struct_access | array_index | function_call | unary_op | binop | ident | number | string
    ).parse_fn(stream, index)

AttributeError: module 'pycco.ast' has no attribute 'For'

In [48]:
binop(tokenize('2+"2" ')).result

BinaryOp(left=Number(value='2', type=Type(name='int', pointer=False)), operator='+', right=StringLiteral(value='2'))

In [30]:
close_bracket

Parser("}")

In [31]:
statement.many(1)(tokenize('return 2;')).result

In [32]:
(statement.until(close_bracket)@map_return)(tokenize('return 2;}')).result

Return(value=[Token(kind=<TokenKind.KEYWORD: 2>, value='return', start=6, end=11), Token(kind=<TokenKind.NUMBER: 7>, value='2', start=8, end=8), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=9, end=9)])

In [33]:
statement(tokenize('return 2;'))

ParserResult(index=-1, result=None, description=None)

In [34]:
r=function(tokenize('''
int main(char* x){
int *x=
3;
    return 2;
}''')).result

In [35]:
r.ret

Return(value=Number(value='2', type=Type(name='int', pointer=False)))

In [36]:
print(r)


int main(char* x)
{
	int* x = 3;
	return 2;
}




In [19]:
r=(((name@map_ident | variable_@map_variable_decl) << Token(TokenKind.OPERATOR, '='))+expression<<semicolon)(tokenize('int x=3;')).result

In [20]:
map_variable_assign(r)

Assign(var=VariableDecl(type=Type(name='int', pointer=False), name=Ident(name='x'), semicolon=False), to=Number(value='3', type=Type(name='int', pointer=False)))

In [None]:
# Binary Expression
def map_binary_expression(tokens: List[Token]) -> ast.BinaryExpression:
    """
    Map tokens to a BinaryExpression.
    Assumes tokens are in the order: left operator right
    """
    left = tokens[0]
    operator = tokens[1].value
    right = tokens[2]
    return ast.BinaryExpression(
        left=ast.Ident(left.value) if left.kind == TokenKind.IDENTIFIER else left,
        operator=operator,
        right=ast.Ident(right.value) if right.kind == TokenKind.IDENTIFIER else right,
    )

expression = Parser()  # Forward declaration for expressions
binary_expression = (expression + TokenKind.OPERATOR + expression) @ map_binary_expression

# If Statement
def map_if_statement(tokens: List[Token]) -> ast.IfStatement:
    """
    Map tokens to an IfStatement.
    Assumes tokens are in the order: if (condition) { then_branch } [else { else_branch }]
    """
    condition = tokens[0]  # First token after `if` and `(`
    then_branch = tokens[1]  # Tokens within the `{}` following the condition
    else_branch = tokens[2] if len(tokens) > 2 else None  # Optional `else` branch
    return ast.IfStatement(condition=condition, then_branch=then_branch, else_branch=else_branch)

if_statement = (
    match(Token(TokenKind.KEYWORD, "if"))
    >> open_paren
    >> expression
    << close_paren
    + open_bracket
    >> variable_decl.until(close_bracket)
    << close_bracket
    + (
        match(Token(TokenKind.KEYWORD, "else"))
        >> open_bracket
        >> variable_decl.until(close_bracket)
        << close_bracket
    ).optional()
) @ map_if_statement

# While Loop
def map_while_loop(tokens: List[Token]) -> ast.WhileLoop:
    """
    Map tokens to a WhileLoop.
    Assumes tokens are in the order: while (condition) { body }
    """
    condition = tokens[0]  # First token after `while` and `(`
    body = tokens[1]  # Tokens within the `{}` following the condition
    return ast.WhileLoop(condition=condition, body=body)

while_loop = (
    match(Token(TokenKind.KEYWORD, "while"))
    >> open_paren
    >> expression
    << close_paren
    + open_bracket
    >> variable_decl.until(close_bracket)
    << close_bracket
) @ map_while_loop

# Function
def map_function(tokens: List[Token]) -> ast.Function:
    """
    Map tokens to a Function.
    Assumes tokens are in the order: return_type name(args) { body }
    """
    return_type = ast.Type(tokens[0].value)
    name = ast.Ident(tokens[1].value)
    args = [map_variable_decl(arg) for arg in tokens[2]]  # Tokens within `()`
    body = tokens[3]  # Tokens within `{}`
    return ast.Function(
        var=ast.VariableDecl(type=return_type, name=name),
        args=args,
        body=body,
    )

# Forward declaration for statement
statement = Parser()

# Function parser
def map_function(tokens: List[Token]) -> ast.Function:
    """
    Map tokens to a Function.
    Assumes tokens are in the order: return_type name(args) { body }
    """
    return_type = ast.Type(tokens[0].value)
    name = ast.Ident(tokens[1].value)
    args = [map_variable_decl(arg) for arg in tokens[2]]  # Tokens within `()`
    body = tokens[3]  # Tokens within `{}`
    return ast.Function(
        var=ast.VariableDecl(type=return_type, name=name),
        args=args,
        body=body,
    )

function = (
    type
    + name
    + (open_paren >> variable.until(comma.optional() + close_paren))
    << close_paren
    + open_bracket
    >> statement.until(close_bracket)
    << close_bracket
) @ map_function

# Define the statement parser
statement.define(variable_decl | if_statement | while_loop | binary_expression)


# Update the `_expression` placeholder to include binary_expression
expression.define(binary_expression | variable)


# Entry Point for Parsing a Program
program = function.until(TokenKind.EOF)

In [59]:
source_code = """
int* main(args) {
    int* x = 42;

    return 0;
}
"""

In [60]:
tokens = tokenize(source_code)

In [61]:
function(tokens)

ParserResult(index=-1, result=None, description=(((((("TYPE" + "*"?) + "IDENTIFIER") @ map_variable_decl) + ((("(" >> (((("TYPE" + "*"?) + "IDENTIFIER") @ map_variable_decl) sep_by ",")) << ")") @ <lambda>)) + (("{" >> (None* + ((("return" >> None) << ";") @ map_return)?)) << "}")) @ map_function))

In [12]:
variable = type + star.optional() + name
variable_decl = (variable + semicolon.optional()) @ map_variable_decl

In [13]:
variable_decl(tokenize('int main'))

AttributeError: 'str' object has no attribute 'value'

In [7]:
variable_decl(tokenize('int main'))

ParserResult(index=2, result=[Token(kind=<TokenKind.TYPE: 3>, value='int', start=3, end=5), Token(kind=<TokenKind.IDENTIFIER: 1>, value='main', start=8, end=11)], description=((("TYPE" + "*"?) + "IDENTIFIER") + ";"?))

In [9]:
parse(tokens)

ParseError: Parsing failed due to an unexpected error.
At token index 0:
int main ( args )
^
Token causing issue: int (kind: TYPE)

In [3]:
r=(variable@variable_map)(tokenize('int *main'))

NameError: name 'variable_map' is not defined

In [6]:
r.result.tokens

[Token(kind=<TokenKind.TYPE: 3>, value='int', start=3, end=5),
 Token(kind=<TokenKind.OPERATOR: 5>, value='*', start=5, end=5),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='main', start=9, end=12)]

In [4]:
(type + name)(tokenizer(('int main')))

ParserResult(index=-1, result=None, description=("TYPE" + "IDENTIFIER"))

In [6]:
(type + name + (open_paren >> variable.until(comma.optional() + close_paren)))(source_code)

ParserResult(index=-1, result=None, description=(("TYPE" + "IDENTIFIER") + ("(" >> ((("TYPE" + "*"?) + "IDENTIFIER") until (","? + ")")))))

In [3]:
type(tokenize('int'))

ParserResult(index=1, result=int, description="TYPE")

In [4]:
variable_decl(tokenize('int* x;'))

ParserResult(index=4, result=[Token(kind=<TokenKind.TYPE: 3>, value='int', start=3, end=5), Token(kind=<TokenKind.OPERATOR: 5>, value='*', start=4, end=4), Token(kind=<TokenKind.IDENTIFIER: 1>, value='x', start=6, end=6), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=7, end=7)], description=((("TYPE" + "*"?) + "IDENTIFIER") + ";"))

In [9]:
function(tokenize(source_code))

ParserResult(index=21, result=[Token(kind=<TokenKind.TYPE: 3>, value='int', start=25, end=27), Token(kind=<TokenKind.OPERATOR: 5>, value='*', start=26, end=26), Token(kind=<TokenKind.IDENTIFIER: 1>, value='x', start=28, end=28), Token(kind=<TokenKind.OPERATOR: 5>, value='=', start=30, end=30), Token(kind=<TokenKind.NUMBER: 7>, value='42', start=33, end=34), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=34, end=34), Token(kind=<TokenKind.IDENTIFIER: 1>, value='printf', start=85, end=90), Token(kind=<TokenKind.SYMBOL: 4>, value='(', start=86, end=86), Token(kind=<TokenKind.STRING_LITERAL: 6>, value='Hello, World!', start=101, end=113), Token(kind=<TokenKind.SYMBOL: 4>, value=')', start=102, end=102), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=103, end=103), Token(kind=<TokenKind.KEYWORD: 2>, value='return', start=114, end=119), Token(kind=<TokenKind.NUMBER: 7>, value='0', start=116, end=116), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=117, end=117)], description=((((

In [10]:
tokenize(source_code)

[Token(kind=<TokenKind.TYPE: 3>, value='int', start=4, end=6),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='main', start=9, end=12),
 Token(kind=<TokenKind.SYMBOL: 4>, value='(', start=10, end=10),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='args', start=14, end=17),
 Token(kind=<TokenKind.SYMBOL: 4>, value=')', start=15, end=15),
 Token(kind=<TokenKind.SYMBOL: 4>, value='{', start=17, end=17),
 Token(kind=<TokenKind.TYPE: 3>, value='int', start=25, end=27),
 Token(kind=<TokenKind.OPERATOR: 5>, value='*', start=26, end=26),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='x', start=28, end=28),
 Token(kind=<TokenKind.OPERATOR: 5>, value='=', start=30, end=30),
 Token(kind=<TokenKind.NUMBER: 7>, value='42', start=33, end=34),
 Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=34, end=34),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='printf', start=85, end=90),
 Token(kind=<TokenKind.SYMBOL: 4>, value='(', start=86, end=86),
 Token(kind=<TokenKind.STRING_LITERAL: 6>, value='Hello, Worl

In [11]:
bad_code = """
int main() {
    int x = 42;
    /*
    multiline comment
    */
    printf(Hello, World!");
    return 0;
}
"""

In [12]:
tokenize(bad_code)

ValueError: Tokenization Error:
Unexpected token at line 7, column 25:

    multiline comment
    */
    printf(Hello, World!");
                        ^
    return 0;
}
Expected one of whitespace, comment, keywords, keywords, (one of {, }, (, ), ;, ,, [, ], ., -> @ create_token), (one of +, -, *, /, %, =, ==, !=, <, >, <=, >=, &&, ||, !, &, |, ^, <<, >> @ create_token), number, string literal, identifier.


In [13]:
tokenizer

Parser(one of whitespace, comment, keywords, keywords, (one of {, }, (, ), ;, ,, [, ], ., -> @ create_token), (one of +, -, *, /, %, =, ==, !=, <, >, <=, >=, &&, ||, !, &, |, ^, <<, >> @ create_token), number, string literal, identifier)