In [1]:
from pycco.tokenizers import *
from pycco.ast import *
from pycco.parsers import *
from inspect import signature

In [14]:
"""Uses Parsers to transform a token stream into an AST"""
from typing import List, Optional
from pycco.parser import match, Parser, parser, S, T, U, ParserResult
from pycco.tokens import Token, TokenKind
from pycco import ast

# Define token matchers
type = match(TokenKind.TYPE)
name = match(TokenKind.IDENTIFIER)
open_paren = match(Token(TokenKind.SYMBOL, "("))
close_paren = match(Token(TokenKind.SYMBOL, ")"))
open_bracket = match(Token(TokenKind.SYMBOL, "{"))
close_bracket = match(Token(TokenKind.SYMBOL, "}"))
comma = match(Token(TokenKind.SYMBOL, ","))
semicolon = match(Token(TokenKind.SYMBOL, ";"))
eq = match(Token(TokenKind.OPERATOR, "="))
star = match(Token(TokenKind.OPERATOR, "*"))

# Grammar Components with Map Functions

# Variable Declaration
def map_variable_decl(tokens: List[Token]) -> ast.VariableDecl:
    """
    Map tokens to a VariableDecl.
    Assumes tokens are in the order: type [*] identifier ;
    """
    type = ast.Type(tokens[0].value).set_tokens([tokens[0]])
    pointer = semicolon = False
    if len(tokens) > 2:
        pointer = tokens[1].value == "*"
        semicolon = tokens[-1].value == ';'
    
    name_token = next(token for token in tokens[1:] if token.kind == TokenKind.IDENTIFIER)
    name = ast.Ident(name_token.value).set_tokens([name_token])
    type.pointer = pointer
    return ast.VariableDecl(type=type, name=name, semicolon=semicolon).set_tokens(tokens)


def map_number(token: Token)->ast.Number:
    type = ast.Type('int')
    if '.' in token.value:
        type = ast.Type('float')
    return ast.Number(token.value, type).set_tokens([token])
    
number = match(Token(TokenKind.NUMBER)) @ map_number

def map_string(token: Token)->ast.StringLiteral:
    return ast.StringLiteral(token.value).set_tokens([token])
    
string = match(Token(TokenKind.STRING_LITERAL)) @ map_string

variable_ = type + ~star + name
variable = variable_ @ map_variable_decl
# type with optional * with an identifier and optional ;
variable_decl = (variable_ + semicolon) @ map_variable_decl

args = open_paren >> variable.sep_by(Token(TokenKind.SYMBOL, ',')) << close_paren

@parser
def statement(stream: List[Token], index: int)-> ParserResult[ast.Statement]:
    return (return_ | variable_decl).parse_fn(stream, index) 

@parser
def expression(stream: List[Token], index: int)-> ParserResult[ast.Expression]:
    return (number | string | expression).parse_fn(stream, index) 

def map_return(expr: ast.Expression)->ast.Return:
    return ast.Return(expr)

return_ = match(Token(TokenKind.KEYWORD, 'return')) >> expression << semicolon
return_ @= map_return


# need to declare in sta
function = variable + args
function += open_bracket >> statement.until(close_bracket) << close_bracket

In [15]:
expression(tokenize('2 ')).result

Number(value='2', type=Type(name='int', pointer=False))

In [16]:
close_bracket

Parser("}")

In [17]:
tokenize('return 2;}')

[Token(kind=<TokenKind.KEYWORD: 2>, value='return', start=6, end=11),
 Token(kind=<TokenKind.NUMBER: 7>, value='2', start=8, end=8),
 Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=9, end=9),
 Token(kind=<TokenKind.SYMBOL: 4>, value='}', start=10, end=10),
 Token(kind=<TokenKind.EOF: 10>, value=None, start=10, end=10)]

In [18]:
(statement.until(close_bracket)@map_return)(tokenize('return 2;}')).result

Return(value=[Token(kind=<TokenKind.KEYWORD: 2>, value='return', start=6, end=11), Token(kind=<TokenKind.NUMBER: 7>, value='2', start=8, end=8), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=9, end=9)])

In [20]:
statement(tokenize('return 2;'))

ParserResult(index=3, result=return 2;, description=None)

In [12]:
(open_bracket >> statement.until(close_bracket) << close_bracket)(tokenize('''
{
    return 2;
}''')).result

[Token(kind=<TokenKind.KEYWORD: 2>, value='return', start=13, end=18),
 Token(kind=<TokenKind.NUMBER: 7>, value='2', start=15, end=15),
 Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=16, end=16)]

In [None]:
# Binary Expression
def map_binary_expression(tokens: List[Token]) -> ast.BinaryExpression:
    """
    Map tokens to a BinaryExpression.
    Assumes tokens are in the order: left operator right
    """
    left = tokens[0]
    operator = tokens[1].value
    right = tokens[2]
    return ast.BinaryExpression(
        left=ast.Ident(left.value) if left.kind == TokenKind.IDENTIFIER else left,
        operator=operator,
        right=ast.Ident(right.value) if right.kind == TokenKind.IDENTIFIER else right,
    )

expression = Parser()  # Forward declaration for expressions
binary_expression = (expression + TokenKind.OPERATOR + expression) @ map_binary_expression

# If Statement
def map_if_statement(tokens: List[Token]) -> ast.IfStatement:
    """
    Map tokens to an IfStatement.
    Assumes tokens are in the order: if (condition) { then_branch } [else { else_branch }]
    """
    condition = tokens[0]  # First token after `if` and `(`
    then_branch = tokens[1]  # Tokens within the `{}` following the condition
    else_branch = tokens[2] if len(tokens) > 2 else None  # Optional `else` branch
    return ast.IfStatement(condition=condition, then_branch=then_branch, else_branch=else_branch)

if_statement = (
    match(Token(TokenKind.KEYWORD, "if"))
    >> open_paren
    >> expression
    << close_paren
    + open_bracket
    >> variable_decl.until(close_bracket)
    << close_bracket
    + (
        match(Token(TokenKind.KEYWORD, "else"))
        >> open_bracket
        >> variable_decl.until(close_bracket)
        << close_bracket
    ).optional()
) @ map_if_statement

# While Loop
def map_while_loop(tokens: List[Token]) -> ast.WhileLoop:
    """
    Map tokens to a WhileLoop.
    Assumes tokens are in the order: while (condition) { body }
    """
    condition = tokens[0]  # First token after `while` and `(`
    body = tokens[1]  # Tokens within the `{}` following the condition
    return ast.WhileLoop(condition=condition, body=body)

while_loop = (
    match(Token(TokenKind.KEYWORD, "while"))
    >> open_paren
    >> expression
    << close_paren
    + open_bracket
    >> variable_decl.until(close_bracket)
    << close_bracket
) @ map_while_loop

# Function
def map_function(tokens: List[Token]) -> ast.Function:
    """
    Map tokens to a Function.
    Assumes tokens are in the order: return_type name(args) { body }
    """
    return_type = ast.Type(tokens[0].value)
    name = ast.Ident(tokens[1].value)
    args = [map_variable_decl(arg) for arg in tokens[2]]  # Tokens within `()`
    body = tokens[3]  # Tokens within `{}`
    return ast.Function(
        var=ast.VariableDecl(type=return_type, name=name),
        args=args,
        body=body,
    )

# Forward declaration for statement
statement = Parser()

# Function parser
def map_function(tokens: List[Token]) -> ast.Function:
    """
    Map tokens to a Function.
    Assumes tokens are in the order: return_type name(args) { body }
    """
    return_type = ast.Type(tokens[0].value)
    name = ast.Ident(tokens[1].value)
    args = [map_variable_decl(arg) for arg in tokens[2]]  # Tokens within `()`
    body = tokens[3]  # Tokens within `{}`
    return ast.Function(
        var=ast.VariableDecl(type=return_type, name=name),
        args=args,
        body=body,
    )

function = (
    type
    + name
    + (open_paren >> variable.until(comma.optional() + close_paren))
    << close_paren
    + open_bracket
    >> statement.until(close_bracket)
    << close_bracket
) @ map_function

# Define the statement parser
statement.define(variable_decl | if_statement | while_loop | binary_expression)


# Update the `_expression` placeholder to include binary_expression
expression.define(binary_expression | variable)


# Entry Point for Parsing a Program
program = function.until(TokenKind.EOF)

In [12]:
source_code = """
int* main(args) {
    int* x = 42;
        /*
    multiline comment
    */
    printf("Hello, World!");
    return 0;
}
"""

In [13]:
tokens = tokenize(source_code)

In [12]:
variable = type + star.optional() + name
variable_decl = (variable + semicolon.optional()) @ map_variable_decl

In [13]:
variable_decl(tokenize('int main'))

AttributeError: 'str' object has no attribute 'value'

In [7]:
variable_decl(tokenize('int main'))

ParserResult(index=2, result=[Token(kind=<TokenKind.TYPE: 3>, value='int', start=3, end=5), Token(kind=<TokenKind.IDENTIFIER: 1>, value='main', start=8, end=11)], description=((("TYPE" + "*"?) + "IDENTIFIER") + ";"?))

In [9]:
parse(tokens)

ParseError: Parsing failed due to an unexpected error.
At token index 0:
int main ( args )
^
Token causing issue: int (kind: TYPE)

In [3]:
r=(variable@variable_map)(tokenize('int *main'))

NameError: name 'variable_map' is not defined

In [6]:
r.result.tokens

[Token(kind=<TokenKind.TYPE: 3>, value='int', start=3, end=5),
 Token(kind=<TokenKind.OPERATOR: 5>, value='*', start=5, end=5),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='main', start=9, end=12)]

In [4]:
(type + name)(tokenizer(('int main')))

ParserResult(index=-1, result=None, description=("TYPE" + "IDENTIFIER"))

In [6]:
(type + name + (open_paren >> variable.until(comma.optional() + close_paren)))(source_code)

ParserResult(index=-1, result=None, description=(("TYPE" + "IDENTIFIER") + ("(" >> ((("TYPE" + "*"?) + "IDENTIFIER") until (","? + ")")))))

In [3]:
type(tokenize('int'))

ParserResult(index=1, result=int, description="TYPE")

In [4]:
variable_decl(tokenize('int* x;'))

ParserResult(index=4, result=[Token(kind=<TokenKind.TYPE: 3>, value='int', start=3, end=5), Token(kind=<TokenKind.OPERATOR: 5>, value='*', start=4, end=4), Token(kind=<TokenKind.IDENTIFIER: 1>, value='x', start=6, end=6), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=7, end=7)], description=((("TYPE" + "*"?) + "IDENTIFIER") + ";"))

In [9]:
function(tokenize(source_code))

ParserResult(index=21, result=[Token(kind=<TokenKind.TYPE: 3>, value='int', start=25, end=27), Token(kind=<TokenKind.OPERATOR: 5>, value='*', start=26, end=26), Token(kind=<TokenKind.IDENTIFIER: 1>, value='x', start=28, end=28), Token(kind=<TokenKind.OPERATOR: 5>, value='=', start=30, end=30), Token(kind=<TokenKind.NUMBER: 7>, value='42', start=33, end=34), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=34, end=34), Token(kind=<TokenKind.IDENTIFIER: 1>, value='printf', start=85, end=90), Token(kind=<TokenKind.SYMBOL: 4>, value='(', start=86, end=86), Token(kind=<TokenKind.STRING_LITERAL: 6>, value='Hello, World!', start=101, end=113), Token(kind=<TokenKind.SYMBOL: 4>, value=')', start=102, end=102), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=103, end=103), Token(kind=<TokenKind.KEYWORD: 2>, value='return', start=114, end=119), Token(kind=<TokenKind.NUMBER: 7>, value='0', start=116, end=116), Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=117, end=117)], description=((((

In [10]:
tokenize(source_code)

[Token(kind=<TokenKind.TYPE: 3>, value='int', start=4, end=6),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='main', start=9, end=12),
 Token(kind=<TokenKind.SYMBOL: 4>, value='(', start=10, end=10),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='args', start=14, end=17),
 Token(kind=<TokenKind.SYMBOL: 4>, value=')', start=15, end=15),
 Token(kind=<TokenKind.SYMBOL: 4>, value='{', start=17, end=17),
 Token(kind=<TokenKind.TYPE: 3>, value='int', start=25, end=27),
 Token(kind=<TokenKind.OPERATOR: 5>, value='*', start=26, end=26),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='x', start=28, end=28),
 Token(kind=<TokenKind.OPERATOR: 5>, value='=', start=30, end=30),
 Token(kind=<TokenKind.NUMBER: 7>, value='42', start=33, end=34),
 Token(kind=<TokenKind.SYMBOL: 4>, value=';', start=34, end=34),
 Token(kind=<TokenKind.IDENTIFIER: 1>, value='printf', start=85, end=90),
 Token(kind=<TokenKind.SYMBOL: 4>, value='(', start=86, end=86),
 Token(kind=<TokenKind.STRING_LITERAL: 6>, value='Hello, Worl

In [11]:
bad_code = """
int main() {
    int x = 42;
    /*
    multiline comment
    */
    printf(Hello, World!");
    return 0;
}
"""

In [12]:
tokenize(bad_code)

ValueError: Tokenization Error:
Unexpected token at line 7, column 25:

    multiline comment
    */
    printf(Hello, World!");
                        ^
    return 0;
}
Expected one of whitespace, comment, keywords, keywords, (one of {, }, (, ), ;, ,, [, ], ., -> @ create_token), (one of +, -, *, /, %, =, ==, !=, <, >, <=, >=, &&, ||, !, &, |, ^, <<, >> @ create_token), number, string literal, identifier.


In [13]:
tokenizer

Parser(one of whitespace, comment, keywords, keywords, (one of {, }, (, ), ;, ,, [, ], ., -> @ create_token), (one of +, -, *, /, %, =, ==, !=, <, >, <=, >=, &&, ||, !, &, |, ^, <<, >> @ create_token), number, string literal, identifier)