In [1]:
import lark

In [2]:
type_map = {"integer": "int", "real": "float", "boolean": "bool", "char": "char"}
relop_map = {"=": "==", "<>": "!=", "<": "<", "<=": "<=", ">": ">", ">=": ">="}
addop_map = {"+": "+", "-": "-", "or": "||"}
mulop_map = {"*": "*", "/": "/", "div": "/", "mod": "%", "and": "&&"}
assignop_map = {":=": "="}
uminus_map = {"-": "-"}

In [3]:
test_rule=r"""
id                     : IDENTIFIER_TOKEN
IDENTIFIER_TOKEN : /[a-zA-Z_][a-zA-Z0-9_]*/
"""
type_map={
    "integer":"int",
    "real":"float",
    "boolean":"bool",
    "char":"char"
}
parser=lark.Lark(test_rule, start="id")
test_str="hello"
tree=parser.parse(test_str)

In [4]:
def visit_id(self, node):
    return [node.children[0].value]

In [5]:
def visit_num(node):
    tokens=[]
    tokens.append(node.children[0].value)
    return tokens

In [6]:
test_code="214.4"
with open("mp2c/MiniPascal.lark") as f:
    grammar=f.read()
parser=lark.Lark(grammar, start="num")

In [7]:
parser.parse(test_code)
def visit_optional_fraction(node):
    return node.children[0].value

def visit_num(node):
    tokens=[]
    for child in node.children:
        # check if the child is a token
        if isinstance(child, lark.lexer.Token):
            tokens.append(child.value)
        elif child.data=="optional_fraction":
            optional_fraction_part=visit_optional_fraction(child)
            tokens[-1]+="."
            tokens[-1]+=optional_fraction_part
        
    return tokens


visit_num(parser.parse(test_code))

['214.4']

In [8]:
def visit_period(node):
    periods=[]
    current_period=[]
    for children in node.children:
        if isinstance(children, lark.lexer.Token):
            current_period.append(int(children.value))
        elif children.data == "period":
            current_period=visit_period(children)
            periods.append(current_period)
            current_period=[]
    periods.append(current_period)
    return periods

parser=lark.Lark(grammar, start="period")
test_code="1..2,4..5"
visit_period(parser.parse(test_code))

[[[1, 2]], [4, 5]]

In [9]:
def visit_basic_type(node):
    return node.children[0].value
parser=lark.Lark(grammar, start="basic_type")
test_code="integer"
visit_basic_type(parser.parse(test_code))

'integer'

In [10]:
def visit_id(node):
    return node.children[0].value
def visit_idlist(node):
    ids = []
    for child in node.children:
        if child.data == "id":
            ids.append(visit_id(child))
        elif child.data == "idlist":
            ids.extend(visit_idlist(child))
    return ids

parser=lark.Lark(grammar, start="idlist")
test_code="a,b,c"
visit_idlist(parser.parse(test_code))

['a', 'b', 'c']

In [11]:
def visit_basic_type(node):
    return type_map[node.children[0].value]


def visit_type(node):
    type = {"basic_type": None, "is_array": False, "period": None}
    for child in node.children:
        if child.data == "basic_type":
            type["basic_type"] = visit_basic_type(child)
        elif child.data == "period":
            type["period"] = visit_period(child)
            type["is_array"] = True
    return type

def visit_period(node):
    periods = []
    current_period = []
    for children in node.children:
        if isinstance(children, lark.lexer.Token):
            current_period.append(int(children.value))
        elif children.data == "period":
            current_period = visit_period(children)
            periods.append(current_period)
            current_period = []
    periods.append(current_period)
    return periods
parser=lark.Lark(grammar, start="type")
test_code="array [1..2] of integer"
visit_type(parser.parse(test_code))

{'basic_type': 'int', 'is_array': True, 'period': [[1, 2]]}

In [12]:
def visit_value_parameter(node):
    ids = []
    type = None
    for child in node.children:
        if child.data == "idlist":
            ids = visit_idlist(child)
        elif child.data == "basic_type":
            type = visit_basic_type(child)
    return {"ids": ids, "type": type}

def visit_var_parameter(node):
    tokens=[]
    value_parameter=visit_value_parameter(node.children[0])
    tokens.append(value_parameter["type"])
    first = True
    for id in value_parameter["ids"]:
        if first:
            tokens.append(id)
            first = False
        else:
            tokens.append(",")
            tokens.append(id)
    tokens.append(";")
    return tokens

parser=lark.Lark(grammar, start="var_parameter")
test_code="var a,b:integer"
tokens_list=visit_var_parameter(parser.parse(test_code))
"   ".join(tokens_list)

'int   a   ,   b   ;'

In [13]:
type_map={
    "integer":"int",
    "real":"float",
    "boolean":"bool",
    "char":"char"
}
def visit_parameter(node):
    tokens=[]
    for child in node.children:
        if child.data == "var_parameter":
            return visit_var_parameter(child)
        elif child.data=="value_parameter":
            value_parameter=visit_value_parameter(child)
            tokens.append(value_parameter["type"])
            first = True
            for id in value_parameter["ids"]:
                if first:
                    tokens.append(id)
                    first = False
                else:
                    tokens.append(",")
                    tokens.append(id)
    return tokens
def visit_value_parameter(node):
    ids = []
    type = None
    for child in node.children:
        if child.data == "idlist":
            ids = visit_idlist(child)
        elif child.data == "basic_type":
            type = visit_basic_type(child)
    return {"ids": ids, "type": type}

def visit_var_parameter(node):
    tokens=[]
    value_parameter=visit_value_parameter(node.children[0])
    first = True
    for id in value_parameter["ids"]:
        if first:
            first = False
        else:
            tokens.append(",")
        tokens.append(value_parameter["type"])
        tokens.append(id)
    return tokens

def visit_parameter_list(node):
    tokens=[]
    first =True
    for child in node.children:
        assert child.data == "parameter"
        if first:
            first = False
        else:
            tokens.append(",")
        tokens.extend(visit_parameter(child))
    return tokens

parser=lark.Lark(grammar, start="parameter_list")
test_code="var a,b:integer; c:real"
visit_parameter_list(parser.parse(test_code))

['int', 'a', ',', 'int', 'b', ',', 'float', 'c']

In [14]:
def visit_formal_parameter(node):
    tokens = []
    tokens.append("(")
    parameter_list = visit_parameter_list(node.children[0])
    tokens.extend(parameter_list)
    tokens.append(")")
    return tokens
parser=lark.Lark(grammar, start="formal_parameter")
test_code="(var a,b:integer; c:real)"
tokens=visit_formal_parameter(parser.parse(test_code))
for token in tokens:
    print(token)

(
int
a
,
int
b
,
float
c
)


In [15]:
def visit_subprogram_head(node):
    tokens = []
    basic_type = None
    id = None
    formal_parameter = None
    for child in node.children:
        if child.data == "basic_type":
            basic_type = visit_basic_type(child)
        elif child.data == "id":
            id = visit_id(child)
        elif child.data == "formal_parameter":
            formal_parameter = visit_formal_parameter(child)
    if basic_type:
        tokens.append(basic_type)
    else:
        tokens.append("void")
    tokens.append(id)
    tokens.extend(formal_parameter)
    return tokens

parser=lark.Lark(grammar, start="subprogram_head")
test_code="function f(var a,b:integer; c:real):integer"
tokens=visit_subprogram_head(parser.parse(test_code))
for token in tokens:
    print(token)
    


int
f
(
int
a
,
int
b
,
float
c
)


In [16]:
def visit_id_varpart(node):
    tokens = []
    for child in node.children:
        if child.data == "empty":
            return tokens
        elif child.data == "expression_list":
            tokens.append("[")
            expression_list = visit_expression_list(child)
            tokens.extend(expression_list)
            tokens.append("]")
    return tokens

def visit_variable(node):
    tokens = []
    for child in node.children:
        if child.data == "id":
            tokens.append(visit_id(child))
        elif child.data == "expression":
            tokens.append("[")
            expression_tokens = visit_expression(child)
            tokens.extend(expression_tokens)
            tokens.append("]")
    return tokens

def visit_variable_list(node):
    tokens=[]
    first=True
    for child in node.children:
        if first:
            first=False
        else:
            tokens.append(",")
        tokens.extend(visit_variable(child))
    return tokens


def visit_func_id(node):
    tokens = []
    for child in node.children:
        id=visit_id(child)
    return tokens


def visit_factor(node):
    tokens = []
    for child in node.children:
        if isinstance(child, lark.lexer.Token):
            token_type = child.type
            token_value = child.value
            if token_type == "NOT":
                tokens.append("!")
            elif token_type == "UMINUS":
                tokens.append(uminus_map[token_value])
        elif child.data == "num":
            num_token = visit_num(child)
            tokens.extend(num_token)
        elif child.data == "id":
            id_token = visit_id(child)
            tokens.extend(id_token)
        elif child.data == "expression":
            tokens.append("(")
            expression_token = visit_expression(child)
            tokens.extend(expression_token)
            tokens.append(")")
        elif child.data == "factor":
            factor_token = visit_factor(child)
            tokens.extend(factor_token)
        elif child.data == "func_id":
            func_id_token = visit_func_id(child)
            tokens.extend(func_id_token)
        elif child.data == "expression_list":
            tokens.append("(")
            expression_list_token = visit_expression_list(child)
            tokens.extend(expression_list_token)
            tokens.append(")")
        elif child.data == "variable":
            variable_token = visit_variable(child)
            tokens.extend(variable_token)
        else: 
            raise Exception("Unknown factor child data: {}".format(child.data))
    return tokens


def visit_term(node):
    tokens = []
    for child in node.children:
        if isinstance(child, lark.lexer.Token):
            tokens.append(mulop_map[child.value])
        elif child.data == "factor":
            factor_token = visit_factor(child)
            tokens.extend(factor_token)
        elif child.data == "term":
            term_token = visit_term(child)
            tokens.extend(term_token)
    return tokens


def visit_simple_expression(node):
    tokens = []
    for child in node.children:
        if isinstance(child, lark.lexer.Token):
            tokens.append(addop_map[child.value])
        elif child.data == "term":
            term_token = visit_term(child)
            tokens.extend(term_token)
        elif child.data == "simple_expression":
            simple_expression_token = visit_simple_expression(child)
            tokens.extend(simple_expression_token)
    return tokens


def visit_expression(node):
    tokens = []
    for child in node.children:
        if isinstance(child, lark.lexer.Token):
            tokens.append(relop_map[child.value])
        elif child.data == "simple_expression":
            simple_expression_token = visit_simple_expression(child)
            tokens.extend(simple_expression_token)
        elif child.data == "expression":
            expression_token = visit_expression(child)
            tokens.extend(expression_token)
        else:
            raise Exception("Unknown expression child data: {}".format(child.data))
    return tokens


def visit_expression_list(node):
    tokens = []
    first = True
    for child in node.children:
        if first:
            first = False
        else:
            tokens.append(",")
        expression_tokens=visit_expression(child)
        tokens.extend(expression_tokens)
    return tokens

parser=lark.Lark(grammar, start="expression_list")
test_code="1<2,3>4"
tokens=visit_expression_list(parser.parse(test_code))
for token in tokens:
    print(token)

1
<
2
,
3
>
4


In [17]:
def visit_assign_statement(node):
    tokens = []
    for child in node.children:
        if isinstance(child, lark.lexer.Token):
            tokens.append(assignop_map[child.value])
        elif child.data == "expression":
            expression_tokens = visit_expression(child)
            tokens.extend(expression_tokens)
        elif child.data == "variable":
            variable_tokens = visit_variable(child)
            tokens.extend(variable_tokens)
        else:
            raise Exception(
                "Unknown assignment_statement child data: {}".format(child.data)
            )
    return tokens


def visit_if_else_statement(node):
    tokens = []
    for child in node.children:
        if child.data == "expression":
            tokens.append("if")
            tokens.append("(")
            expression_tokens = visit_expression(child)
            tokens.extend(expression_tokens)
            tokens.append(")")
        elif child.data == "statement":
            tokens.append("{")
            statement_tokens = visit_statement(child)
            tokens.extend(statement_tokens)
            tokens.append("}")
        elif child.data == "else_part":
            tokens.apend("{")
            else_part_tokens = visit_else_part(child)
            tokens.extend(else_part_tokens)
            tokens.append("}")
        else:
            raise Exception(
                "Unknown if_else_statement child data: {}".format(child.data)
            )
    return tokens


def visit_else_part(node):
    tokens = []
    for child in node.children:
        if child.data == "empyt":
            return tokens
        elif child.data == "statement":
            tokens.append("else")
            tokens.append("{")
            statement_tokens = visit_statement(child)
            tokens.extend(statement_tokens)
            tokens.append("}")
        else:
            raise Exception("Unknown else_part child data: {}".format(child.data))
    return tokens


def visit_procedure_call(node):
    tokens = []
    for child in node.children:
        if child.data == "id":
            tokens.append(visit_id(child))
        elif child.data == "expression_list":
            tokens.append("(")
            expression_list_tokens = visit_expression_list(child)
            tokens.extend(expression_list_tokens)
            tokens.append(")")
        else:
            raise Exception("Unknown procedure_call child data: {}".format(child.data))
    return tokens

def visit_statement_list(node):
    tokens = []
    for child in node.children:
        assert child.data == "statement"
        statement_tokens = visit_statement(child)
        tokens.extend(statement_tokens)
        tokens.append(";")
    return tokens

def visit_compound_statement(node):
    tokens = []
    tokens.append("{")
    assert node.children[0].data == "statement_list"
    statement_list_tokens = visit_statement_list(node.children[0])
    tokens.extend(statement_list_tokens)
    
    tokens.append("}")
    return tokens

def visit_statement(node):
    tokens = []
    for child in node.children:
        if child.data == "assign_statement":
            assign_statement_tokens = visit_assign_statement(child)
            tokens.extend(assign_statement_tokens)
        elif child.data == "if_else_statement":
            if_else_statement_tokens = visit_if_else_statement(child)
            tokens.extend(if_else_statement_tokens)
        elif child.data == "procedure_call":
            procedure_call_tokens = visit_procedure_call(child)
            tokens.extend(procedure_call_tokens)
        elif child.data == "compound_statement":
            compound_statement_tokens = visit_compound_statement(child)
            tokens.extend(compound_statement_tokens)
        else:
            raise Exception("Unknown statement child data: {}".format(child.data))
    return tokens
parser=lark.Lark(grammar, start="statement")
test_code="begin a:=1; b:=2 end"
tokens=visit_statement(parser.parse(test_code))

In [18]:
def visit_const_value(node):
    tokens = []
    for child in node.children:
        if isinstance(child, lark.lexer.Token):
            if child.type == "PLUS":
                tokens.append("+")
            elif child.type == "MINUS":
                tokens.append("-")
            elif child.type == "LETTER":
                tokens.append("'" + child.value + "'")
            else:
                raise Exception("Unknown const_value child type: {}".format(child.type))

        elif child.data == "num":
            num_tokens = visit_num(child)
            tokens.extend(num_tokens)
        else:
            raise Exception("Unknown const_value child data: {}".format(child.data))

    return tokens


def visit_const_declaration(node):
    tokens = []
    for child in node.children:
        if child.data == "id":
            tokens.append("const")
            tokens.append("int")
            tokens.append(visit_id(child))
        elif child.data == "const_value":
            tokens.append("=")
            tokens.extend(visit_const_value(child))
            tokens.append(";")
        elif child.data == "const_declaration":
            tokens.extend(visit_const_declaration(child))
        else:
            raise Exception(
                "Unknown const_declaration child data: {}".format(child.data)
            )
    return tokens


def visit_const_declarations(node):
    tokens=[]
    for child in node.children:
        if child.data == "const_declaration":
            tokens.extend(visit_const_declaration(child))
        elif child.data == "empty":
            return tokens
        else:
            raise Exception(
                "Unknown const_declarations child data: {}".format(child.data)
            )
    return tokens


def visit_var_declaration(node):
    tokens = []
    idlist = []
    id_type = None
    for child in node.children:
        if child.data == "idlist":
            idlist = visit_idlist(child)
        elif child.data == "type":
            id_type = visit_type(child)
        elif child.data == "var_declaration":
            tokens.extend(visit_var_declaration(child))
        else:
            raise Exception("Unknown var_declaration child data: {}".format(child.data))

    for id in idlist:
        tokens.append(id_type["basic_type"])
        tokens.append(id)
        if id_type["is_array"]:
            for period in id_type["period"]:
                tokens.append("[")
                tokens.append(str(period[0]))
                tokens.append("]")
                tokens.append("[")
                tokens.append(str(period[1]))
                tokens.append("]")
        tokens.append(";")
    return tokens


def visit_var_declarations(node):
    tokens = []
    for child in node.children:
        if child.data == "var_declaration":
            tokens.extend(visit_var_declaration(child))
        elif child.data == "empty":
            return tokens
        else:
            raise Exception(
                "Unknown var_declarations child data: {}".format(child.data)
            )
    return tokens


parser = lark.Lark(grammar, start="const_declarations")
test_code = "const a = 1; b = 2;"
tokens=visit_const_declarations(parser.parse(test_code))
for token in tokens:
    print(token)

const
int
a
=
1
;
const
int
b
=
2
;


In [19]:
parser=lark.Lark(grammar, start="var_declarations")
test_code="var a:integer; b:real;"
tokens=visit_var_declarations(parser.parse(test_code))
for token in tokens:
    print(token)

int
a
;
float
b
;


In [21]:
from mp2c import MP2CParser
parser=MP2CParser()
with open("example.pas") as f:
    code=f.read()
tree,tokens,result_string=parser(code)

In [22]:
print(result_string)

#include "mp2c.h"
int gcd(int a, int b);
int x;
int y;
int main() {
  scanf();
  printf();
  ;
}
int gcd(int a, int b) {
  int _gcd;
  if (b == 0) {
    _gcd = a
  } else {
    _gcd = gcd(b, a % b)
  };
  ;
  return _gcd;
}


In [None]:
print(result_string)

#include "mp2c.h"
int gcd(int a, int b);
int x;
int y;
int main() {
  scanf();
  printf();
  ;
}
int gcd(int a, int b) {
  int _gcd;
  if (b == 0) {
    _gcd = a
  } else {
    _gcd = _gcd(b, a % b)
  };
  ;
  return _gcd;
}


In [None]:
tokens

['#include "mp2c.h"',
 'int',
 'gcd',
 '(',
 'int',
 'a',
 ',',
 'int',
 'b',
 ')',
 ';',
 'int',
 'x',
 ';',
 'int',
 'y',
 ';',
 'int main()',
 '{',
 'scanf',
 '(',
 ')',
 ';',
 'printf',
 '(',
 ')',
 ';',
 ';',
 '}',
 'int',
 'gcd',
 '(',
 'int',
 'a',
 ',',
 'int',
 'b',
 ')',
 '{',
 'int',
 '_gcd',
 ';',
 'if',
 '(',
 'b',
 '==',
 '0',
 ')',
 '{',
 '_gcd',
 '=',
 'a',
 '}',
 'else',
 '{',
 '_gcd',
 '=',
 '_gcd',
 '(',
 'b',
 ',',
 'a',
 '%',
 'b',
 ')',
 '}',
 ';',
 ';',
 'return',
 '_gcd',
 ';',
 '}']

In [None]:
print(tree.pretty())

programstruct
  program_head
    id	example
    idlist
      idlist
        id	input
      id	output
  program_body
    const_declarations
      empty	


    var_declarations
      var_declaration
        idlist
          idlist
            id	x
          id	y
        type
          basic_type	integer
    subprogram_declarations
      subprogram_declarations
        empty
      subprogram
        subprogram_head
          id	gcd
          formal_parameter
            parameter_list
              parameter
                value_parameter
                  idlist
                    idlist
                      id	a
                    id	b
                  basic_type	integer
          basic_type	integer
        subprogram_body
          const_declarations
            empty
          var_declarations
            empty	

          compound_statement
            statement_list
              statement
                if_else_statement
                  expression
                    simple

In [None]:
hellow_world_code=r"""
program Answer;
{Answer to the Ultimate Question of Life, the Universe, and Everything}

begin
  X := 42; {Set X to 42}
  writeln(X) {Write X to the screen}
end.
"""

from mp2c import preprocess

In [None]:
x=preprocess(hellow_world_code)

In [None]:
parser=MP2CParser()
tree,tokens,result_string=parser(x)

In [None]:
print(tree.pretty())

programstruct
  program_head
    id	answer
  program_body
    const_declarations
      empty
    var_declarations
      empty
    subprogram_declarations
      empty	



    compound_statement
      statement_list
        statement
          assign_statement
            variable
              id	x
              id_varpart
                empty
            :=
            expression
              simple_expression
                term
                  factor
                    num	42
        statement
          procedure_call
            id	writeln
            expression_list
              expression
                simple_expression
                  term
                    factor
                      variable
                        id	x
                        id_varpart
                          empty



In [None]:
# write tree.pretty() to file
with open("tree.txt", "w") as f:
    f.write(tree.pretty())

In [None]:
tokens

['#include "mp2c.h"',
 'int main()',
 '{',
 'x',
 '=',
 '42',
 ';',
 'writeln',
 '(',
 'x',
 ')',
 ';',
 '}']

In [None]:
print(result_string)

#include "mp2c.h"
int main() {
  x = 42;
  writeln(x);
}


In [None]:
with open("example.pas", "r") as f:
    code=f.read()
tree,tokens,result_string=parser(code)

In [None]:
print(result_string)

#include "mp2c.h"
int gcd(int a, int b);
int x;
int y;
int main() {
  scanf();
  printf();
  ;
}
int gcd(int a, int b) {
  int _gcd;
  if (b == 0) {
    _gcd = a
  } else {
    _gcd = _gcd(b, a % b)
  };
  ;
  return _gcd;
}


In [None]:
print(tree.pretty())

programstruct
  program_head
    id	example
    idlist
      idlist
        id	input
      id	output
  program_body
    const_declarations
      empty	


    var_declarations
      var_declaration
        idlist
          idlist
            id	x
          id	y
        type
          basic_type	integer
    subprogram_declarations
      subprogram_declarations
        empty
      subprogram
        subprogram_head
          id	gcd
          formal_parameter
            parameter_list
              parameter
                value_parameter
                  idlist
                    idlist
                      id	a
                    id	b
                  basic_type	integer
          basic_type	integer
        subprogram_body
          const_declarations
            empty
          var_declarations
            empty	

          compound_statement
            statement_list
              statement
                if_else_statement
                  expression
                    simple

In [None]:
test_code="gcd := gcd(b, a mod b);"
parser=lark.Lark(grammar, start="assign_statement")
x=parser.parse(test_code)

UnexpectedCharacters: No terminal matches ';' in the current parser context, at line 1 col 23

gcd := gcd(b, a mod b);
                      ^
Expected one of: 
	* MULOP
	* ADDOP
	* RELOP
