src/parser/grammar.pg

# Copyright (C) 2007-2008, The Perl Foundation.
# $Id$

=begin Introduction

This is the rules portion of the grammar for the Rakudo compiler,
an implementation of Perl 6 on Parrot.  This grammar is modeled
after the STD.pm grammar that Larry Wall and others are developing,
available from L<http://svn.pugscode.org/pugs/src/perl6/STD.pm>.

Our ultimate goal is to have this grammar and STD.pm converge
with each other, to form an "official" Perl 6 grammar.  But
there's a lot to do between here and that goal.  For one, Parrot
doesn't yet have a rules engine that understands all of the
constructs that appear in STD.pm, such as protoregexes .
Another challenge is that the language specification itself
changes from time to time as the various implementations progress.
So, we can't just blindly copy STD.pm .

When adding a new construct or feature to this grammar,
be sure to look at STD.pm first to see how it achieves the
result.  If STD.pm's approach can be copied directly, do that.
If not, then try to get a close as possible (e.g., by using
STD.pm's names).  And yes, there are times when STD.pm may
adopt things done here.  But we want to keep them as close
as we can.

In each of the rules below, the special notation C<{*}>
marks a point in the rule where a corresponding method
from Perl6::Grammar::Actions (F<src/parser/actions.pl>)
is invoked.  These actions will then construct the abstract
syntax tree nodes as the source program is being parsed.

The C<#=> markers at the ends of lines look like comments,
but they're used to distinguish multiple C<{*}> actions
within a rule.  (This is how STD.pm is organized, also.)
The value following any C<#=> marker is passed as a
'key' argument to the action method invoked by C<{*}>
earlier in the line.

Rules with only one action need no #= comment.

=end overview

grammar Perl6::Grammar is PCT::Grammar;

token TOP {
    {{ $P0 = get_hll_global ['Bool'], 'True'
       set_global '$begin_compunit', $P0 }}
    <.MARK_STATEMENT_END>
    <statement_block>
    [ $ || <panic: Syntax error> ]
    {*}
}


####  whitespace, comments, pod ####

##  The <ws> token is used to match "whitespace", which includes
##  things like spaces, comments, and pod comments.  It also
##  memoizes the last whitespace token matched into C<$!ws>,
##  and short circuits if we are at the same position as the
##  last ws token matched.

token ws {
    ## short circuit
    [ <?{{ $P0 = get_global '$!ws'
           if null $P0 goto noshort
           $P1 = $P0.'to'()
           $P2 = match.'to'()
           if $P1 != $P2 goto noshort
           .return (1)
         noshort:
           set_global '$!ws', match
           .return (0)
      }}>
    | <!ww>
      [
      | <.unsp>
      | \v+
      | <.unv>
      ]*
    ]
}

token unsp {
    \\ <.before [\s|'#']>
    [ \v | <.unv> ]*
}

token unspacey { <.unsp>? }
token nofun { <!before '(' | '.(' | '\\' > }

token unv {
    || \h+
    || ^^ <.pod_comment>
    || '#' \N*
}

##  The <afterws> rule returns true if we're immediate after
##  a set of whitespace.

token afterws {
    ##  <?{ $¢ == $!ws_to != $!ws_from }>
    {{  $P0 = match.'to'()
        $P1 = get_global '$!ws'
        $P2 = $P1.'to'()
        if $P0 != $P2 goto end
        $P2 = $P1.'from'()
        if $P0 == $P1 goto end
        .return (1)
      end:
    }}
    <fail>
}

token pod_comment {
    ^^ '=' <.unsp>?
    [
    | begin \h+ <identifier> .*? \n
      '=' <.unsp>? 'end' \h+ $<identifier> >> \N*   {*}         #= tagged
    | begin \h* \n .*? \n
      '=' <.unsp>? 'end' >> \N*                     {*}         #= anon
    | \N*                                           {*}         #= misc
    ]
    {*}
}


token apostrophe {
    <[ ' \- ]>
}

token identifier {
    <.ident> [ <.apostrophe> <.ident> ]*
}


##  STD.pm doesn't have a statement_block rule -- we have one
##  to distinguish lists of statements that produce blocks
##  from those that don't.

rule statement_block {
    {*}                                          #= open
    <statementlist>
    {*}                                          #= close
}


token lambda { '->' | '<->' }

token pblock { [ <lambda> <.ws> <signature> ]? <.ws> <block> {*} }

token xblock { <EXPR> <.ws> <pblock> {*} }


##  Blocks can also have an implied statement end if the
##  closing brace is the last non-ws thing on the line.

token block {
    '{' ~ '}' <statement_block>
    <.BLOCK_STATEMENT_END>?
    {*}
}

token BLOCK_STATEMENT_END {
    [ \h* <.unv>? \n <.MARK_STATEMENT_END> ]
}

rule statementlist {
    [<statement><.eat_terminator> ]*
    {*}
}

##  The eat_terminator detects when we're at a valid
##  statement termination point.  A semicolon always acts as
##  a valid statement end, as does the presence of any expression
##  terminator.  The MARK_STATEMENT_END subrule is used by other
##  rules to indicate a valid statement end when a terminator
##  isn't present -- e.g., a closing '}' at the end of a line
##  for a <block>.

token terminator { 
    |  <[ ; ) \] } ]> 
    | '!!' 
    | '-->' 
    | [ if | unless | while | until | for | given | when ] >> <.nofun>
}

token stdstopper {
    <?terminator>
}

token eat_terminator {
    || ';'
    || <?terminator>
    || <?AT_STATEMENT_END>
    || $
    || <.panic: "Statement not terminated properly">
}

token MARK_STATEMENT_END {
    {{  $P0 = match.'to'()
        $P0 = clone $P0
        set_global '$!endstmt', $P0
    }}
    <.ws>
}

token AT_STATEMENT_END {
    <?{{ 
          $P0 = get_global '$!endstmt'
          $P1 = get_global '$!ws'
          $P2 = $P1.'from'()
          $I0 = iseq $P0, $P2
          .return ($I0)
    }}>
}
    

##  Parse a single statement, which may be either a bare block
##  or an expression.  Any statement termination is handled by
##  the calling rule.
token statement {
    [
    | <statement_control> {*}                    #= control
    | <expr=EXPR> <.ws>
        [
        || <statement_mod_loop> {*}              #= mod_loop
        || <statement_mod_cond> <.ws>
           <statement_mod_loop>?
           {*}                                   #= mod_cond
        || {*}                                   #= expr
        ]
    | <?before ';'> {*}                          #= null
    ]
    {{ $P0 = get_hll_global ['Bool'], 'False'
       set_global '$begin_compunit', $P0 }}
}

rule statement_control {
    | <if_statement> {*}                         #= if_statement
    | <unless_statement> {*}                     #= unless_statement
    | <repeat_statement> {*}                     #= repeat_statement
    | <while_statement> {*}                      #= while_statement
    | <given_statement> {*}                      #= given_statement
    | <when_statement> {*}                       #= when_statement
    | <default_statement> {*}                    #= default_statement
    | <loop_statement> {*}                       #= loop_statement
    | <for_statement> {*}                        #= for_statement
    | <use_statement> {*}                        #= use_statement
    | <begin_statement> {*}                      #= begin_statement
    | <end_statement> {*}                        #= end_statement
    | <catch_statement> {*}                      #= catch_statement
    | <control_statement> {*}                    #= control_statement
    | <no_statement> {*}                         #= no_statement
}

rule if_statement {
    $<sym>=[if<.nofun>]
    <xblock>
    [ 'elsif' <xblock> ]*
    [ 'else' <pblock> ]?
    {*}
}

rule unless_statement {
    $<sym>=[unless<.nofun>]
    <xblock> {*}
    [ <!before 'else'> || <.panic: "unless does not take \"else\" in Perl 6; please rewrite using \"if\""> ]
}

rule repeat_statement {
    $<sym>=[repeat]
    [ $<loop>=[while|until] <EXPR> <block>
    | <block> $<loop>=[while|until] <EXPR>
    ]
    {*}
}

rule while_statement {
    $<sym>=[[while|until]<.nofun>]
    <xblock>
    {*}
}

rule given_statement {
    $<sym>=[given<.nofun>]
    <xblock>
    {*}
}

rule when_statement {
    $<sym>=[when<.nofun>]
    <EXPR> <block>
    {*}
}

rule default_statement {
    $<sym>=[default]
    <block>
    {*}
}

rule loop_statement {
    $<sym>=[loop]
    $<eee>=[
        '('
            <e1=EXPR>? ';'
            <e2=EXPR>? ';'
            <e3=EXPR>?
        ')'
    ]?
    <block>
    {*}
}

rule for_statement {
    $<sym>=[for<.nofun>]
    <xblock>
    {*}
}

rule use_statement {
    $<sym>=[use] <name>
    <EXPR>?
    {*}
}

rule begin_statement {
    $<sym>=[BEGIN]
    <block>
    {*}
}

rule end_statement {
    $<sym>=[END]
    <block>
    {*}
}

rule catch_statement {
    $<sym>=[CATCH]
    <block>
    {*}
}

rule control_statement {
    $<sym>=[CONTROL]
    <block>
    {*}
}

rule no_statement {
    'no' <module_name><arglist>? {*}
}

rule statement_mod_loop {
    $<sym>=[[while|until|for|given]<.nofun>] <EXPR> {*}
}

rule statement_mod_cond {
    $<sym>=[[if|unless|when]<.nofun>] <EXPR> {*}
}

rule statement_prefix {
    $<sym>=[do|try|gather|contend|async|lazy]
    <statement>
    {*}
}


#### Subroutine and method definitions ####

rule multi_declarator {
    [
    | $<sym>=[multi|proto|only] [ <declarator> || <routine_def> ]
    | <declarator>
    ]
    {*}
}

token routine_declarator {
    | $<sym>='sub'       <routine_def> {*}       #= sub
    | $<sym>='method'    <method_def> {*}        #= method
    | $<sym>='submethod' <method_def> {*}        #= submethod
}

rule multisig {
    ':'?'(' ~ ')' <signature>
    {*}
}

rule routine_def {
    [ <deflongname=identifier> ]? [ <multisig> | <trait> ]*
    <block>
    {*}
    || <.panic: "Malformed routine definition">
}

rule method_def {
    [
    | $<longname>=[<[ ! ]>?<name>] [ <multisig> | <trait> ]*
    | <multisig> <trait>*
    | ::
    ]
    <block>
    {*}
    || <.panic: "Malformed method definition">
}

rule trait {
    [
    | <trait_auxiliary>
    | <trait_verb>
    ]
    {*}
}

rule trait_auxiliary {
    [
    | $<sym>=[is] <name><postcircumfix>?
    | $<sym>=[does] <name>['['<EXPR>?']']?
    | $<sym>=[will] <identifier> <block>
    ]
    {*}
}

rule trait_verb {
    [
    | $<sym>=[of|returns] <typename>
    | $<sym>=[handles] <EXPR>
    ]
    {*}
}

token capterm {
    '\\(' <capture> ')'
    {*}
}

rule capture {
    <EXPR>
    {*}
}

token sigterm {
    ':(' ~ ')' <signature> {*}
}

rule param_sep { (','|':'|';;'|';') }

token signature {
    {*} #= open
    <.ws>
    [
    | <parameter>
    | <?before '-->' | ')' | ']' | '{' | ':'<!before ':' > >
    ] ** 1 ## PGE bug
    [ <param_sep>
        [
        | <parameter>
        | <?before '-->' | ')' | ']' | '{' | ':'<!before ':' > >
        ]
    ]*
    <.ws>
    {*} #= close
}

rule type_declarator {
    'subset'
    <name>
    {{
        $P0 = match['name']
        $S0 = $P0.'text'()
        match.'add_type'($S0)
    }}
    [ of <fulltypename> ]?
    where <EXPR>
    {*}
}

# XXX STD.pm also has value in here?
rule type_constraint {
    [
    | <fulltypename>
    | where <EXPR: 'm='>               # XXX <EXPR(item %chaining)>
    ]
    {*}
}

rule post_constraint {
    where <EXPR: 'm='> {*}             # XXX <EXPR(item %chaining)>
}

token param_var {
    <sigil> <twigil>? [
        || <identifier>
        || $<identifier>=['/'|'!']
    ]?
    {*}
}

token parameter {
    <type_constraint>*
    [
    |   $<quant>=['*'] <param_var>
    |   $<named>=[':'?]
        <param_var>
        $<quant>=[ <[ ? ! ]>? ]
    |   <?{{
            $I0 = match['type_constraint']
            $I0 = $I0 > 0
            .return ($I0)
        }}> <?>
    ]
    <trait>*
    <post_constraint>*
    <default_value>?
    {*}
}

rule default_value {
    '=' <EXPR: 'i='>
}


#### Terms ####

token expect_term {
    | <noun> <post>* {*}                         #= noun
    | '*' {*}                                    #= *
}


token post {
    <!afterws>
    [ <.unsp> || \\ ]?
    [
    | <dotty> {*}                                #= dotty
    | <postcircumfix> {*}                        #= postcircumfix
    ]
}


token dotty {
    [
    | '.VAR' {*}                                      #= VAR
    | '.' <.unspacey> <dottyop> {*}                   #= .
    | ('.' <[+*?^:]>) <.unspacey> <dottyop> {*}       #= .*
    | '!' <methodop> {*}                              #= !
    ]
}


token dottyop {
    | <methodop> {*}                                  #= methodop
    | <postcircumfix> {*}                             #= postcircumfix
}


token methodop {
    [
    | <name>
    | <?before '$' | '@' > <variable>
    | <?before <[ ' " ]> > <quote>
#       { $<quote> ~~ /\W/ or .panic("Useless use of quotes") }
    ] <.unsp>?

    [
    | '.'? <.unsp>? '(' <semilist> ')' {*}            #= semilist
    | ':' <?before \s> <arglist> {*}                  #= arglist
    | {*}                                             #= null
    ]
}


token postcircumfix {
    | '(' <semilist> ')' {*}                          #= ( )
    | '[' <semilist> ']' {*}                          #= [ ]
    | '{' <semilist> '}' {*}                          #= { }
    | <?before '<' > <quote_expression: :w :q> {*}    #= < >
}


# XXX Note that 'self' here should be a term.
token noun {
    | <fatarrow> {*}                             #= fatarrow
    | <variable> {*}                             #= variable
    | <package_declarator> {*}                   #= package_declarator
    | <scope_declarator> {*}                     #= scope_declarator
    | <routine_declarator> {*}                   #= routine_declarator
    | <?before multi|proto|only> <multi_declarator> {*}  #= multi_declarator
    | <regex_declarator> {*}                     #= regex_declarator
    | <type_declarator> {*}                      #= type_declarator
    | <enum_declarator> {*}                      #= enum_declarator
    | <circumfix> {*}                            #= circumfix
    | <statement_prefix> {*}                     #= statement_prefix
    | <dotty> {*}                                #= dotty
    | <value> {*}                                #= value
    | 'self' >> {*}                              #= self
    | <term> {*}                                 #= term
    | <capterm> {*}                              #= capterm
    | <sigterm> {*}                              #= sigterm
    | <colonpair> {*}                            #= colonpair
}


token term {
    [
    | 'VAR(' <variable> ')' {*}                  #= VAR
    | <name=named_0ary>
        [
        | <.unsp>? '.'? '(' <semilist> ')' {*}   #= func args
        | :: {*}                                 #= noarg
        ]
    | <typename> {*}                             #= typename
    | <name>
        [
        | <args> {*}                             #= args
        | :: {*}                                 #= noarg
        ]
    | <sigil> \s <arglist> {*}                   #= sigil
    | '...' {*}                                  #= ...
    ]
}


token args {
    | \s <arglist> {*}                           #= listop args
    | <.unsp>? '.'? '(' <semilist> ')' {*}       #= func args
}

##  XXX: cheat until we get term:pi, term:rand, term:undef, etc.
token named_0ary {
    | [pi|rand|undef|nothing|time|next|last|continue|break|Inf|NaN] >>
    | ['...'|'???'|'!!!'|'=<>']
}

rule package_declarator {
    | $<sym>=[class|grammar|module|package|role] {*}       #= open
      <package_def> {*}                                    #= package_def
    | 'does' <typename> {*}                                #= does
}


rule package_def {
    [
        <module_name>
        {{
            $P0 = match['module_name']
            $P0 = $P0[0]
            $P0 = $P0['name']
            $S0 = $P0.'text'()
            match.'add_type'($S0)
        }}
    ]?
    {*}                                                  #= open
    <trait>*
    [
    | <?{{ $P0 = get_global '$begin_compunit'
           .return ($P0) }}>
      ';' <statement_block> {*}                          #= statement_block
    | <block> {*}                                        #= block
    | {*}                                                #= panic
    ]
}


rule enum_declarator {
    'enum'
    [
        <name> ::
        {{
            $P0 = match['name']
            $P0 = $P0[0]
            $S0 = $P0.'text'()
            match.'add_type'($S0)
        }}
    ]?

    [
    | <?before '[' > <circumfix> {*}             #= circumfix
    | <?before '<' | '«' > <quote> {*}          #= quote
    ]
}

rule scope_declarator {
    $<sym>=[my|our|state|constant|has]
    <scoped>
    {*}
}

rule scoped {
    [
    | <declarator>
    | <routine_declarator>
    | <fulltypename>+ <multi_declarator>
    ] {*}
    || <.panic: "Malformed declaration"> # STD.pm: <.panic: "Malformed \"$+SCOPE\" declaration">
}

token declarator {
    [
    | <variable_declarator>
    | '(' ~ ')' <signature> <trait>*
    | <routine_declarator>
    | <regex_declarator>
    | <type_declarator>
    ]
    {*}
}

token variable_declarator {
    <variable>
    <.ws>
    <trait>*
    <post_constraint>*
    {*}
}

token variable {
    <?sigil>
    [
    | <sigil> <twigil>? <desigilname> {*}                #= desigilname
    | <special_variable> {*}                             #= special_variable
    | <sigil> $<matchidx>=[\d+] {*}                      #= $0
    | <sigil> <?before '<'> <postcircumfix> {*}          #= $<>
    ]
}

token sigil { '$' | '@' | '%' | '&' | '@@' }

token twigil { <[.!^:*+?=]> }

token desigilname {
    [
    | <?before '$' > <variable>
    | <longname=name>
    ]
}

token special_variable {
    $<sym>=[ '$/' | '$!' | '$¢' ] <!before \w> {*}
}

token circumfix {
    | '(' <statementlist> ')' {*}                #= ( )
    | '[' <statementlist> ']' {*}                #= [ ]
    | <?before '{' | <lambda> > <pblock> {*}     #= { }
    | <sigil> '(' <semilist> ')' {*}             #= $( )
}

token module_name {
    <name>
    [
        :dba('generic role')
        <?{{
            ## ($+PKGDECL//'') eq 'role' (more like (@?PKGDECL[0]//'') eq 'role')
            $P0 = get_hll_global ['Perl6'; 'Grammar'; 'Actions'], '@?PKGDECL'
            $S0 = $P0[0]
            $I0 = $S0 == 'role'
            .return ($I0)
        }}>
        '[' ~ ']' <signature>
    ]?
}

token name {
    | <identifier> <morename>*
    | <morename>+
}

token morename {
    '::'
    [
        <?before '(' | <alpha> >
        [ 
        | <identifier>
        | '(' <EXPR> ')' 
        ]
    ]?
}

token value {
    | <quote> {*}                                #= quote
    | <number> {*}                               #= number
}

token typename {
    <name>
    <?{{
        $P0 = match['name']
        $S0 = $P0.'text'()
        .tailcall match.'is_type'($S0)
    }}>
    {*}
}

rule fulltypename {
    <typename>
    [ of <fulltypename> ]?
    {*}
}


##  Quoting is tricky -- the <quote_concat> subrule is in
##  F<src/parser/quote_expression.pir> .
token quote {
    [
    | <.before \'>     <quote_expression: :q>
    | <.before '"' >   <quote_expression: :qq>
    | <.before '<<' >  <quote_expression: :ww :qq>
    | <.before '<' >   <quote_expression: :w :q>
    # | <.before '«' > <quote_expression: :ww :qq>  FIXME: unicode
    | <.before '/'>    <quote_expression: :regex>
    | m   <.ws>        <quote_expression: :regex>
    | q 
        [ q <.ws>      <quote_expression: :qq>
        | w <.ws>      <quote_expression: :q :w>
        | ':PIR' <.ws> <quote_expression: :PIR>
        |   <.ws>      <quote_expression: :q>
        ]
    | Q [ <.ws> ':' ]?
        [ 'PIR' <.ws>  <quote_expression: :PIR>
        | 'q'   <.ws>  <quote_expression: :q>
        | 'qq'  <.ws>  <quote_expression: :qq>
        | 'b'   <.ws>  <quote_expression: :b>
        |       <.ws>  <quote_expression: >
        ]
    ]
    {*}
}


token number {
    [ <dec_number> {*}                           #= dec_number
    | <integer> {*}                              #= integer
    | <rad_number> {*}                           #= rad_number
    ]
}

token integer {
    [
    | 0 [ b <[01]>+           [ _ <[01]>+ ]*
        | o <[0..7]>+         [ _ <[0..7]>+ ]*
        | x <[0..9a..fA..F]>+ [ _ <[0..9a..fA..F]>+ ]*
        | d \d+               [ _ \d+]*
        | \d+[_\d+]*
            {{ say "Leading 0 does not indicate octal in Perl 6" }}  # FIXME
        ]
    | \d+[_\d+]*
    ]
    {*}
}

token escale {
    <[Ee]> <[+\-]>? \d+[_\d+]*
}

# careful to distinguish from both integer and 42.method
token dec_number {
    [
    |            '.' \d+[_\d+]* <.escale>?
    | \d+[_\d+]* '.' \d+[_\d+]* <.escale>?
    | \d+[_\d+]*                <.escale>
    ]
    {*}
}

token radint {
    [
    | <integer> {*}                              #= integer
    | # this alternation is a subset of rad_number
      ':' $<radix>=[\d+] <.unsp>+
      ::  '<'
          $<intpart>=[<[0..9 a..z A..Z _]>]+
          [ '*' <base=radint> '**' <exp=radint> ]?
          '>'
      {*}                                        #= rad_number
    ]
}

token rad_number {
    ':' $<radix>=[\d+] <.unsp>?
    ::
    [
    ||  '<'
        $<intpart>=[<[0..9 a..z A..Z _]>]+
        $<fracpart>=[ '.' <[0..9 a..z A..Z _]>+ ]?
        [ '*' <base=radint> '**' <exp=radint> ]?
        '>'
    || <?before '['> <postcircumfix>
    || <?before '('> <postcircumfix>
    ]
    {*}
}


rule regex_declarator {
    $<sym>=[regex|token|rule] <regex_def>
    {*}
}

rule regex_def {
    <deflongname=name>?
    <regex_block> {*}
    || <.panic: "Malformed regex definition">
}

token regex_block {
    <?before '{'> <quote_expression: :regex>
    <.BLOCK_STATEMENT_END>?
    {*}
}


##  S05 shows semilist as being a list of statements, in order
##  to support multidimensional argument lists.  For now we
##  just handle a single-dimensional argument list.
rule semilist {
    <EXPR>?
    {*}
}

token arglist {
    [
    | <?terminator>
    | <EXPR: 'd='>                                 # EXPR(%list_assignment)
    ]
    {*}
}

token fatarrow {
    <key=identifier> \h* '=>' <val=EXPR: 'i='>     # EXPR(%item_assignment)
    {*}
}

token colonpair {
    ':'
    [
    | '!' <identifier>                                   {*}    #= false
    | <identifier> $<val>=[ <.unsp>? <postcircumfix> ]?  {*}    #= value
    | <postcircumfix>                                    {*}    #= structural
    | <sigil> <twigil>? <desigilname>                    {*}    #= varname
    ]
}

#### expressions and operators ####

##  The EXPR rule is our entry point into the operator
##  precedence parser.  At the moment the operator
##  tokens are defined in F<src/parser/grammar-oper.pg>,
##  using a prototype function syntax (because PGE doesn't
##  yet support protoregexes).  When the operator precedence
##  parser needs a term, it gets it by calling the 'term'
##  token above.

## rule EXPR is optable { ... }

proto 'term:' is precedence('z=')
    is parsed(&expect_term)
    { ... }

proto 'term:->' is equiv(term:)
    is parsed(&expect_term)
    is skipkey(0)
    { ... }

proto 'close:<->' is equiv(term:) { ... }