Skip to content

Commit

Permalink
mediawiki grammar recovery project: first public release
Browse files Browse the repository at this point in the history
git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@1103 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information
grammarware committed Jul 22, 2011
1 parent d8b5fe3 commit 2f6d19c
Show file tree
Hide file tree
Showing 44 changed files with 4,603 additions and 0 deletions.
67 changes: 67 additions & 0 deletions topics/grammars/wiki/mediawiki-bnf/Makefile
@@ -0,0 +1,67 @@
extract:
cat src.article.title.wiki src.article.part1.manually.fixed.wiki src.links.manually.fixed.wiki src.magic.links.wiki src.special.block.part1.manually.fixed.wiki src.inline.text.part1.manually.fixed.wiki > src.bnf.prepared.wiki
perl -pi -w -e 's/<source lang="bnf">/<source lang=bnf>/g;' src.bnf.prepared.wiki

${hunter} src.article.part2.manually.fixed.wiki metawiki.edd raw.1.bgf
${hunter} src.noparse.block.wiki noparse.edd raw.2.bgf
${hunter} src.special.block.part2.manually.fixed.wiki special.edd raw.3.bgf
${hunter} src.inline.text.part2.manually.fixed.wiki inline.edd raw.4.bgf
${hunter} src.bnf.prepared.wiki main.edd raw.5.bgf

${tooldir}/mergebgf raw.1.bgf raw.2.bgf raw.3.bgf raw.4.bgf raw.5.bgf ext.01.bgf

${tooldir}/xbgf utilise-plus.xbgf ext.01.bgf ext.02.bgf
${tooldir}/xbgf remove-concatenation.xbgf ext.02.bgf ext.03.bgf
${tooldir}/xbgf remove-extension-points.xbgf ext.03.bgf ext.04.bgf
${tooldir}/xbgf remove-php-legacy.xbgf ext.04.bgf ext.05.bgf
${tooldir}/xbgf deyaccify.xbgf ext.05.bgf ext.06.bgf
${tooldir}/xbgf remove-comments.xbgf ext.06.bgf ext.07.bgf
${tooldir}/xbgf remove-lookahead.xbgf ext.07.bgf ext.08.bgf
${tooldir}/xbgf remove-duplicates.xbgf ext.08.bgf ext.09.bgf
${tooldir}/xbgf dehtmlify.xbgf ext.09.bgf ext.10.bgf
${tooldir}/xbgf utilise-question.xbgf ext.10.bgf ext.11.bgf
${tooldir}/xbgf utilise-star.xbgf ext.11.bgf ext.12.bgf
${tooldir}/xbgf fix-markup.xbgf ext.12.bgf ext.13.bgf
${tooldir}/xbgf define-special-symbols.xbgf ext.13.bgf ext.14.bgf
${tooldir}/xbgf fake-exclusion.xbgf ext.14.bgf ext.15.bgf
${tooldir}/xbgf remove-postfix-case.xbgf ext.15.bgf ext.16.bgf
${tooldir}/xbgf fix-names.xbgf ext.16.bgf ext.17.bgf
${tooldir}/xbgf unify-whitespace.xbgf ext.17.bgf ext.18.bgf
${tooldir}/xbgf connect-grammar.xbgf ext.18.bgf ext.19.bgf
${tooldir}/xbgf refactor-repetition.xbgf ext.19.bgf ext.20.bgf
${tooldir}/xbgf define-lexicals.xbgf ext.20.bgf ext.21.bgf

${hunter} mediawiki.config.wiki plain.edd add.bgf
${tooldir}/mergebgf ext.21.bgf add.bgf ext.22.bgf
${tooldir}/subgrammar ext.21.bgf wiki-page grammar.bgf

${tooldir}/bgf2html grammar.bgf view.html
${tooldir}/checkbgf grammar.bgf
${tooldir}/checkbgf ext.01.bgf

plot:
${tooldir}/bgf2dot grammar.bgf grammar.dot
perl -pi -w -e 's/\?/q/g;' grammar.dot
perl -pi -w -e 's/-/_/g;' grammar.dot
perl -pi -w -e 's/_>/->/g;' grammar.dot
perl -pi -w -e 's/;/;\n/g;' grammar.dot
perl -pi -w -e 's/digraph generated{//g;' grammar.dot
echo 'digraph generated{' > grammar2.dot
cat grammar.dot | sort | uniq >> grammar2.dot
dot -Tpdf grammar2.dot >grammar.pdf
cp grammar.pdf ~/projects/personal/papers/mediawiki

get:
echo 'http://www.mediawiki.org/wiki/Markup_spec/BNF' > README.txt
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF&action=raw&oldid=281673' >> README.txt
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF/Article_title&action=raw&oldid=295042' > src.article.title.wiki
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF/Article&action=raw&oldid=281674' > src.article.wiki
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF/Noparse-block&action=raw&oldid=372814' > src.noparse.block.wiki
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF/Links&action=raw&oldid=376721' > src.links.wiki
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF/Magic_links&action=raw&oldid=269783' > src.magic.links.wiki
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF/Special_block&action=raw&oldid=281676' > src.special.block.wiki
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF/Inline_text&action=raw&oldid=295055' > src.inline.text.wiki
curl -k 'http://www.mediawiki.org/w/index.php?title=Markup_spec/BNF/Fundamental_elements&action=raw&oldid=212918' > src.fundamental.elements.wiki
cat src.article.title.wiki src.article.wiki src.noparse.block.wiki src.links.wiki src.magic.links.wiki src.special.block.wiki src.inline.text.wiki src.fundamental.elements.wiki > src.bnf.wiki

include ../../Makefile.grammar
44 changes: 44 additions & 0 deletions topics/grammars/wiki/mediawiki-bnf/README.txt
@@ -0,0 +1,44 @@
http://www.mediawiki.org/wiki/Markup_spec/BNF
{{Grammar nav}}
This is an attempt to describe the MediaWiki markup in [[w:Backus-Naur form|Backus-Naur form]]. This is not strictly possible, for at least the following two reasons:
#The logic for parsing combinations of bold, italics and apostrophes
#The logic for parsing nested lists.

Therefore, this grammar will necessarily be incomplete. Where it is incomplete, strategies to parse the non-EBNF parts of the grammar will be described.

The primary goal is to define the MediaWiki parser ''as it currently works''. However, in certain cases, such as where truly bizarre syntax is technically tolerated, the grammar may be restricted to a more useful subset. For example, the following code:

<nowiki>#REDireCTnon%^sense[[foo|and this is parsed as article content</nowiki>

is technically a valid way to write a redirect, but is neither useful nor likely to be in current use. Such deviations from the actual implemented grammar will be noted as they occur.

== About the definitions ==
The definitions are in [[w:Backus-Naur form|Backus-Naur form]] for the moment. We may find the need to use the extended form (EBNF) to stop things getting too complicated (though it should be noted that EBNF expressions can ''always'' be written as BNF).

In general, when parsing a page using this grammar, the matching should be a top-down non-greedy match. i.e. it will always try earlier rules before later ones (always starting from a single point; <wiki-page> in the case of wiki pages) but take the minimum characters to satisfy the complete rule. Exceptions to this should be noted (and avoided if possible).

Bear in mind that these are translation rules, which will be used to convert from one format to another (e.g. wiki-text to HTML), so the grammar may need to include elements that are technically redundant so that they can be referenced in the conversion rules.

All terminals (literal strings) are '''case-insensitive''' unless mentioned otherwise.

== The basics ==
These pages describe basic elements that are used throughout the description, and which are fairly generic.

* [[/Fundamental elements/]]

== Wiki pages ==
This is the basic high-level structure of a wiki-page. Follow the links for the details. This section is incredibly incomplete at the moment.

* [[/Article#Wiki-page|<wiki-page>]] constitutes the '''start symbol''' (a.k.a. "top-level element") describing the wiki page,
** [[/Article#Wiki-page|<redirect>]]
** [[/Article#Article|<article>]]
*** [[/Links/]]
*** [[/Magic links/]] (e.g. RFC, ISBN, etc.)
*** [[/Nowiki|&lt;nowiki&gt;]]

== Other data formats ==
The following types of data are external to the wiki page (e.g. the Article's title).

* [[/Article title/]]

[[Category:Parser| BNF]]
119 changes: 119 additions & 0 deletions topics/grammars/wiki/mediawiki-bnf/connect-grammar.xbgf
@@ -0,0 +1,119 @@
<?xml version="1.0" encoding="UTF-8"?>
<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
<!-- Noparse block -->
<xbgf:define>
<bgf:production>
<nonterminal>characters</nonterminal>
<bgf:expression>
<plus>
<bgf:expression>
<nonterminal>character</nonterminal>
</bgf:expression>
</plus>
</bgf:expression>
</bgf:production>
</xbgf:define>
<!-- Beautification engage! -->
<xbgf:reroot>
<root>wiki-page</root>
</xbgf:reroot>
<!-- digits!!! -->
<xbgf:define>
<bgf:production>
<nonterminal>digits</nonterminal>
<bgf:expression>
<plus>
<bgf:expression>
<nonterminal>digit</nonterminal>
</bgf:expression>
</plus>
</bgf:expression>
</bgf:production>
</xbgf:define>
<xbgf:unite>
<add>digit</add>
<to>decimal-digit</to>
</xbgf:unite>
<xbgf:unite>
<add>DIGIT</add>
<to>decimal-digit</to>
</xbgf:unite>
<!-- ImageOtherParameter -->
<xbgf:vertical>
<nonterminal>image-option</nonterminal>
</xbgf:vertical>
<xbgf:add>
<vertical>
<bgf:production>
<nonterminal>image-option</nonterminal>
<bgf:expression>
<nonterminal>image-other-parameter</nonterminal>
</bgf:expression>
</bgf:production>
</vertical>
</xbgf:add>
<xbgf:horizontal>
<nonterminal>image-option</nonterminal>
</xbgf:horizontal>
<xbgf:eliminate>
<nonterminal>BlockHTML</nonterminal>
</xbgf:eliminate>
<xbgf:eliminate>
<nonterminal>newlines</nonterminal>
</xbgf:eliminate>
<!--
mw("img_upright") ::= "upright" [, ["=",] PositiveInteger]
mw("img_width") ::= PositiveNumber "px" ;
-->
<xbgf:unite>
<add>PositiveInteger</add>
<to>digits</to>
</xbgf:unite>
<xbgf:unite>
<add>PositiveNumber</add>
<to>digits</to>
</xbgf:unite>
<!--
removing French idiosyncrasy
-->
<xbgf:vertical>
<nonterminal>text-with-formatting</nonterminal>
</xbgf:vertical>
<xbgf:remove>
<vertical>
<bgf:production>
<nonterminal>text-with-formatting</nonterminal>
<bgf:expression>
<nonterminal>open-guillemet</nonterminal>
</bgf:expression>
</bgf:production>
</vertical>
</xbgf:remove>
<xbgf:remove>
<vertical>
<bgf:production>
<nonterminal>text-with-formatting</nonterminal>
<bgf:expression>
<nonterminal>close-guillemet</nonterminal>
</bgf:expression>
</bgf:production>
</vertical>
</xbgf:remove>
<xbgf:horizontal>
<nonterminal>text-with-formatting</nonterminal>
</xbgf:horizontal>
<!-- ImageModeParameter -->
<xbgf:unite>
<add>ImageModeThumb</add>
<to>image-mode-auto-thumb</to>
</xbgf:unite>
<!-- top level link, currently under discussion, last comment in 2006 -->
<xbgf:fold>
<nonterminal>link</nonterminal>
</xbgf:fold>
<!-- crucial and nontrivial -->
<xbgf:unite>
<add>category</add>
<to>category-link</to>
</xbgf:unite>
</xbgf:sequence>
66 changes: 66 additions & 0 deletions topics/grammars/wiki/mediawiki-bnf/define-lexicals.xbgf
@@ -0,0 +1,66 @@
<?xml version="1.0" encoding="UTF-8"?>
<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
<!-- Mainstream -->
<xbgf:define>
<bgf:production>
<nonterminal>TAB</nonterminal>
<bgf:expression>
<terminal>\t</terminal>
</bgf:expression>
</bgf:production>
</xbgf:define>
<xbgf:define>
<bgf:production>
<nonterminal>CR</nonterminal>
<bgf:expression>
<terminal>\r</terminal>
</bgf:expression>
</bgf:production>
</xbgf:define>
<xbgf:define>
<bgf:production>
<nonterminal>LF</nonterminal>
<bgf:expression>
<terminal>\n</terminal>
</bgf:expression>
</bgf:production>
</xbgf:define>
<!-- Any -->
<xbgf:redefine>
<bgf:production>
<nonterminal>random-character</nonterminal>
<bgf:expression>
<any/>
</bgf:expression>
</bgf:production>
</xbgf:redefine>
<xbgf:define>
<bgf:production>
<nonterminal>any-text</nonterminal>
<bgf:expression>
<star>
<bgf:expression>
<nonterminal>unicode-character</nonterminal>
</bgf:expression>
</star>
</bgf:expression>
</bgf:production>
</xbgf:define>
<!-- not precisely correct, but for now it'll do -->
<xbgf:define>
<bgf:production>
<nonterminal>sort-key</nonterminal>
<bgf:expression>
<nonterminal>any-text</nonterminal>
</bgf:expression>
</bgf:production>
</xbgf:define>
<xbgf:define>
<bgf:production>
<nonterminal>any-supported-unicode-character</nonterminal>
<bgf:expression>
<any/>
</bgf:expression>
</bgf:production>
</xbgf:define>
</xbgf:sequence>
90 changes: 90 additions & 0 deletions topics/grammars/wiki/mediawiki-bnf/define-special-symbols.xbgf
@@ -0,0 +1,90 @@
<?xml version="1.0" encoding="UTF-8"?>
<xbgf:sequence xmlns:xbgf="http://planet-sl.org/xbgf" xmlns:bgf="http://planet-sl.org/bgf">
<!-- the next is actually no defining special symbols, but dealing with them somehow anyway -->
<!-- special block special symbols -->
<xbgf:vertical>
<nonterminal>TableCellParameter</nonterminal>
</xbgf:vertical>
<xbgf:remove>
<vertical>
<bgf:production>
<nonterminal>TableCellParameter</nonterminal>
<bgf:expression>
<sequence>
<bgf:expression>
<nonterminal>?</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>HTML</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>cell</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>attributes</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>?</nonterminal>
</bgf:expression>
</sequence>
</bgf:expression>
</bgf:production>
</vertical>
</xbgf:remove>
<xbgf:add>
<vertical>
<bgf:production>
<nonterminal>TableCellParameter</nonterminal>
<bgf:expression>
<nonterminal>html-cell-attributes</nonterminal>
</bgf:expression>
</bgf:production>
</vertical>
</xbgf:add>
<xbgf:horizontal>
<nonterminal>TableCellParameter</nonterminal>
</xbgf:horizontal>
<!-- special block special symbols -->
<xbgf:vertical>
<nonterminal>TableParameters</nonterminal>
</xbgf:vertical>
<xbgf:remove>
<vertical>
<bgf:production>
<nonterminal>TableParameters</nonterminal>
<bgf:expression>
<sequence>
<bgf:expression>
<nonterminal>?</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>HTML</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>table</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>attributes</nonterminal>
</bgf:expression>
<bgf:expression>
<nonterminal>?</nonterminal>
</bgf:expression>
</sequence>
</bgf:expression>
</bgf:production>
</vertical>
</xbgf:remove>
<xbgf:add>
<vertical>
<bgf:production>
<nonterminal>TableParameters</nonterminal>
<bgf:expression>
<nonterminal>html-table-attributes</nonterminal>
</bgf:expression>
</bgf:production>
</vertical>
</xbgf:add>
<xbgf:horizontal>
<nonterminal>TableParameters</nonterminal>
</xbgf:horizontal>
</xbgf:sequence>

0 comments on commit 2f6d19c

Please sign in to comment.