Skip to content

Commit

Permalink
ISO C++ grammar added
Browse files Browse the repository at this point in the history
git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@794 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information
grammarware committed May 7, 2010
1 parent 6ffba02 commit 6cdf991
Show file tree
Hide file tree
Showing 8 changed files with 5,533 additions and 21 deletions.
4 changes: 3 additions & 1 deletion topics/extraction/bnf2bgf/lll2bgf.py
Expand Up @@ -15,6 +15,7 @@
('**','DOUBLESTAR'),
('*=','MULTIPLICATIONASSIGNMENT'),
('*','STAR'),
('.*','DOTSTAR'),
('++','DOUBLEPLUS'),
('+=','ADDITIONASSIGNMENT'),
('+','PLUS'),
Expand All @@ -23,7 +24,8 @@
('(','LEFTPARENTHESIS'),
(')','RIGHTPARENTHESIS'),
('{','LEFTCURLYBRACKET'),
('}','RIGHTCURLYBRACKET')
('}','RIGHTCURLYBRACKET'),
('->*','ARROWSTAR')
)

# these special symbols get transformed into HTML entities
Expand Down
104 changes: 84 additions & 20 deletions topics/extraction/bnf2bgf/pdf2lll.py
Expand Up @@ -9,18 +9,30 @@
double = {}
current = ''
keys=[]
reported = ['identifier','keyword','literal']
ignored = ['identifier','keyword','literal','string-literal']
reported = []
punctuators = [[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[],[]]


#bannedLines = ('44','45','46',"Annex A","SPECIFICATION","A.2.")
bannedLines = []
knownTerminals = []

def quote(x):
return '"'+x+'"'

def unquote(x):
if x[0]=='"' and x[-1]=='"':
return x[1:-1]
else:
return x

def assignNewCurrent(c):
global current
if c not in keys:
keys.append(c)
current = c
d = performReplacements(c)
if d not in keys:
keys.append(d)
current = d

def readBannedLinesList(f):
lst = open(f,'r')
Expand All @@ -34,56 +46,91 @@ def readTerminalsList(f):
for kw in ' '.join(lst.readlines()).split():
knownTerminals.append(kw)
lst.close()
print knownTerminals
#print knownTerminals
for kw in knownTerminals:
if not kw.isalpha():
try:
punctuators[len(kw)-1].append(kw)
except IndexError,e:
print 'index error with',kw,len(kw)
punctuators.reverse()
print 'Punctuators:',punctuators

knownPostfixes = ('+','*','?')

knownReplacements = \
(
('opt',' OPTIONALITYMETASYMBOL'),
('–','"-"')
('–','"-"'),
('˜','"~"'),
('fi','fi'),
('[',' ['),
('(',' ('),
(']',' ]'),
(')',' )'),
)

oneof = False

def processline(line):
global oneof
global current
global oneof
rline = line.strip()
if rline == '':
return ''
if rline[-1]==':' and rline[-2].isalpha():
oneof = False
assignNewCurrent(rline[:-1])
# getting rid of leading stuff (perhaps labels)
assignNewCurrent(rline[:-1].split()[-1])
if current in grammar.keys():
#print 'Warning: double declaration of',current
double[current] = grammar[current][:]
grammar[current]=[]
grammar[current] = []
return
if rline.find('one of')>0:
oneof = True
assignNewCurrent(rline.replace('one of','').strip()[:-1])
assignNewCurrent(rline.replace('one of','').strip()[:-1].split()[-1])
if current in grammar.keys():
#print 'Warning: double declaration of',current,': the first one',grammar[current],'discarded'
double[current] = grammar[current][:]
grammar[current]=[]
grammar[current] = []
return
if oneof:
for t in processLineTokens(rline):
grammar[current].append(t)
for t in rline.split():
grammar[current].append(' '.join(processLineTokens(t)))
#for t in processLineTokens(rline):
# grammar[current].append(t)
else:
grammar[current].append(' '.join(processLineTokens(rline)))
return

def processLineTokens(rline):
iline = rline[:]
def performReplacements(line):
for x,y in knownReplacements:
iline = iline.replace(x,y)
line = line.replace(x,y)
return line

def processLineTokens(rline):
iline = performReplacements(rline)
tokens = iline.split()
for i in range(0,len(tokens)):
if tokens[i] in knownTerminals:
tokens[i] = '"'+tokens[i]+'"'
tokens[i] = quote(tokens[i])
tokens[i] = splitLeading(tokens[i],punctuators)
return tokens

def splitLeading(t,arrays):
for ps in arrays:
for p in ps:
if t.find(p)==0:
t = quote(p)+' '+' '.join(processLineTokens(t[len(p):]))
return t

def splitTrailing(t,array):
for p in array:
if t.find(p)>-1 and t.find(p)==len(t)-len(p):
#print 'Found',p,'in',t,'at',t.find(p),'- result at {'+(t[:-len(p)])+' "'+p+'"'+'}'
t = t[:-len(p)]+' '+quote(p)
return t

def readLines(f):
print 'Reading the PDF lines...'
Expand Down Expand Up @@ -126,7 +173,7 @@ def massageGrammarRule(context,nt):
tokens = context[nt][i].split()
# special case: a postfix metasymbol (e.g., *) occurs in the beggining of the line
if tokens[0] in knownPostfixes:
tokens[0] = '"'+tokens[0]+'"'
tokens[0] = quote(tokens[0])
# special case: arithmetic operations versus context metasymbols
if len(tokens) == 3 and tokens[1] == '*' and tokens[0]+' "/" '+tokens[2] in context[nt]:
print 'A suspicious metasymbol * converted to an arithmetic operator'
Expand All @@ -143,9 +190,26 @@ def massageGrammarRule(context,nt):
# REPORTING undefined nonterminals
if tokens[j][0] != '"'\
and tokens[j] not in grammar.keys()\
and tokens[j] not in ignored\
and tokens[j] not in reported:
print 'Warning: nonterminal',tokens[j],'undefined!'
reported.append(tokens[j])
ts = splitLeading(tokens[j],[knownTerminals])
if unquote(ts.split()[-1]) in grammar.keys():
# false positive in nonterminal -> terminal conversion
tss = ts.split()
tss[-1] = unquote(tss[-1])
ts = ' '.join(tss)
if ts.find(' ')>-1 and (ts.split()[-1] in grammar.keys() or ts.split()[-1] in ignored):
print 'L-Splitting',tokens[j],'into',ts
tokens[j] = ts
else:
print 'NOT L-splitting',tokens[j],'into',ts
ts = splitTrailing(tokens[j],knownTerminals)
if ts.find(' ')>-1 and (ts.split()[0] in grammar.keys() or ts.split()[-1] in ignored):
print 'T-Splitting',tokens[j],'into',ts
tokens[j] = ts
else:
print 'Warning: nonterminal',tokens[j],'undefined, but used in',nt
reported.append(tokens[j])
#if tokens[j] not in knownNonterminals:
# tokens[j]='"'+tokens[j]+'"'
# nt2t += 1
Expand Down
13 changes: 13 additions & 0 deletions topics/grammars/cpp/iso-14882-1998/Makefile
@@ -0,0 +1,13 @@
build:
../../../../shared/tools/pdf2bgf iso-is-annex.txt iso-output.bgf banned-lines.lst keywords.lst
../../../../shared/tools/checkxml bgf iso-output.bgf
../../../../shared/tools/normbgf iso-output.bgf iso-is-grammar.bgf
../../../../shared/tools/bgf2bnf iso-is-grammar.bgf iso-is.bnf
rm -f iso-output.bgf

clean:
rm -f iso-output.bgf iso-is-grammar.bgf iso-is.bnf

test:
make build
../../../../shared/tools/gdts iso-is-grammar.bgf extracted-grammar.bgf
32 changes: 32 additions & 0 deletions topics/grammars/cpp/iso-14882-1998/README.txt
@@ -0,0 +1,32 @@
INTERNATIONAL STANDARD ISO/IEC 14882
First edition
1998-09-01

Programming languages — C++

Reference number
ISO/IEC 14882:1998(E)

© ISO/IEC 1999
All rights reserved. Unless otherwise specified, no part of this publication may be reproduced or utilized in any form or by any means, electronic
or mechanical, including photocopying and microfilm, without permission in writing from either ISO at the address below or ISO's member body
in the country of the requester.
ISO copyright office
Case postale 56 ������ CH-1211 Geneva 20
Tel. + 41 22 749 01 11
Fax + 41 22 734 10 79
E-mail copyright@iso.ch
Web www.iso.ch
Printed in Switzerland

Fixes that were necessary for extraction:

namespaceidentifier{namespace-body }
was changed to
namespaceidentifier {namespace-body }
(the extractor cannot be expected to split this into a sequence
of a terminal, a reserved nonterminal, a non-alphanum terminal and a defined nonterminal)

There are various layout problems (say, ";" formatted in italics),
but we don't even care because we copy-paste the raw text anyway.

5 changes: 5 additions & 0 deletions topics/grammars/cpp/iso-14882-1998/banned-lines.lst
@@ -0,0 +1,5 @@
14882:1998
A.
[gram.
67
68

0 comments on commit 6cdf991

Please sign in to comment.