Skip to content

Commit

Permalink
changed the algorithm behind STEP 5 (decomposing symbols), added test…
Browse files Browse the repository at this point in the history
… cases that helped uncover subtle bugs

git-svn-id: https://slps.svn.sourceforge.net/svnroot/slps@1086 ab42f6e0-554d-0410-b580-99e487e6eeb2
  • Loading branch information
grammarware committed Jun 9, 2011
1 parent 8944eaa commit df76bbf
Show file tree
Hide file tree
Showing 33 changed files with 579 additions and 40 deletions.
1 change: 1 addition & 0 deletions topics/grammars/ada/lncs-4348/Makefile
@@ -1,5 +1,6 @@
extract:
cp src.syntax.summary.txt src.prepared.txt
cat src.semantics.txt >> src.prepared.txt
perl -pi -w -e 's/{\|/{TERMINALBAR/g;' src.prepared.txt
perl -pi -w -e 's/–/-/g;' src.prepared.txt
perl -pi -w -e 's/J\./0\./g;' src.prepared.txt
Expand Down
7 changes: 7 additions & 0 deletions topics/grammars/ada/lncs-4348/README.txt
Expand Up @@ -10,3 +10,10 @@ Annex P (informative): Syntax Summary, pages 675..692

The PDF is of a rather poor quality, one needs to pay attention to the order of the lines when copy-pasting.
Some whitespace also gets messed up, but the extractor copes with it.


------------------------
src.semantics.txt was semi-automatically created to encompass the following rule from §1.1.4 (p.6,l.14):

If the name of any syntactic category starts with an italicized part, it is equivalent to the category name without the italicized part.
The italicized part is intended to convey some semantic information. For example subtype_name and task_name are both equivalent to name alone.
99 changes: 99 additions & 0 deletions topics/grammars/ada/lncs-4348/src.semantics.txt
@@ -0,0 +1,99 @@
675.1: non_quotation_mark_graphic_character ::= graphic_character

675.2: non_end_of_line_character ::= character

676.1: pragma_argument_identifier ::= identifier

676.2: subtype_name ::= name

677.1: static_expression ::= expression

677.2: parent_subtype_indication ::= subtype_indication

677.3: static_simple_expression ::= simple_expression

678.1: discrete_subtype_indication ::= subtype_indication

678.2: discriminant_selector_name ::= selector_name

678.3: discriminant_direct_name ::= direct_name

679.1: interface_subtype_mark ::= subtype_mark

680.1: component_selector_name ::= selector_name

682.1: label_statement_identifier ::= statement_identifier

682.2: variable_name ::= name

682.3: boolean_expression ::= expression

682.4: loop_statement_identifier ::= statement_identifier

682.5: loop_identifier ::= identifier

683.1: block_statement_identifier ::= statement_identifier

683.2: block_identifier ::= identifier

683.3: loop_name ::= name

683.4: label_name ::= name

684.1: procedure_name ::= name

684.2: procedure_prefix ::= prefix

684.3: function_name ::= name

684.4: function_prefix ::= prefix

684.5: formal_parameter_selector_name ::= selector_name

685.1: ancestor_subtype_indication ::= subtype_indication

685.2: package_name ::= name

685.3: object_name ::= name

685.4: exception_name ::= name

685.5: callable_entity_name ::= name

685.6: generic_package_name ::= name

685.7: generic_procedure_name ::= name

685.8: generic_function_name ::= name

685.9: task_identifier ::= identifier

686.1: protected_identifier ::= identifier

686.2: entry_direct_name ::= direct_name

686.3: entry_identifier ::= identifier

687.1: entry_name ::= name

687.2: delay_expression ::= expression

688.1: task_name ::= name

689.1: library_unit_name ::= name

689.2: string_expression ::= expression

690.1: generic_formal_parameter_selector_name ::= selector_name

690.2: subprogram_name ::= name

690.3: package_instance_name ::= name

691.1: first_subtype_local_name ::= local_name

692.1: component_local_name ::= local_name

692.2: restriction_identifier ::= identifier

692.3: restriction_parameter_identifier ::= identifier
25 changes: 13 additions & 12 deletions topics/recovery/csharp/README
@@ -1,15 +1,16 @@
See COPYING for copyright matters.
See http://www.cs.vu.nl/grammarware/browsable/CSharp for an on-line version.
See http://userpages.uni-koblenz.de/~zaytsev/text/toosharp.pdf for the paper:
See http://grammarware.net/text/2005/toosharp.pdf for the paper:

@inproceedings{TooSharp2005,
author = "Vadim Zaytsev",
title = "{Correct C\# Grammar too Sharp for ISO}",
booktitle = "{Pre-proceedings of the International Summer School on Generative and Transformational Techniques in Software Engineering (GTTSE 2005), Part II, Participants Workshop}",
year = 2005,
pages = "154--155",
address = "Braga, Portugal",
month = "July",
publisher = "Technical Report, TR-CCTC/DI-36, Universidade do Minho",
note = "Extended abstract",
}

@inproceedings{TooSharp,
author = "Vadim Zaytsev",
title = "{Correct C# Grammar too Sharp for ISO}",
booktitle = "{International Summer School on Generative and Transformational Techniques in Software Engineering (GTTSE 2005), Part II, Participants Workshop}",
year = 2005,
pages = "154--155",
organization = "{Springer}",
address = "Braga, Portugal",
month = "July",
note = "Extended abstract",
}
128 changes: 100 additions & 28 deletions topics/recovery/hunter/hunter.py
Expand Up @@ -162,6 +162,41 @@ def readConfig(f):
if debug:
print('Ok',config)

# bucket sort
def calculateFrequencies(arr):
fs = {}
for x in arr:
if x not in fs.keys():
fs[x] = 1
else:
fs[x] += 1
return fs

def bestFrequency(fs):
bestv = max(fs.values())
for x in fs.keys():
if fs[x] == bestv:
bestk = x
break
return bestk,bestv

# Use terminator symbol to distinguish productions
def useTerminatorSymbol(ts,t):
ps = [[]]
for x in ts:
if x == t:
ps.append([])
else:
ps[-1].append(x)
#ps = list(filter(lambda x:x!=[],ps))
if 'start-label-symbol' in config.keys() or 'end-label-symbol' in config.keys():
print('STEP 4: guessing defining-symbol in a grammar with labels not implemented yet!')
return None,None
dss = list(map(lambda x:'' if len(x)<2 else x[1],filter(lambda x:x!=[],ps)))
bestds,bestdsvalue = bestFrequency(calculateFrequencies(filter(lambda x:x!='',dss)))
prob = bestdsvalue * 100.0 / len(dss)
return prob,bestds

# Use defining symbol to distinguish productions
def useDefiningSymbol(ts,d):
poss = []
Expand Down Expand Up @@ -228,12 +263,7 @@ def useDefinitionSeparatorSymbol(ts,d):

def findMostProbableTail(ps):
# bucket sort
ss = {}
for s in map(lambda x:x[-1],ps):
if s not in ss.keys():
ss[s] = 1
else:
ss[s] += 1
ss = calculateFrequencies(map(lambda x:x[-1],ps))
# at least 80% has the same end symbol?
# TODO: describe the heuristic
vs = list(ss.values())
Expand Down Expand Up @@ -654,12 +684,14 @@ def assembleQualifiedNumbers(ts):
ds.append(x)
return ds

def splitString(s,kw):
def splitString(s,kw,defd):
# split s according to any kws while preserving them
if len(kw)==0:
return [s]
elif s in defd:
return [s]
elif s.find(kw[0])<0:
return splitString(s,kw[1:])
return splitString(s,kw[1:],defd)
else:
ss = s.split(kw[0])
done = []
Expand All @@ -673,19 +705,20 @@ def splitString(s,kw):
done = done[1:]
res = []
for a in done:
res.extend(splitString(a,kw[1:]))
res.extend(splitString(a,kw[1:],defd))
reject = False
if min(map(len,res))<2:
reject = True
if 'nonterminal-if-contains' in config.keys():
for y in res:
if y in kw or y.find(config['nonterminal-if-contains']) < 0:
if y in defd or y.find(config['nonterminal-if-contains']) < 0:
continue
else:
reject = True
#print('!!!!! Have to reject',res,'of',s)
#print(kw)
if reject:
return splitString(s,kw[1:])
return splitString(s,kw[1:],defd)
else:
return res

Expand Down Expand Up @@ -713,17 +746,20 @@ def decomposeSymbols(p,defd):
# q.append(x)
# continue
# none of the above: it is a nonterminal, it's not defined and we have no way to dismiss it
var = splitString(x,defd)
var = splitString(x,defd,defd)
#print(var)
if len(var)==1:
q.append(x)
continue
#print(x,'-->',var)
pos = True
for y in var:
if y in defd or ('nonterminal-if-contains' in config.keys() and y.find(config['nonterminal-if-contains']) < 0):
continue
else:
pos = False
# TODO: inspection! strange dead code follows
#for y in var:
# if y in defd:
# # or ('nonterminal-if-contains' in config.keys() and y.find(config['nonterminal-if-contains']) > -1):
# continue
# else:
# pos = False
if pos:
print('STEP 8:',x,'matches as',var)
q.extend(var)
Expand Down Expand Up @@ -979,20 +1015,51 @@ def considerIndentation(ts):
if 'nonterminals-may-contain-spaces' in config.keys() and 'concatenate-symbol' in config.keys():
# can only treat them together, because spaces in names without concatenation symbol are highly ambiguous
# and concatenation symbols are never used if nonterminal names do not have spaces
# TODO: not necessarily true, if you have both start-nonterminal-symbol and end-nonterminal-symbol!
tokens = reconsiderSpaces(tokens,config['concatenate-symbol'],config.values())
if 'defining-symbol' in config.keys():
prods = useDefiningSymbol(tokens,config['defining-symbol'])
else:
print('STEP 4 skipped, sorry: defining-symbol is not specified.')
# TODO
# STEP 4a.1: [sanity check] Infer terminator-symbol
if debug:
print('The grammar is perceived like this:')
for p in prods:
print('\t',p[1],'is defined as',p[2:])
need2fix = []
if 'defining-symbol' not in config.keys():
# STEP 4: we do not have defining-symbol, too bad
if 'terminator-symbol' in config.keys():
# STEP 4: at least the terminator-symbol is here, can work with that
print('STEP 4: using terminator-symbol to slice token stream into productions.')
prob,ds = useTerminatorSymbol(tokens,config['terminator-symbol'])
if ds:
if prob == 100:
print('STEP 4: inferred defining symbol is',repr(ds)+'.')
else:
print('STEP 4: the most probable defining symbol is',repr(ds),'with',str(int(prob))+'% certainty.')
config['defining-symbol'] = ds
else:
print('STEP 4 skipped, sorry: could not infer defining-symbol.')
print(ds,tokens)
sys.exit(-1)
else:
# STEP 4: we're screwed
print('STEP 4 in a pinch: neither defining-symbol nor terminator-symbol are specified!')
popular = calculateFrequencies(tokens)
highest = max(popular.values())
solution = ['','',0]
for sym in popular.keys():
if popular[sym]>2:
# TODO: threshold justification
prob,ds = useTerminatorSymbol(tokens,sym)
if ds:
print('STEP 4 could have gone for terminator-symbol',repr(sym),'('+str(int(100.0*popular[sym]/highest))+'%) and defining-symbol',repr(ds),'('+str(int(prob))+'%)...')
if prob*popular[sym]/highest > solution[2]:
solution = sym,ds,prob*popular[sym]/highest
if solution[2] < 50:
print('STEP 4 skipped, sorry: inference failed.')
sys.exit(-1)
else:
config['defining-symbol'] = solution[1]
print('STEP 4 assumes defining-symbol is',repr(solution[1])+'.')
# STEP 4: we do now have defining-symbol, yay!
print('STEP 4: using defining-symbol to slice token stream into productions.')
prods = useDefiningSymbol(tokens,config['defining-symbol'])
print('STEP 4: inferring terminator-symbol by looking at the productions.')
if 'terminator-symbol' in config.keys():
# we do have the terminator, but suppose we also had definition symbol!
# we do have the terminator, but suppose we also had defining symbol!
# TODO otherwise
ts = findCommonTail(prods[:-1])
if ts:
Expand Down Expand Up @@ -1024,6 +1091,11 @@ def considerIndentation(ts):
print('%40s'%p[1],'>>>>>>',p[-2:])
config['terminator-symbol'] = ''
need2fix = []
# STEP 4a.1: [sanity check] Infer terminator-symbol
if debug:
print('The grammar is perceived like this:')
for p in prods:
print('\t',p[1],'is defined as',p[2:])
# STEP 4a.2: adjusting the terminator-symbol on the unfit productions
poststep4 = 0
if debug:
Expand Down
24 changes: 24 additions & 0 deletions topics/recovery/hunter/tests/3-qualified.bgf
@@ -0,0 +1,24 @@
<?xml version="1.0" encoding="UTF-8"?>
<bgf:grammar xmlns:bgf="http://planet-sl.org/bgf">
<bgf:production>
<label>1.1.1</label>
<nonterminal>foo</nonterminal>
<bgf:expression>
<nonterminal>bar</nonterminal>
</bgf:expression>
</bgf:production>
<bgf:production>
<label>1.1.2</label>
<nonterminal>bar</nonterminal>
<bgf:expression>
<nonterminal>wez</nonterminal>
</bgf:expression>
</bgf:production>
<bgf:production>
<label>2.1.3</label>
<nonterminal>wez</nonterminal>
<bgf:expression>
<nonterminal>foo</nonterminal>
</bgf:expression>
</bgf:production>
</bgf:grammar>
6 changes: 6 additions & 0 deletions topics/recovery/hunter/tests/3-qualified.edd
@@ -0,0 +1,6 @@
<?xml version="1.0" encoding="UTF-8"?>
<edd:config xmlns:edd="http://planet-sl.org/edd">
<end-label-symbol>:\n</end-label-symbol>
<defining-symbol>::=</defining-symbol>
<terminator-symbol>END\n\n</terminator-symbol>
</edd:config>
9 changes: 9 additions & 0 deletions topics/recovery/hunter/tests/3-qualified.src
@@ -0,0 +1,9 @@
1.1.1:
foo ::= bar END

1.1.2:
bar ::= wez END

2.1.3:
wez ::= foo END

0 comments on commit df76bbf

Please sign in to comment.