diff --git a/topics/extraction/rascal/extract.py b/topics/extraction/rascal/extract.py index f1548f8b..5a6b0ff4 100755 --- a/topics/extraction/rascal/extract.py +++ b/topics/extraction/rascal/extract.py @@ -32,8 +32,12 @@ def parseGroup(g): grammar = {} # in tokens nt = '' cx = 0 - for line in rsc.readlines(): - line = line.strip() + lines = rsc.readlines() + discardBracket = 0 + grammarkeys = [] + endOfAlt = False + while cx < len(lines): + line = lines[cx].strip() cx += 1 if line == '': continue @@ -46,53 +50,89 @@ def parseGroup(g): print '['+str(cx)+']','Found first line, but of what nonterminal?' continue tokens = line[1:].split() - if len(tokens[-1])>1 and tokens[-1][-1] == ';': + if len(tokens) == 0: + continue + endOfAlt = True + if len(tokens[-1])>0 and tokens[-1][-1] == ';': tokens[-1] = tokens[-1][:-1] tokens.append(';') for i in range(0,len(tokens)): if tokens[i].find('[')>-1 and tokens[i].find(']')>-1: # treating a parametrised nonterminal as a base nonterminal tokens[i] = tokens[i][:tokens[i].index('[')] + tokens[i][tokens[i].index(']')+1:] - grammar[nt].append(tokens) - continue + #grammar[nt].append(tokens) + #continue + # fall down elif line[0] == '#': # "follow" - pass + continue elif line[0] == '-': # "reject" - pass + continue elif line[0:2] == '//': # comment continue else: tokens = line.split() - if tokens[0] == 'module': - print '['+str(cx)+']','Parsing module',tokens[1] - # do something with the module name! + # main branching down, fall down point here + if len(tokens) == 0: + continue + if tokens[0] == 'module': + print '['+str(cx)+']','Parsing module',tokens[1] + # do something with the module name! + continue + if tokens[0] == 'start': + start.append(tokens[-1]) + tokens = tokens[1:] + # fall through + if tokens[0] in ('syntax','layout'): + nt = tokens[1] + if nt[0] == '"': + print '['+str(cx)+']','Cannot include lexical restriction information about',nt + nt = '' + while cx < len(lines) and lines[cx].strip() != ';': + cx += 1 continue - if tokens[0] == 'start': - start.append(tokens[-1]) - tokens = tokens[1:] - # fall through - if tokens[0] in ('syntax','layout'): - nt = tokens[1] - if nt[0] == '"': - print '['+str(cx)+']','Cannot include lexical restriction information about',nt - nt = '' - continue - if nt.find('[')>-1 and nt.find(']')>-1: - # treating a parametrised nonterminal as a base nonterminal - nt = nt[:nt.index('[')] + nt[nt.index(']')+1:] - print '['+str(cx)+']','Starting to treat nonterminal',nt + if nt.find('[')>-1 and nt.find(']')>-1: + # treating a parametrised nonterminal as a base nonterminal + nt = nt[:nt.index('[')] + nt[nt.index(']')+1:] + print '['+str(cx)+']','Starting to treat nonterminal',nt + if nt in grammarkeys: + print '['+str(cx)+']','Duplicate or partial definition of nonterminal',nt + endOfAlt = True + else: grammar[nt] = [] + grammarkeys.append(nt) + # in case there are more tokens on the same line, we proceed + if len(tokens) > 3: + tokens = tokens[3:] + else: continue - if tokens[0] == ';' and len(tokens) == 1: - continue - # give up - print '['+str(cx)+']','What is',line.split(),'?' - if nt: + if len(tokens) == 1 and tokens[0] == ';': + continue + while len(tokens) > 0 and tokens[0] in ('left','right','non-assoc'): + tokens = tokens[1:] + print 'Skipped a modifier',tokens[0],'at',nt + if len(tokens) > 0: + if tokens[0] == '(': + discardBracket += 1 + tokens = tokens [1:] + elif tokens[0] == ')' and discardBracket > 0: + discardBracket -= 1 + tokens = tokens [1:] + if len(tokens) == 0: + continue + # give up + print '['+str(cx)+']','What is',tokens,'- a part of',nt,'?' + if nt: + if endOfAlt: grammar[nt].append(tokens) - pass + endOfAlt = False + elif len(grammar[nt]) > 0: + grammar[nt][-1].extend(tokens) + else: + grammar[nt].append(tokens) + pass # NOW TO PROCESS TOKENS #print 'Command:' #for s in grammar['Command']: @@ -100,7 +140,11 @@ def parseGroup(g): bgf = BGF.Grammar() prevline = [] curly = 0 - for nt in grammar.keys(): + # going through the sorted list of nonterminals + print grammarkeys + for nt in grammarkeys: + print nt,'::=',grammar[nt] + for nt in grammarkeys: for alt in grammar[nt]: if prevline: # dead code yet @@ -116,6 +160,7 @@ def parseGroup(g): alt = [] else: alt = alt[1:] + print 'left:',alt if not alt: continue while len(alt)>0 and alt[-1] in (';',''): @@ -146,7 +191,10 @@ def parseGroup(g): sym = None #print '['+str(cx)+']',alt while cx0: if alt[cx] == '{': curly += 1 @@ -221,6 +269,7 @@ def parseGroup(g): term = alt[cx][1:-1].replace('\\\\','\\').replace('\\>','>').replace('\\<','<').replace('\\\'','\'').replace('\\"','"') sym.setName(term) cx +=1 + print 'Found a terminal',term continue if alt[cx][0] == '[' or alt[cx][:2] == '![': # not quite correct @@ -285,6 +334,7 @@ def parseGroup(g): prod.setExpr(seq) #print str(prod) bgf.addProd(prod) - #print str(bgf) + #for p in bgf.prods: + # print str(p) ET.ElementTree(bgf.getXml()).write(sys.argv[2]) sys.exit(0) diff --git a/topics/recovery/hunter/hunter.py b/topics/recovery/hunter/hunter.py index 35d5d17f..53eb59c0 100755 --- a/topics/recovery/hunter/hunter.py +++ b/topics/recovery/hunter/hunter.py @@ -2,7 +2,7 @@ # -*- coding: utf-8 -*- import os, sys import xml.etree.ElementTree as ET -sys.path.append(os.getcwd().split('slps')[0]+'slps/shared/python') +sys.path.append(os.getcwd().split('projects')[0]+'projects/slps/shared/python') import BGF3 from functools import reduce @@ -17,6 +17,7 @@ ignore_tokens = [] ignore_lines = [] nonterminals_alphabet = ['-','_'] +nonterminals_start = [] multiples = [] aliases = {} @@ -44,6 +45,7 @@ ] specials = \ [ + 'POSSIBLE-TERMINATOR-SYMBOL', 'CONCATENATE-SYMBOL', 'LINE-CONTINUATION-SYMBOL' 'START-TERMINAL-SYMBOL', @@ -108,11 +110,14 @@ def removeComments(ts,s,e): return ts def splitTokenStreamByAlphas(s): + global debug ts = [s[0]] i = 1 alpha = isAlphaNum(s[0]) inQuotes = False while (i-1 and ts[j] in ignore_tokens: j -= 1 - if isAlphaNum(ts[j]) or (ts[j][0]==config['start-terminal-symbol'] and ts[j][-1]==config['end-terminal-symbol'] and isAlphaNum(ts[j][1:-1])): + if isAlphaNum(ts[j]) \ + or (ts[j][0]==config['start-terminal-symbol'] and ts[j][-1]==config['end-terminal-symbol'] and isAlphaNum(ts[j][1:-1]))\ + or 'start-nonterminal-symbol' in config.keys() and 'end-nonterminal-symbol' in config.keys() and (ts[j][0]==config['start-nonterminal-symbol'] and ts[j][-1]==config['end-nonterminal-symbol'] and isAlphaNum(ts[j][1:-1])): poss.append(i) poss.append(len(ts)+1) - #print('Positions:',poss) + if debug: + print('Positions:',poss) for i in range(0,len(poss)-1): if 'end-label-symbol' in config.keys(): if ts[poss[i]-2] == config['end-label-symbol']: @@ -328,7 +346,7 @@ def useDefinitionSeparatorSymbol(ts,d): alts.append(ts[poss[i]+1:poss[i+1]]) return alts -def findMostProbableTail(ps): +def findMostProbableTail(ps,known): # bucket sort ss = calculateFrequencies(map(lambda x:x[-1],ps)) # at least 80% has the same end symbol? @@ -337,17 +355,17 @@ def findMostProbableTail(ps): m = max(vs) vs.remove(m) m2 = max(vs) + for k in ss.keys(): + if ss[k] == m: + break #print('m=',m,'m2=',m2,'len(ps)=',len(ps)) - if m < max(0.25*len(ps),2*m2): + if k != known and m < max(0.25*len(ps),2*m2): possibles = [] for i in ss.keys(): if ss[i]>1: possibles.append((i,ss[i])) print('Candidates were:',possibles,'with total',len(ps)) return None,None,None - for k in ss.keys(): - if ss[k] == m: - break n2f = [] fps = [] cx = 0 @@ -380,6 +398,18 @@ def assembleBracketedSymbols(ts,start,end,preserveSpace): tss[-1] += ts[i] if ts[i] == end: inside = False + elif ts[i] == start: + # we do not allow nested bracketed symbols + print('STEP x ERROR: unbalanced bracketed metasymbols',repr(start),'and',repr(end)) + if preserveSpace: + last = tss[-1].split(' ') + tss[-1] = last[0] + tss[-1] += end + tss.extend(last[1:]) + #tss.append(ts[i]) + else: + tss[-1] += end + tss.append(ts[i]) else: tss.append(ts[i]) if ts[i] == start: @@ -635,6 +665,8 @@ def map2expr(ss): if ss[i][1:-1] == '': print('Serialisation error: empty terminal, replaced with ""!') e.setName('""') + elif ss[i] == config['start-terminal-symbol']+'EPSILON'+config['end-terminal-symbol']: + e = BGF3.Epsilon() else: e.setName(ss[i][1:-1]) es.append(e) @@ -1056,7 +1088,10 @@ def useTerminatorToFixProds(ps,ts): nps = [] for p in ps: if ts not in p: - print('STEP 4 warning: a production is disregarded due to the lack of terminator symbol:',p) + #print('STEP 4 warning: a production is disregarded due to the lack of terminator symbol:',p) + print('STEP 4 warning: a production for',p[1].strip(),'without terminator-symbol, appended one.') + p.append(ts) + print(p) while ts in p: i = p.index(ts) nps.append(p[:i]) @@ -1067,7 +1102,7 @@ def useTerminatorToFixProds(ps,ts): while x in tail: tail.remove(x) if len(tail)>0: - print('STEP 4 problem: terminator symbol without proper defining symbol context.',tail) + print('STEP 4 problem: terminator-symbol without proper defining-symbol context.',tail) return nps else: p = tail @@ -1115,6 +1150,10 @@ def considerIndentation(ts): def convertNonalphanumerics2Terminals(p): q = p[:2] for x in p[2:]: + #print('Checking',repr(x)) + if x == '' or x in ignore_tokens: + # if we end up here, it probably indicates a bug elsewhere + continue if 'consider-indentation' in config.keys(): # TODO: make compatible with consider-indentation q.append(x) @@ -1138,9 +1177,13 @@ def convertNonalphanumerics2Terminals(p): # none of the above if x[0]==' ' or x[-1]==' ': x = x.strip() + if x in ('_','-') or x.isdigit(): + print('STEP 5 warning:',repr(x),'is assumed to be an invalid nonterminal name, converted to a terminal symbol.') + q.append(config['start-terminal-symbol'] + x + config['end-terminal-symbol']) + continue string = x[0] alpha = isAlphaNum(x[0]) - if alpha and not x[0].isalpha(): + if alpha and not (x[0].isalpha() or x[0] in nonterminals_start): print('STEP 5 warning: the first letter of',x,'does not seem right, will be separated.') alpha = False for s in x[1:]: @@ -1220,7 +1263,7 @@ def processLine(line,inside,chunks): if line.find(config['end-grammar-symbol'])>-1: inside = False line = line[:line.index(config['end-grammar-symbol'])] - if line.strip() != '': + if line != '': line,inside,chunks = processLine(line,True,chunks) return line,False,chunks else: @@ -1229,7 +1272,7 @@ def processLine(line,inside,chunks): if line.find(config['start-grammar-symbol'])>-1: inside = True line = line[line.index(config['start-grammar-symbol'])+len(config['start-grammar-symbol']):] - if line.strip() != '': + if line != '': return processLine(line,inside,chunks) return (line,inside,chunks) @@ -1257,6 +1300,9 @@ def processLine(line,inside,chunks): print('STEP 0 found',len(lines),'in grammar chunks between designated delimiters.') if debug: print('Perceived lines:',lines) + if len(lines) == 0: + print('FINAL STEP: premature exit due to the lack of any grammar chunks.') + sys.exit(0) if 'line-continuation-symbol' in config.keys(): if 'concatenate-symbol' in config.keys(): sep = config['concatenate-symbol'] @@ -1435,12 +1481,15 @@ def processLine(line,inside,chunks): if 'terminator-symbol' in config.keys(): # we do have the terminator, but suppose we also had defining symbol! # TODO otherwise - ts = findCommonTail(prods[:-1]) + if len(prods) > 1: + ts = findCommonTail(prods[:-1]) + else: + ts = prods[0][-1] if ts: need2fix = [-1] prob = 100 else: - (need2fix,ts,prob) = findMostProbableTail(prods) + (need2fix,ts,prob) = findMostProbableTail(prods,config['terminator-symbol']) if ''.join(ts) == config['terminator-symbol']: print('STEP 4 confirmed terminator-symbol, congratulations!') else: @@ -1454,7 +1503,7 @@ def processLine(line,inside,chunks): config['terminator-symbol'] = ts need2fix = [-1] else: - (need2fix,ts,prob) = findMostProbableTail(prods) + (need2fix,ts,prob) = findMostProbableTail(prods,'') if ts: print('STEP 4 successful: inferred the most probable terminator-symbol:',repr(ts[0]),',','%i'%prob+'% sure') config['terminator-symbol'] = ts[0] @@ -1491,6 +1540,18 @@ def processLine(line,inside,chunks): prods = [list(filter(lambda y:y!=x,p)) for p in prods] if poststep4 > 0: print('STEP 4 also adjusted',poststep4,'productions that did not quite fit the expectations.') + if 'possible-terminator-symbol' in config.keys(): + no = yes = 0 + for i in range(0,len(prods)): + j = len(prods[i])-1 + while prods[i][j] in ignore_tokens: + j -= 1 + if prods[i][j] == config['possible-terminator-symbol']: + yes += 1 + prods[i] = prods[i][:j] + else: + no += 1 + print('STEP 4 found',yes,'productions using possible terminator symbol and',no,'productions not using it.') if debug: print('The grammar is perceived like this:') for p in prods: @@ -1623,7 +1684,10 @@ def processLine(line,inside,chunks): p = BGF3.Production() if 'disregard-labels' not in config.keys() and q[0]: p.setLabel(q[0]) - p.setNT(q[1]) + if 'start-nonterminal-symbol' in config.keys() and 'end-nonterminal-symbol' in config.keys(): + p.setNT(q[1][len(config['start-nonterminal-symbol']):-len(config['end-nonterminal-symbol'])]) + else: + p.setNT(q[1]) p.setExpr(map2expr(q[2:])) bgf.addProd(p) ET.ElementTree(bgf.getXml()).write(sys.argv[3]) diff --git a/topics/recovery/hunter/tests/2-start-end.src b/topics/recovery/hunter/tests/2-start-end.src index 92409089..48da22f9 100644 --- a/topics/recovery/hunter/tests/2-start-end.src +++ b/topics/recovery/hunter/tests/2-start-end.src @@ -1,3 +1,3 @@ -foo : “x y z” -bar : (x y z) -wez : “foo” +(foo) : “x y z” +(bar) : (x y z) +(wez) : “foo”