Skip to content

Commit

Permalink
Match identical and equivalent objects
Browse files Browse the repository at this point in the history
  • Loading branch information
grammarware committed Feb 24, 2013
1 parent cb2c4cb commit 584f417
Showing 1 changed file with 122 additions and 4 deletions.
126 changes: 122 additions & 4 deletions Apeldoorn/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,25 @@

db = []

f = open('monumentenlijst_20120117.txt', 'r')
def allnumbersof(s):
z = s.replace(',','/').replace('-','/').split('/')
y = []
for x in z:
x = x.strip()
if re.match('[0-9]+[a-z][a-z]+',x):
print 'WORKS'
bx = ''
for n in x:
if n.isdigit():
bx += n
else:
y.append(bx+n)
else:
y.append(x)
print y
return y

f = open('tampered-monumentenlijst.txt', 'r')
cur = ''
for line in f.readlines():
if not line.strip():
Expand All @@ -18,7 +36,7 @@
f.close()
print 'Found',len(db),'entries'

f = open('monuments.txt', 'w')
ws = []
# "gm# aluminiumweg 31 1 object met nr 33 Apeldoorn",
# towns = []
for e in db:
Expand All @@ -28,7 +46,8 @@
if status in ('k#', 'K#'):
continue
rest = e[len(status):].strip()
town = addr = ''
town = ''
addr = []
# TODO: ' ' -> ' '; WenumWiesel -> Wenum-Wiesel
# town = rest.strip().split(' ')[-1]
# if town and town.isalpha() and town[0].isupper():
Expand All @@ -46,7 +65,106 @@
town = 'Radio Kootwijk'
# print 'What is a city of',rest
# sys.exit(1)
w = [status, addr, rest, town]
obj = []
a = False
for word in rest.split():
if a:
obj.append(word)
continue
addr.append(word)
if word.isdigit():
a = not a
#quickfix
if len(obj)>0 and len(obj[0])==1 and obj[0].isalnum() and (len(obj)<2 or obj[1]!='object'):
addr[-1] += obj[0]
obj = obj[1:]
# print 'Fixed %s and %s.' % (addr,obj)
addr = ' '.join(addr)
obj = ' '.join(obj)
# met, mer, men,
obj = obj.replace('object mer','object met').replace('object men','object met').replace('i object','1 object').replace('obj.met','object met')
# 1 object nrs
match = ''
if obj.find('1 object met')>-1:
obj,match = obj.split('1 object met')
match = match[1:]
obj = obj.strip()
if obj.endswith(','):
obj = obj[:-1]
if match.find('.')>-1:
ax = match.split('.')
match = ax[0]
obj += ' ' + '.'.join(ax[1:])
# print '"%s" ==> "%s"' % (obj,match)
# if obj[-5:-2] == ['1','object','met']:
# print 'MATCH!'
# print obj[-5:]
elif obj.find('object')>0:
print '???',obj
w = [status, addr, obj, match, town]
if status == 'gm#':
ws.append(w)

# rewriting to match
fixed = False
while not fixed:
done = False
for w in ws:
if done:
# ???
break
if w[3]!='':
print ' ',w
look = w[3].lower()
if look.startswith('nr '):
look = look.replace('nr',' '.join(w[1].split()[:-1]))
if look.find(' en ')>-1:
look = look.split(' en ')[0]
if look.startswith('nrs '):
# default street
s = ' '.join(w[1].split()[:-1])
lookfor = []
for x in allnumbersof(look[3:].strip()):
x = x.strip()
if x.isdigit() or len(x)<5:
lookfor.append('%s %s' % (s,x))
else:
lookfor.append(x)
s = ' '.join(x.split(' ')[:-1])
print lookfor
# lookfor = [look]
else:
lookfor = [look]
neww = w[:]
ws.remove(w)
for look in lookfor:
fnd = None
for u in ws:
if u[1].lower() == look:
fnd = u
if fnd:
print 'Found',fnd
if fnd[0]!=neww[0] or fnd[2]!=neww[2] or fnd[4]!=neww[4]:
print 'Do not match, proceeding with caution'
# sys.exit()
ws.remove(fnd)
neww[1] += '/' + fnd[1]
neww[3] = ''
else:
print 'Not found: "%s"' % look
# fake it till you make it!
neww[1] += '/' + look
neww[3] = ''
# fixed = True
# sys.exit()
ws.append(neww)
done = True
if not done:
fixed = True

# saving
f = open('monuments.txt', 'w')
for w in ws:
f.write('"%s",\n' % w)
f.close()

Expand Down

0 comments on commit 584f417

Please sign in to comment.