In [1]:
import xml.etree.cElementTree as ET
from collections import defaultdict
import re
import pprint

- audit the OSMFILE and change the variable 'mapping' to reflect the changes needed to fix 
    the unexpected street types to the appropriate ones in the expected list.
    You have to add mappings only for the actual problems you find in this OSMFILE,
    not a generalized solution, since that may and will depend on the particular area you are auditing.
- write the update_name function, to actually fix the street name.
    The function takes a string with street name as an argument and should return the fixed name
   

In [2]:
OSMFILE = "cuxhaven_sample_50.osm"
#street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)
street_type_re = re.compile(r'\b', re.IGNORECASE)

# UPDATE THIS VARIABLE
streetmapping = { "St": "Street",
            "St.": "Street",
            "Ave": "Avenue",
            "Rd.": "Road"
            }


def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        street_types[street_type].add(street_name)


def is_website(elem):
    return (elem.attrib['k'] == "website")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_website(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

#pprint.pprint(dict(audit(OSMFILE))) # print the existing names

def update_name(name, streetmapping):
    m = street_type_re.search(name)
    better_name = name
    if m:
        better_street_type = streetmapping[m.group()]
        better_name = street_type_re.sub(better_street_type, name)
    return better_name

In [3]:
pprint.pprint(dict(audit(OSMFILE))) # print the existing names

{'': {'http://conath-immobilien.de',
      'http://cuxpedia.de/index.php?title=Johannesbrunnen',
      'http://denkmalpflege.bremen.de/sixcms/detail.php?template=20_denkmal_wrapper_d&obj=00001632',
      'http://denkmalpflege.bremen.de/sixcms/detail.php?template=20_denkmal_wrapper_d&obj=00003086',
      'http://denkmalpflege.bremen.de/sixcms/detail.php?template=20_denkmal_wrapper_d&obj=00003279',
      'http://hallopizza.de/',
      'http://opi-buxtehude.de/',
      'http://www.aal-kate.de/',
      'http://www.apotheke-buxtehude.de/index.html',
      'http://www.automarkt-wulsdorf.de/',
      'http://www.aventer.biz',
      'http://www.batavia-wedel.de',
      'http://www.bestenbostel.de/',
      'http://www.bobrink.de/',
      'http://www.bohn-segel.de/',
      'http://www.cafelohmann.de/',
      'http://www.chalet-bremerhaven.de/',
      'http://www.cuxcam.de/cams.php?cam=1',
      'http://www.der-brillenladen-cuxhaven.de',
      'http://www.elbstrand-resort.de/de/Restaurant',
      

Street names in german are set together in a quite complicated way. The oficial rules are listed in this reference:

https://www.duden.de/sprachwissen/rechtschreibregeln/strassennamen

In  nutshell, you can expect all kind of settings, like:
"Hauptstraße" (general case)
"Leipziger Straße" (Leipzig is a city, causing an expection to seperate "Straße") 
"Georg-Büchner-Straße" (Georg Büchner is a person, causing an expection to use "-")

To automatize a correction function to check for the applicable rule for a certain street name is very ambicious and would cetainly exceed the time related to this project.

Anyhow, we can check for more obvious mistakes, like abbreviations.
Abbreviations are in general not allowed in German Streetnames, of course with a few exceptions.
The openstreetmap abbreviation list can be found here:

https://wiki.openstreetmap.org/wiki/Name_finder:Abbreviations#Deutsch_-_German

One can see that abbreviations are either followed by a "." (excepection Bhf = Bahnhof = railway station) or written in capital latters. Multiple grouped capital letters are in german only used for this purpose to my knowledge.

In [9]:
#Search for post codes not matching the 5-digits format:
street_type_re = re.compile(r'[^\d]{5}', re.IGNORECASE)
pprint.pprint(dict(audit(OSMFILE))) # print the existing names

{}


As written above, 2 or more capital letters in a row a very unusual in german and might be caused by a mistype.

In [19]:
#update_street = audit(OSMFILE) 
# print the updated names
#for street_type, ways in update_street.items():
#    for name in ways:
#        better_name = update_name(name, streetmapping)
#print (name, "=>", better_name)

In [22]:
print (update_name("Arsch St.",streetmapping))

Arsch Street
