In [1]:
import xml.etree.ElementTree as ET  # we can use cElementTree or lxml if too slow
import pprint
from collections import defaultdict
import re
import csv
import codecs
import cerberus
import schema
import sqlite3

OSM_FILE = "stockholm_sweden.osm"  
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: take every k-th top level element! I started with k=10

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

Parse data-set and identify different tags, using iterative parsing.

In [2]:
def count_all_tags(samplefile):
        all_tags=ET.iterparse(samplefile)
        nodes= defaultdict(int)
        for each_node in all_tags:
            nodes[each_node[1].tag] +=1
        return dict(nodes)           
    
def different_tags():
    
    tags = count_all_tags(SAMPLE_FILE)
    pprint.pprint(tags)

if __name__ == "__main__":
    different_tags()

{'member': 19825,
 'nd': 745157,
 'node': 610544,
 'osm': 1,
 'relation': 1012,
 'tag': 216022,
 'way': 69782}


Number of Unique users contributed to the map in the Stockholm area:

In [3]:
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if "uid" in element.attrib:
            users.add(element.get('uid'))

    return users

def test():

    users = process_map(SAMPLE_FILE)
    pprint.pprint(len(users))  #print number of Unique users

if __name__ == "__main__":
    test()

1883


# Auditing 
One of the usual problems in openstreetmap dataset is from the street name abbreviation which is also very culture oriented. However, only by looking at the osm file I could'nt identify an particular problem. I used a street_type auditing code to find out how well collected the data is.
Steps i took:
1-Building the regular expression to match the last element in the string, where usually the street type is based. 
2-Then based on the street abbreviation, create a mapping that finally needs to be cleaned up.


I tried all sort of changes in my code, however the result looks pretty good(Swedes are really good at documentation afterall ;). It is worth noting that Swedish wording is differnet in many ways. in other words, for a street-name+street_type it usually only one world 'namestreet'; for example 'axfordstreet' as just one word. therefore, if the code recognises the last word not detected as expected will return it as an unvalid name to be updated. In this case a majority of valid and correct street names/types will be printed out. To avoid the confusion and to avoid printing 2G worth of streettypes, I wrote a code to update lower case street types.

In [4]:
street_types = defaultdict(set)
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = [ "Väg", "Gatan", "Alle","Allé", "väg","torg","gatan","alle", "Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons", "Cove", "Alley", "Park", "Way", "Walk" "Circle", "Highway",
            "Plaza", "Path", "Center", "Mission", "Kyrka", "kyrka"]

mapping = { "väg": "Väg" ,
           "torg": "Torg",
            "gata":"Gatan",
            "gatan": "Gatan" ,
            "allé" :"Alle",
           "boulevard":"Boulevard",
           
            }

def audit_street_type(street_types, street_name):

    pattern = street_type_re.search(street_name) #finds the pattern of last words
    if pattern:
        street_type = pattern.group() #returns the last word

        if street_type in expected:  ## here is my own interpretation of expected (for english speaking countries i would use #"if street_type in expected: )#

            street_types[street_type].add(street_name)
            print street_types

def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    for event, elem in ET.iterparse(osmfile, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

if __name__ == "__main__":
    
    audit(SAMPLE_FILE)
#    pprint.pprint(audit(SAMPLE_FILE) )

#    for name, street in street_types.items():

#        print("/nSet:", name, "Entries:"),
#        for item in street:

#            print (item)




defaultdict(<type 'set'>, {'torg': set(['Valla torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'torg': set(['Valla torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'torg': set(['Valla torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'torg': set(['Valla torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'torg': set(['Valla torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'torg': set(['Valla torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'torg': set(['Valla torg', 'Kista torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'Gatan': set(['Tysta Gatan']), 'torg': set(['Valla torg', 'Kista torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'Gatan': set(['Tysta Gatan']), 'torg': set(['Valla torg', 'Kista torg'])})
defaultdict(<type 'set'>, {u'gatan': set([u'Gr\xf6na gatan']), 'Gatan': set(['Tysta Gatan']), '

In [5]:
def update_name(name, mapping, regex): #mapping the street type and updating them
    nm = regex.search(name)
    if nm:
        st_type = nm.group()
        if st_type in mapping:
            name = re.sub(regex, mapping[st_type], name)
    return name
for street_type, ways in street_types.iteritems():
    for name in ways:
        better_name = update_name(name, mapping, street_type_re)
        print name, "is updated to:", better_name

Lugna gatan is updated to: Lugna Gatan
Gröna gatan is updated to: Gröna Gatan
Finska gatan is updated to: Finska Gatan
Östra Ågatan is updated to: Östra ÅGatan
Västra Ågatan is updated to: Västra ÅGatan
Breda Gatan is updated to: Breda Gatan
Långa Gatan is updated to: Långa Gatan
Tysta Gatan is updated to: Tysta Gatan
Gustaf de Lavals torg is updated to: Gustaf de Lavals Torg
Valla torg is updated to: Valla Torg
Kista torg is updated to: Kista Torg
Gustav III:s Boulevard is updated to: Gustav III:s Boulevard


Checking ‘k’ value for each tag. creating a dictionary of the different tags. 

In [6]:
#Regular expressions:
lower = re.compile(r'^([a-z]|_)*$') #lower is for valid only-lowercase-letter tags.
lower_and_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$') #lower_and_colon is for valid tags with a colon in the value. 
troublemaker = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]') #troublemaker is for tags with odd characters.


def key_type(element, keys):
    if element.tag == "tag":
        if re.match(lower, element.attrib['k']):
            keys["lower"] += 1
        elif re.match(lower_and_colon, element.attrib['k']):
            keys["lower_and_colon"] += 1
        elif re.search(troublemaker, element.attrib['k']):
            keys["troublemaker"] += 1
        else:
            keys['other'] += 1
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_and_colon": 0, "troublemaker": 0, "other": 0}
    for  event , elem in ET.iterparse(filename):
        keys = key_type(elem, keys)

    return keys

stockholm_all_tags = process_map(SAMPLE_FILE)
pprint.pprint (stockholm_all_tags)

{'lower': 148584, 'lower_and_colon': 65754, 'other': 1683, 'troublemaker': 1}


Auditing postal codes. The first two digit of postal codes in stockholm is 72.

In [7]:
def is_this_zipcode(elem):
    return (elem.attrib['k'] == "addr:postcode")

def audit_zipcode(invalid_zipcodes, zipcode):
    two_digits_code = zipcode[0:2]
    
    if two_digits_code != 72 or not two_digits_code.isdigit():
        invalid_zipcodes[two_digits_code].add(zipcode)


def audit_main(osmfile):
    osm_file = open(osmfile, "r")
    invalid_zipcodes = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):
        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_this_zipcode(tag):
                    audit_zipcode(invalid_zipcodes,tag.attrib['v'])

    return invalid_zipcodes

stokholm_zipcode = audit_main(SAMPLE_FILE)
#pprint.pprint(dict(stokholm_zipcode))


Problems Encountered: Inconsistent postal codes! 
Important note: Although I indicated invalid zipcode in a broad set, many of the above zipcodes are valid and have no promlebs. In Stockholm area zip codes all begin with “72” or “41”, however some of zip codes were outside this region.
In the following code, I modify the function to clean zip code, change xxx xx-xxxx format into 5 digits format, to remove the blank in the middle and create a consistant zipcode. 

In [8]:
def update_zipcode(zipcode):
    zipcode= zipcode.replace(" ","")
    zipcodeChar = re.findall('[a-zA-Z]*', zipcode)
    if zipcodeChar:
        zipcodeChar = zipcodeChar[0]
        zipcodeChar.strip()  #removes all whitespace at the start and end,including spaces,tabs,newlines and carriage returns
        
        return ((re.findall(r'\d+', zipcode))[0]) 
    
for street_type, ways in stokholm_zipcode.iteritems():
    for zipcodez in ways:
        update_name = update_zipcode(zipcodez)           
#        print zipcodez, "updated to:", update_name        

After auditing is completed it is the time to create tables of data to be inserted into a sql database. 1)parse the data 2)transforme data from document format to tabular format 3)create csv files for each table

In [None]:
OSM_PATH = "sample.osm"

NODES_PATH = "nodes.csv"
NODE_TAGS_PATH = "nodes_tags.csv"
WAYS_PATH = "ways.csv"
WAY_NODES_PATH = "ways_nodes.csv"
WAY_TAGS_PATH = "ways_tags.csv"

LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
                     
troublemaker = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

SCHEMA = schema.schema

NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']


def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=troublemaker, default_tag_type='regular'):

    node_attribs = {}
    way_attribs = {}
    way_nodes = []
    tags = [] 

    if element.tag == 'node':
        for attrib, value in element.attrib.iteritems():
            if attrib in node_attr_fields:
                node_attribs[attrib] = value
        
        for secondary in element.iter():
            if secondary.tag == 'tag':
                if problem_chars.match(secondary.attrib['k']) is not None:
                    continue
                else:
                    new = new_tagDict(element, secondary, default_tag_type)
                    tags.append(new)
        return {'node': node_attribs, 'node_tags': tags}
    elif element.tag == 'way':
        for attrib, value in element.attrib.iteritems():
            if attrib in way_attr_fields:
                way_attribs[attrib] = value
                
        counter = 0
        for secondary in element.iter():
            if secondary.tag == 'tag':
                if problem_chars.match(secondary.attrib['k']) is not None:
                    continue
                else:
                    new = new_tagDict(element, secondary, default_tag_type)
                    tags.append(new)
            if secondary.tag == 'nd':
                something_new = {}
                something_new['id'] = element.attrib['id']
                something_new['node_id'] = secondary.attrib['ref']
                something_new['position'] = counter
                counter += 1
                way_nodes.append(something_new)
        
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}


def new_tagDict(element, secondary, default_tag_type): #Load a new tag dict to go into the list of dicts for way_tags, node_tags
    
    new = {}
    new['id'] = element.attrib['id']
    if ":" not in secondary.attrib['k']:
        new['key'] = secondary.attrib['k']
        new['type'] = default_tag_type
    else:
        post_colon = secondary.attrib['k'].index(":") + 1
        new['key'] = secondary.attrib['k'][post_colon:]
        new['type'] = secondary.attrib['k'][:post_colon - 1]
    new['value'] = secondary.attrib['v']
    print "!23123"
    print secondary.attrib['v']
    print"!2312"
    return new



def get_element(osm_file, tags=('node', 'way', 'relation')): # if it is the right type of tag then Yield the element
    

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()

class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.items()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)
            
            
def validate_element(element, validator, schema=SCHEMA): #Raise ValidationError if element does not match schema"""
    
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)

        raise Exception(message_string.format(field, error_string))



def processing_map(file_in, validate):

    with codecs.open(NODES_PATH, 'w') as nodes_file, codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
         codecs.open(WAYS_PATH, 'w') as ways_file, codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
         codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()
        nodes_writer.writeheader()
        node_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            element1 = shape_element(element)
            if element1:
                if validate is True:
                    validate_element(element1, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(element1['node'])
                    node_tags_writer.writerows(element1['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(element1['way'])
                    way_nodes_writer.writerows(element1['way_nodes'])
                    way_tags_writer.writerows(element1['way_tags'])


if __name__ == '__main__':
    processing_map(OSM_PATH, validate=False)

!23123
traffic_signals
!2312
!23123
180
!2312
!23123
Trafikplats Danderyds k:a
!2312
!23123
motorway_junction
!2312
!23123
traffic_signals
!2312
!23123
Västberga ind omr
!2312
!23123
traffic_signals
!2312
!23123
crossing
!2312
!23123
traffic_signals
!2312
!23123
turning_circle
!2312
!23123
turning_circle
!2312
!23123
crossing
!2312
!23123
crossing
!2312
!23123
crossing
!2312
!23123
zebra
!2312
!23123
traffic_signals
!2312
!23123
traffic_signals
!2312
!23123
yes
!2312
!23123
Mapillary
!2312
!23123
bus_stop
!2312
!23123
stop_position
!2312
!23123
Hovslagarkurvan
!2312
!23123
143
!2312
!23123
Trafikplats Saltskog
!2312
!23123
motorway_junction
!2312
!23123
E 20
!2312
!23123
crossing
!2312
!23123
turning_circle
!2312
!23123
yes
!2312
!23123
Brunnsholm vägskäl
!2312
!23123
bus_stop
!2312
!23123
stop_position
!2312
!23123
132
!2312
!23123
Trafikplats Kungsladugården
!2312
!23123
motorway_junction
!2312
!23123
traffic_signals
!2312
!23123
traffic_signals
!2312
!23123
yes
!2312
!23123
bus_stop

In [11]:
sqlite_file = 'mydb.db' # name of the sqlite database file to be created

# Connect to the database
conn = sqlite3.connect(sqlite_file)
cur = conn.cursor()
cur.execute('DROP TABLE IF EXISTS ways')
conn.commit()
cur.execute("CREATE TABLE ways (id,user,uid,version,changeset,timestamp);") 
conn.commit()
with open('ways.csv','rb') as fin:
    delimiter = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'].decode("utf_8"), i['user'].decode("utf_8"), i['uid'].decode("utf_8"), i['version'].decode("utf_8"), i['changeset'].decode("utf_8"),i['timestamp'].decode("utf_8")) for i in delimiter]

cur.executemany("INSERT INTO ways (id,user,uid,version,changeset,timestamp) VALUES (?, ?,?,?, ?,?);", to_db)
conn.commit()

cur.execute("SELECT * FROM ways")
all_rows=cur.fetchall()
#print('1):')
#print (all_rows)

In [12]:
cur.execute('DROP TABLE IF EXISTS nodes')
conn.commit()
cur.execute("CREATE TABLE nodes (id,lat,lon,user,uid,version,changeset,timestamp);") 
conn.commit()
with open('nodes.csv','rb') as fin: 
    dr = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'].decode("utf_8"), i['lat'].decode("utf_8"),i['lon'].decode("utf_8"),i['user'].decode("utf_8"),i['uid'].decode("utf_8"),i['version'].decode("utf_8"),i['changeset'].decode("utf_8"),i['timestamp'].decode("utf_8")) for i in dr]

cur.executemany("INSERT INTO nodes (id,lat,lon,user,uid,version,changeset,timestamp) VALUES (?, ?,?,?,?,?,?,?);", to_db)
conn.commit()

cur.execute("SELECT * FROM nodes")
all_rows=cur.fetchall()
#print('1):')
#pprint.pprint (all_rows)

In [13]:
cur.execute('DROP TABLE IF EXISTS nodes_tags')
conn.commit()
cur.execute("CREATE TABLE nodes_tags (id INTEGER,key TEXT,value TEXT,type TEXT)") 
conn.commit()
with open('nodes_tags.csv','rb') as fin: 
    dr = csv.DictReader(fin)
    to_db = [(i['id'].decode("utf_8"), i['key'].decode("utf_8"),i['value'].decode("utf_8"),i['type'].decode("utf_8")) for i in dr]

cur.executemany("INSERT INTO nodes_tags (id,key,value,type) VALUES (?, ?,?,?);", to_db)
conn.commit()

In [14]:
cur.execute('DROP TABLE IF EXISTS ways_tags')
conn.commit()
cur.execute("CREATE TABLE ways_tags (id INTEGER,key TEXT,value TEXT,type TEXT)") 
conn.commit()
with open('ways_tags.csv','rb') as fin: # `with` statement available in 2.5+
    # csv.DictReader uses first line in file for column headings by default
    dr = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'].decode("utf_8"), i['key'].decode("utf_8"),i['value'].decode("utf_8"),i['type'].decode("utf_8")) for i in dr]

cur.executemany("INSERT INTO ways_tags (id,key,value,type) VALUES (?, ?,?,?);", to_db)
conn.commit()

In [15]:
cur.execute('DROP TABLE IF EXISTS ways_nodes')
conn.commit()
cur.execute("CREATE TABLE ways_nodes (id , node_id, position)") 
conn.commit()
with open('ways_nodes.csv','rb') as fin: # `__with` statement available in 2.5+
    # csv.DictReader uses first line in file for column headings by default
    dr = csv.DictReader(fin) # comma is default delimiter
    to_db = [(i['id'].decode("utf_8"), i['node_id'].decode("utf_8"),i['position'].decode("utf_8")) for i in dr]

cur.executemany("INSERT INTO ways_nodes (id,node_id,position) VALUES (?, ?,?);", to_db)
conn.commit()

In [16]:
cur.execute("SELECT * FROM nodes_tags WHERE id IN (SELECT DISTINCT(id) FROM nodes_tags WHERE key='road_ref' AND value='E 20')")
road_ref=cur.fetchall()
print('1):')
pprint.pprint (road_ref)

1):
[(311955, u'ref', u'143', u'regular'),
 (311955, u'name', u'Trafikplats Saltskog', u'regular'),
 (311955, u'highway', u'motorway_junction', u'regular'),
 (311955, u'road_ref', u'E 20', u'exit')]


Additional ideas:
List of top 20 Amenities in Stockholm

In [17]:
cur.execute("SELECT value, COUNT(*) as num \
            FROM nodes_tags \
           WHERE key='amenity' \
           GROUP BY value \
           ORDER BY num DESC \
           LIMIT 20;")
amenity= cur.fetchall()
print('20 top amenities in stockholm:')
pprint.pprint (amenity)

20 top amenities in stockholm:
[(u'bench', 227),
 (u'restaurant', 209),
 (u'fast_food', 114),
 (u'cafe', 92),
 (u'post_box', 92),
 (u'parking', 87),
 (u'recycling', 71),
 (u'waste_basket', 67),
 (u'shelter', 60),
 (u'bicycle_parking', 50),
 (u'toilets', 37),
 (u'fuel', 36),
 (u'pharmacy', 28),
 (u'bank', 26),
 (u'school', 25),
 (u'atm', 22),
 (u'pub', 22),
 (u'ferry_terminal', 21),
 (u'kindergarten', 20),
 (u'bbq', 14)]


Overview of the data
This section contains basic statistics about the dataset, the SQL queries used to gather them, and some additional ideas about the data in context.
File Size:

Stockholm.osm: 1.29 GB
nodes_csv: 49.5 MB
nodes_tags.csv: 163.4 KB
ways_csv: 4.1 MB
ways_nodes.csv: 17.9 MB
ways_tags.csv: 5.3 MB

Number of Nodes
Number of Ways
Number of unique users
Top 5 contrinuters 

In [18]:
cur.execute("SELECT COUNT(*) FROM nodes")
nodes= cur.fetchall()
print('number of nodes:')
print nodes[0][0]

number of nodes:
610544


In [19]:
cur.execute("SELECT COUNT(*) FROM ways")
ways= cur.fetchall()
print('number of ways:')
print ways[0][0]

number of ways:
69782


In [20]:
cur.execute("SELECT COUNT(DISTINCT(e.uid)) FROM (SELECT uid FROM nodes UNION SELECT uid FROM ways) e")
unique= cur.fetchall()
print( 'unique users:')
print unique[0][0]

unique users:
1880


In [21]:
cur.execute("SELECT e.user, COUNT(*) as num FROM (SELECT user FROM nodes UNION ALL SELECT user FROM ways) e GROUP BY e.user ORDER BY num DESC LIMIT 5;")
unique= cur.fetchall()
print( '5 top contributors:')
print unique
conn.close()

5 top contributors:
[(u'MichaelCollinson', 68249), (u'Fringillus', 65250), (u'emj', 42343), (u'huven', 39093), (u'jordgubbe', 26045)]


Conclusion
From the process of auditing it is notable that the dataset is fairly well-cleaned even though there are some minor error such as inconsistent postal codes. Since there are thousands of contributing human users, so it is inevitable to have many human input error and because OpenStreetMaps is an open source project, there’re still a lot of areas either missed or outdated. This is applicable alos for my favourite city Stockholm. So I hope OpenStreetMaps can obtain these data from other open data sources. As an example I am very inspired by a very cool startup based in Malmö/Sweden called 'Mapillary'(a service for sharing geotagged photos).