Project Summary:
In this project, I use data wrangling techniques, such as assessing the quality of the data for validity, accuracy, completeness, consistency and uniformity, to clean OpenStreetMap data. Then I convert the dataset from XML to CSV format, import the cleaned .csv files into database, conduct SQL queries to provide a statistical overview of the dataset. Finally, I give some additional suggestions for improving and analyzing the data.

Map Area:
Copenhagen,/Malmö


Split osm file into a smaller sample (SAMPLE_FILE). The original file (Copenhagen/Malmö) is 2GB.

In [8]:
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "copenhagen_denmark.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 10 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

Parse data-set and identify different tags.

In [9]:
import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict

def count_tags(filename):
        all_tags=ET.iterparse(filename)
        nodes= defaultdict(int)
        for node in all_tags:
            nodes[node[1].tag] +=1
        return dict(nodes)           
    
def test():

    tags = count_tags(SAMPLE_FILE)
    pprint.pprint(tags)

if __name__ == "__main__":
    test()

{'member': 5045,
 'nd': 396835,
 'node': 366941,
 'osm': 1,
 'relation': 498,
 'tag': 698563,
 'way': 50188}


Unique users have contributed to the map in this particular area!

In [10]:
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if "uid" in element.attrib:
            users.add(element.get('uid'))

    return users

def test():

    users = process_map(SAMPLE_FILE)
    pprint.pprint(len(users))
#    assert len(users) == 6

if __name__ == "__main__":
    test()

1317


I still am not sure if i need the following code!

In [15]:
import re

street_types= defaultdict(set)
expected= ['Street', 'Avenue', 'Boulevard', 'Drive', 'highway']
street_type_re = re.compile(r'∖b∖S+∖.?$', re.IGNORECASE)
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type= m.group()
        if street_type not in expected:
            street_types[street_type].add[street_name]
    
def is_street_name(elem):
    return( elem.attrib['k'] == "highway")

def audit():
    for event, elem in ET.iterparse(SAMPLE_FILE, events=("start",)):
        if elem.tag== 'k':
            for tag in elem.iter('tag'):
                if is_street_name(tag):
                    audit_street_type (street_types, tag.attrib['v'])
    pprint.pprint(dict(street_types)) 

if __name__ == "__main__":
    audit()

{}


In [17]:
#select id, name, operator, railway, construction
#from planet_osm_point
#where name = 'Trafikplats Södra Sallerup'

SELECT tags.value, COUNT(*) as count 
FROM (SELECT * FROM nodes_tags 
      UNION ALL 
      SELECT * FROM ways_tags) tags
WHERE tags.key='postcode'
GROUP BY tags.value
ORDER BY count DESC;

SyntaxError: invalid syntax (<ipython-input-17-400804ca3a15>, line 5)

Checking ‘k’ value for each tag. creating a dictionary of the three different tags.
Regular expressions: lower is for valid only-lowercase-letters tags. 
lower_colon is for other valid tags with a colon in the value. 
problemchars is for tags with problematic characters. 

In [18]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        if re.match(lower, element.attrib['k']):
            keys["lower"] += 1
        elif re.match(lower_colon, element.attrib['k']):
            keys["lower_colon"] += 1
        elif re.search(problemchars, element.attrib['k']):
            keys["problemchars"] += 1
        else:
            keys['other'] += 1
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

sf_all_keys = process_map(SAMPLE_FILE)
print sf_all_keys

{'problemchars': 27, 'lower': 153243, 'other': 531, 'lower_colon': 544762}
