Project Summary:
In this project, I use data wrangling techniques, such as assessing the quality of the data for validity, accuracy, completeness, consistency and uniformity, to clean OpenStreetMap data. Then I convert the dataset from XML to CSV format, import the cleaned .csv files into database, conduct SQL queries to provide a statistical overview of the dataset. Finally, I give some additional suggestions for improving and analyzing the data.

Map Area:
Copenhagen,/Malmö


Split osm file into a smaller sample (SAMPLE_FILE). The original file (Copenhagen/Malmö) is 2GB.
Challenges: activating python2 via source activate py2 to be able to run the following code.
I stared with k=30

In [1]:
import xml.etree.ElementTree as ET  # Use cElementTree or lxml if too slow

OSM_FILE = "copenhagen_denmark.osm"  # Replace this with your osm file
SAMPLE_FILE = "sample.osm"

k = 30 # Parameter: take every k-th top level element

def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag

    Reference:
    http://stackoverflow.com/questions/3095434/inserting-newlines-in-xml-file-generated-via-xml-etree-elementtree-in-python
    """
    context = iter(ET.iterparse(osm_file, events=('start', 'end')))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


with open(SAMPLE_FILE, 'wb') as output:
    output.write('<?xml version="1.0" encoding="UTF-8"?>\n')
    output.write('<osm>\n  ')

    # Write every kth top level element
    for i, element in enumerate(get_element(OSM_FILE)):
        if i % k == 0:
            output.write(ET.tostring(element, encoding='utf-8'))

    output.write('</osm>')

Parse data-set and identify different tags, using iterative parsing.

In [3]:

import xml.etree.cElementTree as ET
import pprint
from collections import defaultdict

def count_tags(filename):
        all_tags=ET.iterparse(filename)
        nodes= defaultdict(int)
        for node in all_tags:
            nodes[node[1].tag] +=1
        return dict(nodes)           
    
def test():

    tags = count_tags(SAMPLE_FILE)
    pprint.pprint(tags)

if __name__ == "__main__":
    test()

{'member': 2017,
 'nd': 130507,
 'node': 122314,
 'osm': 1,
 'relation': 166,
 'tag': 232601,
 'way': 16729}


Unique users contributed to the map in this particular area:

In [4]:
def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if "uid" in element.attrib:
            users.add(element.get('uid'))

    return users

def test():

    users = process_map(SAMPLE_FILE)
    pprint.pprint(len(users))
#    assert len(users) == 6

if __name__ == "__main__":
    test()

985


# auditing 
One of the usual problems in openstreetmap dataset is from the street name abbreviation. However, I have not found any problems by only looking at the osm file. Here I will try to find something via my code.
1-Building the regular expression to match the last element in the string, where usually the street type is based. 
2-Then based on the street abbreviation, create a mapping that need to be cleaned.

In [25]:
import re

street_types= defaultdict(set)
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road", 
            "Trail", "Parkway", "Commons", "Cove", "Alley", "Park", "Way", "Walk" "Circle", "Highway", 
            "Plaza", "Path", "Center", "Mission"]

mapping = { "Ave": "Avenue",
            "Ave.": "Avenue",
            "avenue": "Avenue",
            "ave": "Avenue",
            "Blvd": "Boulevard",
            "Blvd.": "Boulevard",
            "Blvd,": "Boulevard",
            "Boulavard": "Boulevard",
            "Boulvard": "Boulevard",
            "Ct": "Court",
            "Dr": "Drive",
            "Dr.": "Drive",
            "E": "East",
            "Hwy": "Highway",
            "Ln": "Lane",
            "Ln.": "Lane",
            "Pl": "Place",
            "Plz": "Plaza",
            "Rd": "Road",
            "Rd.": "Road",
            "St": "Street",
            "St.": "Street",
            "st": "Street",
            "street": "Street",
            "square": "Square",
            "parkway": "Parkway"
            }


def audit_street_type(street_types, street_name):
    
    print street_name #only to see street names
   
    m = street_type_re.search(street_name) #finds the pattern 
    if m:
        street_type = m.group() #returns the last word
        if street_type not in expected: 
            street_types[street_type].add(street_name)

            
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")


def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types


def update_name(name, mapping, regex):
    m = regex.search(name)
    if m:
        st_type = m.group()
        if st_type in mapping:
            name = re.sub(regex, mapping[st_type], name)
    return name
pprint.pprint(dict(street_types)) 
   

if __name__ == "__main__":
    audit(SAMPLE_FILE)

{}
Gasværksvej
Nøjsomhedsvej
Vallensbæk Stationstorv
Kastrupvej
Godthåbsvej
Vanløse Torv
Abel Cathrines Gade
Abildgaardsgade
Abildgaardsgade
Absalonsgade
Adelgade
Adelgade
Adriansvej
Agerlandsvej
Agerlandsvej
Aggersvoldvej
Aggervej
Ahlefeldtsgade
Ahornsgade
Ahrenkildes Allé
Aldersrogade
Sigynsgade
Sigynsgade
Allersgade
Alstedvej
Amager Boulevard
Amagerbrogade
Amagerbrogade
Amagerbrogade
Amagerbrogade
Amagerbrogade
Amagerbrogade
Amagerbrogade
Amagerbrogade
Amagerbrogade
Amagerfælledvej
Amagerfælledvej
Amagerfælledvej
Amagergade
Amager Strandvej
Amager Strandvej
Amagertorv
Amaliegade
Amaliegade
Amaliegade
Amalie Skrams Allé
Amsterdamvej
Amsterdamvej
H.C. Andersens Boulevard
Andreas Bjørns Gade
Annebergvej
Ansgars Allé
Apollovej
Applebys Plads
Arabiensvej
Arendalsgade
Arildsgård
Arkaderne
Arkonagade
Armeniensvej
Arnestedet
Arnesvej
Arresøgade
Artillerivej
Artillerivej
Asminderødgade
Astersvej
Astrupvej
Astrupvej
Asylgade
Augustagade
Australiensvej
Axeltorv
Azaleagangen
Azaleagangen
Backer

In [5]:
#select id, name, operator, railway, construction
#from planet_osm_point
#where name = 'Trafikplats Södra Sallerup'


Checking ‘k’ value for each tag. creating a dictionary of the different tags.
Regular expressions: lower is for valid only-lowercase-letter tags. 
lower_colon is for other valid tags with a colon in the value. 
problemchars is for tags with problematic characters. 

In [6]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')


def key_type(element, keys):
    if element.tag == "tag":
        if re.match(lower, element.attrib['k']):
            keys["lower"] += 1
        elif re.match(lower_colon, element.attrib['k']):
            keys["lower_colon"] += 1
        elif re.search(problemchars, element.attrib['k']):
            keys["problemchars"] += 1
        else:
            keys['other'] += 1
    return keys


def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)

    return keys

sf_all_keys = process_map(SAMPLE_FILE)
print sf_all_keys

{'problemchars': 27, 'lower': 153243, 'other': 531, 'lower_colon': 544762}
