In [10]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# Import libraries
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
import cerberus

# Import Schema for validation
import schema

# OSM fil path
OSM_PATH = "city_of_london.osm"

# CSV path names
NODES_PATH = "city_of_london_nodes.csv"
NODE_TAGS_PATH = "city_of_london_nodes_tags.csv"
WAYS_PATH = "city_of_london_ways.csv"
WAY_NODES_PATH = "city_of_london_ways_nodes.csv"
WAY_TAGS_PATH = "city_of_london_ways_tags.csv"

# Regular expressions
LOWER_COLON = re.compile(r'^([a-z]|_)+:([a-z]|_)+')
PROBLEMCHARS = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

street_type1_re = re.compile(r'\sst\.?$', re.IGNORECASE)
street_type2_re = re.compile(r',', re.IGNORECASE)
postal_code_re = re.compile(r'[A-Z]{1,2}[0-9]{1,2}\s[0-9][A-Z]{1,2}|[A-Z]{1,2}[0-9][A-Z]\s[0-9][A-Z]{1,2}|[A-Z]{1,2}[0-9]{1,2}')
postal_code_type_1_re = re.compile(r'([A-Z]{1,2}[0-9]{1,2}\s[0-9][A-Z]{1,2})')
postal_code_type_2_re = re.compile(r'([A-Z]{1,2}[0-9][A-Z]\s[0-9][A-Z]{1,2})')
postal_code_type_3_re = re.compile(r'[A-Z]{1,2}[0-9]{1,2}')

# Expected street values
expected = ['Street','Avenue','Road','Lane','STREET','AVENUE','ROAD','LANE']

# Schema
SCHEMA = schema.schema

# CSV fields
NODE_FIELDS = ['id', 'lat', 'lon', 'user', 'uid', 'version', 'changeset', 'timestamp']
NODE_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_FIELDS = ['id', 'user', 'uid', 'version', 'changeset', 'timestamp']
WAY_TAGS_FIELDS = ['id', 'key', 'value', 'type']
WAY_NODES_FIELDS = ['id', 'node_id', 'position']

counterNone = {'nod':0 , 'nod_tags':0 , 'wy':0 , 'wy_tag':0, 'way_nod':0}
# Clean and shape node or way XML element to Python dict
def shape_element(element, node_attr_fields=NODE_FIELDS, way_attr_fields=WAY_FIELDS,
                  problem_chars=PROBLEMCHARS, default_tag_type='regular'):

    node_attribs = {} 
    way_attribs = {}
    way_nodes = []
    tags = []  # Handle secondary tags the same way for both node and way elements

    # YOUR CODE HERE
    # Node tag elements
    if element.tag == 'node':
        
        # Get element attributes
        element_attributes = element.attrib
        # Set attribute types
        node_attribs['id'] = int(element_attributes['id']) #int
        node_attribs['lat'] = float(element_attributes['lat']) # float
        node_attribs['lon'] = float(element_attributes['lon']) # float
        try:
            node_attribs['user'] = element_attributes['user']
            node_attribs['uid'] = int(element_attributes['uid']) #int
        except:
            node_attribs['user'] = "unknown"
            node_attribs['uid'] = -1
        node_attribs['version'] = element_attributes['version']
        node_attribs['changeset'] = int(element_attributes['changeset']) #int
        node_attribs['timestamp'] = element_attributes['timestamp']
        
        # Node tag elements
        children = element.iter('tag')
        for child in children:
            # Get child attributes (tag)
            node_tags_dict = {}
            child_attributes = child.attrib
            # Set tag child attributtes and update street and postal code attributes
            node_tags_dict['id'] = int(element_attributes['id'])
            child_attr_key = child_attributes['k']
            child_attr_value = child_attributes['v']
            
            # Get rid of attribute keys with problematic characters
            if PROBLEMCHARS.match(child_attr_key):
                continue
            # Clean attribute keys with colons
            elif LOWER_COLON.match(child_attr_key):    
                attribute_list = child_attr_key.split(':')
                node_tags_dict['type'] = attribute_list[0]
                node_tags_dict['key'] = attribute_list[1] 
                if node_tags_dict['key'] == "street":
                    node_tags_dict['value'] = update_street(child_attr_value)
                elif node_tags_dict['key'] == "postal_code":
                    node_tags_dict['value'] = update_postal_code(child_attr_value)
                else:
                    node_tags_dict['value'] = child_attr_value
            # Deal with all attribtues
            else: 
                node_tags_dict['type'] = default_tag_type
                node_tags_dict['key'] = child_attr_key
                if node_tags_dict['key'] == "street":
                    node_tags_dict['value'] = update_street(child_attr_value)
                elif node_tags_dict['key'] == "postal_code":
                    node_tags_dict['value'] = update_postal_code(child_attr_value)
                else:
                    node_tags_dict['value'] = child_attr_value
            
            # Append new tag row
            tags.append(node_tags_dict)
        
        #print {'node': node_attribs, 'node_tags': tags}
        return {'node': node_attribs, 'node_tags': tags}
    
    # Way tag elements
    elif element.tag == 'way':
        # Get element attributes
        element_attributes = element.attrib
        
        # Get element way attributes
        way_attribs['id'] = int(element_attributes['id'])
        way_attribs['user'] = element_attributes['user']
        way_attribs['uid'] = int(element_attributes['uid'])
        way_attribs['version'] = element_attributes['version']
        way_attribs['changeset'] = int(element_attributes['changeset'])
        way_attribs['timestamp'] = element_attributes['timestamp']   
        
        # Get tag child elements
        tag_children = element.iter('tag')
        for tag in tag_children:
            way_tags_dict = {}
            # Get child attributes
            tag_attributes = tag.attrib
            
            # Set child attributes
            way_tags_dict['id'] = int(element_attributes['id'])
            tag_attr_key = tag_attributes['k']
            tag_attr_value = tag_attributes['v']
            
            # Get rid of attribute keys with problematic characters
            if PROBLEMCHARS.match(tag_attr_key):
                continue
            # Clean attribute keys with colons
            elif LOWER_COLON.match(tag_attr_key):    
                attribute_list = tag_attr_key.split(':')
                way_tags_dict['type'] = attribute_list[0]
                way_tags_dict['key'] = attribute_list[1] 
                if way_tags_dict['key'] == "street":
                    way_tags_dict['value'] = update_street(tag_attr_value)
                elif way_tags_dict['key'] == "postal_code":
                    way_tags_dict['value'] = update_postal_code(tag_attr_value)
                else:
                    way_tags_dict['value'] = tag_attr_value
            # Deal with all attribtues
            else: 
                way_tags_dict['type'] = default_tag_type
                way_tags_dict['key'] = tag_attr_key
                if way_tags_dict['key'] == "street":
                    way_tags_dict['value'] = update_street(tag_attr_value)
                elif way_tags_dict['key'] == "postal_code":
                    way_tags_dict['value'] = update_postal_code(tag_attr_value)
                else:
                    way_tags_dict['value'] = tag_attr_value
            # Append new tag row
            tags.append(way_tags_dict)
        
        # Way-node tags    
        pos = -1
        # Get nd child elements
        children_nd = element.iter('nd')
        
        for nd in children_nd:
            nd_tags_dict = {}
            # Get child attributes
            nd_attributes = nd.attrib
                
            nd_tags_dict['id'] = int(element_attributes['id'])
            nd_tags_dict['node_id'] = int(nd_attributes['ref'])
                
            pos +=1
            nd_tags_dict['position'] = int(pos)
            # Append new nd row    
            way_nodes.append(nd_tags_dict)
        
        return {'way': way_attribs, 'way_nodes': way_nodes, 'way_tags': tags}                    
            
    
# ================================================== #
#               Helper Functions                     #
# ================================================== #

# Function that updates postal code value
def update_postal_code(postal_code):      
    # Pattern AADD DAA A = letter A-Z ; D = digit 0-9
    if postal_code_type_1_re.match(postal_code):
        return postal_code
    # Pattern AADA DAA
    elif postal_code_type_2_re.match(postal_code):
        return postal_code
    # Pattern AADD
    elif postal_code_type_3_re.match(postal_code):
        return 'Postal code incomplete'
    # Any other string different than a postal code
    else:
        return 'Not a postal code'
    
# Function that updates street value    
def update_street(street_name):   
    #Case 1: Abbreviations
    if street_type1_re.search(street_name):
        street_name = re.sub( street_type1_re , ' Street',street_name)
        return  street_name
    #Case 2: Complete address
    elif street_type2_re.search(street_name):
        street_list = street_name.split(',')
        for street_item in street_list:
            for expected_item in expected:
                if expected_item in street_item:
                    street_name = street_item.strip()
        return street_name
    #case 3: Postal code
    elif postal_code_re.search(street_name):
        return ''
    #case 4: Any normal case
    else:
        return street_name
    
        
def get_element(osm_file, tags=('node', 'way', 'relation')):
    """Yield element if it is the right type of tag"""

    context = ET.iterparse(osm_file, events=('start', 'end'))
    _, root = next(context)
    for event, elem in context:
        if event == 'end' and elem.tag in tags:
            yield elem
            root.clear()


def validate_element(element, validator, schema=SCHEMA):
    """Raise ValidationError if element does not match schema"""
    if validator.validate(element, schema) is not True:
        field, errors = next(validator.errors.iteritems())
        message_string = "\nElement of type '{0}' has the following errors:\n{1}"
        error_string = pprint.pformat(errors)
        
        raise Exception(message_string.format(field, error_string))


class UnicodeDictWriter(csv.DictWriter, object):
    """Extend csv.DictWriter to handle Unicode input"""

    def writerow(self, row):
        super(UnicodeDictWriter, self).writerow({
            k: (v.encode('utf-8') if isinstance(v, unicode) else v) for k, v in row.iteritems()
        })

    def writerows(self, rows):
        for row in rows:
            self.writerow(row)


# ================================================== #
#               Main Function                        #
# ================================================== #
def process_map(file_in, validate):
    """Iteratively process each XML element and write to csv(s)"""

    with codecs.open(NODES_PATH, 'w') as nodes_file, \
        codecs.open(NODE_TAGS_PATH, 'w') as nodes_tags_file, \
        codecs.open(WAYS_PATH, 'w') as ways_file, \
        codecs.open(WAY_NODES_PATH, 'w') as way_nodes_file, \
        codecs.open(WAY_TAGS_PATH, 'w') as way_tags_file:

        nodes_writer = UnicodeDictWriter(nodes_file, NODE_FIELDS)
        node_tags_writer = UnicodeDictWriter(nodes_tags_file, NODE_TAGS_FIELDS)
        ways_writer = UnicodeDictWriter(ways_file, WAY_FIELDS)
        way_nodes_writer = UnicodeDictWriter(way_nodes_file, WAY_NODES_FIELDS)
        way_tags_writer = UnicodeDictWriter(way_tags_file, WAY_TAGS_FIELDS)

        nodes_writer.writeheader()
        node_tags_writer.writeheader()
        ways_writer.writeheader()
        way_nodes_writer.writeheader()
        way_tags_writer.writeheader()

        validator = cerberus.Validator()

        for element in get_element(file_in, tags=('node', 'way')):
            el = shape_element(element)
            if el:
                if validate is True:
                    validate_element(el, validator)

                if element.tag == 'node':
                    nodes_writer.writerow(el['node'])
                    node_tags_writer.writerows(el['node_tags'])
                elif element.tag == 'way':
                    ways_writer.writerow(el['way'])
                    way_nodes_writer.writerows(el['way_nodes'])
                    way_tags_writer.writerows(el['way_tags'])
            


if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    process_map(OSM_PATH, validate=False)