In [18]:
import csv
import codecs
import pprint
import re
import xml.etree.cElementTree as ET
from collections import defaultdict

# OSM fil path
OSM_PATH = "city_of_london.osm"

# Dictionaries to store data types
node_field_types = defaultdict(set)
node_tag_field_types = defaultdict(set)
way_field_types = defaultdict(set)
way_tag_field_types = defaultdict(set)
way_node_field_types = defaultdict(set)
# Data structure used to store wrong coordinates
coord_out_area = {}
# Data structure to store postal codes by pattern
node_postal_code_types = defaultdict(set)
way_postal_code_types = defaultdict(set)
# Data structure to store street types
node_street_types = defaultdict(set)
way_street_types = defaultdict(set)

subway_stations = []

# Counter for postal code patterns
node_counter_postal_code_types = {'AADD DAA': 0, 'AADA DAA': 0, 'AADD': 0, 'unkown': 0}
way_counter_postal_code_types = {'AADD DAA': 0, 'AADA DAA': 0, 'AADD': 0, 'unkown': 0}
# Counter for street patterns
node_street_type_counter = {'abbreviation': 0, 'commas': 0, 'postal_code': 0}
way_street_type_counter = {'abbreviation': 0, 'commas': 0, 'postal_code': 0}



# Regular Expressions for postal codes
# AADD DDA pattern
postal_code_type_1_re = re.compile(r'([A-Z]{1,2}[0-9]{1,2}\s[0-9][A-Z]{1,2})')
# AADA DAA pattern
postal_code_type_2_re = re.compile(r'([A-Z]{1,2}[0-9][A-Z]\s[0-9][A-Z]{1,2})')
# AADD
postal_code_type_3_re = re.compile(r'[A-Z]{1,2}[0-9]{1,2}')

# Regular expressions for street patterns
# St, st, St.
street_type1_re = re.compile(r'\sst\.?$', re.IGNORECASE)
street_type2_re = re.compile(r',', re.IGNORECASE)



# A function to print the output
def print_nice(dictionary):
    for entry in dictionary:
        print entry + ": " + str(dictionary[entry])
# A function to detect whether the values is an integer or a float
def is_number(v):
    try:
        int(v)
        return True
    except ValueError:
        try:
            float(v)
            return True
        except ValueError:
            return False

# A function to audit the type of the attributes of an element
def audit_attribute_type(types_dictionary, attributes):
    
    for attribute in attributes:
        value = attributes[attribute]
        if value == "NULL" or value == "" or value == None or value == type(None):
            types_dictionary[attribute].add(type(None)) 
        elif value.startswith("{") and value.endswith("}"):
            types_dictionary[attribute].add(type([]))
        elif is_number(value):
            try:
                int(value)
                types_dictionary[attribute].add(type(1))
            except ValueError:
                float(value)
                types_dictionary[attribute].add(type(1.1))
        else:
            types_dictionary[attribute].add(type("a"))
            
# A function to audit coordinates
def audit_coordinates(coord_out_area, element_attributes):
    node_id = element_attributes['id']
    lati = float(element_attributes['lat'])
    longi = float(element_attributes['lon'])
    # Evaluates if the latitude and longitude fall outside the area of interest
    if not(lati > 51.4425602 and lati < 51.5785612) or not (longi > -0.21698 and longi < 0.0164795):
        coord_out_area[node_id] = (lati,longi)

# Audit potal code function
def audit_postal_code(counter_postal_code_types, postal_code_types, child_attributes):
    if child_attributes['k'] == 'postal_code':
        postal_code = child_attributes['v']
        if postal_code_type_1_re.match(postal_code):
            postal_code_types['AADD DAA'].add(postal_code)
            counter_postal_code_types['AADD DAA'] += 1
        elif postal_code_type_2_re.match(postal_code):
            postal_code_types['AADA DAA'].add(postal_code)
            counter_postal_code_types['AADA DAA'] += 1
        elif postal_code_type_3_re.match(postal_code):
            postal_code_types['AADD'].add(postal_code)
            counter_postal_code_types['AADD'] += 1
        else:
            postal_code_types['unknown'].add(postal_code)
            counter_postal_code_types['unkown'] += 1        

# Function to audit streets
def audit_street(street_type_counter, street_types, child_attributes): 
    if child_attributes['k'] == 'addr:street':
            
        street_name = child_attributes['v']
        m1 = street_type1_re.search(street_name)
        m2 = street_type2_re.search(street_name)
        m3 = postal_code_type_1_re.search(street_name)
        m4 = postal_code_type_2_re.search(street_name)
        m5 = postal_code_type_3_re.search(street_name)
        if m1:
            street_type = m1.group()
            street_types[street_type].add(street_name)
            street_type_counter['abbreviation'] += 1
        elif m2:
            street_type = m2.group()
            street_types[street_type].add(street_name)
            street_type_counter['commas'] += 1
        elif m3:
            street_type = m3.group()
            street_types[street_type].add(street_name)
            street_type_counter['postal_code'] += 1
        elif m4:
            street_type = m4.group()
            street_types[street_type].add(street_name)
            street_type_counter['postal_code'] += 1
        elif m5:
            street_type = m5.group()
            street_types[street_type].add(street_name)
            street_type_counter['postal_code'] += 1
 
        
            
# A function to audit nodes and their tags               
def audit_nodes(element): 
    
    # get element attributes
    element_attributes = element.attrib
    # check element attribute types
    audit_attribute_type(node_field_types, element_attributes)
    # Audit coordinates
    audit_coordinates(coord_out_area, element_attributes)
              
    # get children (tag) 
    children = element.iter('tag')
    for child in children:
        # get children attributes
        child_attributes = child.attrib
        # Audit child type
        audit_attribute_type(node_tag_field_types, child_attributes)
        # Audit child postal code
        audit_postal_code(node_counter_postal_code_types, node_postal_code_types, child_attributes)
        # Audit streets names
        audit_street(node_street_type_counter, node_street_types, child_attributes)
        

# A function to audit way elements and their tags and nodes        
def audit_ways(element):
    # get element attributes
    element_attributes = element.attrib
    # check element attribute types
    audit_attribute_type(way_field_types, element_attributes)
    
    # get children (tag) 
    children_tags = element.iter('tag')
    for child in children_tags:
        # get children attributes
        child_attributes = child.attrib
        # audit child type
        audit_attribute_type(way_tag_field_types, child_attributes)
        ## Audit child postal code
        audit_postal_code(way_counter_postal_code_types, way_postal_code_types, child_attributes)
        ## Audit streets names
        audit_street(way_street_type_counter, way_street_types, child_attributes)
        
        

    # get children (nd) 
    children_nodes = element.iter('nd')
    for child in children_nodes:
        # get children attributes
        child_attributes = child.attrib
        audit_attribute_type(way_node_field_types, child_attributes)
        

        
if __name__ == '__main__':
    # Note: Validation is ~ 10X slower. For the project consider using a small
    # sample of the map when validating.
    
    #ITERATE PARSE THE OSM
    context = ET.iterparse(OSM_PATH)
    context.next()
    for _, element in context:
        if element.tag == 'node':
            audit_nodes(element)
        
        elif element.tag == 'way':
            audit_ways(element)
    
    # PRINT RESULTS    
    print "Auditing nodes!"    
    print_nice(node_field_types)
    print
    print "Auditing node coordinates"
    print print_nice(coord_out_area)
    print
    print "Auditing node tags!" 
    print_nice(node_tag_field_types)
    print
    print "Auditing node postal codes"
    print_nice(node_counter_postal_code_types)
    print
    print "Auditing node street names"
    print_nice(node_street_type_counter)
    print_nice(node_street_types)
    print
    print "Auditing ways!"
    print_nice(way_field_types)
    print
    print "Auditing way tags!"
    print_nice(way_tag_field_types)
    print
    print "Auditing node postal codes"
    print_nice(way_counter_postal_code_types)
    print
    print "Auditing node street names"
    print_nice(way_street_type_counter)
    print_nice(way_street_types)
    print
    print "Auditing way nodes!"
    print_nice(way_node_field_types) 
 

Auditing nodes!
changeset: set([<type 'int'>])
uid: set([<type 'int'>])
timestamp: set([<type 'str'>])
lon: set([<type 'float'>])
version: set([<type 'int'>])
user: set([<type 'str'>, <type 'int'>])
lat: set([<type 'float'>])
id: set([<type 'int'>])

Auditing node coordinates
None

Auditing node tags!
k: set([<type 'str'>])
v: set([<type 'str'>, <type 'float'>, <type 'int'>])

Auditing node postal codes
AADD DAA: 368
AADA DAA: 381
unkown: 0
AADD: 104

Auditing node street names
abbreviation: 11
postal_code: 1
commas: 47
N1 1LX: set(['N1 1LX'])
 st: set(['Peartree st'])
,: set(['Eccleston Square, Westminster', 'Pinnacle Way, Limehouse Basin', 'Bride Lane, Fleet Street', 'Northways Parade, Finchley Road', 'High Street, Lewisham', 'The Circle, Saint Pancras Station', 'Parkland Walk, Stroud Green Road', 'Ebury Street, Semley Place', 'Tower Bridge House, St. Katharine Docks', 'High Road, London, Leyton, Greater London E10 6QE', 'The Square, High Road', 'Half Moon Street, Mayfair', '4 Bryans