# OpenStreetMap Data Case Study
### Portland, Oregon
#### source: https://mapzen.com/data/metro-extracts/metro/portland_oregon/

## PART I: Data Exploration

In [1]:
import xml.etree.cElementTree as ET 
import re as re
from collections import defaultdict
import pprint as pprint
import codecs
import json

In [2]:
# the portland osm data and the sample dataset
portland = "portland.osm"
sample = "sample.osm"

#### Counting tags in the dataset
We will use iterative parsing to process the map file in order to find out what tags exist and how many of each tag exists.
The function count_tags returns a dictionary with the tag name as the key and number of times this tag occurs in the map as value.

In [3]:
# count tags in the dataset
def count_tags(filename):
    tags = {}
    for event, elem in ET.iterparse(filename):
        if tags.has_key(elem.tag) == True:
            tags[elem.tag] += 1
        else:
            tags[elem.tag] = 1
    return tags

In [4]:
portland_tags = count_tags(portland)
pprint.pprint(portland_tags)

{'bounds': 1,
 'member': 65447,
 'nd': 7312666,
 'node': 6385864,
 'osm': 1,
 'relation': 6117,
 'tag': 4878233,
 'way': 822892}


#### Exploring the dataset further using regular expressions
We will now explore the dataset further using regular expressions to check for the following:
- **lower** for tags that contain only lowercase letters and are valid
- **lower_colon** for otherwise valid tags with a colon in their names
- **problemchars** for tags with problematic characters
- **other** for other tags that do not fall into the other three categories

In [5]:
# regular expressions
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

In [6]:
def key_type(element, keys):
    if element.tag == "tag":
        if re.search(problemchars, element.attrib['k']):
            keys['problemchars'] += 1
        elif re.search(lower_colon, element.attrib['k']):
            keys['lower_colon'] += 1
        elif re.search(lower, element.attrib['k']):
            keys['lower'] += 1
        else:
            keys['other'] += 1
    return keys

In [7]:
def process_map(filename):
    keys = {"lower": 0, "lower_colon": 0, "problemchars": 0, "other": 0}
    for _, element in ET.iterparse(filename):
        keys = key_type(element, keys)
    return keys

In [8]:
portland_keys = process_map('portland.osm')

In [9]:
pprint.pprint(portland_keys)

{'lower': 2400745, 'lower_colon': 2441784, 'other': 35704, 'problemchars': 0}


#### Number of unique contributors
The following functions figure out the number of unique users who have contributed to the portland dataset

In [10]:
def get_user(element):
    return

def process_map(filename):
    users = set()
    for _, element in ET.iterparse(filename):
        if 'uid' in element.attrib:
            users.add(element.get('uid'))
    return users

In [11]:
unique_users = process_map(portland)
len(unique_users)

1167

## PART II: Auditing

In [12]:
street_type_re = re.compile(r'\b\S+\.?$', re.IGNORECASE)

expected = ["Street", "Avenue", "Boulevard", "Drive", "Court", "Place", "Square", "Lane", "Road",
            "Trail", "Parkway", "Commons"]

mapping = { "St" : "Street",
            "St." : "Street",
            "Ave" : "Avenue",
            "Ave." : "Avenue",
            "Blvd" : "Boulevard",
            "Blvd." : "Boulevard",
            "Dr" : "Drive",
            "Dr." : "Drive",
            "Ct" : "Court",
            "Ct." : "Court",
            "Pl" : "Place",
            "Pl." : "Place",
            "Sq" : "Square",
            "Sq." : "Square",
            "Ln" : "Lane",
            "Ln." : "Lane",
            "Rd" : "Road",
            "Rd." : "Road",
            "Trl" : "Trail",
            "Trl." : "Trail",
            "Pkwy" : "Parkway",
            "Pkwy." : "Parkway",
            "Cmns" : "Commons",
            "Cmns." : "Commons"}

In [13]:
def audit_street_type(street_types, street_name):
    m = street_type_re.search(street_name)
    if m:
        street_type = m.group()
        if street_type not in expected:
            street_types[street_type].add(street_name)

In [14]:
def is_street_name(elem):
    return (elem.attrib['k'] == "addr:street")

In [15]:
def audit(osmfile):
    osm_file = open(osmfile, "r")
    street_types = defaultdict(set)
    for event, elem in ET.iterparse(osm_file, events=("start",)):

        if elem.tag == "node" or elem.tag == "way":
            for tag in elem.iter("tag"):
                if is_street_name(tag):
                    audit_street_type(street_types, tag.attrib['v'])
    osm_file.close()
    return street_types

In [16]:
portland_audit = audit(portland)

In [17]:
pprint.pprint(dict(portland_audit))

{'100': set(['SW Upper Boones Ferry Rd #100']),
 '101': set(['Northwest Hoyt Street #101']),
 '155th': set(['Southwest 155th']),
 '156th': set(['Southwest 156th']),
 '157th': set(['Southwest 157th']),
 '158th': set(['Southwest 158th']),
 '160th': set(['Southwest 160th']),
 '163rd': set(['Southwest 163rd']),
 '165th': set(['Southwest 165th']),
 '170': set(['South Highway 170']),
 '211': set(['Highway 211', 'South Highway 211', 'Southeast Highway 211']),
 '212': set(['SE Highway 212', 'Southeast Highway 212']),
 '213': set(['Highway 213', 'South Highway 213']),
 '224': set(['Northwest Highway 224',
             'South Highway 224',
             'Southeast Highway 224',
             'Southwest Highway 224']),
 '26': set(['Highway 26', 'Southeast Highway 26']),
 '273rd': set(['Northwest 273rd']),
 '330': set(['Northeast 95th Street #330']),
 '41st': set(['41st']),
 '426': set(['Southwest 3rd Avenue #426']),
 '4616': set(['4616']),
 '4637': set(['4637']),
 '47': set(['Northwest Highway 47',

In [18]:
def update_name(name, mapping):
    sorted_keys = sorted(mapping.keys(), key=len, reverse=True)
    for abbrv in sorted_keys:
         if(abbrv in name):
            return name.replace(abbrv, mapping[abbrv])
    return name

In [19]:
for street_type, ways in portland_audit.iteritems():
    for name in ways:
        better_name = update_name(name, mapping)
        print name, "=>", better_name

North Mississippi => North Mississippi
Arney Road Northeast => Arney Road Northeast
Donald Road Northeast => Donald Road Northeast
SE Highway 212 => SE Highway 212
Southeast Highway 212 => Southeast Highway 212
Highway 213 => Highway 213
South Highway 213 => South Highway 213
Highway 211 => Highway 211
South Highway 211 => South Highway 211
Southeast Highway 211 => Southeast Highway 211
North Highway 99w => North Highway 99w
Southeast Belmore Heights => Southeast Belmore Heights
SE Foster Rd => SE Foster Road
SW Nicol Rd => SW Nicol Road
SW Rock Creek Rd => SW Rock Creek Road
6710 McEwan Rd => 6710 McEwan Road
SW Farmington Rd => SW Farmington Road
NW Laidlaw Rd => NW Laidlaw Road
Old Portland Rd => Old Portland Road
NW Cornell Rd => NW Cornell Road
SW Bany Rd => SW Bany Road
SW Brockman Rd => SW Brockman Road
SE Sunnyside Rd => SE Sunnyside Road
SW Walker Rd => SW Walker Road
NW Saltzman Rd => NW Saltzman Road
SW Scholls Ferry Rd => SW Scholls Ferry Road
E Dartmouth => E Dartmouth
Sou

## PART II: Prep data for MongoDB

We want to wrangle the data and transform its shape so that it can be used with pymongo:

In [20]:
lower = re.compile(r'^([a-z]|_)*$')
lower_colon = re.compile(r'^([a-z]|_)*:([a-z]|_)*$')
problemchars = re.compile(r'[=\+/&<>;\'"\?%#$@\,\. \t\r\n]')

CREATED = [ "version", "changeset", "timestamp", "user", "uid"]

In [21]:
def shape_element(element):
    node = {}
    # process only 2 types of top level tags: "node" and "way"
    if element.tag == "node" or element.tag == "way":
        node['type'] = element.tag
        address = {}
        for key in element.attrib:
            val = element.attrib[key]
            # attributes in the CREATED array should be added under a key "created"
            if key in CREATED:
                if "created" not in node:
                    node["created"] = {}
                else:
                    node["created"][key] = val
            # attributes for latitude and longitude should be added to a "pos" array
            # Make sure the values inside "pos" array are floats and not strings
            elif key == "lat" or key == "lon":
                node["pos"] = [float(element.get('lat')), float(element.get('lon'))]
            else:
                node[key] = val
                
                
            for tag in element.iter("tag"):
                key_t, val_t = tag.attrib['k'], tag.attrib['v']
                # if the second level tag "k" value contains
                # problematic characters, it should be ignored
                if problemchars.match(key_t):
                    continue # skip if problem char
                # if the second level tag "k" value starts with "addr:",
                # it should be added to a dictionary "address"    
                elif key_t.startswith("addr:"):
                    addr_key = tag.attrib['k'][len("addr:"):]
                    if lower_colon.match(addr_key):
                        continue
                    elif "address" not in node.keys():
                        node["address"] = {}
                    else:
                        node["address"][addr_key] = val_t
                # if the second level tag "k" value does not start with "addr:",
                # but contains ":", you can process it in a way that you feel is best.
                # For example, you might split it into a two-level dictionary like with "addr:",
                # or otherwise convert the ":" to create a valid key.        
                elif lower_colon.match(key_t):
                    node[key_t] = val_t
                else:
                    node[key_t] = val_t
                    
        for tag in element.iter("nd"):
            if "node_refs" not in node.keys():
                node["node_refs"] = []
            elif 'ref' in element.attrib:
                node_refs = node["node_refs"]
                node_refs.append(tag.attrib["ref"])
                node["node_refs"] = node_refs

        return node
    else:
        return None

In [22]:
def process_map(file_in, pretty = False):
    # You do not need to change this file
    file_out = "{0}.json".format(file_in)
    data = []
    with codecs.open(file_out, "w") as fo:
        for _, element in ET.iterparse(file_in):
            el = shape_element(element)
            if el:
                data.append(el)
                if pretty:
                    fo.write(json.dumps(el, indent=2)+"\n")
                else:
                    fo.write(json.dumps(el) + "\n")
    return data

In [None]:
process_map(portland)

## PART IV: Data Exploration

In [None]:
from pymongo import MongoClient

In [None]:
client = MongoClient()
db = client.portland