# Extract OSM XML data
In which we extract the OSM data from the XML file (already downloaded from [here](http://download.geofabrik.de/north-america/us/puerto-rico-latest.osm.bz2)) because that apparently has more info than the .shp files, including neighborhoods.

For each non-null node, we want the following features:

name, geometry_type, feature_class, osm_id, lon, lat, centroid, all_points, shp_type

In [2]:
import pandas as pd
from xml.etree import ElementTree

## Load data

In [37]:
xml_file = '../../data/geo_files/PR_OSM/xml_files/puerto-rico-latest.osm'
# too much data!!
# e = ElementTree.parse(xml_file).getroot()
# iteratively build tree
cutoff = 100
ctr = 0
with open(xml_file, 'r') as xml_input:
    tree = ElementTree.iterparse(xml_input)
    for items in tree:
        for item in items:
            if(item != 'end'):
                print(item.tag)
                print(item.items())
                ctr += 1
                if(ctr >= cutoff):
                    break
        if(ctr >= cutoff):
            break

bounds
[('minlat', '17.515728'), ('maxlon', '-65.092622'), ('minlon', '-68.310244'), ('maxlat', '18.81272')]
tag
[('k', 'name'), ('v', 'San Juan')]
tag
[('k', 'place'), ('v', 'city')]
tag
[('k', 'capital'), ('v', 'yes')]
tag
[('k', 'name:ar'), ('v', u'\u0633\u0627\u0646 \u062e\u0648\u0627\u0646\u060c \u0628\u0648\u0631\u062a\u0648\u0631\u064a\u0643\u0648')]
tag
[('k', 'name:az'), ('v', 'San Xuan')]
tag
[('k', 'name:be'), ('v', u'\u0421\u0430\u043d-\u0425\u0443\u0430\u043d')]
tag
[('k', 'name:bg'), ('v', u'\u0421\u0430\u043d \u0425\u0443\u0430\u043d')]
tag
[('k', 'name:el'), ('v', u'\u03a3\u03b1\u03bd \u03a7\u03bf\u03c5\u03ac\u03bd')]
tag
[('k', 'name:eo'), ('v', 'San-Juano')]
tag
[('k', 'name:fa'), ('v', u'\u0633\u0627\u0646 \u062e\u0648\u0622\u0646')]
tag
[('k', 'name:gl'), ('v', u'San Xo\xe1n, Porto Rico')]
tag
[('k', 'name:he'), ('v', u'\u05e1\u05df \u05d7\u05d5\u05d0\u05df')]
tag
[('k', 'name:hu'), ('v', 'San Juan')]
tag
[('k', 'name:hy'), ('v', u'\u054d\u0561\u0576 \u053d\u0578\u0

In [3]:
from collections import defaultdict
def extract_all_nodes(xml_file):
    node_dict = defaultdict(list)
    way_dict = defaultdict(list)
    relation_dict = defaultdict(list)
    valid_keys = ['id', 'lat', 'lon']
    with open(xml_file, 'r') as xml_input:
        ctr = 0
        tree = ElementTree.iterparse(xml_input)
        curr_data = []
        for items in tree:
            for item in items:
                if(item != 'end' and item.tag != 'bounds' and item.tag != 'osm'):
                    attrib_i = item.attrib
                    items = item.items()
                    tag_i = item.tag
                    if(tag_i != 'node' and tag_i != 'way' and tag_i != 'relation'):
                        if(tag_i == 'tag' and items[0][0]=='k' and items[1][0]=='v'):
                            k = items[0][1]
                            v = items[1][1]
                            curr_data.append((k,v))
                        # handling relations
                        elif(tag_i == 'member'):
                            curr_data.append(items[1])
                        else:
                            curr_data += items
                    # if we've hit a node/way/relation, update dict and flush data
                    else:
                        # restrict to important data
                        valid_items = filter(lambda x: x[0] in valid_keys, items)
                        curr_data += valid_items
                        if(tag_i == 'node'):
                            node_id = attrib_i['id']
                            node_dict[node_id] = list(curr_data)
                            curr_data = []
                        elif(tag_i == 'way'):
                            way_id = attrib_i['id']
#                             way_dict[way_id] = list(curr_data)
                            curr_data = []
                        elif(tag_i == 'relation'):
                            relation_id = attrib_i['id']
#                             relation_dict[relation_id] = list(curr_data)
                            curr_data = []
                    ctr += 1
                    if(ctr % 1000000 == 0):
                        print('processed %d items'%(ctr))
    # TODO: connect ways to lat/lon points using ref numbers
#     for way_id, way_data in way_dict.iteritems():
        # ref_ids = map(lambda x: x[1], filter(lambda y: y[0]=='ref', x), way_data)
        # non_ref_data = filter(lambda x: x[0]!='ref', way_data)
        # ref_lats = [[x for x in node_data[ref_id] if x[0]=='lat'][0] for ref_id in ref_ids]
        # ref_lons = [[x for x in node_data[ref_id] if x[0]=='lon'][0] for ref_id in ref_ids]
        # ref_lat_lons = zip(ref_lats, ref_lons)
        # non_ref_data.append(('lat_lons', ref_lat_lons))
        # way_dict[way_id] = non_ref_data
    # for relation_id, relation_data in relation_dict.iteritems():
        # ref_ids = map(lambda x: x[1], filter(lambda y: y[0]=='ref', x), relation_data)
        # non_ref_data = filter(lambda x: x[0]!='ref', relation_data)
        # ref_lats = [[x for x in node_data[ref_id] if x[0]=='lat'][0] for ref_id in ref_ids]
        # ref_lons = [[x for x in node_data[ref_id] if x[0]=='lon'][0] for ref_id in ref_ids]
        # ref_lat_lons = zip(ref_lats, ref_lons)
        # non_ref_data.append(('lat_lons', ref_lat_lons))
        # relation_dict[relation_id] = non_ref_data
    # remove all nameless nodes, ways and relations
    # node_dict = {k : v for k,v in node_dict if 'name' in map(lambda x: x[0])}
    # way_dict = {k : v for k,v in way_dict if 'name' in map(lambda x: x[0])}
    # relation_dict = {k : v for k,v in relation_dict if 'name' in map(lambda x: x[0])}
    # convert to rows
    return node_dict, way_dict, relation_dict

In [4]:
test_file = 'test.xml'
node_dict, way_dict, relation_dict = extract_all_nodes(test_file)
print('nodes')
print(node_dict)
print('ways')
print(way_dict)
print('relations')
print(relation_dict)

nodes
defaultdict(<type 'list'>, {'3749363865': [('name', 'El Suarito'), ('amenity', 'restaurant'), ('cuisine', 'regional'), ('smoking', 'no'), ('delivery', 'no'), ('takeaway', 'yes'), ('addr:city', 'Guayama'), ('addr:street', 'Calle Derkes'), ('addr:postcode', '00785'), ('lon', '-66.1143987'), ('lat', '17.9847791'), ('id', '3749363865')]})
ways
defaultdict(<type 'list'>, {})
relations
defaultdict(<type 'list'>, {})


In [None]:
xml_file = '../../data/geo_files/PR_OSM/xml_files/puerto-rico-latest.osm'
node_dict, way_dict, relation_dict = extract_all_nodes(xml_file)

processed 1000000 items
processed 2000000 items
processed 3000000 items
processed 4000000 items
processed 5000000 items
processed 6000000 items
processed 7000000 items
processed 8000000 items
processed 9000000 items
processed 10000000 items
processed 11000000 items
processed 12000000 items
processed 13000000 items
processed 14000000 items
processed 15000000 items


Every time we try to run this we get a memory timeout.

Next best thing: let's extract all nodes with names.

In [7]:
from xml.etree import ElementTree
from collections import defaultdict
def extract_all_named_nodes(xml_file):
    node_dict = defaultdict(list)
    valid_keys = ['id', 'lat', 'lon']
    with open(xml_file, 'r') as xml_input:
        ctr = 0
        tree = ElementTree.iterparse(xml_input)
        curr_keys = []
        curr_data = []
        for items in tree:
            for item in items:
                if(item != 'end' and item.tag != 'bounds' and item.tag != 'osm'):
                    attrib_i = item.attrib
                    items = item.items()
                    tag_i = item.tag
                    if(tag_i != 'node' and tag_i != 'way' and tag_i != 'relation'):
                        if(tag_i == 'tag' and items[0][0]=='k' and items[1][0]=='v'):
                            k = items[0][1]
                            v = items[1][1]
                            curr_keys.append(k)
                            curr_data.append((k,v))
                        # handling relations
                        elif(tag_i == 'member'):
                            curr_data.append(items[1])
                        else:
                            curr_data += items
                    # if we've hit a node/way/relation, update dict and flush data
                    elif(tag_i == 'node' and 'name' in curr_keys):
                        # restrict to important data
                        valid_items = filter(lambda x: x[0] in valid_keys, items)
                        curr_data += valid_items
                        if(tag_i == 'node'):
                            node_id = attrib_i['id']
                            node_dict[node_id] = list(curr_data)
                        curr_data = []
                        curr_keys = []
                    # if no name, reset and try again
                    else:
                        curr_data = []
                        curr_keys = []
                    ctr += 1
                    if(ctr % 1000000 == 0):
                        print('processed %d items'%(ctr))
    # TODO: convert to rows
    return node_dict

In [None]:
xml_file = '../../data/geo_files/PR_OSM/xml_files/puerto-rico-latest.osm'
node_dict = extract_all_named_nodes(xml_file=xml_file)

processed 1000000 items
processed 2000000 items
processed 3000000 items
processed 4000000 items
processed 5000000 items
processed 6000000 items
processed 7000000 items
processed 8000000 items
processed 9000000 items
processed 10000000 items
processed 11000000 items
processed 12000000 items
processed 13000000 items
processed 14000000 items
processed 15000000 items
processed 16000000 items
processed 17000000 items
processed 18000000 items
processed 19000000 items
processed 20000000 items
processed 21000000 items
processed 22000000 items
processed 23000000 items
processed 24000000 items
processed 25000000 items
processed 26000000 items
processed 27000000 items


In [10]:
# mapping ref IDs to lat/lon
for way_id, way_data in way_dict.iteritems():
    ref_ids = map(lambda x: x[1], filter(lambda y: y[0]=='ref', way_data))
    non_ref_data = filter(lambda x: x[0]!='ref', way_data)
    ref_lats = [[x for x in node_dict[ref_id] if x[0]=='lat'][0] for ref_id in ref_ids]
    ref_lons = [[x for x in node_dict[ref_id] if x[0]=='lon'][0] for ref_id in ref_ids]
    ref_lat_lons = zip(ref_lats, ref_lons)
    non_ref_data.append(('lat_lons', ref_lat_lons))
    way_dict[way_id] = non_ref_data
for relation_id, relation_data in relation_dict.iteritems():
    ref_ids = map(lambda x: x[1], filter(lambda y: y[0]=='ref', x), relation_data)
    non_ref_data = filter(lambda x: x[0]!='ref', relation_data)
    ref_lats = [[x for x in node_dict[ref_id] if x[0]=='lat'][0] for ref_id in ref_ids]
    ref_lons = [[x for x in node_dict[ref_id] if x[0]=='lon'][0] for ref_id in ref_ids]
    ref_lat_lons = zip(ref_lats, ref_lons)
    non_ref_data.append(('lat_lons', ref_lat_lons))
    relation_dict[relation_id] = non_ref_data
# remove all nameless nodes, ways and relations
node_dict = {k : v for k,v in node_dict if 'name' in map(lambda x: x[0])}
way_dict = {k : v for k,v in way_dict if 'name' in map(lambda x: x[0])}
relation_dict = {k : v for k,v in relation_dict if 'name' in map(lambda x: x[0])}

IndexError: list index out of range