# Test OpenStreetMap data
In which we look at Puerto Rico's OSM data, downloaded from [here](http://download.geofabrik.de/north-america/us/puerto-rico.html).

In [13]:
import fiona
import json
import os

## Load data

In [5]:
test_file = '../../data/geo_files/PR_OSM/shp_files/gis.osm_places_a_free_1.shp'
c = fiona.open(test_file, 'r')
print(c)

<open Collection '../../data/geo_files/PR_OSM/shp_files/gis.osm_places_a_free_1.shp:gis.osm_places_a_free_1', mode 'r' at 0x7f92fc136850>


In [6]:
c_list = list(c)

In [9]:
print(json.dumps(c_list[0], indent=2, sort_keys=True))

{
  "geometry": {
    "coordinates": [
      [
        [
          -65.3848792, 
          18.1089953
        ], 
        [
          -65.3847676, 
          18.1095664
        ], 
        [
          -65.3845959, 
          18.1098356
        ], 
        [
          -65.3843985, 
          18.1098764
        ], 
        [
          -65.3841239, 
          18.1092646
        ], 
        [
          -65.3837634, 
          18.108669
        ], 
        [
          -65.3833514, 
          18.1076901
        ], 
        [
          -65.3831626, 
          18.107119
        ], 
        [
          -65.3828536, 
          18.1064501
        ], 
        [
          -65.3825617, 
          18.1060014
        ], 
        [
          -65.3827334, 
          18.1060014
        ], 
        [
          -65.3830081, 
          18.1061401
        ], 
        [
          -65.3833514, 
          18.1059769
        ], 
        [
          -65.3836775, 
          18.1057566
        ], 
        [
       

In [12]:
print('\n'.join(map(lambda x: json.dumps(x['properties'], indent=2, sort_keys=True), c_list)))

{
  "code": 1020, 
  "fclass": "island", 
  "name": "Isla Chiva", 
  "osm_id": "28332664", 
  "population": 0
}
{
  "code": 1020, 
  "fclass": "island", 
  "name": "", 
  "osm_id": "28332672", 
  "population": 0
}
{
  "code": 1020, 
  "fclass": "island", 
  "name": "Cayo Rat\u00f3n", 
  "osm_id": "44026832", 
  "population": 0
}
{
  "code": 1020, 
  "fclass": "island", 
  "name": "Cayos Geniqu\u00ed", 
  "osm_id": "44026835", 
  "population": 0
}
{
  "code": 1020, 
  "fclass": "island", 
  "name": "Pelaita", 
  "osm_id": "44026838", 
  "population": 0
}
{
  "code": 1020, 
  "fclass": "island", 
  "name": "Cayo Lobo", 
  "osm_id": "44026839", 
  "population": 0
}
{
  "code": 1020, 
  "fclass": "island", 
  "name": "Los Gemelos", 
  "osm_id": "44026842", 
  "population": 0
}
{
  "code": 1020, 
  "fclass": "island", 
  "name": "Cayo Botella", 
  "osm_id": "44026845", 
  "population": 0
}
{
  "code": 1020, 
  "fclass": "island", 
  "name": "Cayos Geniqu\u00ed", 
  "osm_id": "44026847", 
  

Cool! Looks like I found the "island and marine area" file.

What are all shape files?

In [14]:
all_shape_files = sorted(os.listdir('../../data/geo_files/PR_OSM/shp_files/'))
print('\n'.join(all_shape_files))

gis.osm_buildings_a_free_1.cpg
gis.osm_buildings_a_free_1.dbf
gis.osm_buildings_a_free_1.prj
gis.osm_buildings_a_free_1.shp
gis.osm_buildings_a_free_1.shx
gis.osm_landuse_a_free_1.cpg
gis.osm_landuse_a_free_1.dbf
gis.osm_landuse_a_free_1.prj
gis.osm_landuse_a_free_1.shp
gis.osm_landuse_a_free_1.shx
gis.osm_natural_a_free_1.cpg
gis.osm_natural_a_free_1.dbf
gis.osm_natural_a_free_1.prj
gis.osm_natural_a_free_1.shp
gis.osm_natural_a_free_1.shx
gis.osm_natural_free_1.cpg
gis.osm_natural_free_1.dbf
gis.osm_natural_free_1.prj
gis.osm_natural_free_1.shp
gis.osm_natural_free_1.shx
gis.osm_places_a_free_1.cpg
gis.osm_places_a_free_1.dbf
gis.osm_places_a_free_1.prj
gis.osm_places_a_free_1.shp
gis.osm_places_a_free_1.shx
gis.osm_places_free_1.cpg
gis.osm_places_free_1.dbf
gis.osm_places_free_1.prj
gis.osm_places_free_1.shp
gis.osm_places_free_1.shx
gis.osm_pofw_a_free_1.cpg
gis.osm_pofw_a_free_1.dbf
gis.osm_pofw_a_free_1.prj
gis.osm_pofw_a_free_1.shp
gis.osm_pofw_a_free_1.shx
gis.osm_pofw_free_1.

According to [this manual](http://download.geofabrik.de/osm-data-in-gis-formats-free.pdf), the `poi` files contain points of interest, which seems to be more of what we're looking for. Let's open that up!

In [15]:
poi_file = '../../data/geo_files/PR_OSM/shp_files/gis.osm_pois_free_1.shp'
poi_collection = fiona.open(poi_file)
poi_list = list(poi_collection)
print('%d POIs'%(len(poi_list)))

6502 POIs


In [18]:
def pretty_print(x):
    return json.dumps(x, indent=2, sort_keys=True)

In [19]:
print('\n'.join(map(lambda x: pretty_print(x), poi_list[:10])))

{
  "geometry": {
    "coordinates": [
      -66.063993, 
      17.961996
    ], 
    "type": "Point"
  }, 
  "id": "0", 
  "properties": {
    "code": 2723, 
    "fclass": "monument", 
    "name": "", 
    "osm_id": "237846268"
  }, 
  "type": "Feature"
}
{
  "geometry": {
    "coordinates": [
      -66.0473769, 
      18.2401997
    ], 
    "type": "Point"
  }, 
  "id": "1", 
  "properties": {
    "code": 2082, 
    "fclass": "school", 
    "name": "Escuela Pepita Arenas", 
    "osm_id": "238054139"
  }, 
  "type": "Feature"
}
{
  "geometry": {
    "coordinates": [
      -66.0515189, 
      18.11136
    ], 
    "type": "Point"
  }, 
  "id": "2", 
  "properties": {
    "code": 2950, 
    "fclass": "tower", 
    "name": "WSRA-FM (Guayama)", 
    "osm_id": "238073429"
  }, 
  "type": "Feature"
}
{
  "geometry": {
    "coordinates": [
      -66.6407374, 
      18.0348221
    ], 
    "type": "Point"
  }, 
  "id": "3", 
  "properties": {
    "code": 2012, 
    "fclass": "community_centre",

Good! These are examples of entities that don't show up in Wikipedia but still exist (e.g., `Escuela Luis M Santiago`).

Let's check for some of the more obscure entities that we found during annotation:

- `Villa Caribe` (restaurant)
- `Urbanización Villa Rosa` (apartments)
- `Coliseo Max Sánchez` (stadium)

In [22]:
poi_names = sorted(map(lambda x: x['properties']['name'], poi_list))
test_names = ['Villa Caribe', 'Urbanización Villa Rosa', 'Coliseo Max Sánchez']
print(set(poi_names) & set(test_names))

set([])


In [35]:
test_strs = ['Villa', 'Urb', 'Coliseo']
for test_str in test_strs:
    print('**%s**'%(test_str))
    matching_names = [p for p in poi_names if test_str in p]
    print(u'\n'.join(matching_names))

**Villa**
Escuela Anselmo Villarrubia
Escuela Villa Capri
Escuela Villa Granada (elemental)
Escuela Villa Granada Intermedia
Escuela Villa Marisol
Escuela Villa Marína
La Villa del Norte Bakery
Mirador Villalba Orocovis
Ortodoncistas Dr. C. méndez Villamil, Dr. M. Torres Díaz
Parador Villa Antonio
Parador Villa del Mar
Superfarmacia Villa Toledo
Tasty Villa Chan
Villa Carmen Econo Supermarket
Villa Fontana Park
Villa Montaña Beach Resort
Villa Taina
Villa Verde Inn
Villa Vigía
Village Spa
Villas de Sotomayor
**Urb**
Escuela Elemental Urbana
Escuela Elemental Urbana (escuela nueva)
Escuela Elemental Urbana K-6
Escuela Elemental Urbana Nueva
Escuela Elemental Urbana Nueva
Escuela Intermedia Urbana
Escuela Joséfa Vélez Bauza (Superior Urbana)
Escuela Nueva Elemental Urbana De Guaynabo
Escuela Nueva Urbana De Ciales
Escuela Superior Urbana
Escuela Superior Urbana Nueva
Nomada Urban Beach Hostel
Parque Urbano Dora Colón Clavell
Sabor Urbano
Urban San Juan
**Coliseo**
Coliseo Angel Mercado V

OK! So maybe some of these are too obscure even for OSM. But we do see the major entities coming out.

What's in the other POI list?

In [38]:
poi_file_2 = '../../data/geo_files/PR_OSM/shp_files/gis.osm_pois_a_free_1.shp'
poi_collection_2 = fiona.open(poi_file_2)
poi_list_2 = list(poi_collection_2)
print('%d POIs'%(len(poi_list_2)))

7218 POIs


In [40]:
print('\n'.join(map(pretty_print, poi_list_2[:10])))

{
  "geometry": {
    "coordinates": [
      [
        [
          -66.0490623, 
          18.4025492
        ], 
        [
          -66.0490312, 
          18.4029692
        ], 
        [
          -66.0485621, 
          18.4029378
        ], 
        [
          -66.0485584, 
          18.4029878
        ], 
        [
          -66.0484696, 
          18.4029819
        ], 
        [
          -66.0484771, 
          18.4028803
        ], 
        [
          -66.0484569, 
          18.4028789
        ], 
        [
          -66.0484822, 
          18.4025378
        ], 
        [
          -66.0485024, 
          18.4025392
        ], 
        [
          -66.0485137, 
          18.4023866
        ], 
        [
          -66.0486474, 
          18.4023956
        ], 
        [
          -66.0486381, 
          18.4025208
        ], 
        [
          -66.0490623, 
          18.4025492
        ]
      ]
    ], 
    "type": "Polygon"
  }, 
  "id": "0", 
  "properties": {
    "cod

Maybe the combined list has the missing entities!!

In [41]:
poi_names_2 = sorted(map(lambda x: x['properties']['name'], poi_list_2))
poi_names_combined = list(set(poi_names + poi_names_2))
test_names = ['Villa Caribe', 'Urbanización Villa Rosa', 'Coliseo Max Sánchez']
print(set(poi_names_combined) & set(test_names))

set([])


Still nope! These names must be extremely obscure.

How about other lists?

## Traffic

In [44]:
traffic_file_name = '../../data/geo_files/PR_OSM/shp_files/gis.osm_traffic_free_1.shp'
traffic_collection_1 = fiona.open(traffic_file_name)
traffic_list_1 = list(traffic_collection_1)
print('%d traffic points'%(len(traffic_list_1)))

2454 traffic points


In [46]:
print('\n'.join(map(pretty_print, traffic_list_1[:10])))

{
  "geometry": {
    "coordinates": [
      -66.0166962, 
      18.4358875
    ], 
    "type": "Point"
  }, 
  "id": "0", 
  "properties": {
    "code": 5206, 
    "fclass": "motorway_junction", 
    "name": "", 
    "osm_id": "114570246"
  }, 
  "type": "Feature"
}
{
  "geometry": {
    "coordinates": [
      -66.0222767, 
      18.4331861
    ], 
    "type": "Point"
  }, 
  "id": "1", 
  "properties": {
    "code": 5206, 
    "fclass": "motorway_junction", 
    "name": "", 
    "osm_id": "114596759"
  }, 
  "type": "Feature"
}
{
  "geometry": {
    "coordinates": [
      -66.0763998, 
      18.4282765
    ], 
    "type": "Point"
  }, 
  "id": "2", 
  "properties": {
    "code": 5206, 
    "fclass": "motorway_junction", 
    "name": "", 
    "osm_id": "217540324"
  }, 
  "type": "Feature"
}
{
  "geometry": {
    "coordinates": [
      -66.0467208, 
      18.4080458
    ], 
    "type": "Point"
  }, 
  "id": "3", 
  "properties": {
    "code": 5206, 
    "fclass": "motorway_junction", 

Unnamed motorways. Not interesting.

## Roads

In [48]:
road_file_1 = '../../data/geo_files/PR_OSM/shp_files/gis.osm_roads_free_1.shp'
road_collection_1 = fiona.open(road_file_1)
road_list_1 = list(road_collection_1)
print('%d road entities'%(len(road_list_1)))

159560 road entities


In [49]:
print('\n'.join(map(pretty_print, road_list_1[:10])))

{
  "geometry": {
    "coordinates": [
      [
        -66.0159198, 
        18.4362435
      ], 
      [
        -66.0160557, 
        18.4362188
      ], 
      [
        -66.016188, 
        18.4361882
      ], 
      [
        -66.0163615, 
        18.4360911
      ], 
      [
        -66.0165417, 
        18.4359771
      ], 
      [
        -66.0166962, 
        18.4358875
      ]
    ], 
    "type": "LineString"
  }, 
  "id": "0", 
  "properties": {
    "bridge": "F", 
    "code": 5111, 
    "fclass": "motorway", 
    "layer": 0, 
    "maxspeed": 0, 
    "name": "", 
    "oneway": "F", 
    "osm_id": "12558468", 
    "ref": "", 
    "tunnel": "F"
  }, 
  "type": "Feature"
}
{
  "geometry": {
    "coordinates": [
      [
        -66.0364784, 
        18.4462158
      ], 
      [
        -66.0365735, 
        18.4462263
      ], 
      [
        -66.0369382, 
        18.4462657
      ], 
      [
        -66.0375344, 
        18.4463144
      ], 
      [
        -66.0413925, 
     

These roads do have names! Do we have any repeat offenders?

In [53]:
from collections import Counter
road_names_1 = map(lambda x: x['properties']['name'], road_list_1)
road_counter = Counter(road_names_1)
top_k = 50
print('\n'.join(map(lambda x: '%s,%d'%(x[0],x[1]), road_counter.most_common(top_k))))

,106345
Calle 1,1126
Calle 2,1052
Calle 3,976
Calle 4,875
Calle 5,745
Calle 6,654
Calle 7,606
Calle 8,530
Calle A,521
Calle B,448
Calle 9,425
Calle 10,398
Autopista Luis A. Ferré,365
Calle C,334
Calle 11,282
Autopista José de Diego,282
Calle 12,274
Calle D,272
Calle 13,250
Calle 14,210
Calle Marginal,195
Carretera 2,195
Carretera 3,187
Calle E,186
Calle 15,186
Autopista José Celso Barbosa,172
Calle F,159
Expreso Roberto Sánchez Vilella,156
Calle 16,151
Calle 17,143
Calle Principal,140
Calle 19,128
Calle 18,126
Calle 20,115
Calle 21,111
Carretera 1,107
Calle Amapola,105
Calle G,104
Calle 22,103
Expreso Cruz Ortíz Stella,102
Calle 23,101
Calle Luis Muñoz Rivera,100
Autopista Roberto Sánchez Vilella,94
Calle Gardenia,86
Calle H,81
Calle 24,77
Calle Margarita,76
Calle Ceiba,72
Calle 25,71


OK! The most common street markers are the most obvious ones, stuff like "1st Street", "2nd Street," etc. These streets are probably common across locations, which makes them good candidates for disambiguation!

## Natural

We should make a quick summary function to extract all names from a shape file.

In [54]:
def get_all_location_names(file_name):
    location_iter = fiona.open(file_name)
    locations = list(location_iter)
    location_names = map(lambda x: x['properties']['name'], locations)
    return location_names

In [56]:
natural_file_name = '../../data/geo_files/PR_OSM/shp_files/gis.osm_natural_free_1.shp'
natural_location_names_1 = get_all_location_names(natural_file_name)
natural_location_name_counts = Counter(natural_location_names_1)
top_k = 50
print_count = lambda x: '%s,%d'%(x[0],x[1])
print('\n'.join(map(print_count, natural_location_name_counts.most_common(top_k))))

,9006
Palma,31
Palmas,17
Palma Areca,10
Roble Rosado,9
Plátano,9
Mangó,8
roble,7
Palm,7
Mango,7
Roble,7
Roble Lila,6
Flamboyán,6
Cerro Gordo,5
Roble rosado,5
Cerro Avispa,4
María,4
Roble Amarillo,4
Cerro de las Cuevas,3
Dracaena,3
Cerro Magueyes,3
Cerro Vigía,2
Ficus,2
Jatropha,2
Palma Navidad,2
Piedra Blanca,2
Playa Grande,2
Cerro Lucero,2
La Ceiba,2
Cerro Miraflores,2
Acacia Blanca,2
Croto,2
Cerro El Gato,2
Higuereta,2
Cerro de la Bandera,2
Úcar,2
Cerro Santo Domingo,2
Cerro Malo,2
Cerro San José,2
Cerro Pichón,2
Cerro de las Avispas,2
Arbol de María,2
Pterocarpus,2
La Torrecilla,2
Cerro del Muerto,2
Almácigo,2
Trinitarias,2
palm,2
úcar,2
Cerro Farallón,2


The natural locations have a long tail and seem to focus on trees (`palma` => "palm", `roble` => "oak") and physical landmarks (`cerro` => "hill").

## Buildings

In [57]:
building_file_name = '../../data/geo_files/PR_OSM/shp_files/gis.osm_buildings_a_free_1.shp'
building_location_names_1 = get_all_location_names(building_file_name)
building_location_name_counts = Counter(building_location_names_1)
top_k = 50
print_count = lambda x: '%s,%d'%(x[0],x[1])
print('\n'.join(map(print_count, building_location_name_counts.most_common(top_k))))

,1428913
Burger King,31
McDonald's,28
Walgreens,26
ruins,18
Caparra Country Club,14
Wendy's,13
Church's Chicken,13
KFC,13
Escuela Rosalina C. MartÃ­Â­nez,11
Pizza Hut,11
Banco Popular,9
Caseta Guardia,8
Shell,8
ruinss,8
Mini Almacenes,7
Villa Rosa,7
Scotiabank,7
Condominio Rexville Park,7
unknown,7
Cond. Plaza del Mar,7
Total,7
Cond. The Falls,6
Museo Casa Blanca,6
Iglesia Metodista,6
Juncos Apartment,6
Taco Bell,6
Villas del Sol,6
Apartamentos,6
CVS,5
First Bank,5
Escuela Superior Tomás Carrión Maduro,5
Puma,5
Walmart,5
Amigo,5
Econo,5
Sam's Club,5
ruinsss,5
Cabanillas Industrial Park,4
Centro de Usos Multiples,4
AutoZone,4
Academia Sally Olsen,4
Oficina,4
smokestack,4
Club de los Telefónicos,4
Dr. Facundo Bueso,4
Subestacion AEE,4
Banco Popular de Puerto Rico,4
Terminal de Carros Públicos,4
Casa Alcaldía,4


Lots of businesses (`McDonald's`) and a long tail of residential stuff (`Juncos Apartment`).

## Test OSM lexicon
Does using the OSM lexicon make more sense than using the CrossWikis lexicon??

In [60]:
import re
shape_file_dir = '../../data/geo_files/PR_OSM/shp_files/'
shape_type_matcher = re.compile('(?<=osm_)[a-zA-Z]+(?=_)')
unique_shape_types = sorted(set([shape_type_matcher.findall(f)[0] for f in os.listdir(shape_file_dir)]))
print(unique_shape_types)

['buildings', 'landuse', 'natural', 'places', 'pofw', 'pois', 'railways', 'roads', 'traffic', 'transport', 'water', 'waterways']


To start, let's use the land-based shape types.

In [73]:
shape_types = ['buildings', 'landuse', 'natural', 'places', 'pofw', 'pois', 'roads', 'traffic']
file_base = os.path.join(shape_file_dir, 'gis.osm_%s_free_1.shp')
# get relevant files
shape_file_matcher = re.compile('gis.osm_(%s).*.shp'%('|'.join(shape_types)))
# shape_files = [file_base%(s) for s in shape_types]
shape_files = filter(lambda x: shape_file_matcher.match(x), os.listdir(shape_file_dir))
print(shape_files)

['gis.osm_buildings_a_free_1.shp', 'gis.osm_landuse_a_free_1.shp', 'gis.osm_natural_a_free_1.shp', 'gis.osm_natural_free_1.shp', 'gis.osm_places_a_free_1.shp', 'gis.osm_places_free_1.shp', 'gis.osm_pofw_a_free_1.shp', 'gis.osm_pofw_free_1.shp', 'gis.osm_pois_a_free_1.shp', 'gis.osm_pois_free_1.shp', 'gis.osm_roads_free_1.shp', 'gis.osm_traffic_a_free_1.shp', 'gis.osm_traffic_free_1.shp']


In [82]:
import numpy as np
import pandas as pd
def collect_tuples(coord_list):
    """
    Collect all coordinate tuples in list by flattening list.
    """
    full_coord_list = []
    if(type(coord_list[0]) is not list):
        return coord_list
    else:
        for l in coord_list:
            full_coord_list += collect_tuples(l)
    return full_coord_list

def get_geotag_info(f_name):
    """
    Extract name, location, and feature class of 
    OSM items in file.
    
    Parameters:
    -----------
    f_name : str
    
    Returns:
    --------
    geotag_info : pandas.DataFrame
    """
    shape_collection = fiona.open(f_name)
    geotag_info = []
    for i, item in enumerate(shape_collection):
        geometry = item['geometry']
        geometry_type = geometry['type']
        coords = geometry['coordinates']
        if(geometry_type == 'Point'):
            # get point estimate
            lat, lon = coords
        elif(geometry_type == 'LineString' or geometry_type == 'Polygon'):
            # compute average of coord list
            lat_lon_tuples = collect_tuples(coords)
#             print(lat_lon_tuples)
            lat_vals, lon_vals = zip(*lat_lon_tuples)
            lat = np.mean(lat_vals)
            lon = np.mean(lon_vals)
        properties = item['properties']
        name = properties['name']
        osm_id = properties['osm_id']
        feature_class = properties['fclass']
        geo_item = [name, geometry_type, feature_class, osm_id, lat, lon]
        geotag_info.append(geo_item)
    geotag_cols = ['name', 'geometry_type', 'feature_class', 'osm_id', 'lat', 'lon']
    geotag_info = pd.DataFrame(geotag_info, columns=geotag_cols)
    return geotag_info

In [84]:
geotag_info = []
for s_file in shape_files:
    print('processing shape file %s'%(s_file))
    s_file_full = os.path.join(shape_file_dir, s_file)
    geotag_info_s = get_geotag_info(s_file_full)
    geotag_info.append(geotag_info_s)
geotag_info_df = pd.concat(geotag_info, axis=0)

processing shape file gis.osm_buildings_a_free_1.shp
processing shape file gis.osm_landuse_a_free_1.shp
processing shape file gis.osm_natural_a_free_1.shp
processing shape file gis.osm_natural_free_1.shp
processing shape file gis.osm_places_a_free_1.shp
processing shape file gis.osm_places_free_1.shp
processing shape file gis.osm_pofw_a_free_1.shp
processing shape file gis.osm_pofw_free_1.shp
processing shape file gis.osm_pois_a_free_1.shp
processing shape file gis.osm_pois_free_1.shp
processing shape file gis.osm_roads_free_1.shp
processing shape file gis.osm_traffic_a_free_1.shp
processing shape file gis.osm_traffic_free_1.shp


In [86]:
print(geotag_info_df.head())
print(geotag_info_df.shape)

                                               name geometry_type  \
0                                  Multipisos UPRRP       Polygon   
1  Escuela de Arquitecura y Escuela de Bellas Artes       Polygon   
2                           Eugenio María de Hostos       Polygon   
3                                        Teatro UPR       Polygon   
4            Complejo Deportivo Cosme Beitía Sálamo       Polygon   

  feature_class    osm_id        lat        lon  
0      building  20885732 -66.044319  18.402406  
1      building  20887004 -66.046165  18.402295  
2      building  20887047 -66.047953  18.402395  
3      building  20887143 -66.048651  18.402701  
4      building  20887609 -66.046724  18.407594  
(1626213, 6)


In [87]:
# save to file
geotag_info_file = '../../data/geo_files/PR_OSM/shp_files/combined_shp_data.tsv'
geotag_info_df.to_csv(geotag_info_file, sep='\t', index=False, encoding='utf-8')

We should test the precision/recall of the OSM lexicon to determine how much improvement we might get.

In [90]:
OSM_lexicon = sorted(geotag_info_df.loc[:, 'name'].unique().tolist())
print('%d phrases in lexicon'%(len(OSM_lexicon)))
print('\n'.join(OSM_lexicon[100:200]))

26308 phrases in lexicon
A4
AAA
AAA - Planta de Filtración Miradero
AAA Cofee Break Services
ABC Pharmacy
ABC Veterinary
ABC Veterinary Clinic
AEE - NEOS
AEE- Juan Ruiz Velez Bldg.
AFDA
AIB
ALTURAS DE PIEDRAS BLANCAS
AM Cafe
AMA Busway
AOR Building Supplies
AR Cellular services
AT&T
AT&T Office
ATC
ATT Mobility PR inc.
Aarti
Abandonado
Abandoned School
Abbvie Biologics
Abitore
Abitto
Abracadabra
Abracadabra Counter Café
Abraham Lincoln Statue
Acacia Blanca
Academia Adventista del Oeste
Academia Barbara Ann Roessler
Academia Carlota Alfaro
Academia Cooperativa De Integración Social (ACIS)
Academia Cristiana Logos
Academia Edusapien
Academia Maria Reina
Academia Perpetuo Socorro
Academia Perpetuo Socorro - High School Building
Academia Rev. Isidro Díaz López
Academia Sagrado Corazón
Academia Sally Olsen
Academia San Agustin
Academia San Antonio
Academia San José
Academia Santa Teresita
Academia Santo Tomas de Aquino
Academia de Baile
Academia de Musica
Academia de Música
Academia de la P

In [93]:
test_word = 'Urb'
test_phrases = list(filter(lambda x: test_word in x, OSM_lexicon))
print('%d test phrases'%(len(test_phrases)))
print('\n'.join(test_phrases))

274 test phrases
Avenida David Urbina
Bahía Urbana
Bosque Urbano Doña Inés Mendoza
Bosque Urbano de San Patricio
Bosque Urbano del Nuevo Milenio
Calle Cecilio Urbina
Calle Paseo Largo Urb. Paseo Alta Vista
Calle Urb Monserrate
Calle Urbano Ramírez
Cancha Urb. Jardines
Centro Urbano del Condado
Escuela Elemental Urbana
Escuela Elemental Urbana (escuela nueva)
Escuela Elemental Urbana K-6
Escuela Elemental Urbana Nueva
Escuela Intermedia Urbana
Escuela Joséfa Vélez Bauza (Superior Urbana)
Escuela Nueva Elemental Urbana De Guaynabo
Escuela Nueva Urbana De Ciales
Escuela Superior Urbana
Escuela Superior Urbana Nueva
Lavander Urban Vintage Boutique
Nomada Urban Beach Hostel
Parque Urb. Sol y Mar
Parque Urbano Dora Colón Clavell
Sabor Urbano
Tren Urbano
Urb Alturas De San Jose
Urb Colinas 3T
Urb Colinas De Villa Rosa
Urb Colinas Verdes
Urb Country Club
Urb Cuevas
Urb El Arrendado
Urb Estancias Villa Alba
Urb Extension Jardinesvilla Alba
Urb Extension San Jose
Urb Extension San Jose II
Urb Ex

This looks good! It includes a lot of housing complexes that were unavailable in the Wiki lexicon.

We should still filter out the short stuff.

In [116]:
min_char_len = 4
OSM_lexicon = filter(lambda x: len(x) >= min_char_len, OSM_lexicon)

## Test on annotations

Let's apply it to the whole annotated dataset and see what happens.

In [95]:
import codecs
import data_helpers
reload(data_helpers)
from data_helpers import extract_annotations
annotated_file_name = '../../data/facebook-maria/1773209126315380_post_sample_annotated_wiki.txt'
N = 249
all_annotations = extract_annotations(annotated_file_name)
gold_annotations = all_annotations[:N]
print(gold_annotations[:10])

[[(u'Guayama', u'https://en.wikipedia.org/wiki/Guayama,_Puerto_Rico')], [(u'olimpo', u'NIL'), (u'calle 6', u'NIL'), (u'calle 8', u'NIL')], [(u'Guayama', u'https://en.wikipedia.org/wiki/Guayama,_Puerto_Rico'), (u'residencial Villamar', u'NIL')], [(u'Guayama', u'https://en.wikipedia.org/wiki/Guayama,_Puerto_Rico'), (u'Carite', u'NIL')], [(u'calle 54', u'NIL')], [(u'Guayma', u'https://en.wikipedia.org/wiki/Guayama,_Puerto_Rico'), (u'villa Rosa', u'NIL'), (u'calle 2', u'NIL')], [(u'Guayama URB', u'https://en.wikipedia.org/wiki/Guayama,_Puerto_Rico'), (u'Hacienda AS', u'NIL'), (u'19 Calle', u'NIL')], [], [], [(u'barrio mosquito', u'https://es.wikipedia.org/wiki/Mosquito_(Vieques)')]]


Convert raw text to DTM for easy computation.

In [100]:
raw_file_name = '../../data/facebook-maria/1773209126315380_post_sample_txt.txt'
raw_txt = [l.strip() for l in codecs.open(raw_file_name, 'r', encoding='utf-8')][:N]

In [105]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize.casual import TweetTokenizer
min_df = 1
tokenizer = TweetTokenizer()
ngram_range = (1,5)
cv = CountVectorizer(min_df=min_df, tokenizer=tokenizer.tokenize, ngram_range=ngram_range, lowercase=True)
dtm = cv.fit_transform(raw_txt)
ivoc = {v : k for k,v in cv.vocabulary_.iteritems()}
print(dtm.shape)

(249, 45120)


In [112]:
from itertools import izip
from collections import defaultdict
nonzero_row_cols = dtm.tocoo().nonzero()
ngrams_per_line = defaultdict(list)
for row, col in izip(nonzero_row_cols[0], nonzero_row_cols[1]):
    col_entity = ivoc[col]
    ngrams_per_line[row].append(col_entity)
# convert to list of lists
ngrams_per_line = [ngrams_per_line[n] for n in range(N)]

In [117]:
OSM_lexicon_lower = set(map(lambda x: x.lower(), OSM_lexicon))
print('%d lowercased OSM lexicon'%(len(OSM_lexicon_lower)))

26072 lowercased OSM lexicon


In [120]:
test_word = 'olimpo'
print(filter(lambda x: test_word in x, OSM_lexicon_lower))

[u'olimpo court hotel', u'calle olimpo', u'calle monte olimpo', u'olimpo court hotel parking']


In [118]:
# look for overlap
OSM_lexicon_per_line = [list(OSM_lexicon_lower & set(l)) for l in ngrams_per_line]
print(OSM_lexicon_per_line[:10])

[[u'guayama'], [u'calle', u'calle 8', u'calle 6', u'arroyo'], [u'villamar', u'guayama'], [u'claro', u'guayama'], [u'calle', u'la hacienda', u'calle 54', u'marta'], [u'calle 2', u'calle', u'villa rosa', u'acosta'], [u'calle', u'calle 46', u'guayama'], [u'mar\xeda'], [], []]


What is the precision/recall of these extracted locations?

In [128]:
gold_annotations_txt = map(lambda x: map(lambda y: y[0], x), gold_annotations)
gold_annotations_txt_lower = map(lambda x: map(lambda y: y.lower(), x), gold_annotations_txt)
print(gold_annotations_txt_lower[:10])

[[u'guayama'], [u'olimpo', u'calle 6', u'calle 8'], [u'guayama', u'residencial villamar'], [u'guayama', u'carite'], [u'calle 54'], [u'guayma', u'villa rosa', u'calle 2'], [u'guayama urb', u'hacienda as', u'19 calle'], [], [], [u'barrio mosquito']]


In [129]:
print(len(gold_annotations_txt))
print(len(OSM_lexicon_per_line))

249
249


In [130]:
import data_helpers
reload(data_helpers)
from data_helpers import test_precision_recall
false_tags, missed_tags, precision, recall = test_precision_recall(OSM_lexicon_per_line, gold_annotations_txt_lower)
print('OSM lexicon gets precision=%.3f, recall=%.3f'%(precision, recall))

OSM lexicon gets precision=0.347, recall=0.452


Performance is between the earlier NER and lexicon results that we found (lexicon had high recall and low precision, NER had low recall and higher precision).

In [132]:
print(false_tags[:100])
print(missed_tags[:100])

[[], [u'calle', u'arroyo'], [u'villamar'], [u'claro'], [u'calle', u'la hacienda', u'marta'], [u'calle', u'acosta'], [u'calle', u'calle 46', u'guayama'], [u'mar\xeda'], [], [], [], [], [u'la principal', u'mar\xeda', u'principal'], [], [u'margarita'], [u'mercado', u'pueblo', u'pl\xe1tano'], [], [u'guayama'], [], [], [], [u'pueblo'], [], [u'calle'], [], [], [u'tel\xe9fonos'], [], [], [u'jardines'], [u'guayama'], [], [], [], [u'guayama'], [], [u'jardines'], [], [u'palmas'], [u'san juan', u'marginal', u'calle 615', u'los dominicos', u'flamboy\xe1n', u'rexville', u'calle aracibo', u'calle morales', u'ponce', u'pharmacy', u'calle gautier ben\xedtez', u'puerto rico', u'jardines', u'calle navarra', u'carolina', u'buildings', u'calle 2', u'encantada', u'san jose', u'calle igualdad', u'pueblo', u'vidal', u'plaza victoria', u'calle casia', u'farmacia', u'burger king', u'calle', u'hostos', u'calle obispado', u'calle villa', u'aguadilla'], [], [], [u'mercado'], [u'school'], [], [], [u'calle', u'entr

At first glance, it looks like the lexicon overgenerated the obvious locations (`Guayama`) and undergenerated some highly obscure locations (`calle san millan`) as well as coarse-grained entities like neighborhoods (`santa ana`) and countries (`pr`).

## Utility of OSM data?
Ideally we would use the OSM data as targets for the entities, to compensate for the lack of fine-grained entities in Wikipedia. We might need to test on a larger dataset that has more labels, because 250 is not a big number and we need properly agreed-upon labels from annotators.