# Extract Geonames data
In which we extract the data from the Geonames dump and convert it to the same .tsv format as the OSM entity data.

In [1]:
import pandas as pd

## Load data

In [25]:
geonames_data = pd.read_csv('../../data/geo_files/PR_geonames_gazetteer/PR.txt', sep='\t', index_col=False, header=None)
geonames_col_names = ['geoname_id', 'name', 'ascii_name', 'alternate_names', 'lon', 'lat', 'feature_class',
                      'feature_code', 'country_code', 'cc2', 
                      'admin1_code', 'admin2_code', 'admin3_code', 'admin4_code',
                      'population', 'elevation', 'dem', 'timezone', 'mod_date']
geonames_data.columns = geonames_col_names
print(geonames_data.head())

   geoname_id             name       ascii_name  alternate_names       lat  \
0     4049880         Teresita         Teresita              NaN  18.40967   
1     4049900      Rio Cristal      Rio Cristal              NaN  18.35328   
2     4049922  Quebrada Rincón  Quebrada Rincon  Quebrada Rincon  18.28551   
3     4050022        Lago Gely        Lago Gely              NaN  18.04719   
4     4050076         Glenview         Glenview              NaN  18.04608   

        lon feature_class feature_code country_code  cc2  admin1_code  \
0 -66.17933             P          PPL           PR  NaN         21.0   
1 -66.00322             P          PPL           PR  NaN        139.0   
2 -65.69461             T          VAL           PR  NaN         37.0   
3 -66.57184             H           LK           PR  NaN        113.0   
4 -66.59378             P          PPL           PR  NaN        113.0   

   admin2_code  admin3_code  admin4_code  population  elevation  dem  \
0    7268466.0      

In [3]:
feature_class_mapper = {'A' : 'country/state/region', 
                        'H' : 'stream/lake', 
                        'L' : 'parks/area', 
                        'P' : 'city/village',
                        'R' : 'road/railroad',
                        'S' : 'spot/building/farm', 
                        'T' : 'mountain/hill/rock', 
                        'U' : 'undersea', 
                        'V' : 'forest/heath'}
geonames_feature_class_counts = geonames_data.loc[:, 'feature_class'].value_counts().sort_values(inplace=False, ascending=False)
print(geonames_feature_class_counts)

P    2504
S    2280
T    1441
A    1136
H     562
L     259
R      37
V      16
U       1
Name: feature_class, dtype: int64


Mostly city-level toponyms, which is good for us because that's what OSM was missing!

In [26]:
# downloaded from http://download.geonames.org/export/dump/featureCodes_en.txt 
feature_code_df = pd.read_csv('../../data/geo_files/PR_geonames_gazetteer/featureCodes_en.txt', sep='\t', index_col=False, header=None)
feature_code_df = feature_code_df[feature_code_df.iloc[:, 0].apply(lambda x: '.' in str(x))]
# print(feature_code_df.head())
feature_code_df.iloc[:, 0] = feature_code_df.iloc[:, 0].apply(lambda x: str(x).split('.')[1])
feature_code_df.iloc[:, 1] = feature_code_df.iloc[:, 1].apply(lambda x: x.replace(' ', '_'))
feature_code_mapper = dict(zip(feature_code_df.iloc[:, 0].values, feature_code_df.iloc[:, 1].values))

In [27]:
# replace feature codes
geonames_data.loc[:, 'feature_class_full'] = geonames_data.loc[:, 'feature_class'].apply(lambda x: feature_class_mapper.get(x))
geonames_data.loc[:, 'feature_code_full'] = geonames_data.loc[:, 'feature_code'].apply(lambda x: feature_code_mapper.get(x))
print(geonames_data.head())

   geoname_id             name       ascii_name  alternate_names       lat  \
0     4049880         Teresita         Teresita              NaN  18.40967   
1     4049900      Rio Cristal      Rio Cristal              NaN  18.35328   
2     4049922  Quebrada Rincón  Quebrada Rincon  Quebrada Rincon  18.28551   
3     4050022        Lago Gely        Lago Gely              NaN  18.04719   
4     4050076         Glenview         Glenview              NaN  18.04608   

        lon feature_class feature_code country_code  cc2        ...         \
0 -66.17933             P          PPL           PR  NaN        ...          
1 -66.00322             P          PPL           PR  NaN        ...          
2 -65.69461             T          VAL           PR  NaN        ...          
3 -66.57184             H           LK           PR  NaN        ...          
4 -66.59378             P          PPL           PR  NaN        ...          

   admin2_code  admin3_code  admin4_code  population  elevatio

## Extract municipalities

Extract municipalities, duh.

In [30]:
import fiona
from data_helpers import containment_test
from shapely.geometry import shape
PR_state_FP = "72"
county_shapes = fiona.open('../../data/geo_files/county_shape_files/cb_2016_us_county_500k.shp')
county_shape_list = list(county_shapes)
PR_county_shape_list = filter(lambda x: x['properties']['STATEFP'] == PR_state_FP, county_shape_list)
municipality_geoms = [(c['properties']['NAME'], c['geometry']) for c in PR_county_shape_list]
municipality_shapes = [(n, shape(g)) for n,g in municipality_geoms]

In [36]:
print(type(geonames_data.loc[:, 'lat'].values[0]))

<type 'numpy.float64'>


In [38]:
geonames_data.loc[:, ['lat', 'lon']].apply(lambda x: x.values.tolist()).head()

Unnamed: 0,lat,lon
0,18.40967,-66.17933
1,18.35328,-66.00322
2,18.28551,-65.69461
3,18.04719,-66.57184
4,18.04608,-66.59378


In [39]:
municipality_list = geonames_data.loc[:, ['lon', 'lat']].apply(lambda p: containment_test(p.values.tolist(), municipality_shapes), axis=1)
geonames_data.loc[:, 'municipality'] = municipality_list

In [40]:
print(geonames_data.loc[:, 'municipality'].unique())

[u'Bayam\xf3n' u'Trujillo Alto' u'Ceiba' u'Ponce' u'Cabo Rojo' u'Lo\xedza'
 u'Arecibo' u'Jayuya' u'Mayag\xfcez' u'Guayanilla' u'San Juan'
 u'Can\xf3vanas' u'Guaynabo' u'Moca' u'Las Mar\xedas' u'Adjuntas'
 u'Aguadilla' u'Santa Isabel' u'Culebra' u'Dorado' u'Humacao' u'Salinas'
 u'Aguada' None u'Aguas Buenas' u'Aibonito' u'Fajardo' u'Juana D\xedaz'
 u'Luquillo' u'Caguas' u'Camuy' u'Guayama' u'Naguabo' u'Yauco' u'Toa Baja'
 u'Manat\xed' u'Maricao' u'Utuado' u'Ciales' u'Corozal' u'Florida' u'Lajas'
 u'Pe\xf1uelas' u'Sabana Grande' u'San Lorenzo' u'Quebradillas'
 u'A\xf1asco' u'Las Piedras' u'Patillas' u'R\xedo Grande' u'Gu\xe1nica'
 u'Vieques' u'Cata\xf1o' u'Arroyo' u'Cayey' u'Carolina' u'San Germ\xe1n'
 u'Coamo' u'Isabela' u'Morovis' u'Barceloneta' u'Barranquitas' u'Hatillo'
 u'San Sebasti\xe1n' u'Cidra' u'Juncos' u'Vega Alta' u'Rinc\xf3n'
 u'Orocovis' u'Lares' u'Vega Baja' u'Toa Alta' u'Gurabo' u'Comer\xedo'
 u'Yabucoa' u'Naranjito' u'Villalba' u'Hormigueros' u'Maunabo']


## Write to file

In [43]:
print(geonames_data.columns)

Index([u'geoname_id', u'name', u'ascii_name', u'alternate_names', u'lat',
       u'lon', u'feature_class', u'feature_code', u'country_code', u'cc2',
       u'admin1_code', u'admin2_code', u'admin3_code', u'admin4_code',
       u'population', u'elevation', u'dem', u'timezone', u'mod_date',
       u'feature_class_full', u'feature_code_full', u'municipality'],
      dtype='object')


In [48]:
output_col_names = ['name', 'geometry_type', 'feature_class', 'geoname_id', 'lat', 'lon', 'shp_type', 'municipality', 'osm_id', 'feature_code']
geonames_data_df = geonames_data.copy()
# rename columns
geonames_data_df.drop(['feature_class', 'feature_code'], inplace=True, axis=1)
geonames_data_df.rename(columns={'feature_class_full' : 'feature_class', 
                                 'feature_code_full' : 'feature_code'}, inplace=True)
null_col_names = ['geometry_type', 'shp_type', 'osm_id']
for c in null_col_names:
    geonames_data_df.loc[:, c] = ''
# write to file
geonames_data_df = geonames_data_df.loc[:, sorted(output_col_names)]
out_file_name = '../../data/geo_files/PR_geonames_gazetteer/geonames_data.tsv'
geonames_data_df.to_csv(out_file_name, sep='\t', index=False, encoding='utf-8')