In [1]:
import json
import ogr
import pandas as pd

def getAttributes(inds):
    # read the attributes of a vector (OGR) dataset and
    # returns it as a Pandas DataFrame, including the
    # geometry, which is formatted for Bokeh (xs, ys)
    
    def getFielddef(lyr):
        # function to get the field definitions
        # of the attributes
        
        lyrdef = lyr.GetLayerDefn()
        
        fielddef = {}
        
        for i in range(lyrdef.GetFieldCount()):
            fdef = lyrdef.GetFieldDefn(i)
            fielddef[fdef.GetName()] = (i, fdef.GetTypeName())
            
        return fielddef
    
    # open dataset and layer
    ds = ogr.Open(inds)
    lyr = ds.GetLayer(0)
    
    lyr.ResetReading()
    ft = lyr.GetNextFeature()

    features = []
    
    fielddef = getFielddef(lyr)

    # loop over all features
    while ft:
        
        ft_atts = {}

        # read all atributes for the feature
        for att_name, (i, att_type) in fielddef.items():
            if att_type == 'String':
                ft_atts[att_name] = ft.GetFieldAsString(att_name)
            elif att_type == 'Integer':
                ft_atts[att_name] = ft.GetFieldAsInteger(att_name)
            elif att_type == 'Real':
                ft_atts[att_name] = ft.GetFieldAsDouble(att_name)
            elif att_type == 'Date':
                ft_atts[att_name] = pd.datetools.parse(ft.GetField(i))
            elif att_type == 'DateTime':
                ft_atts[att_name] = pd.datetools.parse(ft.GetFieldAsDateTime(att_name)).to_datetime()

        # get the geometry
        geom = ft.GetGeometryRef()

        geom_json = json.loads(geom.ExportToJson())

        geom_type = geom.GetGeometryType()

        if geom_type == ogr.wkbMultiPolygon:
            # append NaN after each part of a multipolygon
            ys = [list(zip(*sum(poly, [])))[1] for poly in geom_json['coordinates']]
            ys = sum([list(part) + [float('NaN')] for part in ys], [])

            xs = [list(zip(*sum(poly, [])))[0] for poly in geom_json['coordinates']]
            xs = sum([list(part) + [float('NaN')] for part in xs], [])

        elif geom_type == ogr.wkbPolygon:
            denest = list(zip(*sum(geom_json['coordinates'], [])))

            ys = denest[1]
            xs = denest[0]

        ft_atts['xs'] = xs
        ft_atts['ys'] = ys

        sr = pd.Series(ft_atts)
        sr.name = ft.GetFID()
        features.append(sr)

        ft = lyr.GetNextFeature()

    attributes = pd.concat(features, axis=1).T
    attributes.index.name = 'FID'
    
    ds = None

    return attributes

In [3]:
admin_1 = getAttributes('../data/ne_10m_admin_1_states_provinces/ne_10m_admin_1_states_provinces.shp')
admin_1.head()

Unnamed: 0_level_0,OBJECTID_1,abbrev,adm0_a3,adm0_label,adm0_sr,adm1_cod_1,adm1_code,admin,area_sqkm,check_me,...,sov_a3,sub_code,type,type_en,wikipedia,woe_id,woe_label,woe_name,xs,ys
FID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,3604,,ABW,6,3,ABW-5150,ABW-5150,Aruba,0,0,...,NL1,,,,,23424736,,Aruba,"(-69.99693762899992, -69.93639075399994, -69.9...","(12.577582098000036, 12.53172435100005, 12.519..."
1,2003,,AFG,2,1,AFG-1741,AFG-1741,Afghanistan,0,20,...,AFG,,Velayat,Province,,2344550,"Badghis, AF, Afghanistan",Badghis,"(64.3062353857544, 64.32468387226288, 64.33584...","(35.39721955024203, 35.40176707648942, 35.4027..."
2,2004,,AFG,2,1,AFG-1742,AFG-1742,Afghanistan,0,20,...,AFG,,Velayat,Province,,2344558,"Herat, AF, Afghanistan",Hirat,"(61.36393355300004, 61.365483846000075, 61.367...","(35.59824167900007, 35.59850006100007, 35.5979..."
3,2005,,AFG,2,1,AFG-1743,AFG-1743,Afghanistan,0,20,...,AFG,,Velayat,Province,,2344552,"Bamian, AF, Afghanistan",Bamyan,"(67.74390669074114, 67.75475874242204, 67.8090...","(35.443418281328576, 35.44411591241942, 35.459..."
4,2006,,AFG,2,1,AFG-1744,AFG-1744,Afghanistan,0,20,...,AFG,,Velayat,Province,,2344575,"Balkh, AF, Afghanistan",Balkh,"(67.25912927200011, 67.2814534910001, 67.31959...","(37.18514740000006, 37.18866139800011, 37.2084..."


In [4]:
admin_1.to_hdf('../data/ne_10m_admin_1_states_provinces_converted.hdf', 'df')

your performance may suffer as PyTables will pickle object types that it cannot
map directly to c-types [inferred_type->mixed-integer,key->block0_values] [items->['OBJECTID_1', 'abbrev', 'adm0_a3', 'adm0_label', 'adm0_sr', 'adm1_cod_1', 'adm1_code', 'admin', 'area_sqkm', 'check_me', 'code_hasc', 'code_local', 'datarank', 'diss_me', 'featurecla', 'fips', 'fips_alt', 'gadm_level', 'geonunit', 'gn_a1_code', 'gn_id', 'gn_level', 'gn_name', 'gn_region', 'gns_adm1', 'gns_id', 'gns_lang', 'gns_level', 'gns_name', 'gns_region', 'gu_a3', 'hasc_maybe', 'iso_3166_2', 'iso_a2', 'labelrank', 'latitude', 'longitude', 'mapcolor13', 'mapcolor9', 'name', 'name_alt', 'name_len', 'name_local', 'note', 'postal', 'provnum_ne', 'region', 'region_cod', 'region_sub', 'sameascity', 'scalerank', 'sov_a3', 'sub_code', 'type', 'type_en', 'wikipedia', 'woe_id', 'woe_label', 'woe_name', 'xs', 'ys']]

