In [1]:
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize
from geopy.geocoders import Nominatim
import geopy.distance
import geopandas
import json
import time
import random

In [3]:
data = pd.read_csv('BikeSharingData.csv')

In [4]:
data.head()

Unnamed: 0,Duration,Start date,End date,Start station number,Start station,End station number,End station,Bike number,Member type
0,206,2019-02-01 00:00:20,2019-02-01 00:03:47,31509,New Jersey Ave & R St NW,31636,New Jersey Ave & N St NW/Dunbar HS,W21713,Member
1,297,2019-02-01 00:04:40,2019-02-01 00:09:38,31203,14th & Rhode Island Ave NW,31519,1st & O St NW,E00013,Member
2,165,2019-02-01 00:06:34,2019-02-01 00:09:20,31303,Tenleytown / Wisconsin Ave & Albemarle St NW,31308,39th & Veazey St NW,W21703,Member
3,176,2019-02-01 00:06:49,2019-02-01 00:09:45,31400,Georgia & New Hampshire Ave NW,31401,14th St & Spring Rd NW,W21699,Member
4,105,2019-02-01 00:10:41,2019-02-01 00:12:27,31270,8th & D St NW,31256,10th & E St NW,W21710,Member


In [5]:
# make a list of all unique stations
stations = list(set(data['Start station'].values.tolist() + data['End station'].values.tolist()))

In [6]:
# use wrapper for the open street map "Nominatim" api to get locaitons of bike stations
geolocator = Nominatim(user_agent="data science 1 project")

In [7]:
# test to see what will be returned
loc = geolocator.geocode(stations[4], 
                         addressdetails=True, 
                         extratags=True, 
                         namedetails=True)

print('station being searched:', stations[4])
print(json.dumps(loc.raw, indent=2))

station being searched: Rhode Island Ave & V St NE
{
  "place_id": 71674408,
  "licence": "Data \u00a9 OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright",
  "osm_type": "node",
  "osm_id": 6285398246,
  "boundingbox": [
    "38.9181128",
    "38.9182128",
    "-77.0047983",
    "-77.0046983"
  ],
  "lat": "38.9181628",
  "lon": "-77.0047483",
  "display_name": "Rhode Island Ave and V St NE, Rhode Island Avenue Northeast, Shaw, Washington, District of Columbia, 20260, United States of America",
  "class": "amenity",
  "type": "bicycle_rental",
  "importance": 0.6010000000000001,
  "address": {
    "amenity": "Rhode Island Ave and V St NE",
    "road": "Rhode Island Avenue Northeast",
    "neighbourhood": "Shaw",
    "city": "Washington",
    "county": "Washington",
    "state": "District of Columbia",
    "postcode": "20260",
    "country": "United States of America",
    "country_code": "us"
  },
  "extratags": {
    "OWNER": "DC",
    "network": "CapitalBikeShare",
    "

In [8]:
def location_data(row):
    '''
    Function to find OSM info on BikeSharing location
    input: row of DataFrame
    output: Series of location information
    '''
    
    # some delay to not overwhelm the server
    time.sleep(round(random.uniform(1, 5), 1))
    
    # error "handling" in case a location can't be found
    try:
        loc = geolocator.geocode('{} washington d.c.'.format(row['name']), 
                         addressdetails=True, 
                         extratags=True, 
                         namedetails=True)
        grade = 1
        
        if loc == None:
            loc = geolocator.geocode('{} maryland'.format(row['name']), 
                         addressdetails=True, 
                         extratags=True, 
                         namedetails=True)
            grade = 1
            
        if loc == None:
            loc = geolocator.geocode('{} virginia'.format(row['name']), 
                         addressdetails=True, 
                         extratags=True, 
                         namedetails=True)
            grade = 1
            
        if loc == None:
            loc = geolocator.geocode('{}'.format(row['name']), 
                         addressdetails=True, 
                         extratags=True, 
                         namedetails=True)
            grade = 2
        
        # "flattens" json and converts to DataFrame
        temp = json_normalize(loc.raw)
        # add original name back in
        temp['original name'] = row['name']
        # for later check on validity
        # 1: likely in the Washington D.C. area
        # 2: found, but could be elsewhere
        # 3: not found (only in Exception case)
        temp['Nominatim grade'] = grade
        
        # convert to Series so each info gets a column in result
        return temp.iloc[0]
    
    except:
        # print to see where something went wrong
        print(row['name'], 'was not found')
        
        return pd.DataFrame([[row['name'], 3]], columns=['original name', 'Nominatim grade']).iloc[0]

In [9]:
# construct Dataframe from list for better handling
stations_df = pd.DataFrame(stations)
stations_df.columns = ['name']
stations_df.head()

Unnamed: 0,name
0,Takoma Metro
1,12th St & Pennsylvania Ave SE
2,Ward Circle / American University
3,Wilson Blvd & N Troy St
4,Rhode Island Ave & V St NE


In [333]:
## locations = stations_df[:].apply(location_data, axis=1)

Potomac & Swann Ave was not found
Lee Hwy & N Kirkwood Rd was not found
Union Market/6th St & Neal Pl NE was not found
31st & Woodrow St S was not found
Radford & Osage St was not found
Smithsonian-National Mall / Jefferson Dr & 12th St SW was not found
Iwo Jima Memorial/N Meade & 14th St N was not found
24th & R St NE / National Arboretum was not found
Carlin Springs Rd & N Thomas St was not found
Saint Asaph & Madison St was not found
Crystal Dr & 15th St S was not found
S Joyce & Army Navy Dr was not found
Arlington Blvd & S George Mason Dr/NFATC was not found
Eisenhower Ave & Mill Race Ln was not found
Pentagon City Metro / 12th & S Hayes St was not found
Potomac Greens Dr & Slaters Ln was not found
Columbus Ave & Tribeca St was not found
Wilson Blvd & N Uhle St was not found
14th & D St NW / Ronald Reagan Building was not found
Lee Hwy & N Cleveland St was not found
US Dept of State / Virginia Ave & 21st St NW was not found
Reston Town Center Transit Station was not found
Wilson B

In [37]:
locations[['original name', 'Nominatim grade', 'lat', 'lon']].head()

NameError: name 'locations' is not defined

In [38]:
## locations.to_csv('test2.csv', index=False)

In [59]:
locations = pd.read_csv('test2.csv')

In [60]:
locations.columns

Index(['Nominatim grade', 'address.amenity', 'address.borough',
       'address.building', 'address.city', 'address.commercial',
       'address.country', 'address.country_code', 'address.county',
       'address.hamlet',
       ...
       'namedetails.name:uk', 'namedetails.name:zh',
       'namedetails.official_name', 'namedetails.old_name', 'namedetails.ref',
       'original name', 'osm_id', 'osm_type', 'place_id', 'type'],
      dtype='object', length=125)

In [61]:
# select parts of data that can most conveniently be used 
# to verify the matches and select those that have to be worked on
# and convert to fitting data types
cols = ['original name', 'lat', 'lon', 'osm_id', 'type', 'Nominatim grade']
locations_small = locations[cols].astype({'lat': 'float64', 'lon': 'float64'})

def conv_id(x):
    try:
        return int(x)
    except:
        return 0

locations_small['osm_id'] = locations_small['osm_id'].apply(conv_id)

In [62]:
locations_small.tail()

Unnamed: 0,original name,lat,lon,osm_id,type,Nominatim grade
522,13th St & New York Ave NW,38.900283,-77.029822,2276095138,bicycle_rental,1
523,S Troy St & 26th St S,38.847985,-77.075106,6285398267,bicycle_rental,1
524,Wiehle Ave & Reston Station Blvd,38.949208,-77.336802,6285398585,bicycle_rental,1
525,17th St & Independence Ave SW,,,0,,3
526,15th & Euclid St NW,38.923382,-77.035112,2581188789,bicycle_rental,1


The center of Washington D.C. is roughly located at (38.89511, -77.03637). 

Assumption: A match should be within roughly ±1deg on both dimensions. Those outside this margin should be reevaluated.

In [63]:
locations_small[(locations_small['lat'] < 38) | 
                (locations_small['lat'] > 40) | 
                (locations_small['lon'] > -76.5)| 
                (locations_small['lon'] < -77.5)]

Unnamed: 0,original name,lat,lon,osm_id,type,Nominatim grade
11,20th St & Virginia Ave NW,33.653502,-85.898091,6260263,house,2
81,Spring Hill Metro,14.283014,120.901307,92214594,residential,1
83,Campus Commons,38.416405,-75.562547,380627906,retail,1
127,Lee Hwy & N Monroe St,38.916778,-94.383106,18538583,residential,2
168,Washington Blvd & N Frederick St,39.613748,-77.710818,712330800,primary,2
206,Jefferson Dr & 14th St SW,33.494582,-86.857528,7779385,residential,2
211,Commonwealth & E Monroe Ave,40.598333,-79.567139,12099109,residential,2
266,Washington Blvd & Walter Reed Dr,36.082582,-79.82332,16595394,unclassified,2


In [47]:
# how many locations have not been found at all
len(locations_small[locations_small['lat'].isna()])

48

## This means there are $8+48=56$ locations left. That is $\frac{8 + 48 }{526} = 10.6\%$. So that is pretty good, but some more work might help things.

### Idea: Get all Bike Sharing locations in the area from overpass.

Go to url http://overpass-turbo.eu/

Zoom the map to the area (greater Washington D.C.) which will be used as the bounding box ("bbox") and enter the query
```xml
<query type="node">
  <has-kv k="amenity" v="bicycle_rental"/>
  <bbox-query {{bbox}}/>
</query>
<print/>
```
Finally download the .geojson information on all "bicycle rental" stations in the viewbox.

In [4]:
overpass_data = geopandas.read_file('export.geojson')

In [5]:
overpass_data.head()

Unnamed: 0,id,@id,amenity,name,network,wheelchair,capacity,operator,dcgis:dataset,dcgis:gis_id,...,fixme,layer,OWNER,payment:credit_cards,addr:city,addr:postcode,addr:state,addr:street,alt_name,geometry
0,node/479610905,node/479610905,bicycle_rental,Big Wheel Bikes,,,,,,,...,,,,,,,,,,POINT (-77.09846 38.97869)
1,node/1254079635,node/1254079635,bicycle_rental,Capital Bikeshare,Capital Bikeshare,no,,,,,...,,,,,,,,,,POINT (-77.04178 38.90537)
2,node/1323661174,node/1323661174,bicycle_rental,,,,15.0,Capital Bikeshare,,,...,,,,,,,,,,POINT (-77.35982 38.95704)
3,node/1827176287,node/1827176287,bicycle_rental,Convention Center / 7th and M St NW,Capital Bikeshare,,19.0,,CapitalBikeSharePt,14751586.0,...,,,,,,,,,,POINT (-77.02226 38.90572)
4,node/1837934148,node/1837934148,bicycle_rental,9th and Upshur St NW,CapitalBikeShare,,15.0,,CapitalBikeSharePt,14751642.0,...,,,,,,,,,,POINT (-77.02510 38.94176)


This data is not in the same format, so we will have to convert it to work with it.

In [7]:
overpass_data['geometry'][0:1]

0    POINT (-77.09846 38.97869)
Name: geometry, dtype: geometry

In [8]:
# convert the relevant matching information to matchable format
overpass_names = overpass_data[['id', 'name', 'geometry']][overpass_data['network'].astype(str).str.contains('apital')]

def conv_overpass_id(x):
    try:
        return int(x[5:])
    except:
        return 0

def conv_coords(x):
    try:
        x = str(x['geometry'])
        coords = x[x.find("(")+1:x.find(")")]
        coords = coords.split(' ')
        return pd.DataFrame([[coords[0], coords[1]]], columns=['lon', 'lat']).iloc[0]
    except:
        return pd.DataFrame([[0, 0]], columns=['lat', 'lon']).iloc[0]
    
overpass_names['id'] = overpass_names['id'].apply(conv_overpass_id)
overpass_names[['lat', 'lon']] = overpass_names.apply(conv_coords, axis=1)

In [9]:
overpass_names.tail(10)

Unnamed: 0,id,name,geometry,lat,lon
554,6285398636,Fenton St and Ellsworth Dr,POINT (-77.02561 38.99704),-77.0256103,38.9970408
555,6285398637,Georgia Ave and Spring St,POINT (-77.03156 38.99940),-77.0315573,38.9993958
556,6285398638,Piccard and W Gude Dr,POINT (-77.17709 39.10222),-77.1770934,39.1022198
557,6285398639,Fleet St and Ritchie Pkwy,POINT (-77.14138 39.07634),-77.1413803,39.0763388
558,6285398640,Taft St and E Gude Dr,POINT (-77.13296 39.09411),-77.1329563,39.0941108
559,6285398641,Broschart and Blackwell Rd,POINT (-77.20032 39.10211),-77.2003244,39.1021068
560,6285398642,Norfolk and Rugby Ave,POINT (-77.10024 38.99065),-77.1002413,38.9906468
561,6285398643,Battery Ln and Trolley Trail,POINT (-77.10011 38.99238),-77.1001063,38.9923828
563,6506526001,,POINT (-77.23075 38.87860),-77.2307493,38.8785952
564,6506526002,,POINT (-77.22830 38.88278),-77.2282975,38.8827836


In [436]:
# make two dataframes with same columns, join them and
# remove all entries that existed in both
l = locations[['original name', 'lat', 'lon', 'osm_id']]
l['nominatim'] = True
l.columns = ['name', 'lat', 'lon', 'id', 'nominatim']

m = overpass_names[['name', 'lat', 'lon', 'id']]
m['nominatim'] = 'False'

n = pd.concat([l, m])
n.drop_duplicates(['id'], keep=False, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [495]:
n[n['nominatim'] == 'False']

Unnamed: 0,name,lat,lon,id,nominatim
1,Capital Bikeshare,-77.0417844,38.9053748,1.254080e+09,False
18,Eastern Market Metro/Pennsylvania Ave & 7th St...,-76.995397,38.8839997,2.275496e+09,False
21,Minnesota Ave Metro/DOES,-76.9476765,38.8970883,2.276073e+09,False
25,14th and D St SE,-76.98609999999999,38.8839997,2.276073e+09,False
34,15th and Hayes St,-77.0603,38.8603997,2.276095e+09,False
36,12th and Army Navy Dr,-77.0528,38.8628997,2.276095e+09,False
43,20th and Crystal Dr,-77.0492,38.8563997,2.276095e+09,False
44,20th and Bell St,-77.05119999999999,38.8560997,2.276095e+09,False
45,15th and Crystal Dr,-77.0496649,38.8601822,2.276095e+09,False
55,Capital BikeShare,-77.046567,38.8924587,2.276095e+09,False


Well this did not turn out the way I hoped. This doesn't really make matching the rest to data much easier ...

## Idea: just google them

Take the location name, search for it, look at 'landmarks' and search for them in Nominatim. In almost every case there was a 'bike'-logo at the location, some had an openstreetmap id, many didn't. Coordinates were taken from most accurate location.

```python
'Potomac & Swann Ave' = (38.82958,-77.04785)
'Lee Hwy & N Kirkwood Rd' = (38.89530,-77.09746)
'Union Market/6th St & Neal Pl NE' = (38.90777,-76.99703)
'31st & Woodrow St S' = (38.83763,-77.09477)
'Radford & Osage St' = (38.494233, -77.050979)
'Smithsonian-National Mall / Jefferson Dr & 12th St SW' = (38.88865,-77.02862)
'Iwo Jima Memorial/N Meade & 14th St N' = (38.89001,-77.07138)
'24th & R St NE / National Arboretum' = (38.91258,-76.97197)
'Carlin Springs Rd & N Thomas St' = (38.87666,-77.11308)
'Saint Asaph & Madison St' = (38.81268,-77.04396)
'Crystal Dr & 15th St S' = (38.86013,-77.04970)
'S Joyce & Army Navy Dr' = (38.86573,-77.06310)
'Arlington Blvd & S George Mason Dr/NFATC' = (38.86941,-77.10452)
'Eisenhower Ave & Mill Race Ln' = (38.80107,-77.06894)
'Pentagon City Metro / 12th & S Hayes St' = (38.86284,-77.06000)
'Potomac Greens Dr & Slaters Ln' = (38.82168,-77.04759)
'Columbus Ave & Tribeca St' = (39.12596,-77.16728)
'Wilson Blvd & N Uhle St' = (38.89049,-77.08480)
'14th & D St NW / Ronald Reagan Building' = (38.894466445916,-77.031685709953)
'Lee Hwy & N Cleveland St' = (38.8949456,-77.0917386)
'US Dept of State / Virginia Ave & 21st St NW' = (38.8949187,-77.046587)
'Reston Town Center Transit Station' = (38.9571419273019,-77.35981608691958)
'Wilson Blvd & N Quincy St' = (38.8801507,-77.107673)
'Pleasant St & MLK Ave SE' = (38.8638975,-76.9899338)
'Ripley & Bonifant St' = (38.992610503515,-77.029255628586)
'Washington-Lee High School / N Stafford St & Generals Way' = (38.8883351,-77.1109799)
'Westover Library / Washington Blvd & N McKinley Rd' = (38.88589485,-77.14143896053996)
'Crystal Dr & 27th St S' = (38.8484407,-77.051516)
'Crystal Dr & Potomac Ave' = (38.8479677,-77.0513808240339)
'Ballenger Ave & Dulaney St' = (38.80222465,-77.06368074294338)
'New Dominion Pkwy & Fountain Dr' = (38.960511420411144,-77.3563682704622)
'26th & S Clark St' = (38.850573614633774,-77.05149894240314)
'Columbus Ave & Gramercy Blvd' = (39.1233557,-77.1647983)
'28th St S & S Meade St' = (38.8462901,-77.069354)
'Reston Pkwy & Spectrum Dr' = (38.964906763642,-77.35380608654)
'36th & Calvert St NW / Glover Park' = (38.9225768,-77.0703627)
'Walter Reed Community Center / Walter Reed Dr & 16th St S' = (38.857689,-77.0866412)
'Lee Hwy & N Adams St' = (38.8959297,-77.089006)
'East Falls Church Metro/Sycamore St & 19th St N' = (38.88542,-77.15649)
'Vermont Ave & I St NW' = (38.90110,-77.03448)
'S Kenmore & 24th St S' = (38.848505,-77.0849189)
'King St Metro North / Cameron St' = (38.80695585380585,-77.05984255418232)
'Shady Grove Metro East' = (39.1211875,-77.1648847)
'Rockville Pike & Old Georgetown Rd' = (39.04981,-77.11366)
'USDA / 12th & C St SW' = (38.8871972,-77.0282895)
'Kennebec St & 11th St N' = (38.88101193420522,-77.13530392912494)
'GMU / Fairfax Dr & Kenmore St' = (38.88459,-77.10115)
'17th St & Independence Ave SW' = (38.88808,-77.03861)

'20th St & Virginia Ave NW' = (38.89465,-77.04510)
'Spring Hill Metro' = (38.92945,-77.24077)
'Campus Commons' = (38.94468,-77.33545)
'Lee Hwy & N Monroe St' = (38.89636,-77.10464)
'Washington Blvd & N Frederick St' = (38.88479,-77.12761)
'Jefferson Dr & 14th St SW' = (38.88855,-77.03261)
'Commonwealth & E Monroe Ave' = (38.82019,-77.06303)
'Washington Blvd & Walter Reed Dr' = (38.87330,-77.08211)
```

5 hours later ...

In [64]:
# make this into a list so we can get it into the DataFrame
test = [['Potomac & Swann Ave',38.82958,-77.04785],
['Lee Hwy & N Kirkwood Rd',38.89530,-77.09746],
['Union Market/6th St & Neal Pl NE',38.90777,-76.99703],
['31st & Woodrow St S',38.83763,-77.09477],
['Radford & Osage St',38.494233, -77.050979],
['Smithsonian-National Mall / Jefferson Dr & 12th St SW',38.88865,-77.02862],
['Iwo Jima Memorial/N Meade & 14th St N',38.89001,-77.07138],
['24th & R St NE / National Arboretum',38.91258,-76.97197],
['Carlin Springs Rd & N Thomas St',38.87666,-77.11308],
['Saint Asaph & Madison St',38.81268,-77.04396],
['Crystal Dr & 15th St S',38.86013,-77.04970],
['S Joyce & Army Navy Dr',38.86573,-77.06310],
['Arlington Blvd & S George Mason Dr/NFATC',38.86941,-77.10452],
['Eisenhower Ave & Mill Race Ln',38.80107,-77.06894],
['Pentagon City Metro / 12th & S Hayes St',38.86284,-77.06000],
['Potomac Greens Dr & Slaters Ln',38.82168,-77.04759],
['Columbus Ave & Tribeca St',39.12596,-77.16728],
['Wilson Blvd & N Uhle St',38.89049,-77.08480],
['14th & D St NW / Ronald Reagan Building',38.894466445916,-77.031685709953],
['Lee Hwy & N Cleveland St',38.8949456,-77.0917386],
['US Dept of State / Virginia Ave & 21st St NW',38.8949187,-77.046587],
['Reston Town Center Transit Station',38.9571419273019,-77.35981608691958],
['Wilson Blvd & N Quincy St',38.8801507,-77.107673],
['Pleasant St & MLK Ave SE',38.8638975,-76.9899338],
['Ripley & Bonifant St',38.992610503515,-77.029255628586],
['Washington-Lee High School / N Stafford St & Generals Way',38.8883351,-77.1109799],
['Westover Library / Washington Blvd & N McKinley Rd',38.88589485,-77.14143896053996],
['Crystal Dr & 27th St S',38.8484407,-77.051516],
['Crystal Dr & Potomac Ave',38.8479677,-77.0513808240339],
['Ballenger Ave & Dulaney St',38.80222465,-77.06368074294338],
['New Dominion Pkwy & Fountain Dr',38.960511420411144,-77.3563682704622],
['26th & S Clark St',38.850573614633774,-77.05149894240314],
['Columbus Ave & Gramercy Blvd',39.1233557,-77.1647983],
['28th St S & S Meade St',38.8462901,-77.069354],
['Reston Pkwy & Spectrum Dr',38.964906763642,-77.35380608654],
['36th & Calvert St NW / Glover Park',38.9225768,-77.0703627],
['Walter Reed Community Center / Walter Reed Dr & 16th St S',38.857689,-77.0866412],
['Lee Hwy & N Adams St',38.8959297,-77.089006],
['East Falls Church Metro/Sycamore St & 19th St N',38.88542,-77.15649],
['Vermont Ave & I St NW',38.90110,-77.03448],
['S Kenmore & 24th St S',38.848505,-77.0849189],
['King St Metro North / Cameron St',38.80695585380585,-77.05984255418232],
['Shady Grove Metro East',39.1211875,-77.1648847],
['Rockville Pike & Old Georgetown Rd',39.04981,-77.11366],
['USDA / 12th & C St SW',38.8871972,-77.0282895],
['Kennebec St & 11th St N',38.88101193420522,-77.13530392912494],
['GMU / Fairfax Dr & Kenmore St',38.88459,-77.10115],
['17th St & Independence Ave SW',38.88808,-77.03861],
['20th St & Virginia Ave NW',38.89465,-77.04510],
['Spring Hill Metro',38.92945,-77.24077],
['Campus Commons',38.94468,-77.33545],
['Lee Hwy & N Monroe St',38.89636,-77.10464],
['Washington Blvd & N Frederick St',38.88479,-77.12761],
['Jefferson Dr & 14th St SW',38.88855,-77.03261],
['Commonwealth & E Monroe Ave',38.82019,-77.06303],
['Washington Blvd & Walter Reed Dr',38.87330,-77.08211]]

In [65]:
searched_locations = pd.DataFrame(data=np.array(test), columns=['original name', 'lat', 'lon'])

In [68]:
def replace_coords(row):
    '''
    Function that takes coordinate data in DataFrame, if it is 
    NaN the looked up coordinate will be used, else it is kept
    '''
    if pd.isna(row['lat_old']):
        a = searched_locations[searched_locations['original name'] == row['original name']]
        return a[['lat', 'lon']].iloc[0]
    else:
        return pd.DataFrame([[row['lat_old'], row['lon_old']]], columns=['lat', 'lon']).iloc[0]

In [69]:
# replace 'lat' and 'lon' column names so new ones can be added in their place
locations_small.columns = ['original name', 'lat_old', 'lon_old', 'osm_id', 'type', 'Nominatim grade']
locations_small[['lat', 'lon']] = locations_small.apply(replace_coords, axis=1)

In [71]:
locations_small.head()

Unnamed: 0,original name,lat_old,lon_old,osm_id,type,Nominatim grade,lat,lon
0,12th & U St NW,38.916926,-77.009634,6058113,house,1,38.9169,-77.0096
1,Randle Circle & Minnesota Ave SE,38.873902,-76.969789,203022902,secondary,2,38.8739,-76.9698
2,15th & W St NW,38.919039,-77.034473,6285398473,bicycle_rental,1,38.919,-77.0345
3,Potomac & Swann Ave,,,0,,3,38.82958,-77.04785
4,Stanton Square SE,38.855012,-76.984514,6285398409,bicycle_rental,1,38.855,-76.9845


In [72]:
# the above method only replaced NaN locations, but there were 8 more
# which were found, just nowhere near the target
off_locations = [['20th St & Virginia Ave NW',38.89465,-77.04510],
['Spring Hill Metro',38.92945,-77.24077],
['Campus Commons',38.94468,-77.33545],
['Lee Hwy & N Monroe St',38.89636,-77.10464],
['Washington Blvd & N Frederick St',38.88479,-77.12761],
['Jefferson Dr & 14th St SW',38.88855,-77.03261],
['Commonwealth & E Monroe Ave',38.82019,-77.06303],
['Washington Blvd & Walter Reed Dr',38.87330,-77.08211]]

indxs = [11, 81, 83, 127, 168, 206, 211, 266]

for i, idx in enumerate(indxs):
    locations_small.loc[idx, 'lat'] = off_locations[i][1]
    locations_small.loc[idx, 'lon'] = off_locations[i][2]

In [73]:
# check if there are still locations left that are off
locations_small = locations_small.astype({'lat': 'float64', 'lon': 'float64'})

locations_small[(locations_small['lat'] < 38) | 
                (locations_small['lat'] > 40) | 
                (locations_small['lon'] > -76.5)| 
                (locations_small['lon'] < -77.5)]

Unnamed: 0,original name,lat_old,lon_old,osm_id,type,Nominatim grade,lat,lon


So the locations that were too far away from Washington D.C. have been replaced as well.

## This might be it for locating all bike sharing stations

## Idea: Add more information to the stations that might be useful later

There is data on Washington D.C. available at https://opendata.dc.gov/ which has been collected in geojson files at https://github.com/benbalter/dc-maps. The bike sharing data is not very recent and the dates of collection from opendata can not always be determined, however the data selected should be more or less stable over time (metro locations or schools do rarely move).

In [74]:
bus_stations = geopandas.read_file('commuter-bus-locations.geojson')
metro_stations = geopandas.read_file('metro-stations-regional.geojson')
public_schools = geopandas.read_file('public-schools.geojson')
unis_colleges = geopandas.read_file('universities-and-colleges.geojson')

In [76]:
# look at what the data looks like
bus_stations.head(3)

Unnamed: 0,OBJECTID,STATUS,SCORE,SIDE,STAN_ADDR,ARC_STREET,LOCATION,NUMTRIPS,SHELTER,SERVICEPROVIDER,TIME_,BUSSTOP,BUSROUTE,DIRECTION,FREQUENCY,geometry
0,1,M,85,,| | 4TH | ST | NW | | | MADISON | AVE | NW,4TH STREET NW & MADISON AVENUE NW,4th Street & Madison Avenue,13,No,PRTCAM,Morning,National Gallery of Art,,,,POINT (-77.01803 38.95874)
1,2,M,85,,| | 4TH | ST | | | | INDEPENDENCE | AVE | SW |,4th Street & Independence Avenue SW,4th Street & Independence Avenue SW,13,No,"MTA, PRTC AM/PM",Morning,,,,,POINT (-77.01755 38.88758)
2,3,T,69,,| | INDEPENDENCE | AVE | | | | 6TH | ST | |,Independence Avenue & 6th Street,Independence Avenue & 6th Street,13,Yes,PRTCAM,Morning,National Air & Space Museum,,,,POINT (-77.01991 38.88757)


In [77]:
def conv_coords(x):
    '''
    Convert "geometry" data from "POINT (lon lat)" to seperate lat, lon columns
    '''
    try:
        x = str(x['geometry'])
        coords = x[x.find("(")+1:x.find(")")]
        coords = coords.split(' ')
        return pd.DataFrame([[coords[0], coords[1]]], columns=['lon', 'lat']).iloc[0]
    except:
        return pd.DataFrame([[0, 0]], columns=['lat', 'lon']).iloc[0]

In [78]:
bus_stations_list = np.array(bus_stations[['geometry']].apply(conv_coords, axis=1)).astype('float64')

In [79]:
bus_stations_list[:3] # quick look if it worked

array([[-77.01803466,  38.958744  ],
       [-77.01754919,  38.88757906],
       [-77.01990962,  38.88757482]])

In [81]:
# look at what the data looks like
metro_stations.head(3)

Unnamed: 0,OBJECTID,GIS_ID,NAME,WEB_URL,ADDRESS,LINE,geometry
0,1,mstn_1,College Park-U of Md,http://www.wmata.com/rail/station_detail.cfm?s...,"4931 CALVERT ROAD, COLLEGE PARK, MD","green, yellow",POINT (-76.92813 38.97864)
1,2,mstn_2,Capitol Heights,http://www.wmata.com/rail/station_detail.cfm?s...,"133 CENTRAL AVENUE, CAPITOL HEIGHTS, MD","blue, orange, silver",POINT (-76.91181 38.88947)
2,3,mstn_3,Morgan Boulevard,http://www.wmata.com/rail/station_detail.cfm?s...,"300 GARRETT MORGAN BLVD., LANDOVER, MD","blue, orange, silver",POINT (-76.86808 38.89384)


In [80]:
metro_stations_list = np.array(metro_stations[['geometry']].apply(conv_coords, axis=1)).astype('float64')
metro_stations_list[:3] # quick look if it worked

array([[-76.92812725,  38.97864144],
       [-76.91181037,  38.88947365],
       [-76.86807701,  38.89384272]])

In [82]:
# look at what the data looks like
public_schools.head(3)

Unnamed: 0,OBJECTID,NAME,ADDRESS,FACUSE,LEVEL_,STATUS,PHONE,UNGRADED,TOTAL_STUD,SSL,...,SQUARE_FOOTAGE,POPULATION_PLAN,POPULATION_ENROLLED_2008,X,Y,ADDRID,ZIP_CODE,LONGITUDE,LATITUDE,geometry
0,1,Oyster-Adams Bilingual School (Adams),2020 19TH STREET NW,Middle School,MS,Active,202-673-7311,0.0,321.0,,...,,,,396151.48,138827.7,294534,20009,-77.044377,38.91731,POINT (-77.04438 38.91732)
1,2,"Phelps Architecture, Construction, and Enginee...",704 26TH STREET NE,High School/Specialized,HS,Active,202-729-4360,,329.0,,...,205776.0,600.0,,402437.64,137126.77,294495,20002,-76.971898,38.901992,POINT (-76.97190 38.90200)
2,3,Washington Metropolitan High School,300 BRYANT STREET NW,Youth Engagement,YE,Active,202-671-1788,,253.0,,...,,,,398648.513,139173.29,294475,20001,-77.015581,38.92043,POINT (-77.01559 38.92044)


In [83]:
public_schools_list = np.array(public_schools[['geometry']].apply(conv_coords, axis=1)).astype('float64')
public_schools_list[:3] # quick look if it worked

array([[-77.04437955,  38.91731699],
       [-76.97189991,  38.90199948],
       [-77.01558696,  38.92043757]])

In [84]:
# look at what the data looks like
unis_colleges.head(3)

Unnamed: 0,OBJECTID,NAME,ADDRESS,WEB_URL,GIS_ID,XCOORD,YCOORD,ADDR_ID,geometry
0,1,Howard University,2400 6TH STREET NW,http://www.howard.edu/,Univ_007,398129.26,139473.11,284379,POINT (-77.02072 38.92129)
1,2,American University,4400 MASSACHUSETTS AVENUE NW,http://www.american.edu/,Univ_001,392334.76,141247.38,223994,POINT (-77.08830 38.93834)
2,3,George Washington University,2121 I STREET NW,http://www.gwu.edu/,Univ_005,395865.51,137012.82,242496,POINT (-77.04802 38.89976)


In [85]:
unis_colleges_list = np.array(unis_colleges[['geometry']].apply(conv_coords, axis=1)).astype('float64')
unis_colleges_list[:3] # quick look if it worked

array([[-77.02071582,  38.9212851 ],
       [-77.08830309,  38.93833545],
       [-77.04801964,  38.89975954]])

In [None]:
import geopy.distance

print(locations_small.iloc[0]['lat'])

coords_1 = (locations_small.iloc[0]['lat'], locations_small.iloc[0]['lon'])
coords_2 = (locations_small.iloc[1]['lat'], locations_small.iloc[1]['lon'])

print(geopy.distance.distance(coords_1, coords_2).km)

In [87]:
def calc_distance(row, comp):
    '''
    Calculates the distance between bike sharing location and the other
    provided kind of location and returns the distance to the closest
    (should be pretty accurate, but at these distances the shape of 
    the earth should not matter too much)
    '''
    dist = []
    
    for station in comp:
        dist.append(geopy.distance.distance((row.lat, row.lon), (station[1], station[0])).km)
    return min(dist)
    
locations_small['dist_bus'] = locations_small[:].apply(calc_distance, args=(bus_stations_list,), axis=1)
locations_small['dist_metro'] = locations_small[:].apply(calc_distance, args=(metro_stations_list,), axis=1)
locations_small['dist_schools'] = locations_small[:].apply(calc_distance, args=(public_schools_list,), axis=1)
locations_small['dist_uni'] = locations_small[:].apply(calc_distance, args=(unis_colleges_list,), axis=1)

In [88]:
locations_small.head(3)

Unnamed: 0,original name,lat_old,lon_old,osm_id,type,Nominatim grade,lat,lon,dist_bus,dist_metro,dist_schools,dist_uni
0,12th & U St NW,38.916926,-77.009634,6058113,house,1,38.916926,-77.009634,1.331605,1.131517,0.378479,1.076064
1,Randle Circle & Minnesota Ave SE,38.873902,-76.969789,203022902,secondary,2,38.873902,-76.969789,2.100803,1.557394,0.469413,3.126127
2,15th & W St NW,38.919039,-77.034473,6285398473,bicycle_rental,1,38.919039,-77.034473,0.822723,0.645445,0.550312,0.956607


In [89]:
locations_small.columns

Index(['original name', 'lat_old', 'lon_old', 'osm_id', 'type',
       'Nominatim grade', 'lat', 'lon', 'dist_bus', 'dist_metro',
       'dist_schools', 'dist_uni'],
      dtype='object')

In [90]:
# save the most relevant information to file
locations_small[['original name', 'osm_id', 'type', 'lat', 'lon', 'dist_bus', 'dist_metro',
       'dist_schools', 'dist_uni']].to_csv('locations_small.csv', index=False)

In [91]:
a = pd.read_csv('locations_small.csv')

In [92]:
a.head()

Unnamed: 0,original name,osm_id,type,lat,lon,dist_bus,dist_metro,dist_schools,dist_uni
0,12th & U St NW,6058113,house,38.916926,-77.009634,1.331605,1.131517,0.378479,1.076064
1,Randle Circle & Minnesota Ave SE,203022902,secondary,38.873902,-76.969789,2.100803,1.557394,0.469413,3.126127
2,15th & W St NW,6285398473,bicycle_rental,38.919039,-77.034473,0.822723,0.645445,0.550312,0.956607
3,Potomac & Swann Ave,0,,38.82958,-77.04785,2.760321,1.785772,3.039215,5.024466
4,Stanton Square SE,6285398409,bicycle_rental,38.855012,-76.984514,1.760505,1.094119,0.228504,2.879923


## So this should be a good starting point for an analysis.