In [1]:
import pandas as pd
import numpy as np

#pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

import requests
import geocoder
import pickle
import time

### Load ambulance data and prepare for geooding

In [7]:
# load cleaned df
ambulance_df = pd.read_csv('.././Data/20170616_ambulancecalls_2013-2017_cleaned.csv', index_col='date_time')

Unnamed: 0_level_0,date,time,address,descr,urgency
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-15 18:50:25,2017-05-15 00:00:00,1900-01-01 18:50:25,"Amstelveen, Frans Halslaan 10-18, 1181TL",Ambulance besteld vervoer B1: 13178 Rit 56159...,Ambulance besteld vervoer B1
2017-05-15 18:49:48,2017-05-15 00:00:00,1900-01-01 18:49:48,"Amsterdam, Van Hogendorpplein 20-28, 1051AX",Ambulance met hoge spoed: 13102 Rit 56158 Ams...,Ambulance met hoge spoed
2017-05-15 18:40:16,2017-05-15 00:00:00,1900-01-01 18:40:16,"Amsterdam, De Boelelaan , 1081HV",Ambulance besteld vervoer B1: 13405 Rit 56157...,Ambulance besteld vervoer B1
2017-05-15 18:37:55,2017-05-15 00:00:00,1900-01-01 18:37:55,"Amsterdam, Marnixstraat 1-9, 1017PJ",Ambulance met hoge spoed: 13116 Rit 56156 Ams...,Ambulance met hoge spoed
2017-05-15 18:31:50,2017-05-15 00:00:00,1900-01-01 18:31:50,"Amsterdam, Walmolen 100-108, 1035BP",Ambulance met hoge spoed: 13159 Rit 56155 Ams...,Ambulance met hoge spoed


In [8]:
# split into df per urgency category
urgency_types = ambulance_df['urgency'].unique()
dict_of_urgencydfs = {urgency: data for urgency, data in ambulance_df.groupby('urgency')}

In [10]:
# Urgent calls first
A1_df = dict_of_urgencydfs[' Ambulance met hoge spoed']

Unnamed: 0_level_0,date,time,address,descr,urgency
date_time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2017-05-15 18:49:48,2017-05-15 00:00:00,1900-01-01 18:49:48,"Amsterdam, Van Hogendorpplein 20-28, 1051AX",Ambulance met hoge spoed: 13102 Rit 56158 Ams...,Ambulance met hoge spoed
2017-05-15 18:37:55,2017-05-15 00:00:00,1900-01-01 18:37:55,"Amsterdam, Marnixstraat 1-9, 1017PJ",Ambulance met hoge spoed: 13116 Rit 56156 Ams...,Ambulance met hoge spoed
2017-05-15 18:31:50,2017-05-15 00:00:00,1900-01-01 18:31:50,"Amsterdam, Walmolen 100-108, 1035BP",Ambulance met hoge spoed: 13159 Rit 56155 Ams...,Ambulance met hoge spoed
2017-05-15 18:27:52,2017-05-15 00:00:00,1900-01-01 18:27:52,"Diemen, Muiderstraatweg 20-28, 1111PS",Ambulance met hoge spoed: 13110 Rit 56154 Diemen,Ambulance met hoge spoed
2017-05-15 18:21:49,2017-05-15 00:00:00,1900-01-01 18:21:49,"Amsterdam, Ruijterkade",Ambulance met hoge spoed: 13104 Rit 56150 De ...,Ambulance met hoge spoed


In [11]:
# split A1 data into sets of 1000
bins_begin = range(0,len(A1_df),1000)
bins_end = range(1000,len(A1_df)+1000,1000)

dict_of_A1dfs = {}
bin_number = 0
for begin, end in zip(bins_begin, bins_end):
    df_bin = A1_df.iloc[begin:end, :]
    dict_of_A1dfs[bin_number] = df_bin
    bin_number += 1
    
# save the dict
pickle.dump(dict_of_A1dfs, open('.././Data/A1dfs.p','wb'))

### Geocoding

In [4]:
def geocode_df(df_dict, start_df, number_of_dfs, provider_name, api_key=None):
    """Geocoding per df in a dict of dfs.
    Returns .csv files per geocoded df &
    Returns a pickle with last number of last df that was geocoded."""
    range_of_dfs = range(start_df, start_df+number_of_dfs, 1)
    method = getattr(geocoder, provider_name)
    for i in range_of_dfs:
        df = df_dict[i]
        lat = []
        lng = []
        with requests.Session() as session:
            for address in df['address']:
                try:
                    geocode = method(address, session=session, key=api_key).json
                except:
                    print("Can't geocode address: {}".format(address))
                    lat.append(np.nan)
                    lng.append(np.nan)
                else:
                    try:
                        lat.append(geocode['lat'])
                        lng.append(geocode['lng'])
                    except KeyError:
                        print("No lat or lng key: {}".format(address))
                        lat.append(np.nan)
                        lng.append(np.nan)

        df['lat'] = lat
        df['lng'] = lng
        print("Df {} is geocoded".format(i))
        print("Number of addresses not geocoded: {}".format(df['lat'].isnull().sum()))
        
        # save df
        df_name = './GeoResults/GeocodedDF_' + str(i) + '.csv'
        df.to_csv(df_name)
        
        # save which dfs are done
        wherearewe = i
        pickle.dump(wherearewe, open('wherearewe.p','wb'))
        

In [12]:
len(dict_of_A1dfs)

282

In [6]:
# geocoding using bing, 125.000 a year
start = time.time()
geocode_df(A1dfs_dict, 1, 282, 'bing', api_key='AhLOoucAst5-HHqGVjKjH26-igr7eZf3t7tIqB8hVjmJfcAoFNKRcMY1SuujDpZn')
end = time.time()
print((end-start)/60)

Df 200 is geocoded
Number of addresses not geocoded: 0
5.394613381226858
