In [1]:
#source for python googlemaps api code is from here: 
# https://www.shanelynn.ie/batch-geocoding-in-python-with-google-geocoding-api/

In [2]:
# Inputs 

# file_name is the pathway to the file of interest
file_name = r'web_scrape_v3.csv'

# Specify the column name in your input data that contains addresses here
address_column_name = "address"
# Specify the column name in your input data that contains ID here
id_column_name = "ID"

old_cols = [id_column_name, address_column_name]

# Set your Google API key here. 
# Even if using the free 2500 queries a day, its worth getting an API key since the rate limit is 50 / second.
# With API_KEY = None, you will run into a 2 second delay every 10 requests or so.
# With a "Google Maps Geocoding API" key from https://console.developers.google.com/apis/, 
# the daily limit will be 2500, but at a much faster rate.
# Example: API_KEY = 'AIzaSyC9azed9tLdjpZNjg2_kVePWvMIBq154eA
# Google map API key
API_KEY = 'AIzaSyA2ItSloJjstNAt-Jjn-0n7QMUkFmF1vCE'

In [3]:
import pandas as pd
import requests
import logging
import time

data=pd.read_csv(file_name)

In [4]:
logger = logging.getLogger("root")
logger.setLevel(logging.DEBUG)
# create console handler
ch = logging.StreamHandler()
ch.setLevel(logging.DEBUG)
logger.addHandler(ch)

In [5]:
#------------------ CONFIGURATION -------------------------------
# Backoff time sets how many minutes to wait between google pings when your API limit is hit
BACKOFF_TIME = 30

# Return Full Google Results? If True, full JSON results from Google are included in output
RETURN_FULL_RESULTS = False

In [6]:

#------------------ DATA LOADING --------------------------------

if address_column_name not in data.columns:
	raise ValueError("Missing Address column in input data")

# Form a list of addresses for geocoding:
# Make a big list of all of the addresses to be processed.
addresses = data[address_column_name].tolist()

In [7]:
#------------------	FUNCTION DEFINITIONS ------------------------

def get_google_results(address, api_key=None, return_full_response=False):
    """
    Get geocode results from Google Maps Geocoding API.
    
    Note, that in the case of multiple google geocode reuslts, this function returns details of the FIRST result.
    
    @param address: String address as accurate as possible. For Example "18 Grafton Street, Dublin, Ireland"
    @param api_key: String API key if present from google. 
                    If supplied, requests will use your allowance from the Google API. If not, you
                    will be limited to the free usage of 2500 requests per day.
    @param return_full_response: Boolean to indicate if you'd like to return the full response from google. This
                    is useful if you'd like additional location details for storage or parsing later.
    """
    # Set up your Geocoding url
    geocode_url = "https://maps.googleapis.com/maps/api/geocode/json?address={}".format(address)
    if api_key is not None:
        geocode_url = geocode_url + "&key={}".format(api_key)
        
    # Ping google for the reuslts:
    results = requests.get(geocode_url)
    # Results will be in JSON format - convert to dict using requests functionality
    results = results.json()
    
    # if there's no results or an error, return empty results.
    if len(results['results']) == 0:
        output = {
            "formatted_address": None,
            "latitude": None,
            "longitude": None,
            "accuracy": None,
            "google_place_id": None,
            "type": None,
            "postcode": None
        }
    else:    
        answer = results['results'][0]
        output = {
            "formatted_address": answer.get('formatted_address'),
            "latitude": answer.get('geometry').get('location').get('lat'),
            "longitude": answer.get('geometry').get('location').get('lng'),
            "accuracy": answer.get('geometry').get('location_type'),
            "google_place_id": answer.get("place_id"),
            "type": ",".join(answer.get('types')),
            "postcode": ",".join([x['long_name'] for x in answer.get('address_components') 
                                  if 'postal_code' in x.get('types')])
        }
        
    # Append some other details:
    output['input_string'] = address
    output['number_of_results'] = len(results['results'])
    output['status'] = results.get('status')
    if return_full_response is True:
        output['response'] = results
    
    return output

In [8]:
# Create a list to hold results
results = []
# Go through each address in turn
for address in addresses:
    # While the address geocoding is not finished:
    geocoded = False
    while geocoded is not True:
        # Geocode the address with google
        try:
            geocode_result = get_google_results(address, API_KEY, return_full_response=RETURN_FULL_RESULTS)
        except Exception as e:
            logger.exception(e)
            logger.error("Major error with {}".format(address))
            logger.error("Skipping!")
            geocoded = True
            
        # If we're over the API limit, backoff for a while and try again later.
        if geocode_result['status'] == 'OVER_QUERY_LIMIT':
            logger.info("Hit Query Limit! Backing off for a bit.")
            time.sleep(BACKOFF_TIME * 60) # sleep for 30 minutes
            geocoded = False
        else:
            # If we're ok with API use, save the results
            # Note that the results might be empty / non-ok - log this
            if geocode_result['status'] != 'OK':
                logger.warning("Error geocoding {}: {}".format(address, geocode_result['status']))
            logger.debug("Geocoded: {}: {}".format(address, geocode_result['status']))
            results.append(geocode_result)           
            geocoded = True

    # Print status every 20 addresses
    if len(results) % 20 == 0:
    	logger.info("Completed {} of {} address".format(len(results), len(addresses)))

# All done
logger.info("Finished geocoding all addresses")

Geocoded: Phnom Penh: OK
Error geocoding #132D St 135, Phsar Doem Thkov, Chamcar Morn, Phnom Penh: INVALID_REQUEST
Geocoded: #132D St 135, Phsar Doem Thkov, Chamcar Morn, Phnom Penh: INVALID_REQUEST
Error geocoding #8, St. 468, Toul Tompung II, Phnom Penh: INVALID_REQUEST
Geocoded: #8, St. 468, Toul Tompung II, Phnom Penh: INVALID_REQUEST
Error geocoding #37B Street 113, Boeung Keng Kang II, Chamcar Morn, Phnom Penh: INVALID_REQUEST
Geocoded: #37B Street 113, Boeung Keng Kang II, Chamcar Morn, Phnom Penh: INVALID_REQUEST
Error geocoding #28, St. 80, S/k Sras Chork, Phnom Penh: INVALID_REQUEST
Geocoded: #28, St. 80, S/k Sras Chork, Phnom Penh: INVALID_REQUEST
Error geocoding #370 Road 2, Takdol village, Takdol commune, Takhmao district, Kankal Province: INVALID_REQUEST
Geocoded: #370 Road 2, Takdol village, Takdol commune, Takhmao district, Kankal Province: INVALID_REQUEST
Error geocoding #1BEo, St. 86, Sras Chork, Phnom Penh: INVALID_REQUEST
Geocoded: #1BEo, St. 86, Sras Chork, Phnom P

In [9]:
new_data = pd.DataFrame(results)
keep_cols = ['formatted_address', 'input_string','latitude', 'longitude', 'status']
new_data = new_data[keep_cols]

data = data[old_cols]

rename_cols = {"input_string":address_column_name, "formatted_address":"Google valid address"}
new_data.rename(columns = rename_cols, inplace = True)

output_data = pd.merge(data, new_data, on = [address_column_name], how = "outer")

In [11]:
output_data

Unnamed: 0,ID,address,Google valid address,latitude,longitude,status,Similarity
0,10744,Phnom Penh,"Phnom Penh, Cambodia",11.556374,104.92821,OK,0.666667
1,4131,"#132D St 135, Phsar Doem Thkov, Chamcar Morn, ...",,,,INVALID_REQUEST,
2,9288,"#8, St. 468, Toul Tompung II, Phnom Penh",,,,INVALID_REQUEST,
3,6620,"#37B Street 113, Boeung Keng Kang II, Chamcar ...",,,,INVALID_REQUEST,
4,10816,"#28, St. 80, S/k Sras Chork, Phnom Penh",,,,INVALID_REQUEST,
5,7225,"#370 Road 2, Takdol village, Takdol commune, T...",,,,INVALID_REQUEST,
6,4410,"#1BEo, St. 86, Sras Chork, Phnom Penh",,,,INVALID_REQUEST,
7,5583,"#18 Street 604/311, Boeung Kak II, Toul Kork,P...",,,,INVALID_REQUEST,
8,4548,#244AE0 St. 376 Sangkat BengKengkang 3 Khan Ch...,,,,INVALID_REQUEST,
9,10861,"#71B, St.115, Veal Vong, 7 Makara, Phnom Penh",,,,INVALID_REQUEST,


In [12]:
# To calculate the similarity between the input addresses and google valid addresses
from difflib import SequenceMatcher

def similar(a, b):
    return SequenceMatcher(None, a, b).ratio()

num = len(output_data)
for i in range(num):
    try:
        output_data.loc[i, "Similarity"] = similar(output_data.loc[i, "address"], output_data.loc[i, "Google valid address"])
    except:
        pass

In [14]:
output_data.to_csv("Geocoded countries.csv")