# South Africa Universe Geocoding Application #

You might need to install simplejson.  If so, use *conda install -c auto simplejson* in your terminal window. Please note, I am working in Python 2.  Also using geopy: *conda install -c davidbgonzalez geopy=1.10.0*.  If you want mapping, then: *conda install -c ioos folium=0.2.0*. 

In [300]:
import os, codecs, simplejson, math, time
from urllib import urlopen
from urllib import quote
from geopy.distance import vincenty
import folium

Set up a few global variables to save space.  We are going to work with Google's API for geocoding and reverse geocoding, passing our query up through a URL and getting a JSON object back.

In [301]:
GEOCODE  = 'https://maps.google.com/maps/api/geocode/json?address='
GEOCODE2 = 'https://maps.googleapis.com/maps/api/geocode/json?latlng='
KEY = 'AIzaSyCf5T9qpELsOHqXir6VON1nOC_NZU9q38o'
DEBUG = False
MAPPING = False

Note working directory and open files.  Currently hard-wired.  Note we are working in UTF-16.

In [302]:
if DEBUG == True:
    print(os.getcwd())
    os.chdir("/Users/timothybanks/Documents")
    print(os.getcwd())
    
input_file = "univ_input1.txt"
output_file = "univ_output2.txt"

workdir = os.getcwd()
input_path = '{dir}/{file}'.format(dir=workdir, file=input_file)
output_path = '{dir}/{file}'.format(dir=workdir, file=output_file)

#get handler to the file
fRead = codecs.open(input_file,'r')
fWrite = codecs.open(output_file,'w','utf-16')

The lines in the South Africa universe file are fairly simple --- just four tab-separated fields (ID, text address, latitude, longitude):
* 37988609	
* "132 MABANDLA STREET,KWA NOBUHLE,UITENHAGE, South Africa, 6242"	
* -33.8052558	
* 25.38133222

The excitingly named variable *i* is just to read a limited number of lines from the input file for this test. *skip_lines* allows us to vary the starting point. If you want to start from the begining, set it to 0.

In [303]:
i = 0
skip_lines = 1000
line_num_to_be_read= 10

with fRead as f_in:
    for line in f_in:
        i = i + 1
        if i <= skip_lines: continue
        if i == skip_lines + line_num_to_be_read:
            break # a clumsy break
        
        # Read line by line through the file, break line into the four elements noted above.
        
        ss = line.split('\t')
        url_address = ss[1]
        QueryAddr = url_address
        print "%d: %s, %s" % (i, ss[0], ss[1])
        
        #Query Google using the address information and report status of query
        
        url = GEOCODE + quote(url_address)  + "&sensor=false" + "&key=" + KEY
        time.sleep(0.2)
        result = simplejson.load(urlopen(url))
        print "\tGoogle response: %s" % result['status']
        
        #Get the derived coordinates if these are available
        
        if result['status'] == "ZERO_RESULTS":
            derived_latitude = "ZERO_RESULTS"
            derived_longitude = "ZERO_RESULTS"
        else:
            derived_latitude = simplejson.dumps([s['geometry']['location']['lat'] for s in result['results']][0], indent=2)
            derived_longitude = simplejson.dumps([s['geometry']['location']['lng'] for s in result['results']][0], indent=2)      
        
        #Extract the longitude and latitude from string number 23. This will
        #be broken into four variables, e.g. N49 04.166 E16 27.973
        
        latitude =""
        longitude=""
        address=""
        if (len(ss[2]) > 0) or ( int(ss[2]) != 0 and int(ss[3]) != 0):
            latitude = ss[2]
            longitude = ss[3][:-2]
            url = GEOCODE2 + str(latitude) +","+ str(longitude)  + "&key=" + KEY
            time.sleep(0.2)
            result = simplejson.load(urlopen(url))
            if result['status'] == "ZERO_RESULTS":
                address = "ZERO RESULT"   
            else:
                address = simplejson.dumps([s['formatted_address']for s in result['results']][0], ensure_ascii=False, indent=2)
        
        #If we don't have coordinates from Google, then just write what we have to the output file.  Else, calculate
        #a distance between them using the geopy library.
        
        if (len(ss[2]) <= 0):
            fWrite.write(QueryAddr + '; ' + str(latitude) + '; ' + str(longitude) + '; '
                         + derived_latitude +'; ' + derived_longitude + '; ; \n')
        else:
            if (derived_longitude != "ZERO_RESULTS") & (latitude != 'ZERO_RESULTS'):
                distance = round((vincenty((longitude, latitude), (derived_longitude, derived_latitude)).meters/1000),2)
                print("\tDistance between the declared and derived coords is %d m" % (distance*1000))
                fWrite.write(QueryAddr + '; ' + str(latitude) + '; ' + str(longitude) + '; '
                             + derived_latitude +'; ' + derived_longitude + '; ' + address + '; ' +
                             str(distance) + '\n')
                
                #If we have field and derived co-ordinates, and if we want maps, then a map showing the two sets
                #of coordinates will be generated for each input line and written to /tmp.  File name will be the
                #ID number from the input file
                
                if MAPPING == True:
                    map_1 = folium.Map(location=[latitude, longitude],zoom_start=14,)
                    folium.Marker([latitude, longitude], popup='DA Coordinates').add_to(map_1)
                    folium.Marker([derived_latitude, derived_longitude], popup='Derived Coordinates',icon=folium.Icon(color='green')).add_to(map_1)
                    map_1.save('/tmp/maps/'+str(ss[0])+'.html')   
            else:
                fWrite.write(QueryAddr + '; ' + str(latitude) + '; ' + str(longitude) + '; '
                             + derived_latitude +'; ' + derived_longitude + '; ' +
                             address + '; \n')               



1001: 38588873, "255 , South Africa, 1129"
	Google response: OK
	Distance between the declared and derived coords is 4147390 m
1002: 38588879, "16 , South Africa, 1129"
	Google response: OK
	Distance between the declared and derived coords is 4254630 m
1003: 38589310, "3447 , South Africa, 9430"
	Google response: OK
	Distance between the declared and derived coords is 188670 m
1004: 38590564, " THOLO STREET, South Africa, 1754"
	Google response: OK
	Distance between the declared and derived coords is 244260 m
1005: 38591295, " , South Africa, 9480"
	Google response: OK
	Distance between the declared and derived coords is 9360 m
1006: 38591429, "10854 IKAGENG, South Africa, 1055"
	Google response: OK
	Distance between the declared and derived coords is 630 m
1007: 38591622, "5231 , South Africa, 9430"
	Google response: OK
	Distance between the declared and derived coords is 188100 m
1008: 38616939, "72 Luthuli, South Africa, 699"
	Google response: OK
	Distance between the declared and d

In [304]:
print("Closing shop now...")
fRead.close()
fWrite.close()

Closing shop now...


Just an example showing *Folium* in action.

In [305]:
import folium
map_1 = folium.Map(location=[latitude, longitude],zoom_start=14,)
folium.Marker([latitude, longitude], popup='DA Coordinates').add_to(map_1)
folium.Marker([derived_latitude, derived_longitude], popup='Derived Coordinates',
              icon=folium.Icon(color='green')).add_to(map_1)
map_1


In [306]:
ss

['38617636',
 '"10 Polokwa, South Africa, 699"',
 '-23.85136256',
 '29.40088176\r\n']