# Chianson Siu
## Final project

This script saves and parses homeless shelter data from https://www.kingcounty.gov/depts/community-human-services/housing/services/homeless-housing/coordinated-entry/access-points.aspx. and foodbank data from
https://www.homelessshelterdirectory.org/cgi-bin/id/countyfb.cgi?county=King-County&state=WA. It then uses LocationIQ
api in order to geoencode these addresses and write them to a geojson file.

In [1]:
# import packages and set the working directory
import urllib2
import urllib
import requests
import lxml
from lxml import html
import geopandas
import folium
import os
import shapely
import shapely.geometry
import fiona
import fiona.crs
import json
import time
import re
workspace = "geog458\\Final_copy_lab_3"

In [2]:
# Function for saving an html page to a local file
def saveHtml(link, fileName):
    u = urllib2.urlopen(link)
    filePath = os.path.abspath(fileName + ".html")
    localFile = open(filePath, "w")
    localFile.write(u.read())
    localFile.close()
    print(filePath)
    return filePath

In [3]:
# Save an html page containing addresses of homeless shelters / homes
homelessPath = saveHtml("https://www.kingcounty.gov/depts/community-human-services/housing/services/homeless-housing/coordinated-entry/access-points.aspx", "homless_shelter_data")

C:\Users\cjms2\geog458\Final_copy_lab_3\homless_shelter_data.html


In [4]:
# Reads the local html file
homelessText = urllib2.urlopen("file:///" + homelessPath).read()

In [5]:
# Set the root of the html tree for parsing
homelessRoot = html.document_fromstring(homelessText)

In [6]:
# Get the panels containing homeless addresses and names
homelessPanels = homelessRoot.find_class("panel-accordion-primary")
# veteran's text has a different text structure than the other panels.
veteranHome = homelessPanels[5] # verteran's panel stored in index 5
del homelessPanels[5] # remove veteran's panel 

In [7]:
homelessAddress = [] # empty list to hold homeless shelter addresses
homelessTitle = [] # empty list to hold homeless shelter names

# Scans the panel body for homeless addresses and names. 
# Appends the results to the appropriate list
for i in range(0,len(homelessPanels)):
    panelBody = homelessPanels[i].find_class("panel-body")
    panelLink = panelBody[0].cssselect("a") # addresses stored in anchor tags
    panelTitle = panelBody[0].cssselect("strong") # names stored in strong tags

    for link in panelLink:
        panelAddress = link.text_content().strip() # remove extra spaces
        homelessAddress.append(panelAddress.encode("utf-8")) # add address to address list
    
    for title in panelTitle:
        thisTitle = title.text_content().strip() # remove extra spaces
        homelessTitle.append(thisTitle.encode("utf-8")) # add names to names list

# cleanse the addresses for inconsistencies
del homelessAddress[len(homelessAddress)-4:len(homelessAddress)] # last 4 indices do not have addresses
del homelessAddress[9] # index 9 contains an email
homelessAddress[5] = homelessAddress[5] + homelessAddress[6] # complete address was split between index 5 and 6
del homelessAddress[6] # index 6 only contains half an address
del homelessAddress[15] # index 15 contains an blank string ""

# append the veteran's homeless clinic name and address
vetHome = veteranHome.cssselect("p")[5].text_content()
homelessAddress.append(vetHome[len(vetHome)-43:len(vetHome)]) # address contained in last 43 indexes
homelessTitle.append(vetHome[0:len(vetHome)-45]) # name contained in length of text minus 45 indexes

In [8]:
# list of indices within the names list that do not contain names
indices = [1, 2, 3, 5, 6, 7, 9, 10, 11, 13, 14, 15, 18, 19, 20, 21, 22, 23, 26, 28, 29, 30, 31, 32, 37, 42]

# cleanse names list by removing indices that do not contain names
for i in sorted(indices, reverse = True):
    del homelessTitle[i]
homelessTitle[4] = homelessTitle[4] + homelessTitle[5] # full name split between index 4 and 5
homelessTitle[6] = homelessTitle[6] + " " + homelessTitle[7] # full name split between index 6 and 7

# delete index 7 and 5 after concatenation
del homelessTitle[7]
del homelessTitle[5]


In [9]:
# function to making a get request to LocationIQ api to geoencode addresses
# takes one address as the desired search string parameter and returns
# the get request response
def getLocation(searchString):
    
    geocodingApiKey = "9af8ae63239de6" # my API key
    url = "https://us1.locationiq.org/v1/search.php" # base website
    geoformat = "json" # desired return format
    
    # contstructing the URL for the get request
    url = (url + "?key=" + urllib.quote(geocodingApiKey) + 
              "&q=" + urllib.quote(searchString[0:len(searchString)]) + 
              "&format=" + urllib.quote(geoformat))
    response = requests.get(url) # performing get request
    return (response) # return request response

In [10]:
homelessData = []
homelessAddressEmpty = []

# performs a get request for each address in the homeless address list.
# appends the [name, address, lat, lon, geometry point] to the homeless
# data list. If a request returns an empty response, stores the index
# in the homelessAddressEmpty list to be cleanses later
for i in range(0, len(homelessAddress)):
    searchString = homelessAddress[i] # address
    response = getLocation(searchString) # get request for each address
    time.sleep(1) # delay each request
    
    # If an empty response is returned, tries get request one more time
    # without the ending zip code
    if (response.status_code == 404): # empty json returned
        searchString = searchString[0: len(searchString)-6] # remove zip code
        response = getLocation(searchString)
    
    # appends the name, addres, lat, lon, geometry to homeless data list
    # if the response returned a valid output with longitude and latitude
    # data
    if (response.status_code != 500) & (response.status_code != 404):
        jsonAsDict = json.loads(response.text)[0]
        # encodes the lat and lon as coordinate point for the geometry column
        coordinateTuple = [float(jsonAsDict["lon"].encode("utf-8")), float(jsonAsDict["lat"].encode("utf-8"))]
        homelessData.append([homelessTitle[i], 
                             homelessAddress[i], 
                             float(jsonAsDict["lon"].encode("utf-8")), 
                             float(jsonAsDict["lat"].encode("utf-8")),
                             shapely.geometry.Point(coordinateTuple)])
    else: # appends index of failed request to be cleansed
        homelessAddressEmpty.append(i)

In [11]:
# cleanses the failed get request by replacing them with hard coded
# addresses that were tested and returned a proper output
homelessAddressFix = [] # list to hold fixed addresses
homelessAddressFix.append("11061 NE 2nd St. Bellevue, King County")
homelessAddressFix.append("11920 NE 80th St. Kirkland, King County")
homelessAddressFix.append("2709 3rd Ave. Seattle. King County")
homelessAddressFix.append("16225 NE 87th Street. Redmond, King County")
homelessAddressFix.append("419 S 2nd Street #2. Renton, King County")

# index 2 was not able to return a proper response. Without the proper
# geocoding, this index was removed. In this case, it would be the
# Therapeutic Health Services located at 
# 1901 Martin Luther King Jr. Way S, Seattle, WA 98144
del homelessAddressEmpty[2]


In [12]:
# Repeats get requests for the fixed addresses and appends
# the data appropriately to the end of the homeless data
# list
for i in range(0, len(homelessAddressFix)):
    response = getLocation(homelessAddressFix[i])
    jsonAsDict = json.loads(response.text)[0]
    index = homelessAddressEmpty[i] # get the correct index of the name and address for this response
    coordinateTuple = [float(jsonAsDict["lon"].encode("utf-8")), float(jsonAsDict["lat"].encode("utf-8"))]
    homelessData.append([homelessTitle[index], 
                         homelessAddress[index], 
                         float(jsonAsDict["lon"].encode("utf-8")), 
                         float(jsonAsDict["lat"].encode("utf-8")),
                         shapely.geometry.Point(coordinateTuple)])

In [13]:
# Add column name and convert data to a GeoDataFrame
geopandas_df = geopandas.GeoDataFrame(homelessData, columns=['name','address', "long", "lat", "geometry"])
geopandas_df

Unnamed: 0,name,address,long,lat,geometry
0,Catholic Community Services - Seattle,"100 23rd Ave. S., Seattle, WA 98144",-122.301566,47.601189,POINT (-122.30156565 47.6011886)
1,Multi-Service Center- Federal Way,"1200 S. 336th St., Federal Way, WA 98003",-122.318042,47.300769,POINT (-122.318042154362 47.3007686442953)
2,YWCA- Renton,"1010 S. 2nd St., Renton, WA 98057",-122.203563,47.481409,POINT (-122.203563458549 47.4814093)
3,Solid Ground - North Seattle,"9600 College Way N. Seattle, WA 98103",-122.332552,47.698708,POINT (-122.332551801326 47.69870805)
4,YMCA Young Adult Services Drop in Center,"2100 24th Ave S, Seattle, WA 98144",-122.301253,47.584188,POINT (-122.301253 47.5841884)
5,YouthCare’s James W. Ray Orion Center,"1828 Yale Avenue, Seattle, WA 98101",-122.330389,47.618233,POINT (-122.3303895 47.6182332)
6,Peace for the Streets by Kids from the Streets...,"1609 19th Avenue, Seattle, WA 98122",-122.307734,47.615584,POINT (-122.3077338 47.6155844)
7,Nexus Youth & Families,"915 H Street SE, Auburn, WA 98002",-122.218838,47.299578,POINT (-122.218837787879 47.2995776464646)
8,Teen Feed,"4740 B University Way NE, Seattle, WA 98105",-122.312734,47.664324,POINT (-122.31273440028 47.66432385)
9,University District Youth Center,"4516 15th Avenue NE, Seattle, WA 98105",-122.31167,47.661859,POINT (-122.311670474999 47.66185935)


In [14]:
# save the GeoDataFrame to a shapefile
# geopandas_df.to_file("homelessData.shp", driver = "ESRI Shapefile")

In [15]:
geopandas_df.to_csv("homeless_data.csv", index = False, encoding='utf-8')

In [16]:
# Save geopandas dataframe to geojson
# geopandas_df.to_file("homelessData.geojson", driver = "GeoJSON")

In [17]:
foodbankPath = saveHtml("https://www.homelessshelterdirectory.org/cgi-bin/id/countyfb.cgi?county=King-County&state=WA", "food_bank_data")
# Reads the local html file
foodbankText = urllib2.urlopen("file:///" + foodbankPath).read()
# Set the root of the html tree for parsing
foodbankRoot = html.document_fromstring(foodbankText)

C:\Users\cjms2\geog458\Final_copy_lab_3\food_bank_data.html


In [18]:
#Selects the table element with all the links
foodbTable = foodbankRoot.cssselect("table")
#List to hold the links
foodbList = []
i = 0
#Gets all links on the page
for link in foodbTable[0].xpath('//a/@href'):
    i = i + 1
    foodbList.append(link)
#Delets the first 14 indices becuase they are not links related to food banks
for i in range(13, -1, -1):
    del foodbList[i]
#Deletes all odd inices because they are duplicate links
for i in range(len(foodbList)-1, -1, -1):
    if(i%2 == 1):
        del foodbList[i]

In [19]:
auburnFbankPath = saveHtml("https://www.homelessshelterdirectory.org/cgi-bin/id/foodbank.cgi?foodbank=36", "auburn_food_bank")
print(auburnFbankPath)
# Reads the local html file
auburnFbankText = urllib2.urlopen("file:///" + auburnFbankPath).read()
# Set the root of the html tree for parsing
auburnFbankRoot = html.document_fromstring(auburnFbankText)
#print(auburnFbankText)

C:\Users\cjms2\geog458\Final_copy_lab_3\auburn_food_bank.html
C:\Users\cjms2\geog458\Final_copy_lab_3\auburn_food_bank.html


In [20]:
#Function to get the foodbank data. Because each foodbank info is stored in a 
#separate webpages, parses each web page passed in and returns a list
#containing the name and address of each foodbank
def getFoodBankData(website):
    thisWebPath = saveHtml(website, "temp")
    # Reads the local html file
    thisWebText = urllib2.urlopen("file:///" + thisWebPath).read()
    # Set the root of the html tree for parsing
    thisWebRoot = html.document_fromstring(thisWebText)
    #List to hold the foodbank data
    result = []
    
    #Select the correct DOM elements containing the title, address, and phone
    #number of each foodbank
    thisWebContent = thisWebRoot.find_class("entry_content")
    thisWebTitle = thisWebRoot.find_class("entry_title")   
    thisWebAddress = thisWebContent[0].cssselect("p")

    #Append foodbank title: deletes text after hyphen and extra "the" words
    result.append(thisWebTitle[0].text.split("-", 1)[0].replace("the", "").strip().encode("utf-8"))
    #Append foodbank address: first part of address stored in text, second part stored
    #in the tail after a <br> tag
    result.append(thisWebAddress[0].text.strip() + " " + thisWebAddress[0][0].tail.strip().encode("utf-8"))
    #Delete temp file
    os.remove(thisWebPath)
    return result

In [21]:
#list to hold the food bank data
foodbData = []
for foodbank in foodbList:
    foodbData.append(getFoodBankData(foodbank))

C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html
C:\Users\cjms2\geog458\Final_copy_lab_3\temp.html


In [22]:
foodbLocationData = []
foodbEmpty = []

# performs a get request for each address in the food bank address list.
# appends the [name, address, lat, lon, geometry point] to the homeless
# data list. If a request returns an empty response, stores the index
# in the homelessAddressEmpty list to be cleanses later
for i in range(0, len(foodbData)):
    searchString = foodbData[i][1] # address
    response = getLocation(searchString) # get request for each address
    time.sleep(1) # delay each request
    
    # If an empty response is returned, tries get request one more time
    # without the ending zip code
    if (response.status_code == 404): # empty json returned
        searchString = searchString[0: len(searchString)-6] # remove zip code
        response = getLocation(searchString)
    
    # appends the name, address, lat, lon, geometry to food bank data list
    # if the response returned a valid output with longitude and latitude
    # data
    if (response.status_code != 500) & (response.status_code != 404):
        jsonAsDict = json.loads(response.text)[0]
        # encodes the lat and lon as coordinate point for the geometry column
        coordinateTuple = [float(jsonAsDict["lon"].encode("utf-8")), float(jsonAsDict["lat"].encode("utf-8"))]
        foodbLocationData.append([foodbData[i][0], 
                            foodbData[i][1], 
                             float(jsonAsDict["lon"].encode("utf-8")), 
                             float(jsonAsDict["lat"].encode("utf-8")),
                             shapely.geometry.Point(coordinateTuple)])
    else: # appends index of failed request to be cleansed
        foodbEmpty.append(i)

In [23]:
#list to hold the missing locations
missingLocations = []
for i in range(0, len(foodbEmpty)):
    missingLocations.append(foodbData[foodbEmpty[i]])

In [24]:
#List to fix addresses that are missing. Addresses are hard-coded after trying several
#get requests with variants of the original address
fixedAddresses = []
#rotary
fixedAddresses.append("1201 1st Ave S Seattle, WA 98134")
#northwest harvest
fixedAddresses.append("711 Cherry St, Seattle, WA 98104")
#North Seattle Neighbors In Need Food Bank Association
fixedAddresses.append("12044 25th Ave NE Seattle, WA 98125")
#Kent Food Bank And Emergency Services
fixedAddresses.append("515 W Harrison St #107, Kent, WA")
#International Food Bank
fixedAddresses.append("35422 25th Ave Sw Federal Way, WA 98023")
#Food Resource Network
fixedAddresses.append("4731 15th Ave NE, Seattle, WA 98105")
#Food Lifeline
fixedAddresses.append("815 S 96th St, Seattle, WA 98108")
#Auburn Food Bank
fixedAddresses.append("930 18th Pl NE, Auburn, WA 98002")
#Vashon Maury Community Food Bank
fixedAddresses.append("10030 SW 210th St #4, Vashon, WA")

In [25]:
del missingLocations[3] #third index could not be geo-encoded
for i in range(0, len(fixedAddresses)):
    missingLocations[i][1] = fixedAddresses[i]

In [26]:
for i in range(0, len(missingLocations)):
    searchString = missingLocations[i][1] # address
    response = getLocation(searchString) # get request for each address
    time.sleep(1) # delay each request
    
    # If an empty response is returned, tries get request one more time
    # without the ending zip code
    if (response.status_code == 404): # empty json returned
        searchString = searchString[0: len(searchString)-6] # remove zip code
        response = getLocation(searchString)
    
    # appends the name, address, lat, lon, geometry to homeless data list
    # if the response returned a valid output with longitude and latitude
    # data
    if (response.status_code != 500) & (response.status_code != 404):
        jsonAsDict = json.loads(response.text)[0]
        # encodes the lat and lon as coordinate point for the geometry column
        coordinateTuple = [float(jsonAsDict["lon"].encode("utf-8")), float(jsonAsDict["lat"].encode("utf-8"))]
        foodbLocationData.append([missingLocations[i][0], 
                            missingLocations[i][1], 
                             float(jsonAsDict["lon"].encode("utf-8")), 
                             float(jsonAsDict["lat"].encode("utf-8")),
                             shapely.geometry.Point(coordinateTuple)])
    else: # appends index of failed request to be cleansed
        foodbEmpty.append(i)

In [27]:
foodbGeoPanda = geopandas.GeoDataFrame(foodbLocationData, columns = ["name", "address", "long", "lat", "geometry"])

In [None]:
foodbGeoPanda.to_csv("foodBankLocationDataFrame.csv", index = False, encoding = "utf-8")

In [None]:
foodbGeoPanda.to_file("foodBankLocationGeoJson.geojson", driver = "GeoJSON")