# Scraping the web for YMCA locations
## the assumptions: 
   > 1. Selenium to enter location zip code on 'Find your Y' page and go to next page (can enter by state, but it will only list 20 'closest' locations)
   > 2. page showing four of the closest 20 locations - Beautiful Soup to scrape/parse the addresses and put into DF    

In [144]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import urllib.request, urllib.parse, urllib.error
import selenium 
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
import os
import pickle 
import sys
sys.setrecursionlimit(10000)

# path to your pickled zipcode files
path = '/Volumes/ext200/Dropbox/metis/pklfiles/'  

In [84]:
def get_zips(state_name):
    """open the pickle file of zipcodes for a state
    -----------
    IN: state abbreviation
    OUT: the data in the file 
    """
    
    with open(path + state_name + "_zips.pkl", 'rb') as picklefile: 
        return pickle.load(picklefile)

In [95]:
def get_ymca_locations(state_name):
    """ #open the page to enter in zip code, parse HTML, save in a dataframe
    -------------------
    IN: 2-letter abbreviation of a state
    OUT: data frame of ymca locations (name, address, city, zip) in the state
    """
    
    # open file that contains all zipcodes for selected state
    zipcodes_for_site = [get_zips(state_name)]
    
    #create a data frame, name the cols we will fill up
    y_df = pd.DataFrame(index=range(len(zipcodes_for_site[0])*20),columns=['zipcode','state','city',
                                                                           'adds','name','locations'])
    
    row = 0
    
    for zipy in zipcodes_for_site[0]:
        
        #open chrome   
        chromedriver = "/Applications/chromedriver"
        os.environ["webdriver.chrome.driver"] = chromedriver
        driver = webdriver.Chrome(chromedriver)
        
        #url of YMCA's page
        driver.get("http://www.ymca.net/find-your-y/")
        
        # ID of the box where zipcode is entered
        query = driver.find_element_by_id("address")
        
        #put in zip code
        query.send_keys(zipy)
        
        #equivalent to hitting enter on the keyboard
        query.send_keys(Keys.RETURN)
        
        #parse HTML
        soup2=BeautifulSoup(driver.page_source,"html.parser")
        
        #section with locations
        locationsoup = soup2.find_all(style="padding-left: 17px; text-indent: -17px;") 
        
        #store the 20 closest locations
        
        for item in locationsoup:  
            name1 = item.find('a')
            name = name1.text
            adds= name1.next_sibling.next_sibling
            nn= adds.next_sibling.next_sibling
            
            # check to see if location is already there, if not, parse further and add to dataframe
            
            if adds not in y_df.adds.values:
                y_df.name.iloc[row] = name
                y_df.adds.iloc[row] = adds
                y_df.city.iloc[row] = nn.split(',')[0]
                y_df.state.iloc[row] = nn.split()[-2]
                y_df.zipcode.iloc[row] = str(nn.split()[-1])[0:5]
                
                row +=1
        
        #close web driver!!!
        driver.close()  
    
     #drop null rows
    y_df = y_df[y_df['name'].notnull()]    
    
    # this is how we will sum later
    y_df['locations'] = 1 
    
    #get rid of locations in nearby states
    y_df = y_df.loc[(y_df.state == state_name)] 
    
    return y_df    

In [179]:
# Call the function to scrape your locations
# enter state abbreviation as a 2-letter string
%time LA_y = get_ymca_locations('LA')

CPU times: user 15 µs, sys: 5 µs, total: 20 µs
Wall time: 775 µs


In [182]:
# Don't forget to save!
with open(path + 'LA_y.pkl', 'wb') as picklefile:
        pickle.dump(LA_y, picklefile)     