## Web Scraping - Virginia Example

In [None]:
# interacting with websites and web-APIs
import requests # easy way to interact with web sites and services
import json # read/write JavaScript Object Notation (JSON)
from bs4 import BeautifulSoup

# from selenium import webdriver
# browser = webdriver.Firefox()

In [None]:
# We'll start by just copying the URL from our web browser and saving it as a variable:

## The real world version of this website can be found here:
# url = "http://www.virginia.gov/localagency/index.html"

## In our development environment, it is here:
url = "http://deepdish.adrf.info/contrib/virginia.html"

response = requests.get(url)
print(type(response))

## If HTML doesn't load entirely:
# browser.get(url)
# soup = BeautifulSoup(browser.page_source)

In [None]:
# Check the status code (we use str() since this returns an int:
print("Status code " + str(response.status_code) )
## Returns a status of 200 - that's good.

# Header - Content Type
print("Content type " + response.headers['content-type'])
## We were expecting HTML, so that's good too.

# Check the encoding:
print("Encoding is " + response.encoding)
# UTF-8 is a common encoding that we can easily work with. All good.

In [None]:
# We can also print out all the text from the response:
print(response.text)

In [None]:
soup = BeautifulSoup(response.text)
print(type(soup))

In [None]:
## Right off the bat, this gives us new methods, like prettify, that make our HTML a lot easier to work with.
print(soup.prettify())

In [None]:
# We can also askour BeautifulSoup object for specific tags, like the title:
soup.title

In [None]:
# Notice that included the tag itself, but we could just get the text with:
soup.title.text

In [None]:
# Or alternatively, just the tag name:
soup.title.name

In [None]:
# This works well for <title> since there should only be one for any webpage.
# For more common tags, we can use the "find" method to grab the first tag of a certain type (and its contents):
soup.find("p")

In [None]:
# Alternatively, yo can find all the tags of a certain type with "find_all":
soup.find_all("button")

In [None]:
# To get more specific, you can find HTML tags by both their type and attributes:
soup.find("button", {"class": "btn_eastern"})

In [None]:
table = soup.find("tbody")
print(table)

In [None]:
## Instead of printing that out, let's save it as a variable called rows:
rows = table.find_all("tr")
type(rows)

In [None]:
# And we can look within each row (below, just the first row) for table elements <td>:
rows[0].find_all("td")

In [None]:
# Alternatively, we can use the 'findChildren' method.
print(rows[0].findChildren('td'))
print(rows[1].findChildren('td'))
print(rows[2].findChildren('td'))
print(rows[3].findChildren('td'))
print(rows[4].findChildren('td'))
# Note that 'child' is a relative term, refering to a tag within a tag. 
# The container tag is called the 'parent' tag, likewise relative to the child tag.

In [None]:
## We can see that come of these rows have a span element, and some do not:
print(rows[0].findChildren("span"))
print(rows[1].findChildren("span"))
print(rows[2].findChildren("span"))
print(rows[3].findChildren("span"))
print(rows[4].findChildren("span"))

In [None]:
directors = [] # create an empty list to store the director names

for i in range(0, len(rows) -1): # Perform a loop over the number of rows in the table
    row = rows[i] # Subset list to just one row
    
    director = row.find_all("td")[4].text # grab text within the fifth <td> tag
    
    directors.append(director) # Add this name to our list
    
# And now we have a list of the directors of Virginia Social Services agencies:
print(directors)

In [None]:
agencies = []
addresses = []
phone_numbers = []

for i in range(0, len(rows)-1):
    row = rows[i]
    
    address = row.find_all("td")[1].text
    phone_number = row.find_all("td")[4].text
    
    if row.find("span", {"class" : "ng-scope"}):
        agency = row.find("span", {"class" : "ng-scope"}).text
    elif row.find("a", {"class" : "ng-scope"}):
        agency = row.find("a", {"class" : "ng-scope"}).text
    else:
        agency = None
        
    agencies.append(agency)
    addresses.append(address)
    phone_numbers.append(phone_number)

In [None]:
## Save our scraped data to a new pandas dataframe

import pandas as pd

# Create a python dictionary (list of key-value pairs)
d = {"agency_name" : pd.Series(agencies),
    "address" : pd.Series(addresses),
    "phone_number" : pd.Series(phone_numbers),
    "director_name" : pd.Series(directors)}

# Easy to convert to recognizable pandas dataframe (tabular data):
df = pd.DataFrame(d)

print(type(d))
print(type(df))
print(df.shape)
df[:10]

In [None]:
## Save our scraped data as a csv:
df.to_csv("va-social-services.csv", encoding="UTF-8")