In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [2]:
import requests                #For downloading the HTML content using HTTP GET request
from bs4 import BeautifulSoup  #For parsing the HTML content and searching through the HTML
import os
import pandas as pd

# STAGE 1: extract all state URLs from the states page
## Stage 1 pseudocode
1. Use requests module to send a GET request to "https://simple.wikipedia.org/wiki/List_of_U.S._states"
2. Don't forget to raise_for_status to ensure you are getting 200 OK status code
3. Explore what r.text gives you

In [3]:
url = "https://simple.wikipedia.org/wiki/List_of_U.S._states"
r = requests.get(url)
r.raise_for_status()
print(r.text)

<!DOCTYPE html>
<html class="client-nojs" lang="en" dir="ltr">
<head>
<meta charset="UTF-8"/>
<title>List of U.S. states - Simple English Wikipedia, the free encyclopedia</title>
<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"113a2ec4-22b4-41eb-810f-a3675a09d120","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_U.S._states","wgTitle":"List of U.S. states","wgCurRevisionId":7172995,"wgRevisionId":7172995,"wgArticleId":3023,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["States of the United States","Lists about U.S. states"],"wgPageContentLanguage":"en","wgPageContentModel":"wikitext","wg

## Stage 1 pseudocode continued...
4. Check out what type you are getting from r.text

In [4]:
print(type(r.text))

<class 'str'>


## Stage 1 pseudocode continued...
5. Create BeautifulSoup object by passing r.text, "html.parser" as arguments and capture return value into a variable called doc
6. Try prettify() method call --- still not that pretty, right?

In [5]:
doc = BeautifulSoup(r.text, "html.parser")
print(doc.prettify())

<!DOCTYPE html>
<html class="client-nojs" dir="ltr" lang="en">
 <head>
  <meta charset="utf-8"/>
  <title>
   List of U.S. states - Simple English Wikipedia, the free encyclopedia
  </title>
  <script>
   document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"113a2ec4-22b4-41eb-810f-a3675a09d120","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"List_of_U.S._states","wgTitle":"List of U.S. states","wgCurRevisionId":7172995,"wgRevisionId":7172995,"wgArticleId":3023,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["States of the United States","Lists about U.S. states"],"wgPageContentLanguage":"en","wgPageContentMod

## Stage 1 pseudocode continued...
7. (Not a code step) Open "https://simple.wikipedia.org/wiki/List_of_U.S._states" on Google Chrome.
    - Right click on one of the state pages
    - Click on "Inspect" --- this opens developer tools
    - This tool let's you explore the html source code
    - Explore the \<table\> and sub tags like \<th\>, \<tr\>, \<td\>
    - Let's go back to coding

## Stage 1 pseudocode continued...
7. Find all "table" elements in the document by using doc.find_all(...) function and capture return value into a variable "tables"
    - explore the length of the value returned from find_all(...) function
    - check out the type of the value returned from find_all(...) function
8. Add an assert to check that there is only one table - futuristic assert to make sure the html format hasn't changed on the website
9. Extract the first table into tbl variable
    - explore type of tbl
    - try printing the content of tb1 --- looks like just a string

In [6]:
tables = doc.find_all("table")
print(len(tables)) # only one table on the states page!
print(type(tables))
#Futuristic assert to make sure the html format hasn't changed on the website 
assert len(tables) == 1 
tbl = tables[0]
print(type(tbl))

1
<class 'bs4.element.ResultSet'>
<class 'bs4.element.Tag'>


In [7]:
print(tbl)

<table class="wikitable sortable plainrowheaders" style="text-align: center;">
<caption>States of the United States of America
</caption>
<tbody><tr>
<th colspan="2" rowspan="2" scope="col">Name &amp;<br/><a class="mw-redirect" href="/wiki/List_of_U.S._state_abbreviations" title="List of U.S. state abbreviations">postal abbs.</a>
<p><sup class="reference" id="cite_ref-USPSabbreviations_1-0"><a href="#cite_note-USPSabbreviations-1">[1]</a></sup>
</p>
</th>
<th colspan="2" scope="col">Cities
</th>
<th rowspan="2" scope="col">Established<sup class="reference" id="cite_ref-3"><a href="#cite_note-3">[upper-alpha 1]</a></sup>
</th>
<th rowspan="2" scope="col">Population<br/><sup class="reference" id="cite_ref-4"><a href="#cite_note-4">[upper-alpha 2]</a></sup><sup class="reference" id="cite_ref-AnnualEstUS_5-0"><a href="#cite_note-AnnualEstUS-5">[3]</a></sup>
</th>
<th colspan="2" scope="col">Total area<sup class="reference" id="cite_ref-areameasurements_6-0"><a href="#cite_note-areameasurem

## Stage 1 pseudocode continued...
10. Find all the tr elements by using tbl.find_all(...) function and capture return value into a variable tr.
    - explore length of trs, type of trs
    - Add an assert checking that length of trs is at least 50 (For 50 US states)

In [8]:
trs = tbl.find_all("tr")
print(len(trs))
print(type(trs))
assert len(trs) >= 50

52
<class 'bs4.element.ResultSet'>


## Stage 1 pseudocode continued...
11. Iterate over each item in trs (going to be a lengthy step!)
    - print each item (tr tag)
    - call tr.find(..) to find "th" elements --- this finds th element for every tr element.
    - capture return value into a variable called th
    - print th and explore what you are getting.
    - find each hyperlinks within each th element: call th.find_all("a") and capture return value into a variable called links
    - explore length of links by printing it --- some of the states have 2 links; go back and explore why that is the case and figure out which link you want
        - some have 0 links, skip over those entries!
        - extract first of the hyperlinks into a variable called link
        - print link to confirm you are able to extract the correct link
        - explore type of link
        - print link.get_text() method and get attrs of link by saying link.attrs
        - capture link.get_text() into a variable state
        - capture link.attrs into a variable state_url --- we need a full URL. Define a prefix variable holding "https://simple.wikipedia.org" and concatenate prefix + link.attrs
        - create a new dictionary called state_links --- we are going to use this dict to track each state and its URL. Think carefully about where you have to create this empty dict.

#### Congrats :) stage 1 is done

In [9]:
prefix = "https://simple.wikipedia.org"
state_links = {} #KEY: state name; VALUE: link to state page

for tr in trs:
    th = tr.find("th")
    links = th.find_all("a")
    if len(links) == 0:
        continue
    link = links[0]
    #print(type(link))
    #print(link.get_text(), link.attrs) #link.attrs is a dict
    state = link.get_text()
    state_url = prefix + link.attrs["href"]
    state_links[state] = state_url
    
state_links

{'postal abbs.': 'https://simple.wikipedia.org/wiki/List_of_U.S._state_abbreviations',
 'Alabama': 'https://simple.wikipedia.org/wiki/Alabama',
 'Alaska': 'https://simple.wikipedia.org/wiki/Alaska',
 'Arizona': 'https://simple.wikipedia.org/wiki/Arizona',
 'Arkansas': 'https://simple.wikipedia.org/wiki/Arkansas',
 'California': 'https://simple.wikipedia.org/wiki/California',
 'Colorado': 'https://simple.wikipedia.org/wiki/Colorado',
 'Connecticut': 'https://simple.wikipedia.org/wiki/Connecticut',
 'Delaware': 'https://simple.wikipedia.org/wiki/Delaware',
 'Florida': 'https://simple.wikipedia.org/wiki/Florida',
 'Georgia': 'https://simple.wikipedia.org/wiki/Georgia_(U.S._state)',
 'Hawaii': 'https://simple.wikipedia.org/wiki/Hawaii',
 'Idaho': 'https://simple.wikipedia.org/wiki/Idaho',
 'Illinois': 'https://simple.wikipedia.org/wiki/Illinois',
 'Indiana': 'https://simple.wikipedia.org/wiki/Indiana',
 'Iowa': 'https://simple.wikipedia.org/wiki/Iowa',
 'Kansas': 'https://simple.wikipedia.

# STAGE 2: download the html page for each state
## Stage 2 pseudocode
1. Create a directory called "html_files_for_states". Make sure to use try except block to catch FileExistsError exception
2. Initially convert the keys of state_links dict into a list and work with just first 3 items in the list of keys
3. Iterate over each key (initially just use 3):
    1. If key is "postal abbs.", skip processing. What keyword allows you to skip current iteration of the loop?
    2. To create each state's html file name, concatenate the directory name "html_files_for_states" with current key and add a ".html" to the end.
    3. Add the html file name into a new dictionary called "state_files". Think carefully about where you have to create this empty dict.
    4. Use requests module get(...) function call to download the contents of the state URL page.
    5. Open the state html file in write mode and write r.text into the state html file.
    
#### Congrats :) stage 2 is done

In [10]:
html_dir = "html_files_for_states"
state_files = {} #KEY: state; VALUE: state file

try:
    os.mkdir(html_dir)
except FileExistsError:
    pass

for state in state_links.keys():
    if state == "postal abbs.":
        continue
    state_url = state_links[state]

    #html file name
    state_file = os.path.join(html_dir, state + ".html")
    state_files[state] = state_file
    
    #Optimization: if state file already exists, you can perhaps skip downloading it again
    if os.path.exists(state_file):
        continue
    
    #Download
    r = requests.get(state_url)
    r.raise_for_status
    print(state_file)
    
    #Save to a file
    f = open(state_file, "w", encoding = "utf-8")
    f.write(r.text)
    f.close()

# STAGE 3: extract details from each state page
## Stage 3 pseudocode
1. Write a function state_stats. Input path to 1 state file. Output dict of stats for that state
2. Open state html file, read its content.
3. Create a BeautifulSoup object called doc.
4. doc.find_all("tr") - capture return value into a variable called trs
5. Iterate over each tr element
    1. You can retrieve a pair of elements by saying: cells = tr.find_all(["th", "td"])
    2. Explore length of the cells. Notice that there are some entries have length > 2. Let's skip over those. 
    3. Create a dict called stats, where key is the th element's text and the value is td element's text
6. Don't forget to return the stats dict
7. Call state_stats with state_files["Wisconsin"]

In [11]:

def state_stats(path):
    stats = {}
    f = open(path, encoding = "utf-8")
    html_string = f.read()
    f.close()
    
    doc = BeautifulSoup(html_string, "html.parser")
    trs = doc.find_all("tr")
    for tr in trs:
        cells = tr.find_all(["th", "td"])
        if len(cells) == 2:
            key = cells[0].get_text()
            value = cells[1].get_text()
            stats[key] = value
    return stats

wi_stats = state_stats(state_files["Wisconsin"])
print("WI state drink:", wi_stats["Beverage"])
print("WI state dance:", wi_stats["Dance"])

WI state drink: Milk
WI state dance: Polka


## Stage 3 pseudocode continued
- Iterate over all the state files, call state_stats function, and save the return value into a variable.
- Keep track of each state's stats in a dict called state_details
- Create a pandas DataFrame from the state_details dict
- Explore the DataFrame.

In [12]:
states_details = {}

for state in state_files.keys():
    stats = state_stats(state_files[state])
    states_details[state] = stats

In [13]:
states_df = pd.DataFrame(states_details)
states_df

Unnamed: 0,Alabama,Alaska,Arizona,Arkansas,California,Colorado,Connecticut,Delaware,Florida,Georgia,...,South Dakota,Tennessee,Texas,Utah,Vermont,Virginia,Washington,West Virginia,Wisconsin,Wyoming
Country,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States,...,United States,United States,United States,United States,United States,United States,United States,United States,United States,United States
Before statehood,Alabama Territory,Territory of Alaska,Arizona Territory,Arkansas Territory,California Republic,,Connecticut Colony,"Delaware Colony, New Netherland, New Sweden",Florida Territory,Province of Georgia,...,Dakota Territory,Southwest Territory,Republic of Texas,Utah Territory,Vermont Republic,Colony of Virginia,Washington Territory,Part of Virginia,Wisconsin Territory,Wyoming Territory
Admitted to the Union,"December 14, 1819 (22nd)","January 3, 1959 (49th)","February 14, 1912 (48th)","June 15, 1836 (25th)","September 9, 1850 (31st)","August 1, 1876 (38th)","January 9, 1788 (5th)","December 7, 1787 (1st)","March 3, 1845 (27th)","January 2, 1788 (4th)",...,"November 2, 1889 (39th or 40th)","June 1, 1796 (16th)","December 29, 1845 (28th)","January 4, 1896 (45th)","March 4, 1791 (14th)","June 25, 1788 (10th)","November 11, 1889 (42nd)","June 20, 1863 (35th)","May 29, 1848 (30th)","July 10, 1890 (44th)"
Capital,Montgomery,Juneau,,,Sacramento,,Hartford[1],Dover,Tallahassee,,...,Pierre,,Austin,,Montpelier,Richmond,Olympia,,Madison,
Largest city,Birmingham,Anchorage,,,Los Angeles,,Bridgeport,Wilmington,Jacksonville,,...,Sioux Falls,,Houston,,Burlington,Virginia Beach,Seattle,,Milwaukee,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Lowest elevation (Lake Michigan[6][7]),,,,,,,,,,,...,,,,,,,,,579 ft (176 m),
East North Central,,,,,,,,,,,...,,,,,,,,,Illinois · Indiana · Michigan · Ohio · Wisconsin,
West North Central,,,,,,,,,,,...,,,,,,,,,Iowa · Kansas · Minnesota · Missouri · Nebrask...,
Highest elevation (Gannett Peak[2][3][4]),,,,,,,,,,,...,,,,,,,,,,"13,809 ft (4,209.1 m)"


In [14]:
states_df.loc["Capital"]

Alabama                                                  Montgomery
Alaska                                                       Juneau
Arizona                                                         NaN
Arkansas                                                        NaN
California                                               Sacramento
Colorado                                                        NaN
Connecticut                                             Hartford[1]
Delaware                                                      Dover
Florida                                                 Tallahassee
Georgia                                                         NaN
Hawaii                                                          NaN
Idaho                                                           NaN
Illinois                                                        NaN
Indiana                                                         NaN
Iowa                                            