In [None]:
"""

Get list of state pages from the Wikipedia page
https://en.wikipedia.org/wiki/County_(United_States)

scrape page and output to pickle the state name and wikipedia link

scrape page and output to pickle the table with county aggregates by state

"""

In [1]:
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd

import pickle

In [2]:
url = 'https://en.wikipedia.org/wiki/County_(United_States)' 

response = requests.get(url)

In [3]:
response.status_code  #200 = success!

200

In [4]:
response.text[:1000]  #First 1000 characters of the HTML

'<!DOCTYPE html>\n<html class="client-nojs" lang="en" dir="ltr">\n<head>\n<meta charset="UTF-8"/>\n<title>County (United States) - Wikipedia</title>\n<script>document.documentElement.className="client-js";RLCONF={"wgBreakFrames":!1,"wgSeparatorTransformTable":["",""],"wgDigitTransformTable":["",""],"wgDefaultDateFormat":"dmy","wgMonthNames":["","January","February","March","April","May","June","July","August","September","October","November","December"],"wgRequestId":"YAHX8wpAADsAAGmDOB4AAAAL","wgCSPNonce":!1,"wgCanonicalNamespace":"","wgCanonicalSpecialPageName":!1,"wgNamespaceNumber":0,"wgPageName":"County_(United_States)","wgTitle":"County (United States)","wgCurRevisionId":1000221236,"wgRevisionId":1000221236,"wgArticleId":88366,"wgIsArticle":!0,"wgIsRedirect":!1,"wgAction":"view","wgUserName":null,"wgUserGroups":["*"],"wgCategories":["All articles with dead external links","Articles with dead external links from April 2020","Webarchive template wayback links","Articles with dead e

In [5]:
page = response.text

In [6]:
soup = BeautifulSoup(page)

In [10]:
#soup

In [11]:
#print(soup.prettify())

In [19]:
table = soup.find_all(class_ ='flagicon')
#table


In [13]:
headers = ['state', 'wiki_link']

state_pages = []

for st in table:

    state = st.find('a').get('title')
    state_link = st.find('a').findNext('a').get('href')

    links_dict = dict(zip(headers, [state, state_link]))

    state_pages.append(links_dict)


In [18]:
#state_pages

In [15]:
# put the list of links into a df
df_state_links = pd.DataFrame(state_pages)

df_state_links.head()

Unnamed: 0,state,wiki_link
0,Alabama,/wiki/List_of_counties_in_Alabama
1,Alaska,/wiki/List_of_boroughs_and_census_areas_in_Alaska
2,Arizona,/wiki/List_of_counties_in_Arizona
3,Arkansas,/wiki/List_of_counties_in_Arkansas
4,California,/wiki/List_of_counties_in_California


In [16]:
len(df_state_links)

57

In [17]:
df_state_links.tail(10)

Unnamed: 0,state,wiki_link
47,Washington (state),/wiki/List_of_counties_in_Washington
48,West Virginia,/wiki/List_of_counties_in_West_Virginia
49,Wisconsin,/wiki/List_of_counties_in_Wisconsin
50,Wyoming,/wiki/List_of_counties_in_Wyoming
51,American Samoa,/wiki/Administrative_divisions_of_American_Samoa
52,Guam,/wiki/List_of_populated_places_in_Guam
53,Northern Mariana Islands,/wiki/Northern_Mariana_Islands#Administrative_...
54,Puerto Rico,/wiki/Municipalities_of_Puerto_Rico
55,United States,/wiki/U.S._Minor_Outlying_Islands
56,United States Virgin Islands,/wiki/United_States_Virgin_Islands


In [20]:
with open('state_wiki_links.pickle', 'wb') as to_write:
    pickle.dump(df_state_links, to_write)

In [None]:
# for each state and territory listed scrape from page table:

# state
# 2019 total population
# land area
# counties
# equivalents
# total counties + equivalents
# avg population 
# avg land area 

In [17]:
table_data = soup.find(class_='wikitable sortable').find_all('td')
#table_data

In [18]:
state_data = [x.text.strip() for x in table_data]
#state_data

In [25]:
len(state_data)

472

In [19]:
state_data_array = np.array(state_data).reshape(59,8)
#state_data_array

In [28]:
df_st_data = pd.DataFrame(state_data_array)
df_st_data.head(5)

Unnamed: 0,0,1,2,3,4,5,6,7
0,Alabama,4903185,"50,645 sq mi131,171 km2",67,—,67,73182,"756 sq mi1,958 km2"
1,Alaska[g],731545,"570,641 sq mi1,477,953 km2",—,30,30,24385,"19,677 sq mi50,964 km2"
2,Arizona,7278717,"113,594 sq mi294,207 km2",15,—,15,485248,"7,573 sq mi19,614 km2"
3,Arkansas,3017825,"52,035 sq mi134,771 km2",75,—,75,40238,"694 sq mi1,797 km2"
4,California,39512223,"155,779 sq mi403,466 km2",58,—,58,681245,"2,686 sq mi6,956 km2"


In [38]:
col_titles = ['State, federal district, or territory',
          'Total 2019 population',
          'Total Land area',
          'Counties',
          'Equivalents',
          'Total',
          'Average Population',
          'Average Land area']

df_st_data.columns = col_titles

In [39]:
df_st_data.head(5)

Unnamed: 0,"State, federal district, or territory",Total 2019 population,Total Land area,Counties,Equivalents,Total,Average Population,Average Land area
0,Alabama,4903185,"50,645 sq mi131,171 km2",67,—,67,73182,"756 sq mi1,958 km2"
1,Alaska[g],731545,"570,641 sq mi1,477,953 km2",—,30,30,24385,"19,677 sq mi50,964 km2"
2,Arizona,7278717,"113,594 sq mi294,207 km2",15,—,15,485248,"7,573 sq mi19,614 km2"
3,Arkansas,3017825,"52,035 sq mi134,771 km2",75,—,75,40238,"694 sq mi1,797 km2"
4,California,39512223,"155,779 sq mi403,466 km2",58,—,58,681245,"2,686 sq mi6,956 km2"


In [41]:
with open('state_wiki_data.pickle', 'wb') as to_write:
    pickle.dump(df_st_data, to_write)