In [None]:
"""

Get data for features from each of the US county Wikipedia pages
Loop over list of URLs of Wikipedia pages for each US county 

features to collect:
major highways
city or cities
town or towns

output to pickle

Run time: 1 hour

"""

In [1]:
from bs4 import BeautifulSoup
import requests

import numpy as np
import pandas as pd

import pickle
import time

import regex as re

from datetime import datetime 

In [2]:
# list of county pages wikipedia links 

with open('county_wiki_links.pickle', 'rb') as read_file:
    county_links = pickle.load(read_file)

In [3]:
county_links.head(5)

Unnamed: 0,state,FIPS_code,county_name,wiki_link
0,Alabama,1,"Autauga County, Alabama","/wiki/Autauga_County,_Alabama"
1,Alabama,3,"Baldwin County, Alabama","/wiki/Baldwin_County,_Alabama"
2,Alabama,5,"Barbour County, Alabama","/wiki/Barbour_County,_Alabama"
3,Alabama,7,"Bibb County, Alabama","/wiki/Bibb_County,_Alabama"
4,Alabama,9,"Blount County, Alabama","/wiki/Blount_County,_Alabama"


In [7]:
len(county_links)

3094

In [42]:
base_url = 'https://en.wikipedia.org'


In [70]:
now = datetime.now () 
current_time = now.strftime ("%H:%M:%S") 
print("Current Time =", current_time)

Current Time = 16:03:18


In [60]:
cnty_data = []

for i, county in enumerate(county_links['county_name']):

    county_page_url = base_url + county_links['wiki_link'][i]

    response = requests.get(county_page_url)
    page = response.text
    soup = BeautifulSoup(page)
        
    state = county_links['state'][i]
    county_name = county_links['county_name'][i]
    FIPS_code = county_links['FIPS_code'][i]
    
    headers = ['state', 'county_name', 'FIPS_code', 'county_page_url', 'highways', 'hwy_count',
                    'cities', 'city_count', 'towns', 'town_count']

    try:

        try:
            highway_find = soup.find(id=re.compile('highway')).findNext('ul').find_all('a')              
            highway_list = [x.text for x in highway_find]
            highways = list(filter(len, highway_list))  # remove blanks for the hwy sign images
            hwy_count = len(highways)

        except:
            highways = []
            hwy_count = 0

        try:           
            city_list = soup.find(id=re.compile(r"(\bCity\b|\bCities\b)")).findNext('ul').find_all('li')
            cities = [x.find('a').text for x in city_list]
            city_count = len(cities)

        except:
            cities = []
            cities_count = 0

        try:
            town_list = soup.find(id=re.compile(r"(\bTown\b|\bTowns\b)")).findNext('ul').find_all('li')
            towns = [x.find('a').text for x in town_list]
            town_count= len(towns)

        except:
            towns = []
            town_count = 0
            
        
        features_dict = dict(zip(headers, [state, county_name, FIPS_code, county_page_url, highways, 
                                           hwy_count, cities, city_count, towns, town_count]))
        
        cnty_data.append(features_dict)
                

    except:
        print('ERROR with URL: ', county_page_url)
        continue


    time.sleep(1)
        


In [61]:
now = datetime.now () 
current_time = now.strftime ("%H:%M:%S") 
print("Current Time =", current_time)

Current Time = 17:06:59


In [63]:
len(cnty_data)

3094

In [64]:
# to df
county_features = pd.DataFrame(cnty_data)


In [65]:
len(county_features)

3094

In [66]:
county_features.head()

Unnamed: 0,state,county_name,FIPS_code,county_page_url,highways,hwy_count,cities,city_count,towns,town_count
0,Alabama,"Autauga County, Alabama",1,"https://en.wikipedia.org/wiki/Autauga_County,_...","[Interstate 65, U.S. Highway 31, U.S. Highway ...",6,"[Millbrook, Prattville]",2,"[Autaugaville, Billingsley]",2
1,Alabama,"Baldwin County, Alabama",3,"https://en.wikipedia.org/wiki/Baldwin_County,_...","[Interstate 10, Interstate 65, U.S. Highway 31...",12,"[Bay Minette, Daphne, Fairhope, Foley, Gulf Sh...",8,"[Elberta, Loxley, Magnolia Springs, Perdido Be...",6
2,Alabama,"Barbour County, Alabama",5,"https://en.wikipedia.org/wiki/Barbour_County,_...","[U.S. Highway 82, U.S. Highway 431, State Rout...",11,"[Clio, Eufaula]",2,"[Bakerhill, Blue Springs, Clayton, Louisville]",4
3,Alabama,"Bibb County, Alabama",7,"https://en.wikipedia.org/wiki/Bibb_County,_Ala...","[U.S. Highway 11, U.S. Highway 82, State Route...",8,"[Brent, Centreville]",2,"[Vance, West Blocton, Woodstock]",3
4,Alabama,"Blount County, Alabama",9,"https://en.wikipedia.org/wiki/Blount_County,_A...","[Interstate 65, U.S. Highway 31, U.S. Highway ...",9,"[Oneonta, Warrior]",2,"[Allgood, Altoona, Blountsville, Cleveland, Co...",14


In [67]:
# export all and review
county_features.to_csv('review_features_scrape.csv', index=False)

In [68]:
# output pickle
with open('county_features.pickle', 'wb') as to_write:
    pickle.dump(county_features, to_write)