# Pulling FIPS codes from website

### Libraries

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import pickle
import feather

### Processing site

In [2]:
site = 'https://www.nrcs.usda.gov/wps/portal/nrcs/detail/?cid=nrcs143_013697'

In [3]:
r = requests.get(site)

In [4]:
r.content[0:2000] #Lots of EOL characters

b'\r\n\r\n\r\n\r\n \r\n  \r\n\r\n  \r\n\r\n <!DOCTYPE html>\r\n   \r\n\r\n \r\n             \r\n    \r\n      \r\n    \r\n   \r\n \r\n \r\n \r\n   \r\n \r\n \r\n \r\n \r\n \r\n     \r\n\r\n      \r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n\r\n                \r\n \r\n\r\n\r\n\r\n                                                  \r\n                                                                                                                                                                                                      \r\n                                                                                                                                                                                                                                                                                                                                                                                           \r\n                                  \r\n                                   

In [5]:
soup = BeautifulSoup(r.content, 'html.parser')

In [6]:
table = soup.find('table', {"class" : "data"})

In [7]:
def tableDataText(table):    
    """
    function pulled from eusoubrasileiro on stackoverflow
    https://stackoverflow.com/questions/2935658/beautifulsoup-get-the-contents-of-a-specific-table
    Parses a html segment started with tag <table> followed 
    by multiple <tr> (table rows) and inner <td> (table data) tags. 
    It returns a list of rows with inner columns. 
    Accepts only one <th> (table header/data) in the first row.
    """
    def rowgetDataText(tr, coltag='td'): # td (data) or th (header)       
        return [td.get_text(strip=True) for td in tr.find_all(coltag)]  
    rows = []
    trs = table.find_all('tr')
    headerow = rowgetDataText(trs[0], 'th')
    if headerow: # if there is a header row include first
        rows.append(headerow)
        trs = trs[1:]
    for tr in trs: # for every table row
        rows.append(rowgetDataText(tr, 'td') ) # data row       
    return rows

In [8]:
list_table_data = tableDataText(table)
list_table_data[0:5]

[['FIPS', 'Name', 'State'],
 ['01001', 'Autauga', 'AL'],
 ['01003', 'Baldwin', 'AL'],
 ['01005', 'Barbour', 'AL'],
 ['01007', 'Bibb', 'AL']]

In [9]:
df_fips = pd.DataFrame(list_table_data[1:], columns=list_table_data[0])
df_fips.head(5) # matches with site table head

Unnamed: 0,FIPS,Name,State
0,1001,Autauga,AL
1,1003,Baldwin,AL
2,1005,Barbour,AL
3,1007,Bibb,AL
4,1009,Blount,AL


In [10]:
df_fips.tail(5) # matches with site table tail

Unnamed: 0,FIPS,Name,State
3227,72151,Yabucoa,PR
3228,72153,Yauco,PR
3229,78010,St. Croix,VI
3230,78020,St. John,VI
3231,78030,St. Thomas,VI


In [11]:
df_fips.dtypes

FIPS     object
Name     object
State    object
dtype: object

In [12]:
df_fips.columns = ['fips','county','state']

In [13]:
pickle.dump(df_fips,open('df_fips.pkl','wb'))

In [16]:
feather.write_dataframe(df_fips,'fips_codes.feather')