In [36]:
import pandas as pd
import numpy as np
from pathlib import Path
import os
import sys
import csv

fpath = Path.cwd().parent.as_posix()
sys.path.append(fpath)

# Set Up Dictionaries for Converting States to Abbreviations

In [72]:
states_to_abbr = pd.read_csv(f'{fpath}/data/dictionary_maps/states_to_abbr.csv', 
                             names=['State', 'Abbreviation'], 
                             index_col=0).to_dict()['Abbreviation']

abbr_to_states = pd.read_csv(f'{fpath}/data/dictionary_maps/abbr_to_states.csv', 
                             names=['Abbreviation', 'State'], 
                             index_col=0).to_dict()['State']

# Set Up Dictionaries for Converting Counties to Fips

In [47]:
county_state_to_fips = pd.read_csv(f'{fpath}/data/dictionary_maps/county_state_to_fips.csv',
                                    names=['UID', 'fips'],
                                    index_col=0).to_dict()['fips']

fips_to_county_state = pd.read_csv(f'{fpath}/data/dictionary_maps/fips_to_county_state.csv', 
                                   names=['fips', 'UID'], 
                                   index_col=0).to_dict()['UID']

In [62]:
county_state_to_fips

{'Snohomish, WA': 53061.0,
 'Cook, IL': 17031.0,
 'Orange, CA': 6059.0,
 'Maricopa, AZ': 4013.0,
 'Los Angeles, CA': 6037.0,
 'Santa Clara, CA': 6085.0,
 'Suffolk, MA': 25025.0,
 'San Francisco, CA': 6075.0,
 'Dane, WI': 55025.0,
 'San Diego, CA': 6073.0,
 'Bexar, TX': 48029.0,
 'Douglas, NE': 31055.0,
 'Humboldt, CA': 6023.0,
 'Sacramento, CA': 6067.0,
 'Solano, CA': 6095.0,
 'Spokane, WA': 53063.0,
 'Salt Lake, UT': 49035.0,
 'Marin, CA': 6041.0,
 'Napa, CA': 6055.0,
 'Sonoma, CA': 6097.0,
 'Washington, OR': 41067.0,
 'King, WA': 53033.0,
 'Alameda, CA': 6001.0,
 'Hillsborough, FL': 12057.0,
 'Manatee, FL': 12081.0,
 'New York City, NY': nan,
 'Unknown, RI': nan,
 'Placer, CA': 6061.0,
 'San Mateo, CA': 6081.0,
 'Fulton, GA': 13121.0,
 'Norfolk, MA': 25021.0,
 'Grafton, NH': 33009.0,
 'Walla Walla, WA': 53071.0,
 'Contra Costa, CA': 6013.0,
 'Wake, NC': 37183.0,
 'Bergen, NJ': 34003.0,
 'Westchester, NY': 36119.0,
 'Fort Bend, TX': 48157.0,
 'Chelan, WA': 53007.0,
 'Douglas, CO': 803

# Create County Fips Map

In [64]:
df_temp = pd.read_csv(f'{fpath}/data/kaggle_covid_challenge_wk_5/train.csv')
df = df_temp[(df_temp['Country_Region']=='US') & (df_temp['Target']=='ConfirmedCases')].dropna()

In [76]:
df['UID'] = df['County'] + ', ' + df['Province_State'].map(states_to_abbr)
df

Unnamed: 0,Id,County,Province_State,Country_Region,Population,Weight,Date,Target,TargetValue,UID
60500,67761,Autauga,Alabama,US,55869,0.091485,2020-01-23,ConfirmedCases,0.0,"Autauga, AL"
60502,67763,Autauga,Alabama,US,55869,0.091485,2020-01-24,ConfirmedCases,0.0,"Autauga, AL"
60504,67765,Autauga,Alabama,US,55869,0.091485,2020-01-25,ConfirmedCases,0.0,"Autauga, AL"
60506,67767,Autauga,Alabama,US,55869,0.091485,2020-01-26,ConfirmedCases,0.0,"Autauga, AL"
60508,67769,Autauga,Alabama,US,55869,0.091485,2020-01-27,ConfirmedCases,0.0,"Autauga, AL"
60510,67771,Autauga,Alabama,US,55869,0.091485,2020-01-28,ConfirmedCases,0.0,"Autauga, AL"
60512,67773,Autauga,Alabama,US,55869,0.091485,2020-01-29,ConfirmedCases,0.0,"Autauga, AL"
60514,67775,Autauga,Alabama,US,55869,0.091485,2020-01-30,ConfirmedCases,0.0,"Autauga, AL"
60516,67777,Autauga,Alabama,US,55869,0.091485,2020-01-31,ConfirmedCases,0.0,"Autauga, AL"
60518,67779,Autauga,Alabama,US,55869,0.091485,2020-02-01,ConfirmedCases,0.0,"Autauga, AL"


In [83]:
county_state_to_fips.keys()

dict_keys(['Snohomish, WA', 'Cook, IL', 'Orange, CA', 'Maricopa, AZ', 'Los Angeles, CA', 'Santa Clara, CA', 'Suffolk, MA', 'San Francisco, CA', 'Dane, WI', 'San Diego, CA', 'Bexar, TX', 'Douglas, NE', 'Humboldt, CA', 'Sacramento, CA', 'Solano, CA', 'Spokane, WA', 'Salt Lake, UT', 'Marin, CA', 'Napa, CA', 'Sonoma, CA', 'Washington, OR', 'King, WA', 'Alameda, CA', 'Hillsborough, FL', 'Manatee, FL', 'New York City, NY', 'Unknown, RI', 'Placer, CA', 'San Mateo, CA', 'Fulton, GA', 'Norfolk, MA', 'Grafton, NH', 'Walla Walla, WA', 'Contra Costa, CA', 'Wake, NC', 'Bergen, NJ', 'Westchester, NY', 'Fort Bend, TX', 'Chelan, WA', 'Douglas, CO', 'Jefferson, CO', 'Santa Rosa, FL', 'Montgomery, MD', 'Middlesex, MA', 'Clark, NV', 'Washoe, NV', 'Nassau, NY', 'Williamson, TN', 'Harris, TX', 'Pinal, AZ', 'Yolo, CA', 'Denver, CO', 'Eagle, CO', 'El Paso, CO', 'Broward, FL', 'Lee, FL', 'Polk, GA', 'Honolulu, HI', 'Marion, IN', 'Fayette, KY', 'Ramsey, MN', 'Camden, NJ', 'Rockland, NY', 'Chatham, NC', 'Tulsa,

In [81]:
df[df['UID'].map(county_state_to_fips).isna()]

Unnamed: 0,Id,County,Province_State,Country_Region,Population,Weight,Date,Target,TargetValue,UID
77500,86801,Aleutians East,Alaska,US,3337,0.123257,2020-01-23,ConfirmedCases,0.0,"Aleutians East, AK"
77502,86803,Aleutians East,Alaska,US,3337,0.123257,2020-01-24,ConfirmedCases,0.0,"Aleutians East, AK"
77504,86805,Aleutians East,Alaska,US,3337,0.123257,2020-01-25,ConfirmedCases,0.0,"Aleutians East, AK"
77506,86807,Aleutians East,Alaska,US,3337,0.123257,2020-01-26,ConfirmedCases,0.0,"Aleutians East, AK"
77508,86809,Aleutians East,Alaska,US,3337,0.123257,2020-01-27,ConfirmedCases,0.0,"Aleutians East, AK"
77510,86811,Aleutians East,Alaska,US,3337,0.123257,2020-01-28,ConfirmedCases,0.0,"Aleutians East, AK"
77512,86813,Aleutians East,Alaska,US,3337,0.123257,2020-01-29,ConfirmedCases,0.0,"Aleutians East, AK"
77514,86815,Aleutians East,Alaska,US,3337,0.123257,2020-01-30,ConfirmedCases,0.0,"Aleutians East, AK"
77516,86817,Aleutians East,Alaska,US,3337,0.123257,2020-01-31,ConfirmedCases,0.0,"Aleutians East, AK"
77518,86819,Aleutians East,Alaska,US,3337,0.123257,2020-02-01,ConfirmedCases,0.0,"Aleutians East, AK"


# Pull Data From NY Times, Add FIPS, and Compare to the Assigned FIPS value

In [11]:
df_ny_times = pd.read_csv('https://raw.githubusercontent.com/nytimes/covid-19-data/master/us-counties.csv')
df_ny_times['UID'] = df_ny_times['county'] + ' County, ' + df_ny_times['state'].map(inv_states)
df_ny_times['FIPS'] = df_ny_times['UID'].map(fips_map)
df_ny_times

Unnamed: 0,date,county,state,fips,cases,deaths,UID,FIPS
0,2020-01-21,Snohomish,Washington,53061.0,1,0,"Snohomish County, WA",53061
1,2020-01-22,Snohomish,Washington,53061.0,1,0,"Snohomish County, WA",53061
2,2020-01-23,Snohomish,Washington,53061.0,1,0,"Snohomish County, WA",53061
3,2020-01-24,Cook,Illinois,17031.0,1,0,"Cook County, IL",17031
4,2020-01-24,Snohomish,Washington,53061.0,1,0,"Snohomish County, WA",53061
5,2020-01-25,Orange,California,6059.0,1,0,"Orange County, CA",06059
6,2020-01-25,Cook,Illinois,17031.0,1,0,"Cook County, IL",17031
7,2020-01-25,Snohomish,Washington,53061.0,1,0,"Snohomish County, WA",53061
8,2020-01-26,Maricopa,Arizona,4013.0,1,0,"Maricopa County, AZ",04013
9,2020-01-26,Los Angeles,California,6037.0,1,0,"Los Angeles County, CA",06037


In [12]:
# Do they match?

np.sum(df_ny_times['fips'] - df_ny_times['FIPS'].astype(float)) # yes, yes they do!

0.0

In [84]:
pd.read_html("https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697")

[          0              1      2
 0      FIPS           Name  State
 1     01001        Autauga     AL
 2     01003        Baldwin     AL
 3     01005        Barbour     AL
 4     01007           Bibb     AL
 5     01009         Blount     AL
 6     01011        Bullock     AL
 7     01013         Butler     AL
 8     01015        Calhoun     AL
 9     01017       Chambers     AL
 10    01019       Cherokee     AL
 11    01021        Chilton     AL
 12    01023        Choctaw     AL
 13    01025         Clarke     AL
 14    01027           Clay     AL
 15    01029       Cleburne     AL
 16    01031         Coffee     AL
 17    01033        Colbert     AL
 18    01035        Conecuh     AL
 19    01037          Coosa     AL
 20    01039      Covington     AL
 21    01041       Crenshaw     AL
 22    01043        Cullman     AL
 23    01045           Dale     AL
 24    01047         Dallas     AL
 25    01049        De Kalb     AL
 26    01051         Elmore     AL
 27    01053       E

In [110]:
import bs4 as bs
import urllib.request

sauce = urllib.request.urlopen("https://www.nrcs.usda.gov/wps/portal/nrcs/detail/national/home/?cid=nrcs143_013697")
soup = bs.BeautifulSoup(sauce, 'lxml')
table = soup.find('table', {'class': 'data'})
df_fips = pd.read_html(str(table), header=0)[0]

In [112]:
df_fips.to_csv(f'{fpath}/data/dictionary_maps/fips_master.csv', index=False)

In [113]:
len(df_fips)

3232