This script loads in area (in square meters and square miles) of all counties in the US using the 2020 county definitions 

more info on this data here: https://www.census.gov/quickfacts/fact/note/US/LND110220

and here: https://www.census.gov/programs-surveys/geography/technical-documentation/records-layout/gaz-record-layouts.html

data dict: https://www.census.gov/programs-surveys/geography/technical-documentation/records-layout/gaz-record-layouts.2020.html#list-tab-1913338080

data source download link: https://www.census.gov/geographies/reference-files/time-series/geo/gazetteer-files.2020.html

In this script, 

GEOID = State fips code and county fips code

ALAND = land area of the county in square meters

ALAND_SQMI = land area of the county in square miles

In [1]:
import pandas as pd
import numpy as np
import zipfile
import requests, zipfile, io
import os


In [2]:
#download url for each neccesary file 

urls = [ 'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/county2k.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2012_Gazetteer/2012_Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2013_Gazetteer/2013_Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2014_Gazetteer/2014_Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2015_Gazetteer/2015_Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2016_Gazetteer/2016_Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2017_Gazetteer/2017_Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2018_Gazetteer/2018_Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2019_Gazetteer/2019_Gaz_counties_national.zip', 
       'https://www2.census.gov/geo/docs/maps-data/data/gazetteer/2020_Gazetteer/2020_Gaz_counties_national.zip'] 



In [3]:
#downloads all context for all zip files into the data folder
for i in range(len(urls)):
    r = requests.get(urls[i])
    z = zipfile.ZipFile(io.BytesIO(r.content))
    z.extractall("C:/Users/hrowe/Documents/FHWA mobility trend report/T4 - Forecasting/Year 2/modeling code/etl/data")

In [4]:
#stores the name of all the files in the data folder
file_names = os.listdir('./data')

In [5]:
file_names

['2012_Gaz_counties_national.txt',
 '2013_Gaz_counties_national.txt',
 '2014_Gaz_counties_national.txt',
 '2015_Gaz_counties_national.txt',
 '2016_Gaz_counties_national.txt',
 '2017_Gaz_counties_national.txt',
 '2018_Gaz_counties_national.txt',
 '2019_Gaz_counties_national.txt',
 '2020_Gaz_counties_national.txt',
 'county2k.txt',
 'Gaz_counties_national.txt']

In [6]:
def read_in(file_name, year):
    data = pd.read_csv('data/' + file_name, sep='\t', encoding='latin')
    data['GEOID'] = data['GEOID'].astype(str)    #convert GEOID to string
    
    #make sure all FIPS codes are 5 digits long 
    for i in range(len(data)):
        if (len(data.iloc[i,1]) < 5):
            data.iloc[i,1] = data.iloc[i,1].zfill(5)
    
    #create new column for year        
    data['YEAR'] = year
    return(data)

In [7]:
df_2012 = read_in(file_names[0], 2012)
df_2013 = read_in(file_names[1], 2013)
df_2014 = read_in(file_names[2], 2014)
df_2015 = read_in(file_names[3], 2015)
df_2016 = read_in(file_names[4], 2016)
df_2017 = read_in(file_names[5], 2017)
df_2018 = read_in(file_names[6], 2018)
df_2019 = read_in(file_names[7], 2019)
df_2020 = read_in(file_names[8], 2020)
df_2010 = read_in(file_names[10], 2010)

In [8]:
with open("data/county2k.txt") as file:
    lines = [line for line in file]

In [9]:
#lines
lines[0][2:7]

'01001'

In [10]:
lines[0][90:103]

'   1543550050'

In [27]:
lines[219][90:103]

'  51935723751'

In [24]:
lines[219][117:129]

'20052.495900'

In [25]:
lines[0][117:129]

'  595.968032'

In [12]:
lines

['AL01001Autauga County                                                      43671    17662    1543550050      21959029  595.968032    8.478429 32.523283 -86.577176\n',
 'AL01003Baldwin County                                                     140415    74285    4134518782    1115203109 1596.346694  430.582346 30.592781 -87.748260\n',
 'AL01005Barbour County                                                      29038    12461    2291889572      50799584  884.903549   19.613830 31.856515 -85.331312\n',
 'AL01007Bibb County                                                         20826     8345    1613627425       8124469  623.025058    3.136875 33.040054 -87.123243\n',
 'AL01009Blount County                                                       51024    21158    1672057889      12998871  645.585188    5.018892 33.978461 -86.554768\n',
 'AL01011Bullock County                                                      11714     4727    1618775183       2703063  625.012619    1.043659 32.098285 -

In [28]:
#needed a unique solution for 2000 file (county2k.txt)
GEOID = []
ALAND = []
ALAND_SQMI = []
YEAR = []


for line in lines:
    GEOID.append(line[2:7])
    ALAND.append(line[90:103])
    ALAND_SQMI.append(line[117:129])
    YEAR.append(2000)

In [29]:
zipped = list(zip(GEOID, ALAND, ALAND_SQMI, YEAR))

df_2000 = pd.DataFrame(zipped, columns=['GEOID', 'ALAND', 'ALAND_SQMI', 'YEAR'])
df_2000 

Unnamed: 0,GEOID,ALAND,ALAND_SQMI,YEAR
0,01001,1543550050,595.968032,2000
1,01003,4134518782,1596.346694,2000
2,01005,2291889572,884.903549,2000
3,01007,1613627425,623.025058,2000
4,01009,1672057889,645.585188,2000
...,...,...,...,...
3214,72145,118858686,45.891597,2000
3215,72147,131645368,50.828563,2000
3216,72149,91798136,35.443460,2000
3217,72151,143105464,55.253331,2000


for 2011 -->  apply 2010 values

for 2001 --> 2009, apply 2000 values 

In [31]:
df_2011 = df_2010.assign(YEAR = 2011)
df_2001 = df_2000.assign(YEAR = 2001)
df_2002 = df_2000.assign(YEAR = 2002)
df_2003 = df_2000.assign(YEAR = 2003)
df_2004 = df_2000.assign(YEAR = 2004)
df_2005 = df_2000.assign(YEAR = 2005)
df_2006 = df_2000.assign(YEAR = 2006)
df_2007 = df_2000.assign(YEAR = 2007)
df_2008 = df_2000.assign(YEAR = 2008)
df_2009 = df_2000.assign(YEAR = 2009)

In [32]:
#concat all df together
pdList = [df_2000, df_2001, df_2002, df_2003, df_2004, df_2005, df_2006, df_2007, df_2008, df_2009,
          df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020]  # List of dataframes
df_merged = pd.concat(pdList)

In [33]:
df_merged = df_merged[['GEOID','YEAR' ,'ALAND_SQMI']]

In [34]:
df_merged 

Unnamed: 0,GEOID,YEAR,ALAND_SQMI
0,01001,2000,595.968032
1,01003,2000,1596.346694
2,01005,2000,884.903549
3,01007,2000,623.025058
4,01009,2000,645.585188
...,...,...,...
3216,72145,2020,45.848
3217,72147,2020,50.788
3218,72149,2020,35.637
3219,72151,2020,55.215


In [40]:
df_merged.to_csv('year2_indicators/county_area.csv')

Becasue there is no file for 2011, checking to how different ALAND_SQMI is for 2010 and 2012. There is only a slight difference between the years. Going to assign the 2010 values for the year 2011.

In [18]:
df_merged.loc[ (df_merged['GEOID'] == '01001') & (df_merged['YEAR'] < 2013 ) ]

Unnamed: 0,GEOID,YEAR,ALAND_SQMI
0,1001,2000,595.968032
0,1001,2001,595.968032
0,1001,2002,595.968032
0,1001,2003,595.968032
0,1001,2004,595.968032
0,1001,2005,595.968032
0,1001,2006,595.968032
0,1001,2007,595.968032
0,1001,2008,595.968032
0,1001,2009,595.968032


In [20]:
df_merged.loc[(df_merged['GEOID'] == '01001')]

Unnamed: 0,GEOID,YEAR,ALAND_SQMI
0,1001,2000,595.968032
0,1001,2001,595.968032
0,1001,2002,595.968032
0,1001,2003,595.968032
0,1001,2004,595.968032
0,1001,2005,595.968032
0,1001,2006,595.968032
0,1001,2007,595.968032
0,1001,2008,595.968032
0,1001,2009,595.968032


Now I want to check where the NA values are distributed for each year and each county

In [52]:
df_merged.dtypes

GEOID         object
YEAR           int64
ALAND_SQMI    object
dtype: object

In [54]:
#convert ALAND_SQMI to numeric
df_merged['ALAND_SQMI'] = pd.to_numeric(df_merged['ALAND_SQMI'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_merged['ALAND_SQMI'] = pd.to_numeric(df_merged['ALAND_SQMI'])


No NAs or zeros in the dataset

In [37]:
df_merged[df_merged['GEOID'] == '06071']

Unnamed: 0,GEOID,YEAR,ALAND_SQMI
219,6071,2000,20052.4959
219,6071,2001,20052.4959
219,6071,2002,20052.4959
219,6071,2003,20052.4959
219,6071,2004,20052.4959
219,6071,2005,20052.4959
219,6071,2006,20052.4959
219,6071,2007,20052.4959
219,6071,2008,20052.4959
219,6071,2009,20052.4959


In [55]:
df_merged.isna().sum()

GEOID         0
YEAR          0
ALAND_SQMI    0
dtype: int64

In [59]:
df_merged[(df_merged['ALAND_SQMI'] == 0)]

Unnamed: 0,GEOID,YEAR,ALAND_SQMI


In [42]:
df_merged[df_merged['GEOID'].startswith("01")]

AttributeError: 'Series' object has no attribute 'startswith'