In [2]:
"""
This script is used to extract the 2000 - 2020 BLS QCEW Industry files for NAICS code 493 (General warehouse and storage) for all US counties. 
There is a significant memory cost to pull this data. 
"""
import pandas as pd
import os
import sys
from datetime import datetime
import yaml
import time
import requests
import urllib
import zipfile
import pprint
from tqdm import tqdm
from glob import glob
import urllib.request
#from platforms.connect.snowpy import SnowPy

Key notes about the data: 
If a county does not the estabishment counts are truly zero, they are not reported 

A dash "-" means the macro cell field (data point combining area/ownership/industry) does not exist for that specific quarter and year.

Suppressed data fields are published with an "N" in the disclosure code field. Only establishment counts are disclosed for these cells, based on approval from this Federal Register Notice, while all other data items for the cell are suppressed (zero-filled).


https://data.bls.gov/cew/doc/access/csv_data_slices.htm#ANNUAL_LAYOUT

https://data.bls.gov/cew/doc/access/data_access_examples.htm#PYTHON

https://www.bls.gov/cew/downloadable-data-files.htm

https://www.bls.gov/cew/questions-and-answers.htm -> suppression addressed in questions 13 & 14


Notes:
BLS does not disclose their detailed methodology for suppression because they want to prevent anyone from reverse-engineering the data in order to get to the suppressed numbers. So basically, we don't really know all the little things they are doing to protect confidentiality.

Primary suppression (dubbed the 80/3 rule) occurs when one of the following conditions is true:

1. There are fewer than three establishments in the given industry for a geographic area.
2. One firm constitutes more than 80 percent of area employment in a given industry

For post 2013, the indsutry csv file for NAICS 493 is accesible by a single url

In [3]:
#URL structure for the csv files from 2014 on: http://data.bls.gov/cew/data/api/[YEAR]/1/area/[NAICS CODE].csv
#replace '1' with any quarter you want or 'a' for annual averages 
#example URL: http://data.bls.gov/cew/data/api/2017/1/area/US000.csv
#Source: https://data.bls.gov/cew/doc/access/csv_data_slices.htm#ANNUAL_LAYOUT

df_2020 = pd.read_csv('https://data.bls.gov/cew/data/api/2020/a/industry/493.csv')
df_2019 = pd.read_csv('https://data.bls.gov/cew/data/api/2019/a/industry/493.csv')
df_2018 = pd.read_csv('https://data.bls.gov/cew/data/api/2018/a/industry/493.csv')
df_2017 = pd.read_csv('https://data.bls.gov/cew/data/api/2017/a/industry/493.csv')
df_2016 = pd.read_csv('https://data.bls.gov/cew/data/api/2016/a/industry/493.csv')
df_2015 = pd.read_csv('https://data.bls.gov/cew/data/api/2015/a/industry/493.csv')
df_2014 = pd.read_csv('https://data.bls.gov/cew/data/api/2014/a/industry/493.csv')

In [4]:
df_2015

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,...,oty_total_annual_wages_chg,oty_total_annual_wages_pct_chg,oty_taxable_annual_wages_chg,oty_taxable_annual_wages_pct_chg,oty_annual_contributions_chg,oty_annual_contributions_pct_chg,oty_annual_avg_wkly_wage_chg,oty_annual_avg_wkly_wage_pct_chg,oty_avg_annual_pay_chg,oty_avg_annual_pay_pct_chg
0,01000,2,493,55,0,2015,A,N,2,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,01000,3,493,55,0,2015,A,N,2,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,01000,5,493,55,0,2015,A,,259,11793,...,14036991,3.0,5788845,4.4,-266851,-9.7,12,1.6,661,1.7
3,01001,5,493,75,0,2015,A,N,1,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
4,01003,5,493,75,0,2015,A,,6,383,...,11709268,225.1,3311302,484.6,40171,328.8,-355,-29.5,-18468,-29.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2565,C4974,5,493,45,0,2015,A,,10,114,...,-553096,-9.3,-357723,-20.8,-10822,-10.3,44,5.1,2270,5.0
2566,US000,1,493,15,0,2015,A,,13,1372,...,-696264,-1.5,0,0.0,0,0.0,-15,-2.2,-784,-2.2
2567,US000,2,493,15,0,2015,A,,8,149,...,1843991,30.8,0,0.0,0,0.0,48,5.0,2497,5.0
2568,US000,3,493,15,0,2015,A,,254,6899,...,45693533,14.6,-904770,-34.9,-19223,-42.3,50,5.3,2630,5.3


In [9]:
x = '000'
df_2015[df_2015['area_fips'].str.contains('000')]

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,...,oty_total_annual_wages_chg,oty_total_annual_wages_pct_chg,oty_taxable_annual_wages_chg,oty_taxable_annual_wages_pct_chg,oty_annual_contributions_chg,oty_annual_contributions_pct_chg,oty_annual_avg_wkly_wage_chg,oty_annual_avg_wkly_wage_pct_chg,oty_avg_annual_pay_chg,oty_avg_annual_pay_pct_chg
0,01000,2,493,55,0,2015,A,N,2,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,01000,3,493,55,0,2015,A,N,2,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2,01000,5,493,55,0,2015,A,,259,11793,...,14036991,3.0,5788845,4.4,-266851,-9.7,12,1.6,661,1.7
48,02000,1,493,55,0,2015,A,,1,3,...,24254,21.8,0,0.0,0,0.0,49,6.9,2573,6.9
49,02000,5,493,55,0,2015,A,,29,497,...,-1222176,-4.0,215003,1.2,-43148,-9.2,-57,-4.8,-2957,-4.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2162,72000,5,493,55,0,2015,A,,60,1680,...,2670571,6.1,289422,2.2,-24921,-5.7,38,7.6,1959,7.6
2566,US000,1,493,15,0,2015,A,,13,1372,...,-696264,-1.5,0,0.0,0,0.0,-15,-2.2,-784,-2.2
2567,US000,2,493,15,0,2015,A,,8,149,...,1843991,30.8,0,0.0,0,0.0,48,5.0,2497,5.0
2568,US000,3,493,15,0,2015,A,,254,6899,...,45693533,14.6,-904770,-34.9,-19223,-42.3,50,5.3,2630,5.3


In [8]:
df_2015.dtypes

area_fips                            object
own_code                              int64
industry_code                         int64
agglvl_code                           int64
size_code                             int64
year                                  int64
qtr                                  object
disclosure_code                      object
annual_avg_estabs                     int64
annual_avg_emplvl                     int64
total_annual_wages                    int64
taxable_annual_wages                  int64
annual_contributions                  int64
annual_avg_wkly_wage                  int64
avg_annual_pay                        int64
lq_disclosure_code                   object
lq_annual_avg_estabs                float64
lq_annual_avg_emplvl                float64
lq_total_annual_wages               float64
lq_taxable_annual_wages             float64
lq_annual_contributions             float64
lq_annual_avg_wkly_wage             float64
lq_avg_annual_pay               

Sometimes, suppressed cells (county, year, industry, ownership code combos) are 0 for all values. Sometimes they are zero for everythign except establishments will have a non zero value 

In [24]:
df_2015[(df_2015['disclosure_code']  == 'N') & (df_2015['annual_avg_emplvl'] == 0 )]

Unnamed: 0,area_fips,own_code,industry_code,agglvl_code,size_code,year,qtr,disclosure_code,annual_avg_estabs,annual_avg_emplvl,...,oty_total_annual_wages_chg,oty_total_annual_wages_pct_chg,oty_taxable_annual_wages_chg,oty_taxable_annual_wages_pct_chg,oty_annual_contributions_chg,oty_annual_contributions_pct_chg,oty_annual_avg_wkly_wage_chg,oty_annual_avg_wkly_wage_pct_chg,oty_avg_annual_pay_chg,oty_avg_annual_pay_pct_chg
0,01000,2,493,55,0,2015,A,N,2,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
1,01000,3,493,55,0,2015,A,N,2,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
3,01001,5,493,75,0,2015,A,N,1,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
5,01005,5,493,75,0,2015,A,N,2,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
6,01007,5,493,75,0,2015,A,N,1,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2551,C4826,5,493,45,0,2015,A,N,9,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2552,C4830,5,493,45,0,2015,A,N,4,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2553,C4854,5,493,45,0,2015,A,N,2,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0
2555,C4866,5,493,45,0,2015,A,N,3,0,...,0,0.0,0,0.0,0,0.0,0,0.0,0,0.0


For pre 2013 files, we need to download the industry zip file for hte year and extract the csv for NAICS code 493

In [6]:
print(os.getcwd())
datapath = os.getcwd() + '/qcew_zips/'

C:\Users\hrowe\Documents\FHWA mobility trend report\T4 - Forecasting\Year 2\modeling code\etl


In [4]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)

In [5]:
# Define function to download URL to a file
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

In [6]:
def try_download(url, datapath, file):
    #makedir_if_needed(datapath)
    try:
        download_url(url, file)
        print(f"Downloaded {url} to {file}")
    except urllib.error.HTTPError as e:
        print(f"Couldn't find {url}, Exception: {e}")

In [7]:
def uncompress(filepath):
    # Uncompress if zip file
    if filepath[-4:].lower() == '.zip':
        zipfolder = filepath.split('/')[-1].split('.')[0]
        print(f'                Uncompressing zip file to folder {zipfolder}')
        with zipfile.ZipFile(filepath, 'r') as zip_ref:
            zip_ref.extractall(datapath + zipfolder)

In [14]:
#states = pd.read_excel('ETL_Pipeline/ACS/State_Abbrev_FIPS.xlsx')
#states.head()

#https://data.bls.gov/cew/data/files/2009/csv/2009_annual_by_industry.zip

urlbase = 'https://data.bls.gov/cew/data/files/'

def download_qcew_files():
    for year in range(2000, 2014, 1):
        print(year)
        file = f'{datapath}{year}_annual_by_industry.zip'
        if os.path.exists(file) == False:
            url = f'{urlbase}{year}/csv/{year}_annual_by_industry.zip'
            try_download(url, datapath, file)
            #url = f'{urlbase}{year}/csv/{Year}_annual_by_industry.zip'
            #try_download(url, datapath, file)
        
        file = f'{datapath}{year}_annual_by_industry.zip' 
        if os.path.exists(file) == False:
            url = f'{urlbase}{year}/csv/{year}_annual_by_industry.zip'
            try_download(url, datapath, file)
            #url = f'{urlbase}{year}/csv_pus.zip'
            #try_download(url, datapath, file)

In [13]:
download_qcew_files()

2000


2000_annual_by_industry.zip: 60.4MB [00:04, 12.8MB/s]                            


Downloaded https://data.bls.gov/cew/data/files/2000/csv/2000_annual_by_industry.zip to C:\Users\hrowe\Documents\FHWA mobility trend report\T4 - Forecasting\Year 2\modeling code\etl/qcew_zips/2000_annual_by_industry.zip
2001


2001_annual_by_industry.zip: 92.7MB [00:14, 6.60MB/s]                            


Downloaded https://data.bls.gov/cew/data/files/2001/csv/2001_annual_by_industry.zip to C:\Users\hrowe\Documents\FHWA mobility trend report\T4 - Forecasting\Year 2\modeling code\etl/qcew_zips/2001_annual_by_industry.zip
2002


2002_annual_by_industry.zip: 134MB [00:10, 12.9MB/s]                            


Downloaded https://data.bls.gov/cew/data/files/2002/csv/2002_annual_by_industry.zip to C:\Users\hrowe\Documents\FHWA mobility trend report\T4 - Forecasting\Year 2\modeling code\etl/qcew_zips/2002_annual_by_industry.zip
2003


2003_annual_by_industry.zip: 137MB [00:10, 12.6MB/s]                            


Downloaded https://data.bls.gov/cew/data/files/2003/csv/2003_annual_by_industry.zip to C:\Users\hrowe\Documents\FHWA mobility trend report\T4 - Forecasting\Year 2\modeling code\etl/qcew_zips/2003_annual_by_industry.zip
2004


2004_annual_by_industry.zip: 148MB [00:11, 13.1MB/s]                            

Downloaded https://data.bls.gov/cew/data/files/2004/csv/2004_annual_by_industry.zip to C:\Users\hrowe\Documents\FHWA mobility trend report\T4 - Forecasting\Year 2\modeling code\etl/qcew_zips/2004_annual_by_industry.zip





In [17]:
def uncompress_qcew_files():
    for year in range(2000,2014,1):
        try:
            uncompress(datapath + str(year) + '_annual_by_industry.zip' )
            os.remove(datapath + str(year) + '_annual_by_industry.zip')
        except:
            print('unable to uncompress')

        #try:
            #uncompress(datapath + 'pums_' + str(year) + '_csv_pus.zip')
            #os.remove(datapath + 'pums_' + str(year) + '_csv_pus.zip')
        #except:
            #print('unable to uncompress')

In [16]:
uncompress_qcew_files()

                Uncompressing zip file to folder 2000_annual_by_industry
                Uncompressing zip file to folder 2001_annual_by_industry
                Uncompressing zip file to folder 2002_annual_by_industry
                Uncompressing zip file to folder 2003_annual_by_industry
                Uncompressing zip file to folder 2004_annual_by_industry


In [7]:
path_2000 = datapath + '2000_annual_by_industry/2000.annual.by_industry/2000.annual 493 Warehousing and storage.csv'
path_2001 = datapath + '2001_annual_by_industry/2001.annual.by_industry/2001.annual 493 Warehousing and storage.csv'
path_2002 = datapath + '2002_annual_by_industry/2002.annual.by_industry/2002.annual 493 Warehousing and storage.csv'
path_2003 = datapath + '2003_annual_by_industry/2003.annual.by_industry/2003.annual 493 Warehousing and storage.csv'
path_2004 = datapath + '2004_annual_by_industry/2004.annual.by_industry/2004.annual 493 Warehousing and storage.csv'
path_2005 = datapath + '2005_annual_by_industry/2005.annual.by_industry/2005.annual 493 Warehousing and storage.csv'
path_2006 = datapath + '2006_annual_by_industry/2006.annual.by_industry/2006.annual 493 Warehousing and storage.csv'
path_2007 = datapath + '2007_annual_by_industry/2007.annual.by_industry/2007.annual 493 Warehousing and storage.csv'
path_2008 = datapath + '2008_annual_by_industry/2008.annual.by_industry/2008.annual 493 Warehousing and storage.csv'
path_2009 = datapath + '2009_annual_by_industry/2009.annual.by_industry/2009.annual 493 Warehousing and storage.csv'
path_2010 = datapath + '2010_annual_by_industry/2010.annual.by_industry/2010.annual 493 Warehousing and storage.csv'
path_2011 = datapath + '2011_annual_by_industry/2011.annual.by_industry/2011.annual 493 Warehousing and storage.csv'
path_2012 = datapath + '2012_annual_by_industry/2012.annual.by_industry/2012.annual 493 Warehousing and storage.csv'
path_2013 = datapath + '2013_annual_by_industry/2013.annual.by_industry/2013.annual 493 Warehousing and storage.csv'



In [8]:
df_2000 = pd.read_csv(path_2000)
df_2001 = pd.read_csv(path_2001)
df_2002 = pd.read_csv(path_2002)
df_2003 = pd.read_csv(path_2003)
df_2004 = pd.read_csv(path_2004)
df_2005 = pd.read_csv(path_2005)
df_2006 = pd.read_csv(path_2006)
df_2007 = pd.read_csv(path_2007)
df_2008 = pd.read_csv(path_2008)
df_2009 = pd.read_csv(path_2009)
df_2010 = pd.read_csv(path_2010)
df_2011 = pd.read_csv(path_2011)
df_2012 = pd.read_csv(path_2012)
df_2013 = pd.read_csv(path_2013)

In [9]:
#we want to keep the area_fips, industry_code, 'year', and annual_avg_emplvl columns 
df_2011.columns

Index(['area_fips', 'own_code', 'industry_code', 'agglvl_code', 'size_code',
       'year', 'qtr', 'disclosure_code', 'area_title', 'own_title',
       'industry_title', 'agglvl_title', 'size_title',
       'annual_avg_estabs_count', 'annual_avg_emplvl', 'total_annual_wages',
       'taxable_annual_wages', 'annual_contributions', 'annual_avg_wkly_wage',
       'avg_annual_pay', 'lq_disclosure_code', 'lq_annual_avg_estabs_count',
       'lq_annual_avg_emplvl', 'lq_total_annual_wages',
       'lq_taxable_annual_wages', 'lq_annual_contributions',
       'lq_annual_avg_wkly_wage', 'lq_avg_annual_pay', 'oty_disclosure_code',
       'oty_annual_avg_estabs_count_chg',
       'oty_annual_avg_estabs_count_pct_chg', 'oty_annual_avg_emplvl_chg',
       'oty_annual_avg_emplvl_pct_chg', 'oty_total_annual_wages_chg',
       'oty_total_annual_wages_pct_chg', 'oty_taxable_annual_wages_chg',
       'oty_taxable_annual_wages_pct_chg', 'oty_annual_contributions_chg',
       'oty_annual_contributions_pct_

In [10]:
#we want to keep the area_fips, industry_code, 'year', and 'annual_avg_emplvl' columns 
df_2016.columns

Index(['area_fips', 'own_code', 'industry_code', 'agglvl_code', 'size_code',
       'year', 'qtr', 'disclosure_code', 'annual_avg_estabs',
       'annual_avg_emplvl', 'total_annual_wages', 'taxable_annual_wages',
       'annual_contributions', 'annual_avg_wkly_wage', 'avg_annual_pay',
       'lq_disclosure_code', 'lq_annual_avg_estabs', 'lq_annual_avg_emplvl',
       'lq_total_annual_wages', 'lq_taxable_annual_wages',
       'lq_annual_contributions', 'lq_annual_avg_wkly_wage',
       'lq_avg_annual_pay', 'oty_disclosure_code', 'oty_annual_avg_estabs_chg',
       'oty_annual_avg_estabs_pct_chg', 'oty_annual_avg_emplvl_chg',
       'oty_annual_avg_emplvl_pct_chg', 'oty_total_annual_wages_chg',
       'oty_total_annual_wages_pct_chg', 'oty_taxable_annual_wages_chg',
       'oty_taxable_annual_wages_pct_chg', 'oty_annual_contributions_chg',
       'oty_annual_contributions_pct_chg', 'oty_annual_avg_wkly_wage_chg',
       'oty_annual_avg_wkly_wage_pct_chg', 'oty_avg_annual_pay_chg',
      

In [11]:
pdList = [df_2000, df_2001, df_2002, df_2003, df_2004, df_2005, df_2006, df_2007, df_2008, df_2009, df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020]  # List of dataframes
df_merged = pd.concat(pdList)

In [12]:
df_merged = df_merged[['area_fips', 'industry_code', 'own_code', 'year', 'disclosure_code', 'annual_avg_emplvl']]

In [13]:
df_warehouse_emply = df_merged.rename(columns = {'annual_avg_emplvl': 'annual_avg_warehouse_emply'})

In [14]:
df_warehouse_emply

Unnamed: 0,area_fips,industry_code,own_code,year,disclosure_code,annual_avg_warehouse_emply
0,01000,493,3,2000,,94
1,01000,493,5,2000,,6729
2,01055,493,5,2000,,447
3,01069,493,5,2000,,75
4,01073,493,5,2000,,983
...,...,...,...,...,...,...
2613,C4974,493,5,2020,,120
2614,US000,493,1,2020,,1476
2615,US000,493,2,2020,,181
2616,US000,493,3,2020,,6539


Filters down to only private employment (own_code ==5)

In [15]:
df_warehouse_emply = df_warehouse_emply[df_warehouse_emply['own_code'] == 5]

In [16]:
df_warehouse_emply

Unnamed: 0,area_fips,industry_code,own_code,year,disclosure_code,annual_avg_warehouse_emply
1,01000,493,5,2000,,6729
2,01055,493,5,2000,,447
3,01069,493,5,2000,,75
4,01073,493,5,2000,,983
5,01089,493,5,2000,,407
...,...,...,...,...,...,...
2610,C4962,493,5,2020,,4207
2611,C4966,493,5,2020,,425
2612,C4970,493,5,2020,N,0
2613,C4974,493,5,2020,,120


In [25]:
df_warehouse_emply.to_csv('QCEW_naics_493.csv')

26k out of 47k (about half) cells were suppressed for this QCEW data, making annual_avg_warehouse_emply show up as a zero

In [18]:
len(df_warehouse_emply)

47008

In [17]:
df_warehouse_emply[df_warehouse_emply['disclosure_code'] == 'N']

Unnamed: 0,area_fips,industry_code,own_code,year,disclosure_code,annual_avg_warehouse_emply
3,01001,493,5,2001,N,0
5,01003,493,5,2001,N,0
6,01009,493,5,2001,N,0
8,01017,493,5,2001,N,0
9,01019,493,5,2001,N,0
...,...,...,...,...,...,...
2595,C4790,493,5,2020,N,0
2597,C4806,493,5,2020,N,0
2599,C4826,493,5,2020,N,0
2605,C4890,493,5,2020,N,0


No cases where data was suppressed and employment was still published (as expected)

In [19]:
df_warehouse_emply[ (df_warehouse_emply['disclosure_code'] == 'N' ) & (df_warehouse_emply['annual_avg_warehouse_emply'] > 0)]

Unnamed: 0,area_fips,industry_code,own_code,year,disclosure_code,annual_avg_warehouse_emply


Suppression is evenly distributed accross all years

In [21]:
df_warehouse_emply[df_warehouse_emply['disclosure_code'] == 'N'].groupby('year').size()

year
2001    1397
2002    1342
2003    1326
2004    1302
2005    1343
2006    1296
2007    1284
2008    1321
2009    1314
2010    1302
2011    1333
2012    1354
2013    1362
2014    1388
2015    1372
2016    1358
2017    1366
2018    1363
2019    1355
2020    1387
dtype: int64

In [22]:
df_warehouse_emply[df_warehouse_emply['disclosure_code'].isna()].groupby('year').size()

year
2000     697
2001     843
2002     895
2003     899
2004     916
2005     916
2006     964
2007     980
2008     976
2009     980
2010     994
2011     966
2012     976
2013     976
2014     977
2015     995
2016    1027
2017    1032
2018    1040
2019    1057
2020    1037
dtype: int64

This df has the annual average employment in the 493 NAICS code by county for years 2000 - 2020

In [32]:
#making sure the industry code is the same for all entires - it is 
df_warehouse_emply.industry_code.unique()

array([493], dtype=int64)

In [33]:
df_warehouse_emply.dtypes

area_fips                     object
industry_code                  int64
own_code                       int64
year                           int64
annual_avg_warehouse_emply     int64
dtype: object

In [34]:
df_warehouse_emply.isna().sum()

area_fips                     0
industry_code                 0
own_code                      0
year                          0
annual_avg_warehouse_emply    0
dtype: int64

In [35]:
df_warehouse_emply[df_warehouse_emply['annual_avg_warehouse_emply'] > 0].groupby('year').size()

year
2000     697
2001     843
2002     895
2003     899
2004     916
2005     916
2006     964
2007     980
2008     976
2009     980
2010     994
2011     966
2012     976
2013     976
2014     977
2015     995
2016    1027
2017    1032
2018    1040
2019    1057
2020    1037
dtype: int64

In [36]:
df_warehouse_emply[(df_warehouse_emply['annual_avg_warehouse_emply'] == 0) & (df_warehouse_emply['year'] == 2020)].groupby('area_fips').size()

area_fips
01005    1
01009    1
01017    1
01019    1
01023    1
        ..
C4790    1
C4806    1
C4826    1
C4890    1
C4970    1
Length: 1387, dtype: int64

Next step:
make sure industry code is the same for all years
check for NAs
do the same for couriers 
need to account for state, local, and private employment
do due dilligence on the change in method for getting the data from 2013 to 2014
Merge this onto the main df, but only want to keep rows (counties) that exists in the main df

In [37]:
df_warehouse_emply[df_warehouse_emply['area_fips'] == '01000']

Unnamed: 0,area_fips,industry_code,own_code,year,annual_avg_warehouse_emply
1,1000,493,5,2000,6729
2,1000,493,5,2001,6730
2,1000,493,5,2002,5789
2,1000,493,5,2003,5935
2,1000,493,5,2004,6214
2,1000,493,5,2005,7133
2,1000,493,5,2006,7639
2,1000,493,5,2007,8286
2,1000,493,5,2008,8424
2,1000,493,5,2009,8041


In [38]:
df_warehouse_emply[df_warehouse_emply['area_fips'] == '25009']

Unnamed: 0,area_fips,industry_code,own_code,year,annual_avg_warehouse_emply
205,25009,493,5,2000,467
836,25009,493,5,2001,584
833,25009,493,5,2002,508
814,25009,493,5,2003,496
826,25009,493,5,2004,446
850,25009,493,5,2005,436
853,25009,493,5,2006,454
856,25009,493,5,2007,499
870,25009,493,5,2008,428
865,25009,493,5,2009,368


In [39]:
df_warehouse_emply[df_warehouse_emply['area_fips'] == '12031']

Unnamed: 0,area_fips,industry_code,own_code,year,annual_avg_warehouse_emply
90,12031,493,5,2000,3141
262,12031,493,5,2001,3182
267,12031,493,5,2002,3023
266,12031,493,5,2003,2466
267,12031,493,5,2004,2562
271,12031,493,5,2005,2461
275,12031,493,5,2006,3254
272,12031,493,5,2007,3469
278,12031,493,5,2008,3400
271,12031,493,5,2009,3273


In [42]:
df_warehouse_emply[df_warehouse_emply['area_fips'] == '08001']

Unnamed: 0,area_fips,industry_code,own_code,year,annual_avg_warehouse_emply
70,8001,493,5,2000,1527
199,8001,493,5,2001,1636
199,8001,493,5,2002,1693
199,8001,493,5,2003,1631
198,8001,493,5,2004,1323
203,8001,493,5,2005,1147
206,8001,493,5,2006,1069
203,8001,493,5,2007,973
203,8001,493,5,2008,1391
199,8001,493,5,2009,1277
