This script loads the State Nonemployer Statistics (NES) data from 2000 - 2020 from https://www.census.gov/programs-surveys/nonemployer-statistics/data/datasets.html

NES includes the number of businesses (with no paid employees) and total receipts by state and NAICS code. 

We are specifically interested in NAICS code 4853 for Taxi and Limousine Service (https://www.bls.gov/iag/tgs/iag485.htm)

NES Documentation: https://www.census.gov/programs-surveys/nonemployer-statistics/technical-documentation/methodology.html

In [2]:
import pandas as pd
import os
import sys
from datetime import datetime
import yaml
import time
import requests
import urllib
import zipfile
import pprint
from tqdm import tqdm
from glob import glob
import numpy as np
import matplotlib.pyplot as plt
from urllib.request import urlopen
#from platforms.connect.snowpy import SnowPy

Suppression is indicated by a "D" or "S" in the ESTAB_F or RCPTOT_F

Establishment Flag

                                  " " - Number of establishments shown.
                                  "D" - Number of establishments 
                                        withheld to avoid disclosing 
                                        data for individual businesses; 
                                        data are included in broader 
                                        industry totals.
                                  "S" - Number of establishments 
                                        withheld to avoid disclosing 
                                        data that do not meet publication 
                                        standards; data are included in 
                                        broader industry totals.
                                        
Receipts Flag

                                " " - Receipts data are shown.
                                "D" - Receipts data are withheld to 
                                      avoid disclosing data for 
                                      individual businesses; 
                                      data are included in broader 
                                      industry totals.
                                "S" - Receipts data are withheld to 
                                      avoid disclosing data that do 
                                      not meet publication standards; 
                                      data are included in broader 
                                      industry totals.

### ETL 

In [3]:
print(os.getcwd())
datapath = os.getcwd() + '/nes_zips_state/'

C:\Users\hrowe\Documents\FHWA mobility trend report\T4 - Forecasting\Year 2\modeling code\etl\NES


In [4]:
class DownloadProgressBar(tqdm):
    def update_to(self, b=1, bsize=1, tsize=None):
        if tsize is not None:
            self.total = tsize
        self.update(b * bsize - self.n)
        
def download_url(url, output_path):
    with DownloadProgressBar(unit='B', unit_scale=True,
                             miniters=1, desc=url.split('/')[-1]) as t:
        urllib.request.urlretrieve(url, filename=output_path, reporthook=t.update_to)

def try_download(url, file):
    #makedir_if_needed(datapath)
    try:
        download_url(url, file)
        print(f"Downloaded {url} to {file}")
    except urllib.error.HTTPError as e:
        print(f"Couldn't find {url}, Exception: {e}")
        
def uncompress(filepath):
    # Uncompress if zip file
    if filepath[-4:].lower() == '.zip':
        zipfolder = filepath.split('/')[-1].split('.')[0]
        print(f'                Uncompressing zip file to folder {zipfolder}')
        with zipfile.ZipFile(filepath, 'r') as zip_ref:
            zip_ref.extractall(datapath + zipfolder)
            
            
#example url https://www2.census.gov/programs-surveys/nonemployer-statistics/datasets/2019/historical-datasets/combine19_txt.zip
#            https://www2.census.gov/programs-surveys/nonemployer-statistics/datasets/2016/combine16_txt.zip
#            https://www2.census.gov/programs-surveys/nonemployer-statistics/datasets/2012/historical-datasets/combine12_txt.zip
#            https://www2.census.gov/programs-surveys/nonemployer-statistics/datasets/2020/historical-datasets/nonemp20co.zip


In [57]:
def download_txt(t_url, file_path, year):
    response = urlopen(t_url)
    data = response.read()
    txt_str = str(data)
    lines = txt_str.split("\\n")
    des_url = file_path + 'Nonemp' + str(year)[2:] + 'st.txt'
    fx = open(des_url,"w")
    for line in lines:
        fx.write(line+ "\n")
    fx.close()

In [56]:
def download_nes_state_files():
    for year in range(2008,2021):
        print(year)
        url = 'https://www2.census.gov/programs-surveys/nonemployer-statistics/datasets/' + str(year) + '/historical-datasets/nonemp' + str(year)[2:] + 'st.zip'
        file_name = 'NES_State_' + str(year) + '.zip'
        download_url(url, datapath + file_name)
        uncompress(datapath + file_name)
        os.remove(datapath + file_name)
    for year in range(2000,2008):
        print(year)
        directory = 'NES_State_' + str(year)
        path = os.path.join(datapath, directory)
        os.mkdir(path)
        url = 'https://www2.census.gov/programs-surveys/nonemployer-statistics/datasets/' + str(year) + '/historical-datasets/nonemp' + str(year)[2:] + 'st.txt'
        download_txt(url, path +'/', year)

In [58]:
download_nes_state_files()

2000
2001
2002
2003
2004
2005
2006
2007


In [91]:
year = 2000
df_2000_test = pd.read_csv(datapath + 'NES_State_' + str(year) + '/Nonemp' + str(year)[2:] + 'st.txt', 
                    dtype = str)

In [92]:
df_2000_test[df_2000_test['NAICS'] == '492']

Unnamed: 0,"b'""ST""",NAICS,ESTABF,ESTAB,RCPTOT_F,RCPTOT
261,1,492,,1130.0,,19162.0
716,2,492,D,0.0,D,0.0
1180,4,492,,2790.0,,49580.0
1646,5,492,,832.0,,12886.0
2113,6,492,,13581.0,,264257.0
2580,8,492,,2202.0,,34104.0
3045,9,492,,1994.0,,39773.0
3493,10,492,,239.0,,5058.0
3923,11,492,,378.0,,5699.0
4389,12,492,,12555.0,,211287.0


In [89]:
#df_2000_test = df_2000_test.rename(columns = {b'ST' : 'STATE'}) 
#df_2000_test = df_2000_test.rename(columns = {'STATE' : b'ST'}) 

In [98]:
df_2000_test.columns = ['STATE_CODE', 'NAICS', 'ESTABF', 'ESTAB', 'RCPTOT_F', 'RCPTOT']

In [99]:
df_2000_test

Unnamed: 0,STATE_CODE,NAICS,ESTABF,ESTAB,RCPTOT_F,RCPTOT
0,01,00,,223103.0,,8827169.0
1,01,11,,4473.0,,222166.0
2,01,113,,1858.0,,116335.0
3,01,114,,1505.0,,65375.0
4,01,1141,,1339.0,,61406.0
...,...,...,...,...,...,...
23563,56,81292,D,0.0,D,0.0
23564,56,81293,D,0.0,D,0.0
23565,56,81299,,1907.0,,27892.0
23566,56,813,,284.0,,3288.0


In [186]:
#function that takes the year and naics code a returns a df from NES
def make_nes_df(year, naics):
    year = year
    df = pd.read_csv(datapath + 'NES_State_' + str(year) + '/Nonemp' + str(year)[2:] + 'st.txt', 
                    dtype = str)
    if (year < 2007):
        df.columns = ['STATE_CODE', 'NAICS', 'ESTAB_F', 'ESTAB', 'RCPTOT_F', 'RCPTOT']
    if(year == 2007):
        df.columns = ['STATE_CODE', 'NAICS', 'ESTAB_F', 'ESTAB', 'RCPTOT_N_F', 'RCPTOT_F', 'RCPTOT']
    if (year > 2007):
        df.columns = ['STATE_CODE', 'NAICS', 'LFO', 'ESTAB_F', 'ESTAB', 'RCPTOT_N_F', 'RCPTOT_F', 'RCPTOT']
    df = df[df['NAICS'] == str(naics)]        #filter to only the naics code you want
    df['YEAR'] = year                         #add a column for the year
    
    return(df)

### Creating df for NAICS 4853 (Taxi and Limousine Service)

Creating df for 2000 to 2020 for Taxi and Limousine Service: NAICS 4853

In [132]:
df_2000 = make_nes_df(2000, 4853)
df_2001 = make_nes_df(2001, 4853)
df_2002 = make_nes_df(2002, 4853)
df_2003 = make_nes_df(2003, 4853)
df_2004 = make_nes_df(2004, 4853)
df_2005 = make_nes_df(2005, 4853)
df_2006 = make_nes_df(2006, 4853)
df_2007 = make_nes_df(2007, 4853)
df_2008 = make_nes_df(2008, 4853)
df_2009 = make_nes_df(2009, 4853)
df_2010 = make_nes_df(2010, 4853)
df_2011 = make_nes_df(2011, 4853)
df_2012 = make_nes_df(2012, 4853)
df_2013 = make_nes_df(2013, 4853)
df_2014 = make_nes_df(2014, 4853)
df_2015 = make_nes_df(2015, 4853)
df_2016 = make_nes_df(2016, 4853)
df_2017 = make_nes_df(2017, 4853)
df_2018 = make_nes_df(2018, 4853)
df_2019 = make_nes_df(2019, 4853)
df_2020 = make_nes_df(2020, 4853)

In [133]:
df_2020

Unnamed: 0,STATE_CODE,NAICS,LFO,ESTAB_F,ESTAB,RCPTOT_N_F,RCPTOT_F,RCPTOT,YEAR
989,01,4853,-,,3470,G,,46407,2020
990,01,4853,C,,3,J,,139,2020
991,01,4853,Z,,8,J,,1483,2020
992,01,4853,S,,3456,G,,44686,2020
993,01,4853,P,,3,H,,99,2020
...,...,...,...,...,...,...,...,...,...
95187,55,4853,S,,7939,G,,115787,2020
95188,55,4853,P,,13,H,,340,2020
96991,56,4853,-,,365,G,,5479,2020
96992,56,4853,S,,359,G,,5311,2020


In [158]:
pdList = [df_2000, df_2001, df_2002, df_2003, df_2004, df_2005, df_2006, df_2007, df_2008, df_2009, df_2010, df_2011, df_2012, df_2013, df_2014, df_2015, df_2016, df_2017, df_2018, df_2019, df_2020]  # List of dataframes
df_merged = pd.concat(pdList)

In [138]:
#len(df_merged[(df_merged['ESTAB_F'] == 'D') | (df_merged['ESTAB_F'] == 'S')].FULL_FIPS.unique())

In [139]:
#len(df_merged[(df_merged['RCPTOT_F'] == 'D') | (df_merged['RCPTOT_F'] == 'S')].FULL_FIPS.unique())

In [140]:
#len(df_merged[(df_merged['ESTAB_F'] == 'D') | (df_merged['ESTAB_F'] == 'S')])

In [141]:
#len(df_merged[(df_merged['RCPTOT_F'] == 'D') | (df_merged['RCPTOT_F'] == 'S')])

2000-2004 and 2008-2014 have the most casses of suppression

In [142]:
#df_merged[(df_merged['ESTAB_F'] == 'D') | (df_merged['ESTAB_F'] == 'S')].groupby('YEAR').size()

In [143]:
#df_merged[(df_merged['RCPTOT_F'] == 'D') | (df_merged['RCPTOT_F'] == 'S')].groupby('YEAR').size()

In [159]:
df_merged = df_merged[['STATE_CODE', 'YEAR', 'NAICS', 'ESTAB_F', 'ESTAB', 'RCPTOT_F', 'RCPTOT', 'RCPTOT_N_F', 'LFO']]

In [154]:
df_merged.dtypes

STATE_CODE    object
YEAR           int64
NAICS         object
ESTAB_F       object
ESTAB          int64
RCPTOT_F      object
RCPTOT        object
RCPTOT_N_F    object
LFO           object
dtype: object

In [160]:
#convert ESTAB and RCPTOT into numerics
df_merged['ESTAB'] = pd.to_numeric(df_merged['ESTAB'])

In [161]:
df_merged['RCPTOT'] = pd.to_numeric(df_merged['RCPTOT'])

In [149]:
#df_merged.dtypes

Receipt totals are given in thousands, adjusting the units here

In [162]:
df_merged['RCPTOT'] = df_merged['RCPTOT'] * 1000

Seeing if there are any unexpected behaviors between zeros and suppressed data 

In [163]:
df_merged[df_merged['ESTAB'] == 0].ESTAB_F.unique() #expecting only D and S, no nan

array(['D', 'S'], dtype=object)

In [164]:
df_merged[df_merged['ESTAB'] > 0].ESTAB_F.unique()  #expecting only nan

array([nan], dtype=object)

Final df

In [165]:
df_merged

Unnamed: 0,STATE_CODE,YEAR,NAICS,ESTAB_F,ESTAB,RCPTOT_F,RCPTOT,RCPTOT_N_F,LFO
251,01,2000,4853,,202,,3861000,,
707,02,2000,4853,,463,,14150000,,
1171,04,2000,4853,,1002,,26613000,,
1636,05,2000,4853,,88,,1894000,,
2103,06,2000,4853,,10019,,296422000,,
...,...,...,...,...,...,...,...,...,...
95187,55,2020,4853,,7939,,115787000,G,S
95188,55,2020,4853,,13,,340000,H,P
96991,56,2020,4853,,365,,5479000,G,-
96992,56,2020,4853,,359,,5311000,G,S


In [168]:
df_merged.to_csv('NES_naics_4853_st.csv')

### Code to create df for NAICS code 492 (Couriers)

In [187]:
df_courier_2000 = make_nes_df(2000, 492)
df_courier_2001 = make_nes_df(2001, 492)
df_courier_2002 = make_nes_df(2002, 492)
df_courier_2003 = make_nes_df(2003, 492)
df_courier_2004 = make_nes_df(2004, 492)
df_courier_2005 = make_nes_df(2005, 492)
df_courier_2006 = make_nes_df(2006, 492)
df_courier_2007 = make_nes_df(2007, 492)
df_courier_2008 = make_nes_df(2008, 492)
df_courier_2009 = make_nes_df(2009, 492)
df_courier_2010 = make_nes_df(2010, 492)
df_courier_2011 = make_nes_df(2011, 492)
df_courier_2012 = make_nes_df(2012, 492)
df_courier_2013 = make_nes_df(2013, 492)
df_courier_2014 = make_nes_df(2014, 492)
df_courier_2015 = make_nes_df(2015, 492)
df_courier_2016 = make_nes_df(2016, 492)
df_courier_2017 = make_nes_df(2017, 492)
df_courier_2018 = make_nes_df(2018, 492)
df_courier_2019 = make_nes_df(2019, 492)
df_courier_2020 = make_nes_df(2020, 492)

In [188]:
pdListCourier = [df_courier_2000, df_courier_2001, df_courier_2002, df_courier_2003, df_courier_2004,
                 df_courier_2005, df_courier_2006, df_courier_2007, df_courier_2008, df_courier_2009, 
                 df_courier_2010, df_courier_2011, df_courier_2012, df_courier_2013, df_courier_2014, 
                 df_courier_2015, df_courier_2016, df_courier_2017, df_courier_2018, df_courier_2019, df_courier_2020]  # List of dataframes
df_courier_merged = pd.concat(pdListCourier)

In [189]:
df_courier_merged = df_courier_merged[['STATE_CODE', 'YEAR', 'NAICS', 'ESTAB_F', 'ESTAB', 'RCPTOT_F', 'RCPTOT', 'RCPTOT_N_F', 'LFO']]

In [190]:
df_courier_merged.dtypes

STATE_CODE    object
YEAR           int64
NAICS         object
ESTAB_F       object
ESTAB         object
RCPTOT_F      object
RCPTOT        object
RCPTOT_N_F    object
LFO           object
dtype: object

In [191]:
df_courier_merged['ESTAB'] = pd.to_numeric(df_courier_merged['ESTAB'])
df_courier_merged['RCPTOT'] = pd.to_numeric(df_courier_merged['RCPTOT'])

In [192]:
df_courier_merged.dtypes

STATE_CODE    object
YEAR           int64
NAICS         object
ESTAB_F       object
ESTAB          int64
RCPTOT_F      object
RCPTOT         int64
RCPTOT_N_F    object
LFO           object
dtype: object

Receipt totals are given in thousands, adjusting the units here

In [193]:
df_courier_merged['RCPTOT'] = df_courier_merged['RCPTOT'] * 1000

Making sure suppression flags are working as we would expect 

In [194]:
df_courier_merged[(df_courier_merged['ESTAB'] == 0) & (df_courier_merged['ESTAB_F'].isna()) ]  #expect a df with zero rows 

Unnamed: 0,STATE_CODE,YEAR,NAICS,ESTAB_F,ESTAB,RCPTOT_F,RCPTOT,RCPTOT_N_F,LFO


In [195]:
df_courier_merged

Unnamed: 0,STATE_CODE,YEAR,NAICS,ESTAB_F,ESTAB,RCPTOT_F,RCPTOT,RCPTOT_N_F,LFO
261,01,2000,492,,1130,,19162000,,
716,02,2000,492,D,0,D,0,,
1180,04,2000,492,,2790,,49580000,,
1646,05,2000,492,,832,,12886000,,
2113,06,2000,492,,13581,,264257000,,
...,...,...,...,...,...,...,...,...,...
95225,55,2020,492,,10148,,97452000,G,S
95226,55,2020,492,,12,,2192000,H,P
97010,56,2020,492,,1011,,12862000,G,-
97011,56,2020,492,,1001,,10958000,G,S


In [196]:
df_courier_merged.to_csv('NES_naics_492_st.csv')