# ETL Process
### Extract, Transform, Load

In [1]:
# Importing relevant packages

import requests
import pandas as pd
import numpy as np
from IPython.display import clear_output

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

### Extract

We gathered our data from the 2019 Annual Business Survey (ABS) through their [APIs](https://www.census.gov/data/developers/data-sets/abs.2019.html).

In order to pull data from the API, you need to use an API key, which can be requested from the [API User Guide](https://www.census.gov/data/developers/guidance/api-user-guide.Help_&_Contact_Us.html).

With our API key, we generate urls to the various datasets including all the parameters of interest in our get variables. We probably won't use all of this information but it's better to have it available and trim it down later.

In [2]:
API_KEY = '2e3a785a0323fb22d89acae1264b47cb449782aa'

#Setting up Url for API calls
get = 'GEO_ID,NAME,NAICS2017,NAICS2017_LABEL,SEX,SEX_LABEL,ETH_GROUP,ETH_GROUP_LABEL,RACE_GROUP,RACE_GROUP_LABEL,VET_GROUP,VET_GROUP_LABEL,EMPSZFI,EMPSZFI_LABEL,YEAR,FIRMPDEMP,FIRMPDEMP_F,RCPPDEMP,RCPPDEMP_F,EMP,EMP_F,PAYANN,PAYANN_F'
summary_us_url = f'https://api.census.gov/data/2018/abscs?get={get}&for=us:*&key={API_KEY}'
summary_state_url = f'https://api.census.gov/data/2018/abscs?get={get}&for=state:*&key={API_KEY}'

get = 'GEO_ID,NAME,NAICS2017,NAICS2017_LABEL,SEX,SEX_LABEL,ETH_GROUP,ETH_GROUP_LABEL,RACE_GROUP,RACE_GROUP_LABEL,VET_GROUP,VET_GROUP_LABEL,QDESC,BUSCHAR,BUSCHAR_LABEL,YEAR,FIRMPDEMP,FIRMPDEMP_F,RCPPDEMP,RCPPDEMP_F,EMP,EMP_F,PAYANN,PAYANN_F'
business_char_workers_us_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=us:*&QDESC_LABEL=WORKERS&key={API_KEY}'
business_char_workers_state_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=state:*&QDESC_LABEL=WORKERS&key={API_KEY}'
business_char_cust_us_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=us:*&QDESC_LABEL=CUST&key={API_KEY}'
business_char_cust_state_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=state:*&QDESC_LABEL=CUST&key={API_KEY}'
business_char_family_us_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=us:*&QDESC_LABEL=FAMOWN&key={API_KEY}'
business_char_family_state_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=state:*&QDESC_LABEL=FAMOWN&key={API_KEY}'

get = 'GEO_ID,NAME,NAICS2017,NAICS2017_LABEL,OWNER_SEX,OWNER_SEX_LABEL,OWNER_ETH,OWNER_ETH_LABEL,OWNER_RACE,OWNER_RACE_LABEL,OWNER_VET,OWNER_VET_LABEL,QDESC,OWNCHAR,OWNCHAR_LABEL,YEAR,OWNPDEMP,OWNPDEMP_F'
owner_educ_us_url = f'https://api.census.gov/data/2018/abscbo?get={get}&for=us:*&QDESC_LABEL=EDUC&key={API_KEY}'
owner_educ_state_url = f'https://api.census.gov/data/2018/abscbo?get={get}&for=state:*&QDESC_LABEL=EDUC&key={API_KEY}'
owner_age_us_url = f'https://api.census.gov/data/2018/abscbo?get={get}&for=us:*&QDESC_LABEL=OWNRAGE&key={API_KEY}'
owner_age_state_url = f'https://api.census.gov/data/2018/abscbo?get={get}&for=state:*&QDESC_LABEL=OWNRAGE&key={API_KEY}'

get = 'GEO_ID,NAME,NAICS2017,NAICS2017_LABEL,NSFSZFI,NSFSZFI_LABEL,TECHUSE,TECHUSE_LABEL,FACTORS_U,FACTORS_U_LABEL,IMPACTWF_U,IMPACTWF_U_LABEL,MOTUSETECH,MOTUSETECH_LABEL,FIRMPDEMP,FIRMPDEMP_F,RCPPDEMP,RCPPDEMP_F,EMP,EMP_F,PAYANN,PAYANN_F'
tech_us_url = f'https://api.census.gov/data/2018/abstcb?get={get}&for=us:*&key={API_KEY}'
tech_state_url = f'https://api.census.gov/data/2018/abstcb?get={get}&for=state:*&key={API_KEY}'



Once we have all the urls, we can make the requests using<br>
    
    requests.get()
We stored all the requests in variables so that we can don't have to continously make calls since we are limited to 500 calls per day. This also makes it to focus on specific datasets because we know exactly what datasets we are extracting and transforming as well as the information contained in those datasets.

In [3]:
# Making API calls

# summary_us_request = requests.get(summary_us_url)
# summary_state_request = requests.get(summary_state_url)
# business_char_workers_us_request = requests.get(business_char_workers_us_url)
# business_char_workers_state_request = requests.get(business_char_workers_state_url)
# business_char_cust_us_request = requests.get(business_char_cust_us_url)
# business_char_cust_state_request = requests.get(business_char_cust_state_url)
# business_char_family_us_request = requests.get(business_char_family_us_url)
# business_char_family_state_request = requests.get(business_char_family_state_url)
owner_educ_us_request = requests.get(owner_educ_us_url)
owner_educ_state_request = requests.get(owner_educ_state_url)
owner_age_us_request = requests.get(owner_age_us_url)
owner_age_state_request = requests.get(owner_age_state_url)
tech_us_request = requests.get(tech_us_url)
tech_state_request = requests.get(tech_state_url)


### Transform

In this step, we build a function to do some preliminary cleaning across the datasets. It takes in the stored request from the API calls to clean the data.

It accomplishes this task by:
+ Removing the brackets and quotes
+ Changing fields with extra commas
+ Splitting each entry into it's own row
+ Transforming the split string into a list
+ Extracting the headers and values
+ Creating the DataFrame

Once the DataFrame is created, we then take a few more steps to prep our data for use:
+ Changing the capitalized header names to lower case
+ Changing the null strings into NaN, which is actually recognized as a null value in Pandas
+ Renaming columns to more descriptive names
+ Changing numercial columns from string type to numerical

In [23]:
def to_dataframe(response):
    '''Takes API response, converts it into text, cleans it and places it 
    into a pandas dataframe which is returned to the user for analysis'''

    data = response.text
    # Remove brackets from ends
    data = data[2:-2]
    # Remove quotes from values
    data = data.replace('"', '')
    # Altering values containing commas so they don't misalign and break the DataFrame
    data = data.replace('Agriculture, forestry, fishing and hunting', 'Agriculture/Forestry/Fishing/Hunting')
    data = data.replace('Arts, entertainment, and recreation', 'Arts/Entertainment/Recreation')
    data = data.replace('Mining, quarrying, and oil and gas extraction', 'Mining/Quarrying/Oil&Gas Extraction')
    data = data.replace('Not applicable,', 'N/A')
    data = data.replace('Professional, scientific, and technical services', 'Professional/Scientific/Technical Services')
    data = data.replace('Some college, but no degree', 'Some college')
    data = data.replace('Technical, trade, or vocational', 'Technical/Trade/Vocational')
    data = data.replace('Tested,','Tested')

    # Splitting into rows
    data = data.split('],\n[')
    # Changing into a list to seperate headers and values
    new_data = [i.split(',') for i in data]

    # Creating DataFrame
    values = new_data[1:]
    headers = new_data[0]
    df = pd.DataFrame(values, columns = headers)
    # Convert names into lower case
    df.columns = df.columns.str.lower()
    df = df.replace('null', np.nan)
        
    # Change numeric columns from str to numeric  
    numeric = ['emp','firmpdemp','payann','rcppdemp',]
    for i in numeric:
        df[i] = pd.to_numeric(df[i])

    # Renaming columns to more descriptive names
    df.rename(columns = {
        'emp' : 'number_of_employees',
        'emp_pct' : 'percent_of_employees',
        'eth_group_label' : 'ethnicity',
        'factors_u_label' : 'usage_adverse_factors',
        'firmpdemp' : 'number_of_firms',
        'firmpdemp_pct' : 'percent_of_firms',
        'impactwf_u_label' : 'tech_usage_worker_impact',
        'motusetech_label' : 'tech_usage_purpose',
        'naics2017_label' : 'industry',
        'nsfszfi_label' : 'firm_size_category',
        'payann' : 'annual_payroll',
        'payann_pct' : 'percent_annual_payroll',
        'race_group_label' : 'race',
        'rcppdemp' : 'revenue',
        'rcppdemp_f' : 'revenue_ranges',
        'rcppdemp_pct' : 'percent_revenue_of_firms',
        'sex' : 'sex_code',
        'sex_label' : 'sex',
        'techuse_label' : 'tech_usage'
    }, inplace = True)

    return df

### Initial Data Analysis + Exporting to CSV

Every group member was given the freedom to explore the topics/questions they were interested in. In the following sections, the members each make dataframes from the datasets created from the API calls they are interested in. We then do some initial data analysis, and then export the dataframes to CSV files, which we can use to make visualizations in a separate notebook.

In [22]:
def owner_dataframe(response):
    '''Takes API response, converts it into text, cleans it and places it 
    into a pandas dataframe which is returned to the user for analysis'''

    data = response.text
    # Remove brackets from ends
    data = data[2:-2]
    # Remove quotes from values
    data = data.replace('"', '')
    # Altering values containing commas so they don't misalign and break the DataFrame
    data = data.replace('Agriculture, forestry, fishing and hunting', 'Agriculture/Forestry/Fishing/Hunting')
    data = data.replace('Arts, entertainment, and recreation', 'Arts/Entertainment/Recreation')
    data = data.replace('Mining, quarrying, and oil and gas extraction', 'Mining/Quarrying/Oil&Gas Extraction')
    data = data.replace('Not applicable,', 'N/A')
    data = data.replace('Professional, scientific, and technical services', 'Professional/Scientific/Technical Services')
    # data = data.replace('Some college, but no degree', 'Some college')
    # data = data.replace('Technical, trade, or vocational', 'Technical/Trade/Vocational')
    # data = data.replace('Tested,','Tested')

    # Splitting into rows
    data = data.split('],\n[')
    # Changing into a list to seperate headers and values
    new_data = [i.split(',') for i in data]

    # Creating DataFrame
    values = new_data[1:]
    headers = new_data[0]
    df = pd.DataFrame(values, columns = headers)
    # Convert names into lower case
    df.columns = df.columns.str.lower()
    df = df.replace('null', np.nan)
        
    # Change numeric columns from str to numeric  
    # numeric = ['emp','firmpdemp','payann','rcppdemp',]
    # for i in numeric:
    #     df[i] = pd.to_numeric(df[i])

    # Renaming columns to more descriptive names
    df.rename(columns = {
        'emp' : 'number_of_employees',
        'emp_pct' : 'percent_of_employees',
        'eth_group_label' : 'ethnicity',
        'factors_u_label' : 'usage_adverse_factors',
        'firmpdemp' : 'number_of_firms',
        'firmpdemp_pct' : 'percent_of_firms',
        'impactwf_u_label' : 'tech_usage_worker_impact',
        'motusetech_label' : 'tech_usage_purpose',
        'naics2017_label' : 'industry',
        'nsfszfi_label' : 'firm_size_category',
        'payann' : 'annual_payroll',
        'payann_pct' : 'percent_annual_payroll',
        'race_group_label' : 'race',
        'rcppdemp' : 'revenue',
        'rcppdemp_f' : 'revenue_ranges',
        'rcppdemp_pct' : 'percent_revenue_of_firms',
        'sex' : 'sex_code',
        'sex_label' : 'sex',
        'techuse_label' : 'tech_usage'
    }, inplace = True)

    return df

In [24]:
# Chris

#tech usage by state
usage_by_state = to_dataframe(tech_state_request)
usage_by_state['state'] = usage_by_state['name']
total_usage = [
    'T1E03B99', 'T2E03B99', 'T3E03B99', 'T4E03B99', 'T5E03B99'
]
usage_by_state = usage_by_state[usage_by_state['techuse'].isin(total_usage)]

usage_by_state = usage_by_state[[
    'state','industry','techuse','tech_usage','number_of_firms']]

ai = usage_by_state[usage_by_state['techuse'] == 'T1E03B99']
cloud = usage_by_state[usage_by_state['techuse'] == 'T2E03B99']
software = usage_by_state[usage_by_state['techuse'] == 'T3E03B99']
robotics = usage_by_state[usage_by_state['techuse'] == 'T4E03B99']
equipment = usage_by_state[usage_by_state['techuse'] == 'T5E03B99']

usage = pd.DataFrame()
usage['state'] = usage_by_state['state'].unique()

ai = ai.groupby('state')['tech_usage','number_of_firms'].sum()
ai = list(ai['number_of_firms'])
cloud = cloud.groupby('state')['tech_usage','number_of_firms'].sum()
cloud = list(cloud['number_of_firms'])
software = software.groupby('state')['tech_usage','number_of_firms'].sum()
software = list(software['number_of_firms'])
robotics = robotics.groupby('state')['tech_usage','number_of_firms'].sum()
robotics = list(robotics['number_of_firms'])
equipment = equipment.groupby('state')['tech_usage','number_of_firms'].sum()
equipment = list(equipment['number_of_firms'])
usage['ai'] = ai
usage['cloud'] = cloud
usage['software'] = software
usage['robotics'] = robotics
usage['equipment'] = equipment
usage.set_index('state',inplace=True)
clear_output(wait=False)

usage.to_csv('CSV/usage_by_state.csv')

In [25]:
worker_impact = to_dataframe(tech_us_request)

# impact of usage by industry

# flagged rows don't provide any useful information
worker_impact = worker_impact[worker_impact['firmpdemp_f'].isna()]
# filter for all firm sizes
worker_impact = worker_impact[worker_impact['nsfszfi'] == '001']
# filter to remove total of tech_usage_worker_impact
worker_impact = worker_impact[worker_impact['impactwf_u'] != '00']

# selecting rows for table
worker_impact = worker_impact[
    ['industry','naics2017','techuse','tech_usage','impactwf_u','tech_usage_worker_impact',
    'number_of_firms', 'number_of_employees', 'annual_payroll'
]]

# export to CSV
worker_impact.to_csv('CSV/worker_impact.csv')

In [27]:
owners_age = owner_dataframe(owner_age_state_request)

In [None]:
# Sara

In [None]:
# Jasleen

summary_us = to_dataframe(summary_us_request)

revenue_disclosed = summary_us[(summary_us['revenue'] != '0') & (summary_us['race'] != 'Total') & (summary_us['race'] != "Minority") & (summary_us['race'] != "Equally minority/nonminority") & (summary_us['race'] != 'Nonminority') & (summary_us['race'] != 'Classifiable') & (summary_us['race'] != 'Unclassifiable')]
revenue1 = revenue_disclosed[['revenue', 'race', 'industry']]

revenue1.to_csv('CSV/revenues_disclosed.csv')

In [None]:
summary_state = to_dataframe(summary_state_request)


revenue_state = summary_state[(summary_state['race'] != 'Total') & (summary_state['race'] != "Minority") & (summary_state['race'] != "Equally minority/nonminority") & (summary_state['race'] != 'Nonminority') & (summary_state['race'] != 'Classifiable') & (summary_state['race'] != 'Unclassifiable') & (summary_state['revenue_ranges'] != "S") & (summary_state['revenue_ranges'] != "D")]

revenue_state.to_csv('CSV/revenue_states.csv')

In [None]:
check = summary_us[['revenue', 'industry']]
check.tail()

In [None]:
revenue_hidden = summary_us[(summary_us['revenue'] == '0') & (summary_us['revenue_ranges'] != "S") & (summary_us['revenue_ranges'] != "D") & (summary_us['industry'] != "Total for all sectors") & (summary_us['race'] != 'Total') & (summary_us['race'] != "Minority") & (summary_us['race'] != "Equally minority/nonminority") & (summary_us['race'] != 'Nonminority') & (summary_us['race'] != 'Classifiable') & (summary_us['race'] != 'Unclassifiable')]

revenue2 = revenue_hidden[['industry', 'race', 'revenue_ranges']]
revenue2.to_csv('CSV/revenue_industry.csv')

revenue2.to_csv('CSV/revenues_hidden.csv')

In [18]:
# Armin

'<Response [200]>'