# ETL Process
### Extract, Transform, Load

In [44]:
# Importing relevant packages

import requests
import pandas as pd
import numpy as np

pd.set_option('display.max_rows', 50)
pd.set_option('display.max_columns', 50)

### Extract

We gathered our data from the 2019 Annual Business Survey (ABS) through their [APIs](https://www.census.gov/data/developers/data-sets/abs.2019.html).

In order to pull data from the API, you need to use an API key, which can be requested from the [API User Guide](https://www.census.gov/data/developers/guidance/api-user-guide.Help_&_Contact_Us.html).

With our API key, we generate urls to the various datasets including all the parameters of interest in our get variables. We probably won't use all of this information but it's better to have it available and trim it down later.

In [75]:
API_KEY = '2e3a785a0323fb22d89acae1264b47cb449782aa'

#Setting up Url for API calls
get = 'GEO_ID,NAME,NAICS2017,NAICS2017_LABEL,SEX,SEX_LABEL,ETH_GROUP,ETH_GROUP_LABEL,RACE_GROUP,RACE_GROUP_LABEL,VET_GROUP,VET_GROUP_LABEL,EMPSZFI,EMPSZFI_LABEL,YEAR,FIRMPDEMP,FIRMPDEMP_F,RCPPDEMP,RCPPDEMP_F,EMP,EMP_F,PAYANN,PAYANN_F'
summary_us_url = f'https://api.census.gov/data/2018/abscs?get={get}&for=us:*&key={API_KEY}'
summary_state_url = f'https://api.census.gov/data/2018/abscs?get={get}&for=state:*&key={API_KEY}'

get = 'GEO_ID,NAME,NAICS2017,NAICS2017_LABEL,SEX,SEX_LABEL,ETH_GROUP,ETH_GROUP_LABEL,RACE_GROUP,RACE_GROUP_LABEL,VET_GROUP,VET_GROUP_LABEL,QDESC,BUSCHAR,BUSCHAR_LABEL,YEAR,FIRMPDEMP,FIRMPDEMP_F,FIRMPDEMP_PCT,FIRMPDEMP_PCT_F,RCPPDEMP,RCPPDEMP_F,RCPPDEMP_PCT,RCPPDEMP_PCT_F,EMP,EMP_F,EMP_PCT,EMP_PCT_F,PAYANN,PAYANN_F,PAYANN_PCT,PAYANN_PCT_F'
business_char_workers_us_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=us:*&QDESC_LABEL=WORKERS&key={API_KEY}'
business_char_workers_state_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=state:*&QDESC_LABEL=WORKERS&key={API_KEY}'
business_char_cust_us_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=us:*&QDESC_LABEL=CUST&key={API_KEY}'
business_char_cust_state_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=state:*&QDESC_LABEL=CUST&key={API_KEY}'
business_char_family_us_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=us:*&QDESC_LABEL=FAMOWN&key={API_KEY}'
business_char_family_state_url = f'https://api.census.gov/data/2018/abscb?get={get}&for=state:*&QDESC_LABEL=FAMOWN&key={API_KEY}'

get = 'GEO_ID,NAME,NAICS2017,NAICS2017_LABEL,OWNER_SEX,OWNER_SEX_LABEL,OWNER_ETH,OWNER_ETH_LABEL,OWNER_RACE,OWNER_RACE_LABEL,OWNER_VET,OWNER_VET_LABEL,QDESC,OWNCHAR,OWNCHAR_LABEL,YEAR,OWNPDEMP,OWNPDEMP_F,OWNPDEMP_PCT,OWNPDEMP_PCT_F'
owner_educ_us_url = f'https://api.census.gov/data/2018/abscbo?get={get}&for=us:*&QDESC_LABEL=EDUC&key={API_KEY}'
owner_educ_state_url = f'https://api.census.gov/data/2018/abscbo?get={get}&for=state:*&QDESC_LABEL=EDUC&key={API_KEY}'
owner_age_us_url = f'https://api.census.gov/data/2018/abscbo?get={get}&for=us:*&QDESC_LABEL=OWNRAGE&key={API_KEY}'
owner_age_state_url = f'https://api.census.gov/data/2018/abscbo?get={get}&for=state:*&QDESC_LABEL=OWNRAGE&key={API_KEY}'

get = 'GEO_ID,NAME,NAICS2017,NAICS2017_LABEL,NSFSZFI,NSFSZFI_LABEL,TECHUSE,TECHUSE_LABEL,FACTORS_U,FACTORS_U_LABEL,IMPACTWF_U,IMPACTWF_U_LABEL,MOTUSETECH,MOTUSETECH_LABEL,FIRMPDEMP,FIRMPDEMP_F,FIRMPDEMP_PCT,FIRMPDEMP_PCT_F,RCPPDEMP,RCPPDEMP_F,RCPPDEMP_PCT,RCPPDEMP_PCT_F,EMP,EMP_F,EMP_PCT,EMP_PCT_F,PAYANN,PAYANN_F,PAYANN_PCT,PAYANN_PCT_F'
tech_us_url = f'https://api.census.gov/data/2018/abstcb?get={get}&for=us:*&key={API_KEY}'
tech_state_url = f'https://api.census.gov/data/2018/abstcb?get={get}&for=state:*&key={API_KEY}'



Once we have all the urls, we can make the requests using<br>
    
    requests.get()
Storing all the responses for use so we don't have to repeatedly call the API.<br> (API calls are limited to 500 a day)

In [76]:
# Making API calls

# summary_us_request = requests.get(summary_us_url)
# summary_state_request = requests.get(summary_state_url)
# business_char_workers_us_request = requests.get(business_char_workers_us_url)
# business_char_workers_state_request = requests.get(business_char_workers_state_url)
# business_char_cust_us_request = requests.get(business_char_cust_us_url)
# business_char_cust_state_request = requests.get(business_char_cust_state_url)
# business_char_family_us_request = requests.get(business_char_family_us_url)
# business_char_family_state_request = requests.get(business_char_family_state_url)
# owner_educ_us_request = requests.get(owner_educ_us_url)
# owner_educ_state_request = requests.get(owner_educ_state_url)
# owner_age_us_request = requests.get(owner_age_us_url)
# owner_age_state_request = requests.get(owner_age_state_url)
tech_us_request = requests.get(tech_us_url)
tech_state_request = requests.get(tech_state_url)


### Transform

In this step, we build a function to do some preliminary cleaning across the datasets. It takes in the stored request from the API calls to clean the data.

It accomplishes this task by:
+ Removing the brackets and quotes
+ Changing fields with extra commas
+ Splitting each entry into it's own row
+ Transforming the split string into a list
+ Extracting the headers and values
+ Creating the DataFrame

Once the DataFrame is created, we then take a few more steps to prep our data for use:
+ Changing the capitalized header names to lower case
+ Renaming columns to more descriptive names
+ Changing numercial columns from string type to numerical

In [83]:
def to_dataframe(response):
    '''Takes API response, converts it into text, cleans it and places it 
    into a pandas dataframe which is returned to the user for analysis'''

    data = response.text
    # Remove brackets from ends
    data = data[2:-2]
    # Remove quotes from values
    data = data.replace('"', '')
    # Altering values containing commas so they don't misalign and break the DataFrame
    data = data.replace('Agriculture, forestry, fishing and hunting', 'Agriculture/Forestry/Fishing/Hunting')
    data = data.replace('Arts, entertainment, and recreation', 'Arts/Entertainment/Recreation')
    data = data.replace('Mining, quarrying, and oil and gas extraction', 'Mining/Quarrying/Oil&Gas Extraction')
    data = data.replace('Not applicable,', 'N/A')
    data = data.replace('Professional, scientific, and technical services', 'Professional/Scientific/Technical Services')
    data = data.replace('Some college, but no degree', 'Some college')
    data = data.replace('Technical, trade, or vocational', 'Technical/Trade/Vocational')
    data = data.replace('Tested,','Tested')

    # Splitting into rows
    data = data.split('],\n[')
    # Changing into a list to seperate headers and values
    new_data = [i.split(',') for i in data]

    # Creating DataFrame
    values = new_data[1:]
    headers = new_data[0]
    df = pd.DataFrame(values, columns = headers)
    # Convert names into lower case
    df.columns = df.columns.str.lower()
    df = df.replace('null', np.nan)

    # Change numeric columns from str to numeric
    numeric = [
        'emp',
        'emp_pct',
        'firmpdemp',
        'firmpdemp_pct',
        'payann',
        'payann_pct',
        'rcppdemp',
        'rcppdemp_pct'
        ]
    for i in numeric:
        df[i] = pd.to_numeric(df[i])

    # Renaming columns to more descriptive names
    df.rename(columns = {
        'emp' : 'number_of_employees',
        'emp_pct' : 'percent_of_employees',
        'eth_group_label' : 'ethnicity',
        'factors_u_label' : 'usage_adverse_factors',
        'firmpdemp' : 'number_of_firms',
        'firmpdemp_pct' : 'percent_of_firms',
        'impactwf_u_label' : 'tech_usage_worker_impact',
        'motusetech_label' : 'tech_usage_purpose',
        'naics2017_label' : 'industry',
        'nsfszfi_label' : 'firm_size_category',
        'payann' : 'annual_payroll',
        'payann_pct' : 'percent_annual_payroll',
        'race_group_label' : 'race',
        'rcppdemp' : 'revenue',
        'rcppdemp_pct' : 'percent_revenue_of_firms',
        'sex' : 'sex_code',
        'sex_label' : 'sex',
        'techuse_label' : 'tech_usage'
    }, inplace = True)


    return df

In [90]:
# Chris

tech = to_dataframe(tech_us_request)
tech.iloc[:,10:30].info()
tech.head(20)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4440 entries, 0 to 4439
Data columns (total 20 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   impactwf_u                4440 non-null   object 
 1   tech_usage_worker_impact  4440 non-null   object 
 2   motusetech                4440 non-null   object 
 3   tech_usage_purpose        4440 non-null   object 
 4   number_of_firms           4440 non-null   int64  
 5   firmpdemp_f               193 non-null    object 
 6   percent_of_firms          4440 non-null   float64
 7   firmpdemp_pct_f           193 non-null    object 
 8   revenue                   4440 non-null   int64  
 9   rcppdemp_f                3710 non-null   object 
 10  percent_revenue_of_firms  4440 non-null   float64
 11  rcppdemp_pct_f            3710 non-null   object 
 12  number_of_employees       4440 non-null   int64  
 13  emp_f                     193 non-null    object 
 14  percent_

Unnamed: 0,geo_id,name,naics2017,industry,nsfszfi,firm_size_category,techuse,tech_usage,factors_u,usage_adverse_factors,impactwf_u,tech_usage_worker_impact,motusetech,tech_usage_purpose,number_of_firms,firmpdemp_f,percent_of_firms,firmpdemp_pct_f,revenue,rcppdemp_f,percent_revenue_of_firms,rcppdemp_pct_f,number_of_employees,emp_f,percent_of_employees,emp_pct_f,annual_payroll,payann_f,percent_annual_payroll,payann_pct_f,us
0,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q01,Artificial Intelligence: Technology was too ex...,0,All firms,0,All firms,364212,,7.7,,3611480651,,12.1,,9431290,,10.0,,547672280,,10.9,,1
1,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q02,Artificial Intelligence: Technology was not ma...,0,All firms,0,All firms,97824,,2.1,,4339811482,,14.6,,8840439,,9.4,,659811366,,13.1,,1
2,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q03,Artificial Intelligence: Lacked access to requ...,0,All firms,0,All firms,41326,,0.9,,2035327734,,6.8,,3435646,,3.7,,252592448,,5.0,,1
3,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q04,Artificial Intelligence: Required data not rel...,0,All firms,0,All firms,23983,,0.5,,2076128070,,7.0,,3499826,,3.7,,254157635,,5.0,,1
4,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q05,Artificial Intelligence: Lacked access to requ...,0,All firms,0,All firms,55217,,1.2,,2924464479,,9.8,,4928034,,5.2,,378066615,,7.5,,1
5,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q06,Artificial Intelligence: Laws and regulations,0,All firms,0,All firms,36626,,0.8,,1375991615,,4.6,,2136381,,2.3,,163479890,,3.2,,1
6,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q07,Artificial Intelligence: Concerns regarding sa...,0,All firms,0,All firms,50802,,1.1,,2675539382,,9.0,,5356187,,5.7,,326697581,,6.5,,1
7,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q08,Artificial Intelligence: Lacked access to capital,0,All firms,0,All firms,69524,,1.5,,1086906292,,3.6,,1983379,,2.1,,137964861,,2.7,,1
8,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q09,Artificial Intelligence: Technology not applic...,0,All firms,0,All firms,2209406,,46.6,,9854538084,,33.1,,37360534,,39.7,,1747138071,,34.7,,1
9,0100000US,United States,0,Total for all sectors,1,All firms,0,All firms,T1E19Q10,Artificial Intelligence: No factors adversely ...,0,All firms,0,All firms,2044962,,43.1,,13616954124,,45.7,,40948340,,43.5,,2301769757,,45.7,,1


rcppdemp_f  rcppdemp
B           0             81
D           0           2573
I           0            236
K           0            252
L           0            402
M           0            221
O           0            448
R           0           1067
S           0           5336
T           0            847
U           0           2371
W           0           4664
Name: rcppdemp, dtype: int64

In [None]:
# Sara

In [None]:
# Jasleen

In [None]:
# Armin